From 1f3f9c07d5fb862b356f5b145bb11e30b4fa63fe Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Est=C3=A8ve?= Date: Tue, 18 Oct 2022 14:31:46 +0200 Subject: [PATCH 001/108] Add "how to cite us" info to README and index.md (#668) --- README.md | 8 ++++++++ jupyter-book/index.md | 15 +++++++++------ 2 files changed, 17 insertions(+), 6 deletions(-) diff --git a/README.md b/README.md index f0be651eb..ae9ceec47 100644 --- a/README.md +++ b/README.md @@ -32,3 +32,11 @@ See instructions [here](./local-install-instructions.md) ## Contributing See [CONTRIBUTING.md](CONTRIBUTING.md) + +## How to cite us + +The MOOC material is developed publicly under the [CC-BY +license](https://github.com/INRIA/scikit-learn-mooc/blob/main/LICENSE). + +You can cite us through the project's Zenodo archive using the following DOI: +[10.5281/zenodo.7220306](https://doi.org/10.5281/zenodo.7220306). diff --git a/jupyter-book/index.md b/jupyter-book/index.md index b848c5375..cab66acbc 100644 --- a/jupyter-book/index.md +++ b/jupyter-book/index.md @@ -72,15 +72,18 @@ or use the following resources: ## MOOC material -The MOOC material is developed publicly under the [CC-By license]( -https://github.com/INRIA/scikit-learn-mooc/blob/main/LICENSE), -including the notebooks, exercises and solutions to the exercises -(but not the quizz solutions ;) via the following GitHub -repository: +The MOOC material is developed publicly under the [CC-BY license]( +https://github.com/INRIA/scikit-learn-mooc/blob/main/LICENSE). + +You can cite us through the project's Zenodo archive using the following DOI: +[10.5281/zenodo.7220306](https://doi.org/10.5281/zenodo.7220306). + +The following repository includes the notebooks, exercises and solutions to the +exercises (but not the quizz solutions ;): https://github.com/INRIA/scikit-learn-mooc/ -This is also published as a static website at: +The MOOC material is also published as a static website at: https://inria.github.io/scikit-learn-mooc/ From 30fafac9c5dc7a551c79fdd8f447310e9d856d9f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Est=C3=A8ve?= Date: Thu, 20 Oct 2022 11:02:46 +0200 Subject: [PATCH 002/108] Update full-index.md --- full-index.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/full-index.md b/full-index.md index fd55992df..c0c19aea5 100644 --- a/full-index.md +++ b/full-index.md @@ -36,7 +36,8 @@ * [Using numerical and categorical variables together](notebooks/03_categorical_pipeline_column_transformer.ipynb) * [๐Ÿ“ Exercise M1.05](notebooks/03_categorical_pipeline_ex_02.ipynb) * [๐Ÿ“ƒ Solution for Exercise M1.05](notebooks/03_categorical_pipeline_sol_02.ipynb) -* [๐ŸŽฅ Visualizing scikit-learn pipelines in Jupyter](https://inria.github.io/scikit-learn-mooc/predictive_modeling_pipeline/03_categorical_pipeline_video.html) +* [๐ŸŽฅ Visualizing scikit-learn pipelines in Jupyter](https://inria.github.io/scikit-learn-mooc/predictive_modeling_pipeline/03_categorical_pipeline_visualization_video.html) +* [Visualizing scikit-learn pipelines in Jupyter](notebooks/03_categorical_pipeline_visualization.ipynb) * [โœ… Quiz M1.03](https://inria.github.io/scikit-learn-mooc/predictive_modeling_pipeline/03_categorical_pipeline_quiz_m1_03.html) [๐Ÿ Wrap-up quiz 1](https://inria.github.io/scikit-learn-mooc/predictive_modeling_pipeline/wrap_up_quiz.html) From 5d27ddd8fa00f73c70a5061949b09cb7a50eb4cb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Est=C3=A8ve?= Date: Mon, 7 Nov 2022 13:37:15 +0100 Subject: [PATCH 003/108] Disable thebe integration (#670) --- jupyter-book/_config.yml | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/jupyter-book/_config.yml b/jupyter-book/_config.yml index deccfce27..ef85cf4da 100644 --- a/jupyter-book/_config.yml +++ b/jupyter-book/_config.yml @@ -80,7 +80,9 @@ launch_buttons: binderhub_url: "https://mybinder.org" # colab_url: "https://colab.research.google.com" # Not working for now, # because it needs .ipynb - thebe: true +# Disable thebe support since it does not start in the right folder, see +# https://github.com/INRIA/scikit-learn-mooc/issues/669 for more details +# thebe: true binder: binderhub_url : "https://mybinder.org" From e46fef7096f91f802da79bbea59e1c7f536aed9e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Est=C3=A8ve?= Date: Thu, 10 Nov 2022 16:30:41 +0100 Subject: [PATCH 004/108] Remove deprecated distutils.Version in check_env.py (#672) Fix https://github.com/INRIA/scikit-learn-mooc/issues/549 --- .github/workflows/slides.yml | 7 ---- check_env.py | 70 +++++++++++++++++++----------------- environment-dev.yml | 1 + environment.yml | 1 + 4 files changed, 40 insertions(+), 39 deletions(-) diff --git a/.github/workflows/slides.yml b/.github/workflows/slides.yml index e07b49b89..7fbb5aa87 100644 --- a/.github/workflows/slides.yml +++ b/.github/workflows/slides.yml @@ -24,17 +24,10 @@ jobs: working-directory: ./slides run: pip install -r requirements.txt - - name: Install npm dependencies - run: npm install -g bellbind/remarkjs-pdf - - name: Export html working-directory: ./slides run: make && mkdir ../dist && mv *.html ../dist/ - - name: Export pdf - working-directory: ./dist - run: for f in $(find -name "*.html" -not -name "index.html") ; do remarkjs-pdf $f || echo "Error converting $f to pdf"; done - - name: Upload distributed files uses: actions/upload-artifact@v1 with: diff --git a/check_env.py b/check_env.py index 8cb38f33e..c515a4792 100644 --- a/check_env.py +++ b/check_env.py @@ -1,22 +1,35 @@ -from __future__ import print_function -from distutils.version import LooseVersion as Version import sys +import importlib -OK = '\x1b[42m[ OK ]\x1b[0m' +OK = "\x1b[42m[ OK ]\x1b[0m" FAIL = "\x1b[41m[FAIL]\x1b[0m" try: - import importlib + from packaging.version import Version except ImportError: - print(FAIL, "Python version 3.6 or above is required," - " but %s is installed." % sys.version) + print(FAIL, "'packaging' package not installed, install it with conda or pip") + sys.exit(1) + +# first check the python version +print("Using python in", sys.prefix) +print(sys.version) +pyversion_str = f"{sys.version_info.major}.{sys.version_info.minor}" +pyversion = Version(pyversion_str) + +if pyversion < Version("3.8"): + print( + FAIL, + "Python version 3.8 or above is required," f" but {pyversion_str} is installed.", + ) + sys.exit(1) +print() def import_version(pkg, min_ver, fail_msg=""): mod = None try: mod = importlib.import_module(pkg) - if pkg in {'PIL'}: + if pkg in {"PIL"}: try: ver = mod.__version__ except AttributeError: @@ -25,39 +38,32 @@ def import_version(pkg, min_ver, fail_msg=""): except AttributeError: try: ver = mod.PILLOW_VERSION - except: + except Exception: raise else: ver = mod.__version__ - if Version(ver) < min_ver: - print(FAIL, "%s version %s or higher required, but %s installed." - % (lib, min_ver, ver)) + if Version(ver) < Version(min_ver): + print( + FAIL, + f"{lib} version {min_ver} or higher required, but {ver} installed.", + ) else: - print(OK, '%s version %s' % (pkg, ver)) + print(OK, f"{pkg} version {ver}") except ImportError: - print(FAIL, '%s not installed. %s' % (pkg, fail_msg)) + print(FAIL, f"{pkg} not installed. {fail_msg}") return mod -# first check the python version -print('Using python in', sys.prefix) -print(sys.version) -pyversion = Version(sys.version) -if pyversion >= "3": - if pyversion < "3.6": - print(FAIL, "Python version 3.6 or above is required," - " but %s is installed." % sys.version) -elif pyversion >= "2": - print(FAIL, "Python version 3.6 or above is required," - " but %s is installed." % sys.version) -else: - print(FAIL, "Unknown Python version: %s" % sys.version) - -print() -requirements = {'numpy': "1.16", 'scipy': "1.2", 'matplotlib': "3.0", - 'sklearn': "1.1", 'pandas': "1", - 'seaborn': "0.11", - 'notebook': "5.7", 'plotly': "5.10"} +requirements = { + "numpy": "1.16", + "scipy": "1.2", + "matplotlib": "3.0", + "sklearn": "1.1", + "pandas": "1", + "seaborn": "0.11", + "notebook": "5.7", + "plotly": "5.10", +} # now the dependencies for lib, required_version in list(requirements.items()): diff --git a/environment-dev.yml b/environment-dev.yml index de3403f29..e997e287a 100644 --- a/environment-dev.yml +++ b/environment-dev.yml @@ -10,6 +10,7 @@ dependencies: - jupytext - beautifulsoup4 - IPython + - packaging - pip - pip: - jupyter-book >= 0.11 diff --git a/environment.yml b/environment.yml index 3206ba33b..10e5c3bca 100644 --- a/environment.yml +++ b/environment.yml @@ -13,3 +13,4 @@ dependencies: - jupytext - plotly >= 5.10 - IPython + - packaging From 59e27aba61a44a4b605677553d2733f3b07cfd14 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Est=C3=A8ve?= Date: Thu, 10 Nov 2022 17:16:20 +0100 Subject: [PATCH 005/108] Remove slides-ci workflow and remove instructions to generate html slides. Fix #671 --- .github/workflows/slides.yml | 35 ------------------------- Makefile | 50 ------------------------------------ slides/README.md | 13 ---------- slides/requirements.txt | 1 - 4 files changed, 99 deletions(-) delete mode 100644 .github/workflows/slides.yml delete mode 100644 Makefile delete mode 100644 slides/requirements.txt diff --git a/.github/workflows/slides.yml b/.github/workflows/slides.yml deleted file mode 100644 index 7fbb5aa87..000000000 --- a/.github/workflows/slides.yml +++ /dev/null @@ -1,35 +0,0 @@ -name: slides-ci - -on: - push: - branches: - - main - -jobs: - build: - - runs-on: ubuntu-latest - steps: - - uses: actions/checkout@v1 - - name: Setup Node - uses: actions/setup-node@v1 - with: - node-version: '12.x' - - uses: actions/setup-python@v1 - with: - python-version: '3.x' - architecture: 'x64' - - - name: Install python dependencies - working-directory: ./slides - run: pip install -r requirements.txt - - - name: Export html - working-directory: ./slides - run: make && mkdir ../dist && mv *.html ../dist/ - - - name: Upload distributed files - uses: actions/upload-artifact@v1 - with: - name: dist - path: dist/ diff --git a/Makefile b/Makefile deleted file mode 100644 index 4356313e2..000000000 --- a/Makefile +++ /dev/null @@ -1,50 +0,0 @@ -PYTHON_SCRIPTS_DIR = python_scripts -NOTEBOOKS_DIR = notebooks -JUPYTER_BOOK_DIR = jupyter-book -JUPYTER_KERNEL := python3 -MINIMAL_NOTEBOOK_FILES = $(shell ls $(PYTHON_SCRIPTS_DIR)/*.py | perl -pe "s@$(PYTHON_SCRIPTS_DIR)@$(NOTEBOOKS_DIR)@" | perl -pe "s@\.py@.ipynb@") - -# This assumes that the folder mooc-scikit-learn-coordination and -# scikit-learn-mooc are siblings, e.g. the repos are in the -# ~/dev/mooc-scikit-learn-coordination and ~/dev/scikit-learn-mooc. This should -# be the case in most development setups. If not then you can pass the -# GITLAB_REPO_JUPYTERBOOK_DIR variable with -# make -e GITLAB_REPO_JUPYTERBOOK_DIR=your/gitlab/repo/jupyter-book-dir/goes-here -GITLAB_REPO_JUPYTERBOOK_DIR = ../mooc-scikit-learn-coordination/jupyter-book - -all: $(NOTEBOOKS_DIR) - -.PHONY: $(NOTEBOOKS_DIR) copy_matplotlibrc sanity_check_$(NOTEBOOKS_DIR) all \ - exercises quizzes $(JUPYTER_BOOK_DIR) $(JUPYTER_BOOK_DIR)-clean $(JUPYTER_BOOK_DIR)-full-clean - -$(NOTEBOOKS_DIR): $(MINIMAL_NOTEBOOK_FILES) copy_matplotlibrc sanity_check_$(NOTEBOOKS_DIR) - -$(NOTEBOOKS_DIR)/%.ipynb: $(PYTHON_SCRIPTS_DIR)/%.py - python build_tools/convert-python-script-to-notebook.py $< $@ - -copy_matplotlibrc: - cp $(PYTHON_SCRIPTS_DIR)/matplotlibrc $(NOTEBOOKS_DIR)/ - -sanity_check_$(NOTEBOOKS_DIR): - python build_tools/sanity-check.py $(PYTHON_SCRIPTS_DIR) $(NOTEBOOKS_DIR) - -exercises: - python build_tools/generate-exercise-from-solution.py $(PYTHON_SCRIPTS_DIR) - -quizzes: - python build_tools/generate-quizzes.py $(GITLAB_REPO_JUPYTERBOOK_DIR) $(JUPYTER_BOOK_DIR) - -full-index: - python build_tools/generate-md-index.py - -$(JUPYTER_BOOK_DIR): - jupyter-book build $(JUPYTER_BOOK_DIR) - rm -rf $(JUPYTER_BOOK_DIR)/_build/html/{slides,figures} && cp -r slides figures $(JUPYTER_BOOK_DIR)/_build/html - -$(JUPYTER_BOOK_DIR)-clean: - # keep jupyter-cache cache folder - jupyter-book clean $(JUPYTER_BOOK_DIR) - -$(JUPYTER_BOOK_DIR)-full-clean: - # deletes jupyter-cache cache folder - rm -rf $(JUPYTER_BOOK_DIR)/_build diff --git a/slides/README.md b/slides/README.md index 5e8254a74..3cf7fe230 100644 --- a/slides/README.md +++ b/slides/README.md @@ -18,16 +18,3 @@ python -m http.server # open your browser with the right port (from previous command) using the right md file firefox 'http://localhost:8000/slides/index.html?file=../slides/ml_concepts.md' ``` - -# Export - -To install packages needed to generate the slides: - -``` -pip install -r requirements.txt -``` - -Note: for some reason if you `pip install` `htmlark` and not -`htmlark[parsers]`, you'll get a blank HTML page. - -Then use `make` to export html files. diff --git a/slides/requirements.txt b/slides/requirements.txt deleted file mode 100644 index 8198dab0c..000000000 --- a/slides/requirements.txt +++ /dev/null @@ -1 +0,0 @@ -remarker From c0e08899943d45e47540f9f5193e2fea49f3d5f1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Est=C3=A8ve?= Date: Thu, 10 Nov 2022 18:03:23 +0100 Subject: [PATCH 006/108] Fix previous commit by putting back Makefile --- Makefile | 50 ++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 50 insertions(+) create mode 100644 Makefile diff --git a/Makefile b/Makefile new file mode 100644 index 000000000..4356313e2 --- /dev/null +++ b/Makefile @@ -0,0 +1,50 @@ +PYTHON_SCRIPTS_DIR = python_scripts +NOTEBOOKS_DIR = notebooks +JUPYTER_BOOK_DIR = jupyter-book +JUPYTER_KERNEL := python3 +MINIMAL_NOTEBOOK_FILES = $(shell ls $(PYTHON_SCRIPTS_DIR)/*.py | perl -pe "s@$(PYTHON_SCRIPTS_DIR)@$(NOTEBOOKS_DIR)@" | perl -pe "s@\.py@.ipynb@") + +# This assumes that the folder mooc-scikit-learn-coordination and +# scikit-learn-mooc are siblings, e.g. the repos are in the +# ~/dev/mooc-scikit-learn-coordination and ~/dev/scikit-learn-mooc. This should +# be the case in most development setups. If not then you can pass the +# GITLAB_REPO_JUPYTERBOOK_DIR variable with +# make -e GITLAB_REPO_JUPYTERBOOK_DIR=your/gitlab/repo/jupyter-book-dir/goes-here +GITLAB_REPO_JUPYTERBOOK_DIR = ../mooc-scikit-learn-coordination/jupyter-book + +all: $(NOTEBOOKS_DIR) + +.PHONY: $(NOTEBOOKS_DIR) copy_matplotlibrc sanity_check_$(NOTEBOOKS_DIR) all \ + exercises quizzes $(JUPYTER_BOOK_DIR) $(JUPYTER_BOOK_DIR)-clean $(JUPYTER_BOOK_DIR)-full-clean + +$(NOTEBOOKS_DIR): $(MINIMAL_NOTEBOOK_FILES) copy_matplotlibrc sanity_check_$(NOTEBOOKS_DIR) + +$(NOTEBOOKS_DIR)/%.ipynb: $(PYTHON_SCRIPTS_DIR)/%.py + python build_tools/convert-python-script-to-notebook.py $< $@ + +copy_matplotlibrc: + cp $(PYTHON_SCRIPTS_DIR)/matplotlibrc $(NOTEBOOKS_DIR)/ + +sanity_check_$(NOTEBOOKS_DIR): + python build_tools/sanity-check.py $(PYTHON_SCRIPTS_DIR) $(NOTEBOOKS_DIR) + +exercises: + python build_tools/generate-exercise-from-solution.py $(PYTHON_SCRIPTS_DIR) + +quizzes: + python build_tools/generate-quizzes.py $(GITLAB_REPO_JUPYTERBOOK_DIR) $(JUPYTER_BOOK_DIR) + +full-index: + python build_tools/generate-md-index.py + +$(JUPYTER_BOOK_DIR): + jupyter-book build $(JUPYTER_BOOK_DIR) + rm -rf $(JUPYTER_BOOK_DIR)/_build/html/{slides,figures} && cp -r slides figures $(JUPYTER_BOOK_DIR)/_build/html + +$(JUPYTER_BOOK_DIR)-clean: + # keep jupyter-cache cache folder + jupyter-book clean $(JUPYTER_BOOK_DIR) + +$(JUPYTER_BOOK_DIR)-full-clean: + # deletes jupyter-cache cache folder + rm -rf $(JUPYTER_BOOK_DIR)/_build From 68f4fa1097d694368a04348e7feb56651b395cf1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Est=C3=A8ve?= Date: Fri, 16 Dec 2022 15:51:20 +0100 Subject: [PATCH 007/108] Simplify deploy-gh-pages.yml --- .github/workflows/deploy-gh-pages.yml | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/.github/workflows/deploy-gh-pages.yml b/.github/workflows/deploy-gh-pages.yml index 9117dad7b..93379e88e 100644 --- a/.github/workflows/deploy-gh-pages.yml +++ b/.github/workflows/deploy-gh-pages.yml @@ -43,14 +43,10 @@ jobs: run: | make jupyter-book - - name: Create folder for gh-pages - run: | - cp -r jupyter-book/_build/html gh-pages - - name: GitHub Pages action if: ${{ github.event_name == 'push' && github.ref == 'refs/heads/main' }} uses: peaceiris/actions-gh-pages@v3.6.1 with: github_token: ${{ secrets.GITHUB_TOKEN }} - publish_dir: ./gh-pages + publish_dir: ./jupyter-book/_build/html commit_message: "[ci skip] ${{ github.event.head_commit.message }}" From de86e9a039fbc3abde416c7e5975bf9d12f38fd9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Est=C3=A8ve?= Date: Fri, 16 Dec 2022 16:19:18 +0100 Subject: [PATCH 008/108] Use netlify for JupyterBook preview --- .github/workflows/deploy-gh-pages.yml | 13 ++++ .github/workflows/jupyter-book-pr-preview.yml | 60 +++++++++++++++++++ 2 files changed, 73 insertions(+) create mode 100644 .github/workflows/jupyter-book-pr-preview.yml diff --git a/.github/workflows/deploy-gh-pages.yml b/.github/workflows/deploy-gh-pages.yml index 93379e88e..6bb94a5ca 100644 --- a/.github/workflows/deploy-gh-pages.yml +++ b/.github/workflows/deploy-gh-pages.yml @@ -5,6 +5,9 @@ on: branches: - main - test-ci* + pull_request: + branches: + - main jobs: deploy-gh-pages: @@ -42,6 +45,16 @@ jobs: - name: Build the JupyterBook run: | make jupyter-book + echo ${{github.event.number}} > pull_request_number + + - name: Upload jupyter-book artifact for PRs + if: ${{ github.event_name == 'pull_request' }} + uses: actions/upload-artifact@v3 + with: + name: jupyter-book + path: | + jupyter-book/_build/html + pull_request_number - name: GitHub Pages action if: ${{ github.event_name == 'push' && github.ref == 'refs/heads/main' }} diff --git a/.github/workflows/jupyter-book-pr-preview.yml b/.github/workflows/jupyter-book-pr-preview.yml new file mode 100644 index 000000000..c13519082 --- /dev/null +++ b/.github/workflows/jupyter-book-pr-preview.yml @@ -0,0 +1,60 @@ +name: jupyter-book-pr-preview + +on: + workflow_run: + workflows: ["deploy-gh-pages"] + types: + - completed + +jobs: + deploy-preview: + runs-on: ubuntu-latest + if: ${{github.event.workflow_run.event == 'pull_request' && github.event.workflow_run.conclusion == 'success'}} + steps: + - name: 'Commit Status: Set Workflow Status as Pending' + uses: myrotvorets/set-commit-status-action@1.1.6 + with: + token: ${{ secrets.GITHUB_TOKEN }} + status: pending + sha: ${{ github.event.workflow_run.head_sha }} + context: 'JupyterBook preview' + + - uses: dawidd6/action-download-artifact@v2 + with: + github_token: ${{secrets.GITHUB_TOKEN}} + workflow: deploy-gh-pages.yml + pr: ${{steps.pull-request-number.outputs.result}} + name: jupyter-book + + - name: Get pull request number + id: pull-request-number + run: | + export PULL_REQUEST_NUMBER=$(cat pull_request_number) + echo "result=${PULL_REQUEST_NUMBER}" >> $GITHUB_OUTPUT + + - uses: actions/setup-node@v3 + with: + node-version: '16' + - run: npm install --global netlify-cli@6 + - name: Deploy to Netlify + env: + NETLIFY_AUTH_TOKEN: ${{secrets.NETLIFY_AUTH_TOKEN}} + NETLIFY_SITE_ID: ${{secrets.NETLIFY_SITE_ID}} + run: netlify deploy --dir=jupyter-book/_build/html --alias=pull-request-${{steps.pull-request-number.outputs.result}} + + - name: 'Commit Status: Update deployment status' + uses: myrotvorets/set-commit-status-action@1.1.6 + # Always run this step regardless of job failing early + if: always() + env: + DEPLOY_SUCCESS: Successfully deployed preview. + DEPLOY_FAILURE: Failed to deploy preview. + TARGET_URL_SUCCESS: https://pull-request-${{steps.pull-request-number.outputs.result}}--scikit-learn-mooc.netlify.app + TARGET_URL_FAILURE: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }} + with: + token: ${{ secrets.GITHUB_TOKEN }} + status: ${{ job.status == 'success' && 'success' || 'failure' }} + sha: ${{ github.event.workflow_run.head_sha }} + context: 'JupyterBook preview' + description: ${{ job.status == 'success' && env.DEPLOY_SUCCESS || env.DEPLOY_FAILURE }} + targetUrl: ${{ job.status == 'success' && env.TARGET_URL_SUCCESS || env.TARGET_URL_FAILURE }} From e2fcce6a72a1889ad06570d8833c46210b2cfc70 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Est=C3=A8ve?= Date: Fri, 16 Dec 2022 16:50:37 +0100 Subject: [PATCH 009/108] Add newline and tweak wording --- python_scripts/03_categorical_pipeline.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/python_scripts/03_categorical_pipeline.py b/python_scripts/03_categorical_pipeline.py index e7a6fdbf9..f3658523b 100644 --- a/python_scripts/03_categorical_pipeline.py +++ b/python_scripts/03_categorical_pipeline.py @@ -258,8 +258,9 @@ # - the original categories (before encoding) have an ordering; # - the encoded categories follow the same ordering than the original # categories. -# The **next exercise** highlights the issue of misusing `OrdinalEncoder` with -# a linear model. +# +# The **next exercise** shows what can happen when using an `OrdinalEncoder` +# with a liner model and the conditions above are not met. # # One-hot encoding categorical variables with high cardinality can cause # computational inefficiency in tree-based models. Because of this, it is not recommended From 3907405cf54ae4d8261758c73254303628d4fd3d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Est=C3=A8ve?= Date: Fri, 16 Dec 2022 17:50:54 +0100 Subject: [PATCH 010/108] Remove CircleCI-related files since netlify is used for the JupyterBook preview on PRs --- .circleci/config.yml | 40 ------------------- .../circleci-artifacts-redirector.yml | 14 ------- 2 files changed, 54 deletions(-) delete mode 100644 .circleci/config.yml delete mode 100644 .github/workflows/circleci-artifacts-redirector.yml diff --git a/.circleci/config.yml b/.circleci/config.yml deleted file mode 100644 index ef537b132..000000000 --- a/.circleci/config.yml +++ /dev/null @@ -1,40 +0,0 @@ -version: 2 - -jobs: - build_jupyter_book: - docker: - - image: continuumio/miniconda - environment: - - OMP_NUM_THREADS: 1 - - MKL_NUM_THREADS: 2 - - OPENBLAS_NUM_THREADS: 2 - - MINICONDA_PATH: ~/miniconda - steps: - - checkout - - run: ./build_tools/circle/checkout_merge_commit.sh - - run: ./build_tools/circle/checksum_python_files.sh /tmp/checksum.txt - - restore_cache: - keys: - - v3-{{ .Branch }}-{{ checksum "/tmp/checksum.txt" }} - - v3-{{ .Branch }} - - v3-main - - run: - command: ./build_tools/circle/build_jupyter_book.sh - no_output_timeout: 30m - - save_cache: - paths: - - jupyter-book/_build/.jupyter_cache - key: v3-{{ .Branch }}-{{ checksum "/tmp/checksum.txt" }} - - store_artifacts: - path: jupyter-book/_build/html - destination: jupyter-book - -workflows: - version: 2 - build_jupyter_book: - jobs: - # TODO add lint step - # - lint - - build_jupyter_book - # requires: - # - lint diff --git a/.github/workflows/circleci-artifacts-redirector.yml b/.github/workflows/circleci-artifacts-redirector.yml deleted file mode 100644 index 08a046a6b..000000000 --- a/.github/workflows/circleci-artifacts-redirector.yml +++ /dev/null @@ -1,14 +0,0 @@ -name: circleci-artifacts-redirector - -on: [status] -jobs: - circleci_artifacts_redirector_job: - runs-on: ubuntu-latest - name: Run CircleCI artifacts redirector - steps: - - name: GitHub Action step - uses: larsoner/circleci-artifacts-redirector-action@master - with: - repo-token: ${{ secrets.GITHUB_TOKEN }} - artifact-path: 0/jupyter-book/_changed.html - circleci-jobs: build_jupyter_book From d1d19e362ff615da61b22b995fdc0e0c12c44c17 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Est=C3=A8ve?= Date: Fri, 16 Dec 2022 18:01:33 +0100 Subject: [PATCH 011/108] Remove lingering CircleCI stuff --- build_tools/circle/build_jupyter_book.sh | 62 --------------------- build_tools/circle/checkout_merge_commit.sh | 32 ----------- build_tools/circle/checksum_python_files.sh | 25 --------- workflow-notes.md | 9 ++- 4 files changed, 4 insertions(+), 124 deletions(-) delete mode 100755 build_tools/circle/build_jupyter_book.sh delete mode 100755 build_tools/circle/checkout_merge_commit.sh delete mode 100755 build_tools/circle/checksum_python_files.sh diff --git a/build_tools/circle/build_jupyter_book.sh b/build_tools/circle/build_jupyter_book.sh deleted file mode 100755 index d07bb5953..000000000 --- a/build_tools/circle/build_jupyter_book.sh +++ /dev/null @@ -1,62 +0,0 @@ -#!/usr/bin/env bash -set -xe - -jupyter_book_dir=jupyter-book -jupyter_book_build_dir="$jupyter_book_dir/_build/html" - -function show_error_logs { - echo "Some notebooks failed, see logs below:" - for f in $jupyter_book_build_dir/reports/*.log; do - echo "================================================================================" - echo $f - echo "================================================================================" - cat $f - done - # You need to exit with non-zero here to cause the build to fail - exit 1 -} - -apt-get install make - -source /opt/conda/etc/profile.d/conda.sh -conda update --yes conda -conda create -n scikit-learn-mooc --yes -c conda-forge python=3.9 -conda activate scikit-learn-mooc -pip install -r requirements-dev.txt - -affected_jupyter_book_paths() { - files=$(git diff --name-only origin/main...$CIRCLE_SHA1) - # TODO: rather than the grep pattern below we could potentially look at - # _toc.yml to know whether the file affects the JupyterBook - echo "$files" | grep python_scripts | perl -pe 's@\.py$@.html@' - echo "$files" | grep -P "$jupyter_book_dir/.+md$" | \ - perl -pe "s@$jupyter_book_dir/(.+)\.md@\1.html@" -} - -write_changed_html() { - affected="$1" - if [ -n "$CI_PULL_REQUEST" ] - then - echo "The following files may have been changed by PR $CI_PULL_REQUEST:" - echo "$affected" - ( - echo '' - echo 'Files changed by PR '"$CI_PULL_REQUEST" - echo '
    ' - echo "$affected" | sed 's|.*|
  • & [main]|' - echo '

This PR JupyterBook index' - echo '' - ) > "$jupyter_book_build_dir/_changed.html" - fi -} - -affected=$(affected_jupyter_book_paths) - -make $jupyter_book_dir 2>&1 | tee $jupyter_book_dir/build.log - -write_changed_html "$affected" - -# Grep the log to make sure there has been no errors when running the notebooks -# since jupyter-book exit code is always 0 -grep 'Execution Failed' $jupyter_book_dir/build.log && show_error_logs || \ - echo 'All notebooks ran successfully' diff --git a/build_tools/circle/checkout_merge_commit.sh b/build_tools/circle/checkout_merge_commit.sh deleted file mode 100755 index d9860b0ab..000000000 --- a/build_tools/circle/checkout_merge_commit.sh +++ /dev/null @@ -1,32 +0,0 @@ -#!/bin/bash - - -# Add `main` branch to the update list. -# Otherwise CircleCI will give us a cached one. -FETCH_REFS="+main:main" - -# Update PR refs for testing. -if [[ -n "${CIRCLE_PR_NUMBER}" ]] -then - FETCH_REFS="${FETCH_REFS} +refs/pull/${CIRCLE_PR_NUMBER}/head:pr/${CIRCLE_PR_NUMBER}/head" - FETCH_REFS="${FETCH_REFS} +refs/pull/${CIRCLE_PR_NUMBER}/merge:pr/${CIRCLE_PR_NUMBER}/merge" -fi - -# Retrieve the refs. -git fetch -u origin ${FETCH_REFS} - -# Checkout the PR merge ref. -if [[ -n "${CIRCLE_PR_NUMBER}" ]] -then - git checkout -qf "pr/${CIRCLE_PR_NUMBER}/merge" || ( - echo Could not fetch merge commit. >&2 - echo There may be conflicts in merging PR \#${CIRCLE_PR_NUMBER} with main. >&2; - exit 1) -fi - -# Check for merge conflicts. -if [[ -n "${CIRCLE_PR_NUMBER}" ]] -then - git branch --merged | grep main > /dev/null - git branch --merged | grep "pr/${CIRCLE_PR_NUMBER}/head" > /dev/null -fi diff --git a/build_tools/circle/checksum_python_files.sh b/build_tools/circle/checksum_python_files.sh deleted file mode 100755 index 9cc7a8546..000000000 --- a/build_tools/circle/checksum_python_files.sh +++ /dev/null @@ -1,25 +0,0 @@ -#!/bin/bash -RESULT_FILE=$1 - -if [ -f $RESULT_FILE ]; then - rm $RESULT_FILE -fi -touch $RESULT_FILE - -checksum_file() { - echo `openssl md5 $1 | awk '{print $2}'` -} - -FILES=() -while read -r -d ''; do - FILES+=("$REPLY") -done < <(find python_scripts -name '*.py' -type f -print0) - -# Loop through files and append MD5 to result file -for FILE in ${FILES[@]}; do - echo `checksum_file $FILE` >> $RESULT_FILE -done - -# Sort the file so that it does not depend on the order of find -sort $RESULT_FILE -o $RESULT_FILE - diff --git a/workflow-notes.md b/workflow-notes.md index caa3d7b1e..af57afbc9 100644 --- a/workflow-notes.md +++ b/workflow-notes.md @@ -25,11 +25,10 @@ see [below](#how-our-repo-contents-are-used-on-the-fun-mooc-platform). ### Continuous Integration -We have both: -- Github Actions: builds on push in branches and deploy on gh-pages on push to - main -- CircleCI: used on PRs, very useful to show JupyterBook generated by a PR to - check the rendering +We use Github Actions for: +- building the JupyterBook in pull requests. Previewing the built JupyterBook + is done by deploying to Netlify. +- building the JupyterBook on pushes to main and deploy to gh-pages ### Label conventions with the Learning Lab From 5007da83e8c86677e08c1888c58d89ef648db73e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Est=C3=A8ve?= Date: Mon, 19 Dec 2022 15:52:18 +0100 Subject: [PATCH 012/108] Check URL should use _changed.html --- .github/workflows/jupyter-book-pr-preview.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/jupyter-book-pr-preview.yml b/.github/workflows/jupyter-book-pr-preview.yml index c13519082..cabe43fe9 100644 --- a/.github/workflows/jupyter-book-pr-preview.yml +++ b/.github/workflows/jupyter-book-pr-preview.yml @@ -49,7 +49,7 @@ jobs: env: DEPLOY_SUCCESS: Successfully deployed preview. DEPLOY_FAILURE: Failed to deploy preview. - TARGET_URL_SUCCESS: https://pull-request-${{steps.pull-request-number.outputs.result}}--scikit-learn-mooc.netlify.app + TARGET_URL_SUCCESS: https://pull-request-${{steps.pull-request-number.outputs.result}}--scikit-learn-mooc.netlify.app/_changed.html TARGET_URL_FAILURE: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }} with: token: ${{ secrets.GITHUB_TOKEN }} From 0d99a198194932daf6132abd22cb1798f344986c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Est=C3=A8ve?= Date: Tue, 20 Dec 2022 11:18:27 +0100 Subject: [PATCH 013/108] Reuse deleted build_jupyter_book.sh (#680) --- .github/workflows/deploy-gh-pages.yml | 7 ++- build_tools/build_jupyter_book.sh | 63 +++++++++++++++++++++++++++ 2 files changed, 68 insertions(+), 2 deletions(-) create mode 100755 build_tools/build_jupyter_book.sh diff --git a/.github/workflows/deploy-gh-pages.yml b/.github/workflows/deploy-gh-pages.yml index 6bb94a5ca..236d4dacc 100644 --- a/.github/workflows/deploy-gh-pages.yml +++ b/.github/workflows/deploy-gh-pages.yml @@ -19,8 +19,9 @@ jobs: steps: - uses: actions/checkout@v3 + with: + fetch-depth: 0 - # Install dependencies - name: Set up Python uses: actions/setup-python@v4 with: @@ -43,8 +44,10 @@ jobs: v2-refs/heads/main - name: Build the JupyterBook + env: + GITHUB_PULL_REQUEST_NUMBER: ${{github.event.number}} run: | - make jupyter-book + bash build_tools/build_jupyter_book.sh echo ${{github.event.number}} > pull_request_number - name: Upload jupyter-book artifact for PRs diff --git a/build_tools/build_jupyter_book.sh b/build_tools/build_jupyter_book.sh new file mode 100755 index 000000000..e051a8d30 --- /dev/null +++ b/build_tools/build_jupyter_book.sh @@ -0,0 +1,63 @@ +#!/usr/bin/env bash +set -xe + +jupyter_book_dir=jupyter-book +jupyter_book_build_dir="$jupyter_book_dir/_build/html" + +function show_error_logs { + echo "Some notebooks failed, see logs below:" + for f in $jupyter_book_build_dir/reports/*.log; do + echo "================================================================================" + echo $f + echo "================================================================================" + cat $f + done + # You need to exit with non-zero here to cause the build to fail + exit 1 +} + +affected_jupyter_book_paths() { + files=$(git diff --name-only origin/main...) + # TODO: rather than the grep pattern below we could potentially look at + # _toc.yml to know whether the file affects the JupyterBook + echo "$files" | grep python_scripts | perl -pe 's@\.py$@.html@' + echo "$files" | grep -P "$jupyter_book_dir/.+md$" | \ + perl -pe "s@$jupyter_book_dir/(.+)\.md@\1.html@" +} + +write_changed_html() { + affected="$1" + if [ -n "$GITHUB_PULL_REQUEST_NUMBER" ] + then + GITHUB_PULL_REQUEST_URL="https://github.com/inria/scikit-learn-mooc/pull/$GITHUB_PULL_REQUEST_NUMBER" + echo "The following files may have been changed by PR $GITHUB_PR_NUMBER:" + echo "$affected" + ( + echo '' + echo 'Files changed by PR '"$GITHUB_PR_URL" + echo '

    ' + echo "$affected" | sed 's|.*|
  • & [main]|' + echo '

This PR JupyterBook index' + echo '' + ) > "$jupyter_book_build_dir/_changed.html" + fi +} + +git remote -v +git show --stat +git log --color --graph --pretty=format:'%Cred%h%Creset -%C(yellow)%d%Creset %s %Cgreen(%cr) %C(bold blue)<%an>%Creset' --abbrev-commit -20 +git fetch origin main >&2 # || (echo QUICK BUILD: failed to get changed filenames for $git_range; return) +git diff origin/main... --stat +git diff origin/main... + +affected=$(affected_jupyter_book_paths) +mkdir -p $jupyter_book_build_dir +write_changed_html "$affected" + +make $jupyter_book_dir 2>&1 | tee $jupyter_book_dir/build.log + + +# Grep the log to make sure there has been no errors when running the notebooks +# since jupyter-book exit code is always 0 +grep 'Execution Failed' $jupyter_book_dir/build.log && show_error_logs || \ + echo 'All notebooks ran successfully' From 2620c15b066b84ef2577c459257c30cb3c717ca0 Mon Sep 17 00:00:00 2001 From: Arturo Amor <86408019+ArturoAmorQ@users.noreply.github.com> Date: Sun, 12 Feb 2023 18:43:37 +0100 Subject: [PATCH 014/108] MAINT Remove remaining set_config (#685) --- python_scripts/logistic_regression.py | 4 ---- python_scripts/logistic_regression_non_linear.py | 4 ---- python_scripts/parameter_tuning_grid_search.py | 5 ----- python_scripts/parameter_tuning_nested.py | 6 ------ python_scripts/parameter_tuning_randomized_search.py | 5 ----- 5 files changed, 24 deletions(-) diff --git a/python_scripts/logistic_regression.py b/python_scripts/logistic_regression.py index 890cd460c..c691d7d8f 100644 --- a/python_scripts/logistic_regression.py +++ b/python_scripts/logistic_regression.py @@ -74,10 +74,6 @@ # Scikit-learn provides the class `LogisticRegression` which implements this # algorithm. -# %% -import sklearn -sklearn.set_config(display="diagram") - # %% from sklearn.pipeline import make_pipeline from sklearn.preprocessing import StandardScaler diff --git a/python_scripts/logistic_regression_non_linear.py b/python_scripts/logistic_regression_non_linear.py index 19b7729a6..0a51c87e7 100644 --- a/python_scripts/logistic_regression_non_linear.py +++ b/python_scripts/logistic_regression_non_linear.py @@ -59,10 +59,6 @@ # We will create a predictive model by standardizing the dataset followed by # a linear support vector machine classifier. -# %% -import sklearn -sklearn.set_config(display="diagram") - # %% from sklearn.pipeline import make_pipeline from sklearn.preprocessing import StandardScaler diff --git a/python_scripts/parameter_tuning_grid_search.py b/python_scripts/parameter_tuning_grid_search.py index 1bf984ce2..fc81136f5 100644 --- a/python_scripts/parameter_tuning_grid_search.py +++ b/python_scripts/parameter_tuning_grid_search.py @@ -17,11 +17,6 @@ # # Let us reload the dataset as we did previously: -# %% -from sklearn import set_config - -set_config(display="diagram") - # %% import pandas as pd diff --git a/python_scripts/parameter_tuning_nested.py b/python_scripts/parameter_tuning_nested.py index 32241cbf0..ca262a6ac 100644 --- a/python_scripts/parameter_tuning_nested.py +++ b/python_scripts/parameter_tuning_nested.py @@ -40,12 +40,6 @@ # We now create the predictive model that we want to optimize. Note that # this pipeline is identical to the one we used in the previous notebook. -# %% -from sklearn import set_config - -# To get a diagram visualization of the pipeline -set_config(display="diagram") - # %% from sklearn.compose import ColumnTransformer from sklearn.preprocessing import OrdinalEncoder diff --git a/python_scripts/parameter_tuning_randomized_search.py b/python_scripts/parameter_tuning_randomized_search.py index 6666688c4..7fbf1c8cc 100644 --- a/python_scripts/parameter_tuning_randomized_search.py +++ b/python_scripts/parameter_tuning_randomized_search.py @@ -24,11 +24,6 @@ # # Let us reload the dataset as we did previously: -# %% -from sklearn import set_config - -set_config(display="diagram") - # %% import pandas as pd From 8d307210dc0615a8504ae9157dc25d53ac7dc52d Mon Sep 17 00:00:00 2001 From: Darigov Research <30328618+darigovresearch@users.noreply.github.com> Date: Wed, 15 Feb 2023 13:16:23 +0000 Subject: [PATCH 015/108] feat: Adds license to jupyter-book config (#682) --- jupyter-book/_config.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/jupyter-book/_config.yml b/jupyter-book/_config.yml index ef85cf4da..42816945e 100644 --- a/jupyter-book/_config.yml +++ b/jupyter-book/_config.yml @@ -47,7 +47,7 @@ html:

- Brought to you by + Brought to you under a CC-BY License by Inria Learning Lab, scikit-learn @ La Fondation Inria, Inria Academy, From fb7ff43de9d96fe1eb9fdfb52613e5f204afb1d2 Mon Sep 17 00:00:00 2001 From: Arturo Amor <86408019+ArturoAmorQ@users.noreply.github.com> Date: Tue, 21 Feb 2023 15:52:58 +0100 Subject: [PATCH 016/108] MAINT Change parameter names to match scikit-learn notation (#688) --- python_scripts/parameter_tuning_ex_02.py | 6 +++--- python_scripts/parameter_tuning_sol_02.py | 12 ++++++------ 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/python_scripts/parameter_tuning_ex_02.py b/python_scripts/parameter_tuning_ex_02.py index 425032ebc..0f92bc226 100644 --- a/python_scripts/parameter_tuning_ex_02.py +++ b/python_scripts/parameter_tuning_ex_02.py @@ -75,9 +75,9 @@ # Write your code here. # %% [markdown] -# -# Now use the test set to score the model using the best parameters -# that we found using cross-validation in the training set. +# Now use the test set to score the model using the best parameters that we +# found using cross-validation. You will have to refit the model over the full +# training set. # %% # Write your code here. diff --git a/python_scripts/parameter_tuning_sol_02.py b/python_scripts/parameter_tuning_sol_02.py index 311bc8bc8..d82243f97 100644 --- a/python_scripts/parameter_tuning_sol_02.py +++ b/python_scripts/parameter_tuning_sol_02.py @@ -86,21 +86,21 @@ print(f"score: {mean_score:.3f}") if mean_score > best_score: best_score = mean_score - best_params = {'learning-rate': lr, 'max leaf nodes': mln} + best_params = {'learning_rate': lr, 'max_leaf_nodes': mln} print(f"Found new best model with score {best_score:.3f}!") print(f"The best accuracy obtained is {best_score:.3f}") print(f"The best parameters found are:\n {best_params}") # %% [markdown] -# -# Now use the test set to score the model using the best parameters -# that we found using cross-validation in the training set. +# Now use the test set to score the model using the best parameters that we +# found using cross-validation. You will have to refit the model over the full +# training set. # %% # solution -best_lr = best_params['learning-rate'] -best_mln = best_params['max leaf nodes'] +best_lr = best_params['learning_rate'] +best_mln = best_params['max_leaf_nodes'] model.set_params(classifier__learning_rate=best_lr, classifier__max_leaf_nodes=best_mln) From c097ae0832291ced3f664b3e87a0681dde0c2e7e Mon Sep 17 00:00:00 2001 From: Arturo Amor <86408019+ArturoAmorQ@users.noreply.github.com> Date: Wed, 22 Feb 2023 15:52:09 +0100 Subject: [PATCH 017/108] MAINT Fix FutureWarnings for v1.2 (#687) --- python_scripts/ensemble_adaboost.py | 4 ++-- python_scripts/ensemble_bagging.py | 6 +++--- python_scripts/ensemble_ex_01.py | 2 +- python_scripts/ensemble_introduction.py | 4 ++-- python_scripts/ensemble_random_forest.py | 4 ++-- python_scripts/ensemble_sol_01.py | 6 +++--- python_scripts/logistic_regression.py | 2 +- 7 files changed, 14 insertions(+), 14 deletions(-) diff --git a/python_scripts/ensemble_adaboost.py b/python_scripts/ensemble_adaboost.py index 81f212742..8349e715b 100644 --- a/python_scripts/ensemble_adaboost.py +++ b/python_scripts/ensemble_adaboost.py @@ -166,8 +166,8 @@ # %% from sklearn.ensemble import AdaBoostClassifier -base_estimator = DecisionTreeClassifier(max_depth=3, random_state=0) -adaboost = AdaBoostClassifier(base_estimator=base_estimator, +estimator = DecisionTreeClassifier(max_depth=3, random_state=0) +adaboost = AdaBoostClassifier(estimator=estimator, n_estimators=3, algorithm="SAMME", random_state=0) adaboost.fit(data, target) diff --git a/python_scripts/ensemble_bagging.py b/python_scripts/ensemble_bagging.py index 2a42275ce..70efc88c3 100644 --- a/python_scripts/ensemble_bagging.py +++ b/python_scripts/ensemble_bagging.py @@ -245,7 +245,7 @@ def bootstrap_sample(data, target): from sklearn.ensemble import BaggingRegressor bagged_trees = BaggingRegressor( - base_estimator=DecisionTreeRegressor(max_depth=3), + estimator=DecisionTreeRegressor(max_depth=3), n_estimators=100, ) _ = bagged_trees.fit(data_train, target_train) @@ -333,11 +333,11 @@ def bootstrap_sample(data, target): # base models. # # The ensemble itself is simply built by passing the resulting pipeline as the -# `base_estimator` parameter of the `BaggingRegressor` class: +# `estimator` parameter of the `BaggingRegressor` class: # %% bagging = BaggingRegressor( - base_estimator=polynomial_regressor, + estimator=polynomial_regressor, n_estimators=100, random_state=0, ) diff --git a/python_scripts/ensemble_ex_01.py b/python_scripts/ensemble_ex_01.py index 648d44a6e..382d9fe11 100644 --- a/python_scripts/ensemble_ex_01.py +++ b/python_scripts/ensemble_ex_01.py @@ -38,7 +38,7 @@ # %% [markdown] # Create a `BaggingRegressor` and provide a `DecisionTreeRegressor` -# to its parameter `base_estimator`. Train the regressor and evaluate its +# to its parameter `estimator`. Train the regressor and evaluate its # generalization performance on the testing set using the mean absolute error. # %% diff --git a/python_scripts/ensemble_introduction.py b/python_scripts/ensemble_introduction.py index b8597a59d..3b8e91d6b 100644 --- a/python_scripts/ensemble_introduction.py +++ b/python_scripts/ensemble_introduction.py @@ -102,9 +102,9 @@ # %%time from sklearn.ensemble import BaggingRegressor -base_estimator = DecisionTreeRegressor(random_state=0) +estimator = DecisionTreeRegressor(random_state=0) bagging_regressor = BaggingRegressor( - base_estimator=base_estimator, n_estimators=20, random_state=0) + estimator=estimator, n_estimators=20, random_state=0) cv_results = cross_validate(bagging_regressor, data, target, n_jobs=2) scores = cv_results["test_score"] diff --git a/python_scripts/ensemble_random_forest.py b/python_scripts/ensemble_random_forest.py index fca332573..fa1aca90f 100644 --- a/python_scripts/ensemble_random_forest.py +++ b/python_scripts/ensemble_random_forest.py @@ -104,7 +104,7 @@ bagged_trees = make_pipeline( preprocessor, BaggingClassifier( - base_estimator=DecisionTreeClassifier(random_state=0), + estimator=DecisionTreeClassifier(random_state=0), n_estimators=50, n_jobs=2, random_state=0, ) ) @@ -121,7 +121,7 @@ # better than the performance of a single tree. # # Now, we will use a random forest. You will observe that we do not need to -# specify any `base_estimator` because the estimator is forced to be a decision +# specify any `estimator` because the estimator is forced to be a decision # tree. Thus, we just specify the desired number of trees in the forest. # %% diff --git a/python_scripts/ensemble_sol_01.py b/python_scripts/ensemble_sol_01.py index 4a7e244cb..57d675788 100644 --- a/python_scripts/ensemble_sol_01.py +++ b/python_scripts/ensemble_sol_01.py @@ -31,7 +31,7 @@ # %% [markdown] # Create a `BaggingRegressor` and provide a `DecisionTreeRegressor` -# to its parameter `base_estimator`. Train the regressor and evaluate its +# to its parameter `estimator`. Train the regressor and evaluate its # generalization performance on the testing set using the mean absolute error. # %% @@ -41,7 +41,7 @@ from sklearn.ensemble import BaggingRegressor tree = DecisionTreeRegressor() -bagging = BaggingRegressor(base_estimator=tree, n_jobs=2) +bagging = BaggingRegressor(estimator=tree, n_jobs=2) bagging.fit(data_train, target_train) target_predicted = bagging.predict(data_test) print(f"Basic mean absolute error of the bagging regressor:\n" @@ -72,7 +72,7 @@ "n_estimators": randint(10, 30), "max_samples": [0.5, 0.8, 1.0], "max_features": [0.5, 0.8, 1.0], - "base_estimator__max_depth": randint(3, 10), + "estimator__max_depth": randint(3, 10), } search = RandomizedSearchCV( bagging, param_grid, n_iter=20, scoring="neg_mean_absolute_error" diff --git a/python_scripts/logistic_regression.py b/python_scripts/logistic_regression.py index c691d7d8f..5664349f4 100644 --- a/python_scripts/logistic_regression.py +++ b/python_scripts/logistic_regression.py @@ -80,7 +80,7 @@ from sklearn.linear_model import LogisticRegression logistic_regression = make_pipeline( - StandardScaler(), LogisticRegression(penalty="none") + StandardScaler(), LogisticRegression(penalty=None) ) logistic_regression.fit(data_train, target_train) accuracy = logistic_regression.score(data_test, target_test) From 046d2ba44e9b146330832529cc9ada4f5fe0bf49 Mon Sep 17 00:00:00 2001 From: Arturo Amor <86408019+ArturoAmorQ@users.noreply.github.com> Date: Wed, 22 Feb 2023 15:52:59 +0100 Subject: [PATCH 018/108] MAINT Update scikit-learn to v 1.2.1 (#684) --- check_env.py | 2 +- environment-dev.yml | 2 +- environment.yml | 2 +- local-install-instructions.md | 2 +- requirements-dev.txt | 2 +- requirements.txt | 2 +- 6 files changed, 6 insertions(+), 6 deletions(-) diff --git a/check_env.py b/check_env.py index c515a4792..357cc200c 100644 --- a/check_env.py +++ b/check_env.py @@ -58,7 +58,7 @@ def import_version(pkg, min_ver, fail_msg=""): "numpy": "1.16", "scipy": "1.2", "matplotlib": "3.0", - "sklearn": "1.1", + "sklearn": "1.2", "pandas": "1", "seaborn": "0.11", "notebook": "5.7", diff --git a/environment-dev.yml b/environment-dev.yml index e997e287a..fab94f5fc 100644 --- a/environment-dev.yml +++ b/environment-dev.yml @@ -2,7 +2,7 @@ name: scikit-learn-course channels: - conda-forge dependencies: - - scikit-learn >= 1.1.1 + - scikit-learn >= 1.2.1 - pandas >= 1 - matplotlib-base - seaborn diff --git a/environment.yml b/environment.yml index 10e5c3bca..3f840ce77 100644 --- a/environment.yml +++ b/environment.yml @@ -4,7 +4,7 @@ channels: - conda-forge dependencies: - - scikit-learn >= 1.1.1 + - scikit-learn >= 1.2.1 - pandas >= 1 - matplotlib-base - seaborn diff --git a/local-install-instructions.md b/local-install-instructions.md index 7bc1e42f4..026b8eaa3 100644 --- a/local-install-instructions.md +++ b/local-install-instructions.md @@ -46,7 +46,7 @@ Using python in /home/lesteve/miniconda3/envs/scikit-learn-course [ OK ] numpy version 1.19.5 [ OK ] scipy version 1.6.0 [ OK ] matplotlib version 3.3.3 -[ OK ] sklearn version 1.1.1 +[ OK ] sklearn version 1.2.1 [ OK ] pandas version 1.2.0 [ OK ] seaborn version 0.11.1 [ OK ] notebook version 6.2.0 diff --git a/requirements-dev.txt b/requirements-dev.txt index 79e5e6cdd..e6b46805b 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -1,4 +1,4 @@ -scikit-learn>=1.1.1 +scikit-learn>=1.2.1 pandas>=1 matplotlib seaborn diff --git a/requirements.txt b/requirements.txt index 4de74870d..49df7fe00 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,4 @@ -scikit-learn>=1.1.1 +scikit-learn>=1.2.1 pandas>=1 matplotlib seaborn From b107428270dca65cac6a1eb18fa54b524935307a Mon Sep 17 00:00:00 2001 From: Olivier Grisel Date: Wed, 22 Feb 2023 15:54:44 +0100 Subject: [PATCH 019/108] FIX use sparse_output parameter for OneHotEncoder (#686) Co-authored-by: Arturo Amor <86408019+ArturoAmorQ@users.noreply.github.com> --- notebooks/03_categorical_pipeline.ipynb | 4 ++-- notebooks/03_categorical_pipeline_ex_02.ipynb | 2 +- notebooks/03_categorical_pipeline_sol_02.ipynb | 4 ++-- notebooks/linear_models_sol_04.ipynb | 4 ++-- python_scripts/03_categorical_pipeline.py | 4 ++-- python_scripts/03_categorical_pipeline_ex_02.py | 2 +- python_scripts/03_categorical_pipeline_sol_02.py | 4 ++-- python_scripts/linear_models_sol_04.py | 4 ++-- 8 files changed, 14 insertions(+), 14 deletions(-) diff --git a/notebooks/03_categorical_pipeline.ipynb b/notebooks/03_categorical_pipeline.ipynb index 403a0b385..155670ac8 100644 --- a/notebooks/03_categorical_pipeline.ipynb +++ b/notebooks/03_categorical_pipeline.ipynb @@ -276,7 +276,7 @@ "source": [ "from sklearn.preprocessing import OneHotEncoder\n", "\n", - "encoder = OneHotEncoder(sparse=False)\n", + "encoder = OneHotEncoder(sparse_output=False)\n", "education_encoded = encoder.fit_transform(education_column)\n", "education_encoded" ] @@ -287,7 +287,7 @@ "source": [ "
\n", "

Note

\n", - "

sparse=False is used in the OneHotEncoder for didactic purposes, namely\n", + "

sparse_output=False is used in the OneHotEncoder for didactic purposes, namely\n", "easier visualization of the data.

\n", "

Sparse matrices are efficient data structures when most of your matrix\n", "elements are zero. They won't be covered in detail in this course. If you\n", diff --git a/notebooks/03_categorical_pipeline_ex_02.ipynb b/notebooks/03_categorical_pipeline_ex_02.ipynb index 01b435624..b8d0ad094 100644 --- a/notebooks/03_categorical_pipeline_ex_02.ipynb +++ b/notebooks/03_categorical_pipeline_ex_02.ipynb @@ -139,7 +139,7 @@ "\n", "Hint: `HistGradientBoostingClassifier` does not yet support sparse input\n", "data. You might want to use\n", - "`OneHotEncoder(handle_unknown=\"ignore\", sparse=False)` to force the use of a\n", + "`OneHotEncoder(handle_unknown=\"ignore\", sparse_output=False)` to force the use of a\n", "dense representation as a workaround." ] }, diff --git a/notebooks/03_categorical_pipeline_sol_02.ipynb b/notebooks/03_categorical_pipeline_sol_02.ipynb index 569836225..e43625325 100644 --- a/notebooks/03_categorical_pipeline_sol_02.ipynb +++ b/notebooks/03_categorical_pipeline_sol_02.ipynb @@ -178,7 +178,7 @@ "\n", "Hint: `HistGradientBoostingClassifier` does not yet support sparse input\n", "data. You might want to use\n", - "`OneHotEncoder(handle_unknown=\"ignore\", sparse=False)` to force the use of a\n", + "`OneHotEncoder(handle_unknown=\"ignore\", sparse_output=False)` to force the use of a\n", "dense representation as a workaround." ] }, @@ -193,7 +193,7 @@ "\n", "from sklearn.preprocessing import OneHotEncoder\n", "\n", - "categorical_preprocessor = OneHotEncoder(handle_unknown=\"ignore\", sparse=False)\n", + "categorical_preprocessor = OneHotEncoder(handle_unknown=\"ignore\", sparse_output=False)\n", "preprocessor = ColumnTransformer([\n", " ('one-hot-encoder', categorical_preprocessor, categorical_columns)],\n", " remainder=\"passthrough\")\n", diff --git a/notebooks/linear_models_sol_04.ipynb b/notebooks/linear_models_sol_04.ipynb index 9df06624e..067b287dc 100644 --- a/notebooks/linear_models_sol_04.ipynb +++ b/notebooks/linear_models_sol_04.ipynb @@ -380,7 +380,7 @@ "from sklearn.preprocessing import OneHotEncoder\n", "\n", "single_feature = [\"CentralAir\"]\n", - "encoder = OneHotEncoder(sparse=False, dtype=np.int32)\n", + "encoder = OneHotEncoder(sparse_output=False, dtype=np.int32)\n", "X_trans = encoder.fit_transform(X_train[single_feature])\n", "X_trans = pd.DataFrame(\n", " X_trans,\n", @@ -427,7 +427,7 @@ }, "outputs": [], "source": [ - "encoder = OneHotEncoder(drop=\"first\", sparse=False, dtype=np.int32)\n", + "encoder = OneHotEncoder(drop=\"first\", sparse_output=False, dtype=np.int32)\n", "X_trans = encoder.fit_transform(X_train[single_feature])\n", "X_trans = pd.DataFrame(\n", " X_trans,\n", diff --git a/python_scripts/03_categorical_pipeline.py b/python_scripts/03_categorical_pipeline.py index f3658523b..bd62c6e26 100644 --- a/python_scripts/03_categorical_pipeline.py +++ b/python_scripts/03_categorical_pipeline.py @@ -172,13 +172,13 @@ # %% from sklearn.preprocessing import OneHotEncoder -encoder = OneHotEncoder(sparse=False) +encoder = OneHotEncoder(sparse_output=False) education_encoded = encoder.fit_transform(education_column) education_encoded # %% [markdown] # ```{note} -# `sparse=False` is used in the `OneHotEncoder` for didactic purposes, namely +# `sparse_output=False` is used in the `OneHotEncoder` for didactic purposes, namely # easier visualization of the data. # # Sparse matrices are efficient data structures when most of your matrix diff --git a/python_scripts/03_categorical_pipeline_ex_02.py b/python_scripts/03_categorical_pipeline_ex_02.py index bb3a9ef58..56adf7b71 100644 --- a/python_scripts/03_categorical_pipeline_ex_02.py +++ b/python_scripts/03_categorical_pipeline_ex_02.py @@ -102,7 +102,7 @@ # # Hint: `HistGradientBoostingClassifier` does not yet support sparse input # data. You might want to use -# `OneHotEncoder(handle_unknown="ignore", sparse=False)` to force the use of a +# `OneHotEncoder(handle_unknown="ignore", sparse_output=False)` to force the use of a # dense representation as a workaround. # %% diff --git a/python_scripts/03_categorical_pipeline_sol_02.py b/python_scripts/03_categorical_pipeline_sol_02.py index d07f9dd49..a0185c9e5 100644 --- a/python_scripts/03_categorical_pipeline_sol_02.py +++ b/python_scripts/03_categorical_pipeline_sol_02.py @@ -126,7 +126,7 @@ # # Hint: `HistGradientBoostingClassifier` does not yet support sparse input # data. You might want to use -# `OneHotEncoder(handle_unknown="ignore", sparse=False)` to force the use of a +# `OneHotEncoder(handle_unknown="ignore", sparse_output=False)` to force the use of a # dense representation as a workaround. # %% @@ -135,7 +135,7 @@ from sklearn.preprocessing import OneHotEncoder -categorical_preprocessor = OneHotEncoder(handle_unknown="ignore", sparse=False) +categorical_preprocessor = OneHotEncoder(handle_unknown="ignore", sparse_output=False) preprocessor = ColumnTransformer([ ('one-hot-encoder', categorical_preprocessor, categorical_columns)], remainder="passthrough") diff --git a/python_scripts/linear_models_sol_04.py b/python_scripts/linear_models_sol_04.py index aaac72c52..76b0c1d3e 100644 --- a/python_scripts/linear_models_sol_04.py +++ b/python_scripts/linear_models_sol_04.py @@ -208,7 +208,7 @@ from sklearn.preprocessing import OneHotEncoder single_feature = ["CentralAir"] -encoder = OneHotEncoder(sparse=False, dtype=np.int32) +encoder = OneHotEncoder(sparse_output=False, dtype=np.int32) X_trans = encoder.fit_transform(X_train[single_feature]) X_trans = pd.DataFrame( X_trans, @@ -237,7 +237,7 @@ # binary categories. # %% tags=["solution"] -encoder = OneHotEncoder(drop="first", sparse=False, dtype=np.int32) +encoder = OneHotEncoder(drop="first", sparse_output=False, dtype=np.int32) X_trans = encoder.fit_transform(X_train[single_feature]) X_trans = pd.DataFrame( X_trans, From 0e83076c847d6f5b3d2bf03b3de3abe722cb6930 Mon Sep 17 00:00:00 2001 From: Arturo Amor <86408019+ArturoAmorQ@users.noreply.github.com> Date: Wed, 22 Feb 2023 15:56:58 +0100 Subject: [PATCH 020/108] MAINT Swap ax and ax.twinx in nested-cross-val figure (#689) * MAINT Swap ax and ax.twinx in nested-cross-val figure * Update figure --- figures/nested_cross_validation_diagram.png | Bin 34690 -> 34676 bytes figures/plot_parameter_tuning_cv.py | 15 +++++++-------- 2 files changed, 7 insertions(+), 8 deletions(-) diff --git a/figures/nested_cross_validation_diagram.png b/figures/nested_cross_validation_diagram.png index ced3ccd7a3c1068492a5e602ca86229c68dec9fb..6ad68ec6a4b26a1e1b693e4959d892140fe4a547 100644 GIT binary patch literal 34676 zcmd432Ut^Uw=Eo0tQ*m-hzbfyQxH%?QIKMVfJpCMBuYnmhm8dlK@Gi$N|X*_=m8=q zA`(DDF98MVAxI4n0(UO<{?1qKDgQbD{h$B(JPJZq*2??NIp!E+&b1zDT~p!MesDVq zh2p@dUeZCKHd-Nnw{AwEP-?gxc=&SGgeL(Xm_38%ht`^ z&czumAtiD81ph4$4_9|NNlB-Fdx3nJDc`d!dcXzD&H*`!rK^N5T65bPkR^&)3V?sK= zKG{8$X@-707%-wZqBu<709S*;TSjE;K)wkX4@DuL6?NoD2pfD+R^4ia{CC&y8{i62 zsJnOn@B1|_QJWhsE#xOTMZP2*AAr#iYKti4eIeR&%-#6Kyhn~9 z>7vo-X1ngfG5Q;gq#)C@D;z=Rw=A@bw`CLJBrSs$no{IRPo>0jSZ+J_Yt_ehbzm2; zIHzDctTVQ<#YB(G3R`KDUn`lGH>CW#zXsnUQ-kAuX)8%-iA%J_m6i9{QzX924{hpn_l(zk(9i}YWmSaq~k6(UCN0U?7^kWh@qUZ2lqrR)c$H5@QY+&dVn3I!82hj7aajqol4;lD79L2~~<`M&eVBv~57W4-yDr z8#+lK2q~PfD+v8nl=B#yU&;*oVQ~Lgym6p+|9;Oy+HQ>?Fxv1HT zjQL=^F>;r6V)5RBtad7QT~WT!m>On8RqkpvH+3z_BaPTB_N_@U;+e{#S<5l^<66@Uq~ZvZi~01H2PTr$1}Yb-;2z|$EuB- zFlfRsE-yw(OGgYdAryFG%*#9V5F;@~aBQJ7&<3;5G&mYRjSCVU*7m3-kBz{#4eF8@q)qbqOMar7)$q3Ian6B+t zyewlq0#8rfMl($)9rQkVXJ4HuZL~w?_sTXE9aHu+BTLc6mxE0M9Zlrzdmjg3C_F+t zEB!jD!3X5?X&jglBJ-lqvXUxg9@iAzYC>V)EMh$F5GBRG(ZeRWR+3h)QGOJ33?B9- zRaSIt$oNgdXCO^x*3j{BfZB3Xl%NW{%HF$Q@8cwJbFyfr@x$Gu2_dTB*kw$xj_c^q z5!`0=tk>Uvz1>}~+@V_j`jSGn4U>m|%$WVw{w=LQXRK& z9=FP$nH~CF6rFuwB0mP7x&4X`kD8p?`Obhd#i=(e>MDKD7<@R6vv_1Gt5fYT=R><4 z8_H;E!99;FG^TvRNkq3ckp0S;k(-#Jy6pXFDi}HQYSPm0m;~IidbtMWCGJXXw}L?C zz0frS=a|Bi#Q~U5IeWE4p)nEMbDdv7_L5^AxaEYdC%vrx+ezlHzE`@E@E0&6YRh*L zSUpreZms%8%Yp*>mi~S1;pLHm)>wroJ4V;d)IeNZoZ*k=H@M-(BtzyimBaOirR`f2 z7pqK)?AwM)2P1nH=UOy_E0vfT5|L+zsVg%jW9(mrxtOEI6=RZc2@kI`{Bm zo<)y3U+`(dId;oSYr*v^?PLV*U= zBQXMFkvI}9smq={OBwyKGE{uhS^=?>Qed&R~q zxqJ@2QL{g=wsJkNR`XixC5kvbD6n@1vpw|N6DrHHz$smIIU7%Cq_KGio9l?Mk6bm^ zZF-CWdxkLqqvC zsmU4Xh)4rc9lCF*D4XjLHVXBW|E?{GGt?;A4gC7iY^53m)3Aky-r@{4jr=lru)%Bk8 z1ZtMmN9PQD2Umq6&f`^rg5J@swLK#p->4l4UE#RRNycdd{71%qW1mV1Q!n(ynA$W8 zT&%w3sKFWubj=an|6Z?}ajX8n#XdE=?KU%@i>!!~ z`bf6_$dXqw8jy}6x|||-b3Lxvpqi!_xa=sgRBHX<{$*qMGu_ZKDWbGg-#%l?mRT(| zr`+YN`|IA^b~I7!;ldP=>2SMb>iWdahtyz`PhPV&j4?ujh{)x_BrnrBniG*Ayd$BLWMW*Q`2x{958 znqXZN+}z!t96amxv${WcEiiMcfb-lh3W&OFPCEb^~&0FO}MA98n^r@@G&|C95ILKSQ@2N+InnD0;2fXlm*eTY za>1)BCPK{Pc}w@WEi#lmM!$M-9F!V0AO*3UZ>D$}^oW}lcZRa>cz5mG^v4gJa-%Qk z@DrNb2M$>&u`(9&S^CL-t7DqMUk;zT{;?Xlc$4&NDRhx!*P0*GOwk6QX=sVk@1Zit zakzwpt~A?*So2Sh545;37baf^8H-OX8EknR!e-m^>U;7lZmzp;57HIA_?X@O;l+Dh zEZYnFxhCbmwJz>sFhp>9s$YYO1k0F8WZFsgD7obQ#1!$d<<478fS&Q=8pjftX(@Pmg%m|@Rk!Lf=`Iv|2#?^X6S zVF4ixG~td=3jdgKqql9a?#Y^jfWU{i6UI6dA2^kA#4tCLZ+}|W z&o*E&V@;jy6s&9RxU9 zd;c2!mKK-lGw_hdd$QX$HE_W=7q*P`Yp|!JKPV|7Daq8MLA+!|URJi`*3!&SLdzSC zhRL?J6CL@@`{li8GuKk&octM7Z_dOsx34$6R*!4ZR77Wb({>*-*|))qc>CZAdcoR?VA%z z#}rGdPm?;H+2%}S@GD0z%X%^-xyaiWl?rD2u$c00GF?>h#HoiJ(_Wa@O)qs%E`{}0 zekLD|llny788f$o-TLQ`(glINoAUOkAC|a?{ODjLJjn(zlZhmIp8Gb zS6Lv0yY^6iFTqHJop3s@%SH?pLkVP1<5&<^?ZHqZtIka15Zcu#Pmf%bDYk3Djy%{c zu+x90UW8{SpJD(@eyTzj@XC+)Ml*F)<6&cP=?&F}Sn=?gnVDjBosp)L5+VNwgY_b* z6{~-2IlO3%t(zjPDGm-Nxj0vPj8Vt{arBj6A2=OIa}@P1WOw5Y@?zV+QDE~I(V8k5ve{xL!@Gk9$^X_+y#_TiG%<;$0smzsmCs*DL( zb3UK%f6#cGORe?Im{U__BhRM<#|m4n^(-#*xO4==)w{!PKF<%DP3q5y!s}`(cn$)@ zwU0^$FR6P4pd7xvs*OxQ-(3B*_7))7aRTC{jyP2%?_G0uo7k7E+wet$v^<{O9n9?M z-^c4uD=>Ziy+6~LT26GSnEoWg7qs|ehL1I>(u4D@YGVuI->~gtlm3C6?-l8%u=|2n z^-f!8{SunV>ZFC@1knyJt!hGEs|B3tbz~2FWNwdDx{7wh=5jrLivQrj-ts!`>;^2h zQElZ=@R>M47TPP-{Cd}$(z;sg@%?N|fwYyJ#lW_Q8p1~~?m{e6%90Sn0n4$Me8VuW zQ7rLJUtlf zw!GseGfyc4Z>4q4e>VO}pQ@cNQ?F)Dhc&MmQKzFG>s`9spGMp(t7BW;I&B+qnmc&x z*hHmn?8x48B|1@Y@y5aIFPZml1nHr)&Q?G%;^2|C*Yj8&OV95wb5DQda`FCeT~D7q zdnwx=K&TQt1ZkTj>)acE%IbZ{MLt-``welDn)Uo2?9x*VjAm|^I8%$&C;(2+gfRJ) z3nqaWxshadwL=R5t4kxnsn4Ei(>Yb^4*S#ITn^vYc9uyaa0UTP9n3+>lQsd2fG!T% z-mf`<%mtNpsDA!fEVg+w$3c(&a!-edW<_R>R{-<;oz1zDOQrCd9zna#x7JE0n#p9n zo?@qxHD>=>pUuxPpzV^LKfho^QVPoe#N_*YM^cujox&TRD-f;gA`g|!jefIn=q0uI zjVa~po^CFYv)c6AF%?Ha$G4%!iNQSbyth5ri!EYOo>0Za)4rV$z?hUc zn`>a~yWZJm8Xp8ts0~Px8>dSZKv2hIXF=L0D45Ca5Jbaj>q(sCTshx9bxm#z--z@0 zcA2|qsljp;=Rfv3PGX8mWW1r#Xh+d45U=e)5HkM1z4`g`XLQzCF|h`rOhBX8nhppb{=6^S<>(8d4qNBH0spM<9-ujxb9lMQXP(88w z$AHAxcY1$(7yEXL4kgj8k2%E_r_yueyi8oD$0meQUh}`_R`b)TzW1g~^@P?vDLV>2 z0}~~DxfwQp#5UKZJ*SJ)+UDbf-S-5pJU-T2?39EgY+s{nfA0W|(5!e)QgW&Vn5-~5 zZCbHS1Ov6J!Lj8{BWDqN>u>XTNEX%vHasLVnBA?=is5tC7-KZ5S`ss*!j{T{iC?V11fHe z*^FCFzC`r$HxA0j`!^WPTj7{fm2pFs*^ftfiHAFGOrCWL3?TOGz58WmfuvUbxuLJU z8MN`2A$s$%t|-n>#u%*?gnmoA*Q0oKyZ&y=B#Azdw0 z2$^eWZtnjD`sQXwhW5ouzj;vy9lSzoo_Ws}j)R2-S0WB7x_*CO@kD90^jb%rc}Yi> zzC+1ga#noS^77mmK^q77UGS-PL=i)|!>QQm1Fzxg)&yGS&55uKp#BaPUR}16`y?%r zW0I1HZRF^L1mp3qWA#xYBed?A=Q%TEbaS*^zGGAJ*)1RtPB{Gh{4ASM+_+9Xo&>Mh z%z^@=p98KDRgUfHd+s9o{GzQ2Y9qMgucho;Gm7Bae4?uFY$@Rb$L34g6Ggx91iLZH z>jc8hW@n$Jp}2fnPs!9UYH`w#1gfTQWcEVvTpFJf&l%V6<(aza#jf7KFPiUbDlXxhS~GQ% z+OiE@3?mLcOgG)H7~sFzClFIv7lfl6R`7F-!tc>McbnCs zsg&a0q9N-v(Oopr4g{OnC-CGP&J-@HQ1C66n8A(be&d;8&%OJ69bzRdpRW^PdoaVB z#PUk7=d;nuC-O{}YIaCUDgL;-N6x~nUP$bT+;~QAPl-!Oe${do1Tf(sBG8uKTnyVO zzfNq}egk7YMhA<1GfYmHUDA-Hw$d`wpMR+WyiC&YG zaLKAtpYQMg;MrsNtn={!bX67YU4xh6>f+#8L<%H%(mHQ9z{V{PAy8ZeHGSJf9U!MN z3d^Hu8Wui3?$KcVu~N>xrK5F9tJ9xMT`K2^u)L*$EMShS!#H(`>4$qniaV@zySghlxp%=RO1X2QFTY0!sKtnPJihbf~n18lg{NGWaPX?6BYH zOZCAH^IsoToQCr|OPlwM;nhZl=S$9=tl$|jR!dayUpNmbF>waoF+T_dSe&;4S?$N` zIEyT$TVG%71d?ib`ufKQH5n7z8saR2x)zvIq~PM4&Cj{RiZs`IAXz6bXcj&Pb+s92 zco6lug1>X7IkC7oNjAUfXML1up;M0+6BvG#Bx#GDY@1w2!hsLLDAbD!UlNlyw;3XP zh2ZD6)YZB4;Urm?yl0crD2 z-qLT>kt^mZp311GkG`((#^lb z&x!0Pq(8%K>#=>ZKP~y0O%ZnaBfeYhmF6;Db2&#ZaQHEA(pwG`ZO4`O$O-uc|9)a< zrs-LW(JQhV7xK9~BFZC8CgmLMR*CYZ$E37&A}us!u42|it!Ihka;ZhDCv69MBH=0# zs3a|_qN3suvpm%%@~p5Xt}l3vb=lh4^bpkk94hDs(>3^qDL8H-Dz@H_boj8a^yDkUA~ z?8e}Jm4o#AxM1T`8-@5}9XsD16N5iNz7;@pO~Bnb*TtHnb-DLcwU;XOY_QFI6yXGvp)RLS*??(I>_~<&Zo1lrI!I+FY-DbK6vJ+3ap$Oyjqyo*hL`@pdh5 zq>)7H8Tt9sBUd;58UrZO%4aYg%4>mCi&s4vxVTLSvc)3H1*K@^%64X2I63iz)9u_sjQrJH9DUBejqVjBdL9H zWwzNRDkX(l1gT7A&QExEeu)u7Lm4n||Mk5(ObT+w)Y+M?opvR{zS4apc=I7{ag%6U ztU_G`PlqFT3gSsJPJVj1CPjX*i0G-wl70!s$_K>A8j7>*dx!KG3Gd$d{e<&NyZ!up z2VrzN#`ua6{}&lL0#*f2J>TJ5sI0Eecd~+qM8_mBXn_-)uwiy@-G;u9`&Wi#C-X`N z9u4{W`u6b1IF_p8V!^I)cpt(x0kmQ!24Ha>;4Gtw>fKMhK0Q!dlnPuo|I36o!atrs z{~$v5%~fL)Bcph2Ev>KipPwF?BrOhvFG)KHuO=pwi{s3_jI$AXyaX#o@EiU5@}7E} zMB_m;_G%BvBV$0-Vv+H+HbYB=QA5&l+sl#X{`B$b!MfnMXU~49$DYWX{}INwy35BP z-y-E1Xf`krgMiUCdhOe{4|_FLQ2`=P@z5H$>XHxmD#eZ+yG12>PjNtf>NiB}sX!=5 zMUWSSo?MGcx!IplntiU-?A#|36#TvJK<$%pp_b1_t_$`0$tfU1*f3tV&&2@364XLF z-A_&>)b{JJ9=^!$LZP0$Bai6*H*7K0@ukDZvh+|F87;rQ2&O5lWwJzDF7P@S2bm6bti#=8ay`5QxDV)~_^ z(QEiB2$H3A2%Ilg`QmM;fJlIbRV@uCZAl53so%zV=Y&g7jF=&I1!{}4jL)y{p0Jmr zvbB@tUX$hXuoQG73zYvziZ`$uLzj-Zu@=(t0w0@t8Q2-Bgl>OE(ZJoXl)$8?cy)g|>pm<6mmxwJ z+-MYWP^=JCGf)!O{&9!t7&C3HNoEPE)1KK@Y zFc27)F_Sn64xX_K=xHyIUwL9(c@SSorp1oN^v98aAuGd31^;dT$?so5^5^`R8ERnc zKZCp{9VkaKH1PSf5yv?pAwj{RBadnW1g=PNZDm&BIao!k9qNko&VoS$%)xU$u9+il zi$B6#%;C*5KO7G)k=$un;3dEA79yx?aq~w2IGqP&fEqaWF80>F_aqTe$0p<_ql}-|BE|qw211xI|?0sZe#$?eIRVco=)tBG*ZjTlK zrSQaf7hsyw#g|KM!7A3Io{Aa)2O-j6Y>_<<5W% za0>@7_$0-X-_q)affB4WKyaK0n8+>JH#{`tBtCH#kxi`+tPBid)peYmowZkHCVNWa z)Zs|tb&$a?Tv(6 zpOWx5KfvJ0)@|upG!cTryAW(sa+ehhHGrqqWY$UwUel;;0CUjvc+&yVowsZwegd}? zCIG)3iVTg|f9jPhqS)XQ6S@dDJBW}|rpbW|U11|bLxC`8(J`s1Fs~P7^F+bRrq|kE z^u&prt{3O;+zqp&ha*JylcXu)7=Y@S+VWF~0*oPBp*`;(wf%>}Cn@1378|>)$ZXZslr}1JvmLJ9!k0Yr9(=NV zscxW4q;SrQD+A$@r`G^B+L?%>>E+V%b7aH?(I5ne){K`3oyK65{Qavk{SIqq+q2Ph zF(1e|S9-&Y;yPI?TkWBsx2gK3pNgK|=(-0J-f+ug`1oGcrgiTsh_y_VsGOq|BNu!c zJqwk6D9^McPbYxn1Y$WaT%!Ee5K!@!72p9>Pq6K{!2y+;0BCV}2Fi;7tGaE=1_`K7 zv4~hchx7mtW4JC++QI%>@|gl(M7GV-ss2%d^OHT^thJTqwL&O!VkQgl!@&yQ+3~B- z&y`&Ux1TXl)8;=&ABUpB{FbJY-&^799hh~_2{}AsUn6Nvr>IMsdjk3uJt$$rSRbBvIJpVm5%H0 z4w9IQuTFqW)8iaU8PK}FJ>`P7c2T#d>iF9{5B1B3z*6nTE!qMqtiGx&Y6ck}otnC4 zyfTAsKIiWIO7Sn!E_w~RDv^srOo9_slesXZ73<6OV>3!n+H-)byjy_&!0)TXb+!KM z$A=CPd${;RPc)=0?aP4ati>Tss0|&qlakP(646L3bfn-h4GRXk z#y#zjIiSkgxk~!{CLW!Zc^2J^ph_lcpS<WFw0?Z@QG58E)t58W~Fc2LEI$qd3-%jyz=C5rSSOjt_V@C z_k#6QS=EDwhyYc3Udb1#FlY&G8NypHzt5n4EeC}aMJGxs(qV-RH#A~m>o zJF3?Hk~tg&r{n&c+*nu8s0-KWZH$2l>@ZF*#S+TtQ(knGU>WgVB${O5pCjCEg$TJUSo23rsbu%4-T6i z9`$T5;p!TOBCbet31*t!0Tn_Y1ly8oTeLL&B)Hcanu?x=IY!gBflz-@aI~BGS+9Y# zR_Wl8F$EOkU zKQ?Z0&QVQm`}WrQLw6;RjCm6LqxmXN$b5YsQNMF#%CMDX)mSqBD$m9;8H$&K1tk3y zPK7zG1ujFg0^uOmKy5eyG5U*jmI5Wg?258r9=t1NQD~X%PT)cl)C(W zK2ve-PgPvd_wrhxBSQ>*JzM#*`2N&s&tSd~CK_@<^7fqvnn<#bF)Q(BI5)<(?_+ zqu!Bo(i7$#+p_ezkDuA}4aNY#%$`?Y@Mlhzj1;@{uYLx+IE*yaW zlcPEcZ0fG%lUA3{EBMWg8X+xrkX&>(7?_diFLE#}%b(iFdG2c|2nUr`z#4z>GKeLh zeP*H2-}%Oe1!uQ^?%s5E3m+I5D(%4P{Sj5A#O!MBCe1do3^&|pBoz$?7;xs0s@@z`rTFE=8|`R#lmn0 zX?1=a0;;47qQLFV)r1-I=@~-{-`9QMnVC+IRwqbRF?bgeO9l;%gHh;}FL?te`UX@J z4YD$}2i8CUVX#Xe3`NzvQbDPop9HhX^A@n>h2y$!0`%eIQ33y)mO+BbM!FDh)ZxS5l3(z*t&mg&@fciSEl@M4rE&$#qeunVO&AQD%y;#bW@1n35!v)aFk zP>ts&tuJIF4k$0|a(CtA;QtZLH`+xQDaHlv+q<`r4vmoekls_-_jYi%S3#{ZfrfwA zB%&OBL5H}&&}8u<87UDl~P`Zg5KqWveW zl#$FYv2qcJhVCxXv}Ym{D3xIQ8SsK%k;Hw%NQ+tAs6S8>IwogF&{by z#vNM@`|!)JHUu4oIRf#4kYn?pAxj*&HbH+$|b5MVYtH~1$@p)lfqlygg~sQ7#J8d1IOfh2e2Iow3e$oU$8L&o!+-F z*)s_w{&-P7LKz%jmZ+p^9k%ox5qEsxmp<)*KxF3(2;>+!vr~Q3b$ldW5LE{tqtDD0 z)E#yx4Op5{*lghi0e}n%q`QwaskG)>l1!0SGg1eP%029Io0ZlAo{Y4E_^H7?LA(Qf z95C97s-H99r3Fj?j4qs!Dg>E&ul9#nyYtN_hZSU4gM4e96N0DnD`wO_K+WSqKGTFZ zy*G292bsu#`S(#!_BpAv+1wNBMfE`|bV5maU>Fb)JQl{y0dtV+>3!s;P7 zM_w$ z{4<_MJ;;@IK?lTK9)zWBZ38S97TmjxUks$-w-2cb=pZou8`Q;<&bnjoAPrvVO1W#- zf8YbnOKcwD8v6NjFNT8bc;{M{e)j2x?Pg5-|GG)(96X+xo`_7F@C@$k(Y3a=ZiGRG zXr%@f_FSWa>rX~1{N~3KA3k>npnH@kWv4R-#3#U_eHl7bthY|S&9c(X*Vj&M0NNb# znKf=mG9d&qy@*h3Op&`T`VwwFFMG2Xa1wJGZ3`+jvGY36z_az_&HJa;`z6Q(4hP`h z!bKl3QBev|!zA-}DG6nBsm~4{KjY{E_~U;y$uiqOToWru$46c3HiPejX$5s^lM`Uz zWzRxvb}T^)DRE{t*1jWe68wX1A0UqsJw*<2fh*H>p7`%e$hpg_xR2%3kcDqHgC% zv_y*3YFgK}o`{KdLw4${{z(@D9Vel0H@7o+S@of!(-zQB$RYpLP0-1P>`m$}b|MA> zGj0^L1L%A$9Dio|JS}HssU9uG2sQ@-Et^o7`7BU24{7GQfd?jF-XUDhdaAEd z-psR2zX2JMq%wNlHCq-x46n}~z)Sl*A>)cGVaMVU5;T)!#Jz!y`3Hp2!9Z^Zb43P0 zF8B4ZKA?=#(1yT136})Zo`zJU*nXhz;z+NoL>`&yV!>=nQw5~2DP#`40tl(0!Q4{x z5HhD)=GgU)9Cc03ZXL_Po_RC;@!_7Nj(p2tL%2p+j1J^k$6E8+ie@-85MS9MKWzG2 z9|<%NRC#Hx_T;+_xU21z2ln z?x~-I6aoqqlUE3-mxrw(tx>0fI*Lop!}p0ldJe{^WM0+scssYb$G0O;TzjN%e17VQ zykQnpx$MAt#r(42M14dFj7Rb*Rm~%_k0!;AWrIkA2cN?NayHrE?(Xh=ElDP~3mFbD zM@XR(YfQ7FABt9gO!EF4!MxtRx09eSch7y|ad9*cP_>2^$xvZ%#5hZCe2I;dZ#) z>rF?ysJOV2w496kv&F8wM}G15rr^4oZzSgwC1{)`c#|EY63PMIq`BNVjQeAqA8^a$xTBL8re+HWD1l12$Sf{r<_A&S) zoMntrOoZ`HT|tFX&YoroxP5H}HpgUFO$cj*4pK+I32f~S_2vGPjK&iW-~hW}I+1ac z3F2QO9?YUeq?OwXb;}g79U-Z#GqX4|oF6@U1SATu$ZOIs3wpq7zq$9XI}xHU;D1k= zQ{)+G&}m4ifT6bqoc(3V!mZo>@-u3VAXj?iBmsRr`0yd3QZ?C4(8-Ho zcvN(U^5=D;^26se{q}iUP$u5%rNIXN^~P_=u2OmKYn$JnwSt3DyY~GDw=0sVX?aIZ z|ASdVu-eHX+=6G-4gPumP^b$Rkj(+385jNfQqxYZR7cjVs@ zv|Gsz-{UNsXSs#z=SZei`zw66!HA+5#cS?Mf(hd@{_f;EmB-pmp9@UTf!C30Arfei zt(fjHhDUnUoz7+@RidNM^Fw_JxWCHGsOCPshYTH0I{N0%&bv-1E6bBek^Z0Ez^R;= zV9ba1|4iPO(6tZEEJ;wX!J5Cy<4`VLU-vQZhxj`}8quwm6t03gzWA=&+WR*1n@Oih z!1-PrEN!J4#+X@Q^VOEsDEhejmvy*8zOGY-r`YKqV#PUHlo6asn8mJ;&+Dsr)CyaM zlT%OR^Atzbd|2OgxKHxl!*-&7n&9$DOv~Pt{XsX?wr|uDT7P|wcNo4!FlCCqLm-b` z`N8@naXlq(H9U&-3df5HMcihD>1gi>$-3kI9xmv@b*wC=+Yi$2WcurMEMAm1!VXp$Sgd;%@SAyzyFw!^T|_>thnj(l97W ziNRfa=23v3#9S|Hgxwqb0qjEfE%gn%gs-e0`XYIE-b^t|hZ0v1 zCizKtptO#>I!-6*YV(DBi9kRtWV9mR>d|s<#tyb!e-3|aB9jH@WA%>NIpJ$jTRGNG zfQN>ouw5p;4A-t|e`M?&Mg+6$yuzx;)x|B6_{bnT8^2hi@AtfqXhtyQnNc1hGz#Rqg?WCd=0 zfyMYge6u$m+sdxDdExt%8h~GJK7gMJ0Rg{%7`Vx83N*O2mfgLMizO2s=h>HhMdOQ( zWftQ`(f0x)Ezasr98QRSYrgQ{)RzNi4IJKRUd_rCL+2Veee6c7;or7kMLX4Bp~3IQv_Psu{v z%~Laf&_(rDF^96DeEH*r_IlhkZ^liVhQ6mB8F->#ZemyM{OM zRr*!#pH_NVLr`2IZLIwF2{yewTRc!=gB#!OMbAD}gBP`#zMA3<;A6{r_74ZeTIYT_ zJoF`!nlEN^C}iW_C|9@?d`CJi@+4t#%G~@qWAZQHm6oJ5rnz z9PJ73NvkE`4q^`OEeXKmFVmYg@hVH}aB-dw_{Z7|RG=yW2OpV~ApLAE20tq}kCPJo zQiHmi4?{`bB^eHIO4V2*F(E~Gti3b$S*YEJrZd;2r$80-g~2 zoG&SVf~`K+e-Pt;wMHJ6`L{}6`b0LvWz8aIz}f6G+ZLuifa>J?|0L4?_E3b(1Dg*fm9%71+h{sPxV>e2zw$&oy@SkHoCQycDGN_c8 zQfmJ1m-^<-5cywv|GSaD4hE`dw1X2iicC9%?zw?HnIJ81hw|cj9Ub#7#apU&3y#e{ zaA4bYBoA5Vr$H`6e_hOJLW>|rWHB#F+ZxF-{>I)w?^bD42JiZT@MMT}Yc2zY7Q0&5 zN9DPvyDcQ^!QVKYfE6r1*ZZy9A*^ZyiT~jV5(QR@k#}I}=lE zcMLoKjdi-|dhP(Dl;4Y_r$f{dW7~CLy)bO;rM}8zQ=U7s<^NI2{|yD>i7Mt7Z;;7C znQ|l~Y0qM-Pt05u;j{|=f5Dgj;nV!f<1&zEQfM7GLGQ_H9i;8H5yT)c=1PQbqtR|T z?N?Ld^?Js8Z(i=cW+$VJ*b8s}X+-f)3t=<{o&AxSC&xJZT3sY!?Uq-K*I@a){&_|K z7da_L&lEG4P@||4{O*NXMO>D9D#Vzu<3WRlPNqDjAPhd1<5E{YCrSbO1|uz_g~Z_V^YXt4{DHLJi5N^?1Jc;mJ=pxWT5Enh zQ<^wK7q)HEZ>ZZ**Jo{ z?SH#Q{{O`P_A#6c$8ST`#Nj=kTvaRmrvmEaNhch2N|)k!fegojsvzwY{oJfuopbr$ zMqs}r=nn*{q|O;9Fq0{-qdo;o$y*a|QbA!t{LPl4y3 z@Sj@`!@F>Q`LRW|icGvVPEK{%9DkW@>w3v}v4Na(@|&NxOSIMeZe3@A6nMaY@w(&DNPjXd zpI#-;HP4pOyH4dFG%>fo$!Lx!--Y_Yk3`F*_vEWOE|`0`_A4g8g?#<<1h5P?xLCDB zF}n!-K3r3j2M*u==?#917RS2Ep-l1NnzT>O2V7+DSX3<|x|M${}#R+~+V0NLa1q z#1;ySA4Snf9*0AHlJVR^9Igyb%DX6*UclB@hfCkJ&On5#;gDeevG9*b`zKUKuHN{g zUE%a`>wkjrKT-I&ZkuaJIPh-2=|^75Ws-Ue?{W3=aMx%GqsowYXLBF+S`9 zC&p5(o2%m!l~p6v4ol+7@1sh2h+U`%eb`h*dETG>liq*Vj5}nz+D#PG_8}?wcVewO zJ6Z^oiq({&?yt&vL5{0mBO92g7_;sT@x=7mmD8{@<3EMLoKxnCYg#W3_z)uZCG(9T zt*XppG3;fIrv#+U{1J&Ya=0p5L+&(pX;W2v*U%au4cd+c` zA=ps-r4u(&PK0l^UWw_ z(bu!QZ+V1rpL*mFB_+#h$n>Xk7CDW_xkhJoye+vpI#5w)Ep@>3pL(R@x6xNC1rv+~Fl(CCj29Dv)ThfcY zQsRmM7#+v1RG~wL&vwGIHXMy#;+utv`btg%EKsrtcy>Sz=OvO_j%2ej4qM7>tNo$A zy=qGtQyPa=UB0ZA#s1V_>^CdO1Ysl6k#;NzYG^2D-qKJJNRv)W@YhBSJ_&0G=8D-Y z0p)LKF`*=FYbOZ>J>Hyk7vdXk5qCus>!{&sc_#^p+k@(jOO@mwG`=~basFuq;qm-E zW($%ij|9u_8=mf9h(@<6Rxf|BaYE_2jq$QE(?UP&vXmw0HqxGUI=#Sm*i5{kk0{cg zdL-i2djm-|ym0L4-b(FPrU#O8^t;l`x(b-@uG9@Q#?B?Z!#(DW8$}IQ&chGxOg^gdQ~eqNYL&=PU_(TeI6m!Wl9l! zRYNU%^V5BH99Fo!J50RsSxRW0bgM^gJiQ8VS3ZPW{Z)*MYI#37@mOgo@bVW(vHz?G?QLo1Q z7D4JkqBO2hx%ys$WpCW7;N)$)!zlfWUqGG0Q*`#FhEoF#X}5iQsggPGo&Rn>CYD7 zsaS&!s`tmkBJLU1SLDJsyj|1T_rACC%BwoKGSzO4?HdfWdu>qqW-)RIxqcz%$_Tym zgiX)fj_37hf@GtkYv}yOMp}c%Bp(X(UD+Bd(3oQ$IzLHvyzxMYty5*}+|{=!c)xA! ze1e%y2N`g`g6j((Ey^_ffX89u3tafjxm+x+_)=*wAW~M%7vt39cHaB%p*oe0D{LW? zgM>4^A6`yR>iS_T)fg*fMp4P^g&2L$mP5vzA?G~9>k5f;0x@HL)bXA603XlJjP_D& zI$zpSRj!Asbk&%R>jwC2X39WPqw7vAnhes1BCfo)4;~bx8vUU`cVlF?=yL+*d7>OL zzK_#|$J8-#>VArO3F_TH=@UA|gMsiQCGY|s@rQ6c^GFPx(iae7rG`!!3YFnCaD6dz z+#~87T4S%JLSn`{X`3)RdILR8yq<1ANbA#_w;PRmtNNdM5QwD13sr1*gYFGR`pSR4 z;pq^zGp_jg3uLT;ZuQrt0^C7|W@BzDRr2ahkoKr8zoWC$zehh&z!JO|E*B)XD&ND; z?)vBm@iw0Gy>kc`<+gWilK%Wz>;~L>L`JC^vNQ@tjaztM|; zkHo?AMNYE+A!b<}_Q_H(kUtai=T<161RnPgcs9h00iW+#$k(Goz?kT@TN7+sKG-4~`exDZPojP_VB`qEAhch_7 zQ2B){rw-5NdPYC2#FPqA6NakZ$o1F_qoYl$@_IPmj5(P6i3N+wdn^(+_m=1qRVZJ4 z;v%j$unKR7TeZAY#Fi1_vH7hhZs#LbrFQi%pQPU>yhRsh{?!Avm@%(Ve!ki1nNvNJ$B6?=iw2Dys z2&S9&yg5qrSW3%RlIWN>R;HU;qF!!F*|Yu1Co`Qx81D4jYsd~fBB{$6Qz8oLBvlXE zU`_X~ynyxsw?GyL(x&)*XH+!T4$InxYjGZVcyruyHwNjA?FdHn`CXfD^-k*I&3{+> z$hZHh_Wz?+uqe~4?HZ!~O3nX$3QqV#ag1=9k$`q{%(UIDS#!G(p1`%*KrKlGMb)*v zi6aURcSCHm?gtLUaxuzg$^s zz3Z({yYJ^&^vN?3v*57WdF1r&!bO!3-?ZNhpG#=sl%Ccq9=EhcgwO4VMb_pkf}}%2 z+$cK_4Js4WK>TG_8Tz8HWnml3Bit#(CjCQbZSAjz&nE>|@|C@2tf$?8L0z?nc`m26 zvcb|8-`?rY&_V^nv-8G5CRm0U?j`CP7P-vBp!5nT`xOkb%qW9&C+j{5t|e5>vxMx{ z-E}V!NX2aoeeS*vq3M9^iwS1)0rDJ+ef%257V(Sa z7)lOQ*x(R}%phEXUQOyP-;akM$zF@$?H6!jeQ{NEb+vI*NX$-s4~`HpU*(Wg zbA~qLY-z9?UVAfSE*LJ%+^U$u0kgG3F~?;)w6vL}{a}CXD|Q9yMOuXC=S*Z-88|s< z@O3H&1qF5+Jnn3WE%PWlRP3;r#L0A_pUc<6&Tk$fA8WA53T@z3}JCWK-V{J_933 zJqz}NChnW<-5B!%6S9+T5TY&X@t##fX%RPw55&_R(Ph+fGDcFqzbpjb|ES7CB2O#r zIp*$sI!eP5x`JS+0t;OVnnHJ_ZPl)?tRSYqKqd)86pe2~w=>lA-#f7HtR}>TXaLeF zVFsKBW2`eWu+YlfdlTlZefqs-^- zj3CINL`~fEHshpN&OqFCVx7Zu;sL|83u7>@iIWwf=uQ|&@@aG-(7qEo1;WXdwc8*7%nH!iBsYH2>KndsqY{wxS8KpW+C zev`0=$DTE)-6|^~E>tF8?*#@J>!S8JUv&cADAEF%+q(|RZkLx=_pc&#g?70Ei20or z96y!z$6D{MXUa02eIGvT91-9LTsOkED}?rFLLJY46}L5r$$3I zJI)>p7M7M+eB$;>d;)cJ$)|aSGu(T1b7DY^2!`jyFM}1tY5>*d$$ydpnJ*fH15zr^p6rW&X#pgcHF^hwu1j*NO z+B`?NvmIc`_K8FhAV&Wf4<-dk-6scy4n72ye-tN!<1NCn{Y+ zN4W@ay(|EOMLx_pY6l+Rae`DiLGyK7zcUmNe6UZAyaZ=(=od~Bao4M#{8FS|Cb87) zJZ2njp1QwB2=U3x)tpA3RK!q!p?`Sx3pkD-E5OWuxi`DTg;59hkxCM{9$y=MoY6~c3b?$5{Plb7XSROgT4gkzL`PR$Xp}^#Tz-tu`zC=?IH0AnS$gFqqLB~oB0cCL zC|OA_)Tz;gqTZh&tzoz*5ki1AOih7bLashIaqD?^M%HI@wy{wP7u$3c%C-#Hi4&5me-Re&>qKe6?K5k&1RpyR_2NDhaV%N4$E-BRxQ(g&GV~>ahm0k z5+1Q7!)F75?aFm}TQhI8fK?+KaF4zV!>coJg5CTK0MB-(!#3hD6=)DR2*3}me#ljk zjJw|_*fZXtT?Xd6(HqivhLxi#E-F0<5ecH7H)z+px8$h27lACw?$4>>hm=7qh2Q?5 zDq$;_aZWkOr2WRA=gUf?pjlLWtO3!N=d zCfZSI{^d!8Oi%)|7sx_Q|BuF4Aw@@)p}_l4Na;1sAcA$)E72xx){mbd9V|HoU-4`>jH6`leS8`tcs?rsSGJxVA2MPLwCWm#H8tMI#IveeGB8Pb92Aj zo5v2UYI|!Ly@jh(>YTNEdw0RdE?(b?01=1P)jfjQ9IN>PQWabejTaB!|D^)GGGdO& z>I0{Vme|HL$%vo;Q(8V*qs4)tu-3iLuZ$WVAZ!PWL4keWCH*&R2-8(rkfhLfvGnSa zJ3}2mB;gy3fU~)(Lfn$l%^d1~G=D_WePJOf;suz%5XI2BDG{7~tP1y3=7ITrZ;ft| ze*h%E^~fSu(iGc+wNiVs3yw19=?2w%^fP(AL-e3b;*{!$lVoHHqBEPpjhT37wxpU_ zfI_?L_t&5oTp7~e+@-{y={M}PAH8yg-aTX^9e0%*3s1&EBa-{{-@!L7GG7x(%v+zp zi!3MYp?A>u2-^+7n6wrQ-qynp=YfuNSXC{5dFy_L$?IIM+hYu@ONaWIIB(!umB#{a zg_c|VUOZ$V%dwS;dAm_%N-FAZH#*9=#6Q>N@-XKb;cA>?)h z?7iiHAJSwU5L7w@x{t8a;JRWTg0i2hwsF<0<9b}wWl{~gF8ldj-+aG!Z0R$e>j)y} z$5-8#f;}%yEuDKR`q_2^k5|HTm)b3(U3N|P$tu4>kO>hsdrbVaDsH1AGcwSuJKsEM zL|6l9ALM71JHnpWbVHbCY&EuZ)=TYJ=a-(5NLb1`_cc4g_R-_?``=u@`%2z!Xy!P3 zY3(;R(;WEl9+Y{Tx(9LnnbG-xii+}QDy2&t`yDG4RJ}-!UYaHL8UJ~#(y`#rqyEc# zg`mI^O+MPbc8^-1r#5HJ{@&q)3igeOWe`O98 zSyJvXy`ZZXcMFa&6iRv`9WOqC)b28kI;~eAEq5FE?Yd(M3VttYvcz~X#0&sTP0%OR z@Cn);Jl!Obg4Td%i%EdFByFARk82H?+uYe3Px+KT0OFz##(laok(iOd!_3Mx0hHNDqp49d{tYAwyG7|!hnZNk{ zq-4mPFmu5V?&B5(7lWLw=I_%NkKv?qU79T~3fLV|S?L{X2i{J~qH2ZCO`z&B0c$QD zrda1H`!^Adym2Nex6Kh9){jtf-|rI!en^O*bXdzi#wMsWWA-{x85lCIB2egB|6jipgZK0#;Ie#ie#G7-F8l5YH^^;Kn~AhM(-bq0 zKb(d3FC)#MKc;0|89sHGLR~VR5}a(-%qD4ja^0mmA`v9CYFu5IbDZ`Vmj<+sJ~V4B zG-n+RC@f|NqzMHy?r85MDF zEbA2SD-!V9C}05jp$b%092hvr7NQI>hn9n20Ym8;vXUGJa^!isV3{`+-CR2S^Oem5 zaSrP^f|otZ#k{fOKN{JEm>6Y?w)FrMf@DCzmWzvU5n)Rw0i^mKh6dacHD`i8Iu{59 zqddl9%aN+hj<$PRf2JS9)L2Vw)2;E!>#cJ73W4oLI5B2Lv0@0nsg3bsY!gv7;s^dr zw7{gHV0y6J2s?aD?L_(Ty~C%eCWW{8RCyeR%azN_X`We*&zLjmM|V0fH1P@HaIhOks39SNA$&piF!zykw&U~)bQ6G0?3h5ZO&0)anz3>AM zsJ3dWg5;lasYp|R4|>L(R|WH$>9i z)5#=I@$nqhq_x2ur+4t~99eC5#R{6S^~YsB7}!RyX5)F8%+JLq3?A@86n3DvyC?)$ zLQbofzBBL|+v ziYsQ?#{t)V0hClyW^ID}EDh}M3O^7MRFJf_rf5lV!8GzIKlI1T5p_ulR})lNE*RNS zHfBxmI?mnsgu~l`@6gGgbDwwkbui$=1U^tHl~Z%e_qe5jGJXQmanj6>8%M?ud`QSVC|;av`Do)g)@!zwb#&9HE+*!{-CNJ4 zt)n+xpn#L|@z8g6EZ}-c(ii29gR&tA@_DizUp@G|g}ruVD+XAVU>~;4g#tp?zDyaU z8Vl@-)JiAq$O+)HnAM`eP%?nG zk%C%z!Ohs7u?g?BD%uRO$pLKv)QUCa^-#X$yWR-w@XG4k383*EZ$aRM)c-eBCC6~m z5CU0%=WlBxUR98lyx$KxYn#O#d;SyXM}{QzZMVM&F0cg)3e%?Aha7tQoh3H9>>z~0 z!O_;Ljv=rb=<&7JY*|!FDih&&1gi>A%fF-TB4?&;;o1J4j9Jhk!cGNSjsU@m-^1-^ zrom7^Qa^aKi6aR)T)hD#B$(3&iMro7|BlgIDab8R@^APx;PQU}Y5!L=ivI-Xh{b@o z95cW<`q$BhchvELu|MRf@pzdve}QxK*ZF3^HU<*CwSn1Ojw1H@!$KBrVuUgtGYc@S zTfQ-^e@r`%$>T^Mu2pt@$4+T07cj;0EMxA>a4mU)h&RY9So}v$=r{JoGWuk4Pmx7C z+g7_q4$5OLWS;BfTcyk680{!bj&WZ_^-?YOD9c2gTftplMFIA)1KkM4=s)3k|DXW= z$&%^dFHz3?oI|v>kBkStlxY}!A3VyI3ROXnzkO=xk0VW7W%+LcQGaszc`}92>HSyZ zR2~LAvMz{IRoxV!j8%9KR&p!gJ#qZhL@wZJkWlJx3hMoat(NfaMbi^dGw<)Lp#NS@ z!8cldd^UwchDGUmp@EhMXXcl(K_qI}=UfdX`OXL~eSAE11Z1$Zm|nTKf4Fyjv02d4UJ?t&Nd!RudH z-T!-BMm2!XE;2+6m>i~-^hsr?+ltc1+-%_?PMGF#<#v1H0Kllo62iACfjon~r}nG@ z5`;mg-^g%OWP!@l4?JQ|QmIcalC%+5c-%D7P;t)!t*44~4etBNb>;VT2nL-Q1L(qp z&ctx;0RPh);V@f1a%Gwh2=tDB1#*`#n!~gC{$JC-FLLVDKJJLQzYbz>GbIN!!Ou6z zauwU2ZC2)HQLg{E$VxY^a4T}iUHSC0v-IYm-(|<}_FV65Aea64I_lr^&;FYPGah~Q ze+g#)DT3MmOs&{+s#OgX1Z`*m1wlb36(|V$8_oAUnb*Xnn9G&y`P~);#xY2TVfbAe z54Wm8$}Ek)K0WpD<3w@NSHrb**NpEZT+t6R67}WF{;A5Qef=5)mQTqW9B0;CB`F-` zp~Z59L{Ub8IQ~slBBsCcc!QQD2Y&|OGmE_#6x1sVi)BV+lZdOcgcW_hk!rjDWp;ou zPl829X4TS|_;!sYXRa~0{P$$+2O0sJ$}3$=Uh-_qG6oF@uT3I9FpnxyX*kROgR9_$iVFEh$7OHO`0G=qnKuk-msdp zXm8K09d>yFx$27QdKxMv6MUmA@_;n(W$aS@*1E@@v>W)421_+@lAfyZts<)8mx1Vz zDoYs}Yvc%FE>ciH8tuhdMekuO;9l-!I0H`PbMVy&Zn{qkD7B<+>6VbS^%1kk`fA)E zN+4*~@;;LZP~eEb$(NQ4K0#?*yiOYfvA5VtP#cnl=F?D`2LTRBUV$G{^p60DoJ@r- zk`A&6nVSAdq%PdcE%oH4{70+Bjm}WZpn$tc#Zol$(nRy<2iaX~D5RusxZHOq%;AU5$VKGXT;zfl5q^o1ez{=~EYwua9! zfz4JN#tZxawf%cbK+*<{Aa63KfkgmHmUqM)5lZfePr^#x-2eXmtxKLJ2m0Ob{-U8l zR6-Jndh|`QZ|-%JB3$O_{FB@i^7~QjJA&8mcPB$G+|N z`KlG+MF-v$89Vg-zK~Y7I}WEczSMvpy9;N& z+0foUR!%u%A1iqEx8whNz2Q~ZiJPzuJlwh`vf^;Itm(x;NYWPgrSuuR#&S{ zcb*qQmpe;&!yur33N;0hgq-kr=LpfzJKH-tuyvt<_c!}f^!bz_c&q-< zRSl^19*IBgxwxn{w_}zP-ul(@jR9BhT8^ibDq6uPjoT;%eog+z!Y}!DBF`dh5uOYU zNXuX7Aom>_EyoeU&$0@=tf-6KCu#B6y2EfHG@dP(gMt{FD##;*5#`=@zyz&0$UzoMefZ%`Ghq2KTbTb#pn|Ck_T%14J$6%v{E_+GqUDw!dQe5yVP>#w`d8f!7pTzS`c3^@2`H64_^t(mD>|7=Ks}=7 zC-=*1GrHxVK6Q)>0a?l?%zu-AQRJXte(tFMnv{SjruYf5kPXuhnhYY%dP@_R1PjIP zNc4z*c7$CP+}QVC(c%brgsq_GAk$=fN0GPzvFJQSG(=}wWLSkJnwUqh-K>A~X@}hY zODJ)HRK9p)+)x<(8EJ!{Ash#CQe`?wB|D|ga-^n(X@z?h<^&%Lh>^+Zoa7iMRFfNa;V>z9eTYhk6Lv#S z9kECS_F1lBk?r_m4Mi_}s~|=ES1;kmf}aps*cW@we#$Fd&JwB6LJB*+@SmB*SipMy z>had$h-&1HxTkhiA`4rl2`H+qYPHL@&rbr4A@*`Zi5;Ph3D`LKM!p>(?>FTKmppLp zu9t5VXq3(=@^{2GJRO`kCKSHacQNhL@v$Ovd`>CL4~(qbMV%%uP$@HWWvT7TLay(b6C)67bcgOY^yW(9`!UY(bhm_5!B4_s6=a3vNsLK)Bx>5p+hVwhrUHqTlhe=4 z=ah7uL{2a36Q19%W4p!D zb*w$EOysN({jqH-?Sck;c#@zhb0)R=W+najLC- z_(H||VzY}7G1wPXMrV!Iz;}}Bdme7~4_7?HeBMRWHd&J~dW7an`$BW|-rUFOb0B~t z5b9w9HYQv-+HjK}SEx+lG}Ov`)(dw0fULc!JG9~RBeKEG3GQaCC#um9@WA>LqSNYq zM|)I zx6dsqUL_daFhdwIZ*|nQc;@Sld|3yMo8^aqVMT90Z@0eKo0+sc@;FB+t}BhyrJMdu zH|%dyf6P^+FzI~tUsft4Vup_|ysaT~B&yx>na6gZ?wIB-1jw}-MMZt_h82{XHg-@@ z6>#jY{>&@Q*3Z0xk#)5 z{iPq}hx!7+d2r?aFjrXkl5xCtgG$F9ICf)4Bov=EIt68+zT9Jvslfv|qiRr;tp39* zkZOl9lxh>^GIvK>S^wx2E!~cV^!%LBPE_mo;hlON^`fy~k9Q$uYtc;>a5}|d`%T*q zO(<%r{=??)je6i%Drw8o|YuoKv zgo65X%gIo?B1=!Pqm)pJ6n1*CS?cLh&~U)8`>-D^E>W7VL0H}so4w+r2mb)y!1^7; zsOXV&0zaVvDNZV2pOC3GnzkAa^KJlQdBcd^uoA5yF*tD~kxz_&am9{Ga;ie~<4oFP zY~74qe<96#tdn)m)(~CZE zo`$AgN1woYY##~)+sADe78Js73Jxn-i)g*NcAE*G@rpY+P68W7)9fuH%Fd8OL5$OH8pj0zUA&361fFAxjZ(*>A9TH9r1&iqmw9V@@%WTVpl8@e$q2 ze?ogYbXW|kR|xWu>waebMz7m^n5h)(7uj&{=8=TqvD%+(&NN5Tb&3QBke{EtIzi15 zHhZ~7xS(P@k2XPQ6>8y|6*AANr+r&asOOzOi}^(sV+T!?b8&;Dd0?p54Z+ZBzmL|96KSY-eFVgU~+clVNdGioOE@h&DG|))(kz{$pzX_a0(3#F|O+i z?_(sPdK)+hlfHP1c%$tBA>^Urke@Awq^m!7i^haDydE0X&*nS(%t`O-2g6HL#AoA> z&%egjvv*j-1CeU&29Lp~QH6@OGvc4;w3I%*(0hbp`1K6RI(^_!8Eb#-7rceziu7$Ih0V(PsZryDp0&P{ z3G~-3bCrTE#L;`TeAC*)#zH9sMPGrH70u@H@nSXCV49Y*@a0c=FlXjT%`cM@9F^?UIEk0y`SX#x3PQY~d*n_IUD7TFB~Cv+DOPzNbk`Zyp^w zcgO>7M=qJm3W^4==QOt)0hXDxLr)@KpG0Vcu!QvdcIjZV&mK9ihwshCE!1qC;D7p?q4`GKmPv%YyPh~KL3|~ f(?2|(z9hV9mrf*z z3MdAUP(lkR2pCEb>4d;HChxn>clJ7a@3qeL?H?c4rGc55nK_?lJomWodyF|BUAm~r z!G4e(g+g&)w9XlzP@8O!f7`a8P$+Hu4g$O=d1_qoG<3sy`doFlL+M`iyy@)b>FjWg z-`mdJ!@vb40z|Jp&y&D~yF{O;ftILP*!TBaT-)Q-K#zYVCw zZZOSlb9k)4zBSeq-ZVbDxN;-yibv!$4Zc*yf@6~Va=0(V%{s5gHUIvan1-{azg zfc^sNTfEX}D$@e}@gJ?)GzZHrKVA@78+mmNXHbl7N(|>w zv^u-EQS7{{D`OcBiq&r1F;lTw=6U=zzMnY5M$%rg^WB6zx#D!&-ru&9Iz$)FwF5?0 zSZ_hTXniYer~LeYY0@jF4M&Z0{N~1FZ99)#diL6+z)IJEb}90}NNupE9X3nL&x}%z zUnVj~Z|SxMK+Se}Ej)4R*+(rj zia&4HyWq{_2@|{!X89C_-^bbq705E6*x@6hOAIOgcvtU*@-Nei16s-#C?@z-O-jA{ zp*0SSi@;%Nx)nxPrx8C}MzAI_2P@{~1uP}$78p&D#plc*ysP_yZ(UCUp%pLnq+Ii)k0|&TSD?{lTfWmk9NxZR^4G@;KY`G#ad+X-cEKOEOYT*m@FbOqCN8n3tRy|3&iPr*!aXktC(b2$4UdrZMF^HmD#8`7`HKZ zwY;}kTYa|Hye33hWI@m6b97mzpJ^;_*>wL$}COxVG@Xv9mP`hZhVb%@i{KDK&A+mohm~EZnoOXSYy>rYY$8m*AoJ-82tv@)tT0=TX^5Y3&X8fID~Z)cbB`}3!(n} z+{Kb$n@X`4WL>80$24eFX&FeP$GKr`a zw2(~5WM4I})TO-dnit$Y_DaejmcWPcJ73GpP9a!hls}Vk-x$^nsuKomQ};s!xwMq^ zC@I+aQlh%(b9~a;WMnqRaKT$OC z?3iYL_*9PX%t%V43Zp`|MU6S}W|PwS%dN%sEhjO4Ypexs4ND#6qrn?8m@y%oA<|mP zDvv}>3MSJd)e}rpLpWcqJ#EkW`5`YsRlwm6;WO6z-1P3KL2@oBb<{6Qo>$N}?^fJ2H z-0t?}PkqrDuM6Kq*}mJ~d(0+-&Jm{@u^C$* zx%lFl=M~0y@4koR?6$k zIh)j|9;+)$@(q!E10{NCYDt5ex@PHZ>IK#{tErn*w>-uohfh*JJBL(cipXwFw>N4JaKkq3q~{EwBUM{VLiRTX7>9U<~3jWd+G(u zg6M6I{-^C{4RqMr+B^5lyDxBH=Y`qQ1-2)cmW4cfSyLBUaK$=<#krVTp>|O8&%*&E zk$^hhajbVz$ZlerKz07Nxo3pEO%uFxpzbGAFTtxP#=JNK>K2_Q#TQXce?~e^*)8syushEk4Af zuQA7BO$uvFBfH8J*RiM0R9)eSJba2nVYHk5@lDC>0%Z8vOW(qbavR zyx^_DXtGDM1MT_gyHa!G)G0p-M)+`%bN3taU<0kpCCM^ix=zyF-My+Eg5NxnSM~Sm z>Ou`QV#>ps@ph)UZ+^1ZC--uWak;YBn5W9%!($OujH>a(eUoYGLAY?_+Z?*z*ku8~ z@phvSKDP>|1^oU#Gul$1|2FMqqBR`7Dl5g?ED0!#7hJ6?wYXV1XRb5P+@&^z%~Uw` z_O;%#JCz9Lz?hT=ZV!crhpFT0pB(UllTTZn&6@cBA++4_>+{obW~Gi^D&Ozz?WyO> zk#p+I{ao?u{SI>l_{{^0#V!Eh^5RwX^DI8B;vKz+EzKP@=aSjDCE7J7#N)A*!LIkO zE1B`EiSbj?JqELjlvtLO+rXpP`tpha^;pl#A@fcGpJ@Zfi;wM-7`u6G!aIV%b1hwI z{L8ECpA}TC<(qsikBw7vA5xF%^JR3KHI}(oGZ?tD&St%OT*(V_Iv8qvw_e#=r ztA_3P=E(6?8Me_6O-8@mWY19cp`8Tjn=wIRYLA_;S>o|ROQeuiYQesH&;MW>Wlgql zdknLDD)nk-Ni`zdvU0eRhdM25l{~lp9{#qvvQRI0ELi(>Vd14f=Ddqy(CSK$iN&qD za#jENzBe|tA&EYHw=`$&@8ItATcG)H+g-o@v%|z%^+#OI%JBEm6rcXYTSN7MQO}-z z*NK(Lp03@>*B(niH=nTmFxHV%Lz?R}H}AtSC#`us$j8kEvtf$1Ym%fYf8Z5Z%!Pqp zzdmqETb4PSc2?XPGD{Ae?+&eIGKd2$eo8ZBbT7oPcz4ZmM*(TFEvqGSX=apq65=hQ z>#eWH`IjU?`kk%3pGd3AHM`XgqVRSKRj4$aKWn0rq32hJd2 zWjBChy38TF_zsW)XGU7m)-bNFuGvA-iq2h^CKrbyrwSpqpB%TX&zE?97{YolOGy{l~TmwgiKx1JG*VTpn#Vg+1oEl`>> z#Pb)Oc1w3t2aB&r((umKi7$f+(zUy(r%wlqOuLOStVOPH{R+A^THS>A%uAq|%Sj%8 zM4gtx2sXbkEL;rjtNuz(jh5>n?~I%EV7vOGwPOAQ*XI1aVS3zh#XB+T%<7m0Rb>l` zIRocC9ggiRi!K|k?(f=z5pK|}lc4C=zJB7g;I}VWoBkq{w-E@DK9DD&MA}iza)bJ+eirDt-ql~B>e1ehcrcWAcQz?<9y;-hdH)^z zjvaHwX)NcnL-par3n|L-%z7R-i^J4aB6F%{jR&I)*P>Vrf$x=uTc#(&#vr;~oMo96 z9=F_<=IS?DN*!C9T3dDV4nR5el+c?K(Dt~swL-vSx2fK8w@_FB(hKvwZfj*F2i(aa=Q$0b;Bxhu7txI)zg1d`ZxYw>&MH|}B^ zRSj1tUYyCoPrnS|d`A6f%pEO|U~0&=V~d2zApK@B*99(OADLv0?IN3r{CNa(-+?_= zN7`#MnS79&sftsM6JV*#qlsN*O}7@p$ObYSe-pcwOs5-ObD3W_gCEgeHlwiBMLJ>U z_T^qT$?Vdih~u{;n${RZn2c=GmXKi?Qf$!U`xd&^xQWaF=iV1}Yc$))f&8`9V_p26 z`DAJciVL!7F_BkLM_1yjTaE!^$JnI_VcUb$RtJHi;u_}(k z%AU68&p+q~+^@oWu1q($b(c68)%T`g^tmIXv(r9wq7{EhLxht)z z`g~KF+kmTL|83hL@5!FSmx@L@S#TyGXXE*I!LlFEzkeWdn;|$+eki#=igg6(M># zX7;#Qts`#D9;@-?YuZa2_U}~lHr(E^^rDvuh>n!mDU%S znh==F|Eb;Zrj_YidjXCpIpKM$Mx(;t-*MJwZ>aaDw`d(az;ydA&o;Wq#p0=F#)T|2 z&S5)G)xKHM(b*oW!K*K$J5#>j!u6XZBuGmkhZyZ3U6hdlph>pXrAwDo#l>^lhsUUS z<|dZ-i)u?hn%XjT&D}vfDOP<8*%2WkAyJYZq3k#wuGqiWlwhBlFe{H98Xg&mjqf)} zKhF_IfGNKmMF2u%5=Q{)5OXEtI7~DAE$!hqGaP44!i_ekbnphwwkD%nPNGflYu~o2 zcLnsiln1sUGmrP2uH!CW8CYWt^nH=839y^$uU53Tx93hwxTreU?hP<8;3Oj`ESz<& z$<)+T3IlGCl)fGoo9Wh9ajO8;PQXM}8z+-Lf9{?G7@R~a=2{XJX5dh{MFl}w!1#>y z=9i4wHp}pDXOL(iVQZx4-^_7_zmBMNH>`D%)l7aJ`L&a^aE3fu;h(_0%X`7U$+PJ3 z@}%R*kL#J2(**La6df%FX*fWA<3yF>+P6?_w5T!|rgF}6NeY7CfxOK6f%4CVxUZD$%xSoFGtKi#3mkzB>mJeX--OP#JE+gMV z`FLv{QWp-&-MDlGut=5j7ak8LoQ;vXx8^_A|O~3EVah^9&Mh_6(A&imx%%4P^YHUo}zQMC^5nYafq4r@rv| zdZBn;6RAHeqncB@w-d|95}M-;Px1HGort}_5$QPP(LKL3kVACSm*JXStDDjpnJx;f zc{&?*E^_NrzfD{I=vw^!McUpZ#~2*1IWZS~LUnttX&VBp*+AlPjiOmUU|ngJ!f?My z2kX_JU~{?d@#_K*i%9}>;D|Rac`~8|SW7KIu7&;!4EMuXfZ+bkV3lwn2!N~OGAS{A zd;69#II9Y1L!keUvfK6wUnF-|tH_%U1Llk0vg%G&^4MuwbWNl4;qGId6XVBO%X7WI zfH@RlKx=rHy7Ve!JC#k2FByOIcr~%^h`)+HA91LX@mV;{`NZ{4PrZ+6Mp$@}k2g5W zqb03LCp{2^xAGm1$yr{Qsu^w+FK{lqR`-~!7P#!}*!^a>8CkV_>bQU?pfU^wjktit zjP;a`d!ySvaH=Pp*Wg$*d8f`< z;jm9%?B^(9#VUGclG(m}`-}=q3i=sfy1SCrR+(|-jvcQr;d{#6YS8NHYdKqaZ$F;~ zVxTOT~Mg zvDtIu-S%mD3#-g2Dx)>Mp%XOx@>|~-N8RKTMGoVgc|L#esjU#-_@BCLn6W87;rPl{ z6sj?!q|V`zS4WPC+rmV5@%S2DFUau?0Hq^bw8+Wnk9!sS{HLl1+OrKqTN1l;Vs;|6 zmUnl44%Dp0uCkax2fYq&9g8-+*!_dyk)EFp3JNoL@&pQXDfZi*r>L7YdkVt0!zWO$ zWH!qnuiLh7_yc)8a|X2uUV@+QdWZbUFOVCu4TXAIPpTeGj8%!1%s=dPSiYxZpy!98 zkB9z^=YJp{50)#l?H=q=*&k>mJf=J!nlPu8Ny`G(e+z_RGfVYiORDOO0G+ikV7+g| zHs>W`XZRv=2RGBC$fs$kW2b?9>nQ;%|H;xl+%l`QDIE10_#N`8KuHDb${|P{sdo;I z0{0*KEyBAa`TLQfp88WCMPFLL70wpmbvt% z=U317WFm)MRC}DcN1K9JK#8W2@TIS6LCw$m0CSGZ;*FD+L;{k$gi zm43QE;!1%zkSE)s-dE+7V+j!4WCn013N+9kazN{`^G|LdG%DuPQ9IRvyKKj7U{GJR z!+MMH?Y4_6tsHxuV>~iEJb40~NFN;vUQoyJ4_Im+Voa00f-;Of+t*%Wb(RkJ*n6T0#{uIA(l7;ENe*S-oj zSy|bD&v-$3wZ(6Dr#7{~3WP1Po*y_Ab8jGuHzXwF36PCph^O)o>T63aY}4sJOG`_V zehZB1X;9~8RX4xCPgEQT@Cfs0R?31ob&^FS_VoK5(iXBX*M3pLdajh08eu7*eTK#c zBZI!(*?N%AyX$H%Sll%MkVy_84pQ6cyIY2<7yA8oL@1BFGDy?Adi4b)qh41q!%GV) z1|Pe@!7A;*w3DBz7>XF#!7Y_L2WF*IVouOQ)!s;&I`AZVO10a4xwcIBI&$z|a%3^}Zc z;ueLsBZ*GNeO^>caY1^!p&m`nzB&0U;uB0Pq9C5+gNDKt)pGRHALtl>bEJ6K_s6po zNU@zlb<2|9|8MLPyf10%j*amG#N~`V3k|95;KegK>bCPK&J*Sv`JQI2n1Eg<^!j_V zn6z2b>}B$S<3d+??q$K`-k<)ig9oF_dI+&4mSI}jEsL?bBfwK?()<>I^t6M?a_jvr zFCb{nBDfk@gXV(tFlo{!yyvONK)CKP7_zqJlQZPGy52bi;ebdrTC`4sT{iHWIk2W? z3kfwe&7V;@4scSO0g-#Zt{2jTCgfN?_sokaiNt`V-v^Tv-I_&u79jg{Jel(N_U;ZM z?m;;x32+>cC4BU7=q;Np^Xd1^vT}%&X16Z#CRX`Ok(R*A>>d?ME&$W<1)|gt>q&86 zTIUJRpI^fd95~>IoZ`9_wLZ4Z8xZKuh)qf&QpFyQQws{b!BZG%f`)=-w;*OO7x4Kj zE5MVakW^k1O#q;aT~bm6&GvEY3M`~50ha+1E0~W6#OV)=nB=)1KYm14z8-?1dh3g% z*v#3#x*yfL96MkpH_8}m@RDa_Iw0KpjO zNY%#!>xKnb3;=+ZbJx?RYD0D-iPFds>u)ep#-`X(eClz3UE0_1V>$D+A;sO~`*s{vAdzy@H zkvoaFDs{TOhNn-To&zgiFd0{=PJ&@0FS@LYa^SXs5k~TGQHXDPzeRPS&pu-kwC?mm zq$F@`a`8jn^+fK_K(<((5{cM%;WJP$vx}aWS=8;q$%TUA;$J{mrv`#x!>QLzf!U&d z9kPY#=goyj5!PILcdOfL^=3||CL?(zCw?G7uWzcEhN;8({#>MMKd+Wn_rVu7+LJL}B>B4e)i!ckn z+TX)pilIW1p9wQagqlk1FJ-KWm9Jefm$>oud3u|@;g?uRAE3Eq%Zt-PN$q*&K0b(f zpkuJM%2HfuSzFO-%gL{LZHjDcN>VK7{P<{Zl7eIV&tNHwYH3D(f3<%(kGy01Yf1Am zBO=T}QpYR(4{jCTAH*>I$e*4aqpB|bh~j`OOL6`2=~+&M(nx$BHgot(oPBeuD)TGU zgxoK~ca;mw1aR(;8RUS@sGBnAVr9H*)GQh^Cm48?j`ayT-dII31zTkY0)8z8z%}-B zLpkT_y2A@AZZ6Imw={-Sb1d8EFcdh+8l@jk&w8sb|4a|hgqhrH{+{(^Wo2c=YrIqU zgvKw(8ojchYR)UtJPpHMirxnUmlgn?jWjZEF9c|}8V-wt)1Qps$>_>-WRPNEd=IMMY&*_AH*j2PKh7Ziy?Gn7u%WUIAsXORZ6d8GV4z(HSt`amf&(hWJESYD>tP zoCVq=B|c>vC6CCAb$dWpkycW*I1h!G_QNM{-(<#F`8iA@E=BgqKx2&$4>+7=fckZ^ zAbF}qRK*%$a!^7U^PGsTtV&p;m6xuS&gUrWQ;XEWm=7rh730U!1|nxK&4Zm`q#wAv}rH z`Suu^g{6Lo7BxI*akG|9vbeaI*bXVJ7fcDykC60ajdQQ)2Yz~bBov&20;hG2rWxT} z1XjuvU(JoymlEjl~ ze-^Afvp^8&f^RN>OZrBODEbfyh1)A*CN*X=?>2GDr9u!igSk!;?O8=Qz|vSwf#V9a zG^}akNua>UVA8%fhc6jNeu)t`!Cp*|6)cCoN=@AF&%!gnO|9Z5j<3xX1a%xjVn0^W zoDnbQK%oGwno2n7>z-b-I#<9k*ehlFhJci+6f6)+5DI>i!1J!=L2kft*oUnMqr-_zIIU zS;r)|w79hreWt=WRcF6akOqUdVCtDeH;Ixh%k$Co4Y(Z`efgM=FI}F6zLY%V&YM54n`nIv2riTT^@J z+Ew^s8KPp=6w+pyHYGbNwTcc7MjjJHr=9Tssb}M^Y$p77dUpS1`rmri-FJdf517$` zoA0*gvGQn=7f1MLjK*A30($f-LQ)HYqF_xg@qorQg~kjEJYqZ3KOKvl2pG>RR}k%V z+YVQD$$Mf6hwC)8cX>(xBIb&C-1(pw_@-yu^!WVX=x$mgOUC;zoZbw;Pe9r)yb%U2 z{r1-xRZLICE!$F9!5OC^Q?VnW#vK3elwgOI`iWB{^94=y<<|O_zXUOWAXIQ(%hX7d zD@!v@$v{V45<%oEA>aqzS|qTP!)5j@yn(QOhn4dM;4-tCBMz%f^m(*M-pU0x9S1d+ z$+Bu#R)U#gK&5Y(=r4-718a0eQkeDylP5m^Y4Lg3YLhgmqE>*MxHiDaW8jMR%A?W6 zny~1I($=K6^0mZn|!F?#hM=w@n~OoB5N!KON%`q(9T-btr@+ zT&dMc5DNV3`3$lq)|iSsQ^azO;C8l%NK~C&c)*Z$=xQY@YHS^*fE1mF!4(?p9JW#W9IkovY7cq1wXIj<-tKXw(UFn; z_3ER807Hh=&!0c{L&9LN9O*IZ1$6`|!$cJsym%(A;Ag@-nJkzsRs%eZUkO_~it!m3 zziamzd;lM97_5sbU^XH2t)nL(?g9BB?2~l9#Do?^!?z{q2iMZZ$1SUw_ zzJjYTv`t9Qikzh11jMCP5Yj?JhO5_~rbiAnM7AJKr5SfknF&Jgu2kH-I=MJ*QfIx@ z8S!TkeDYqGLjjAwLIz!6u#{m4JX(jHlVYk;FR9hr!CfyRE+Rol}uD#JBdU9GvMXO)I4J2Yf_(Za0%zyTDeh zNN8(^^X;+vG6GKWho}RO#-UFw7JAVZ_v)S0AH0IF&^&0u&h13Q>~FQ7q_yNY(87AW99ZFd zc3eolJ@|+Rv9Xe2{C)jlK+D#fm3X0l9PID$Sd}~B;1$)47u4WxfJxA6jX*pnh;P+} zfuNxDMF)?-<=JinU{EyB=)U7Tt`%4BCoBP5bEiVirrQ66;dMMkC&;P>>IsqW+fNg!`+dAtDaMw~1gL7a_zBBSc^aG+X7 zC0H#<>*aE!TN>#L^8hy4f%H!kfKd{mfLm>fM=GY@gH}g_60N*CE;k|ewF5N8mkM1> z?kXvvCver$I)H0r{ppdjJ>iFx+&hjp z94hW0mw=;41_r{1Oh*0~QfIoJ>SlL(o#AKv0zwlo38lu?734Ax)krKMJZLy0&T0(S zn_CrFb&iM;aS{sqEf=9~UpB1Tj2xK-a@``JNuFAA2-wxtxv>uMLjX(5ZLq$G11lqn zd0{>B(CPLIBo*{zIu*Fwf{H1hF%WAS6ml$s05%HL#&e5%xTa>x0a(X?o~V~Lg!53Z zA*MS*+bRo0Lwsmn)JEWU6$z3kN#@g?6O4+XGK5*Q zLSVMPL@*Dq#~W#hngLUy<}I}7-p6}FQ8l*@dN1n%TvBI{V)remDV0I#)#WoZa3-3Z ztjRK&>7~E0GeD(CTQFenkV(Mc^Q|xB@2Ex15~0Fb^4!Dj_cJ zqzCZhGe6ehWC;1hM~E39zN2}=-cdvzzq#D$@lGm$DGfKOEV%rjDvz~4|J<2f-MsvI zBKDQz=x6_@r1(3(yS4{rmR=yf=w`0!HZ! ztvvXBW0XaS_eVbG*D#1j9AJ4uWXmoaL^k&3%?2i<-N^iQR$; z+A-6$UaZv42THkP1lj{YT$6Z^E|U=;lqEpb?&D71_-3OIG#0Xq+W=IR&~OXfqECE- z(7G#g2U?#`DgnTALQbW*LE&mrpyOec5P!H!)NCdp;VNG|QkZ-yICd+8ozt4JvLdJI>6D)I-4dDr6|^2#spFABeDH)@ z1yUCmD$!jnT{=aw@vGS)U3a1cQ#aJ#+(1op%Md6=9XiR?>FUvbcvB#U;v)YAhjXrC}R!E6A%_m&M zk7!!%K(-0sT@82X$e^R-G)_N#n-$GsyGn(O36qKM+;D+H!(WW?yhD_JQn4S|s3qb8 z?Flq7x+5l?#(GBSsqkmXF8_L5uzp6`aj~Pp3EfTj{3y>nqTZ7#-u2I%?}59Hc*N=F zF^W}8nm!d=Kosh;c8=&ntam9P0D~5&-tRL#e3+pojen24$G;s}1RN2plY-vju9XCZ z7sl(K8GdTKflTw)v|RsdqM$@&>-zUnxc&~{e{Nj(r!c(6;Pv8ob{ri?c)fBE!v9Bn z0^EuqIcR_)96$eV{hR;I$o;Q6CI0CP4OGWAif#IrWkgZm&$Q4?LlA zl~`oS6tC$;4Ywr47@1>cP45Jb`0zX}^e{%#=BmQY+xNLToW>|?h+$C7*Q`c{denjZlc4ap@CzJoA0$$F*z#mids6+%AX_2#LTVyb8 zf*fHil0GGYv0A=(9&5Y{tB<_-6mBd%{npq(4|7(aDO!$3#__sf*59~d77<*@1xsT?Y(xKe0j;cZA-j+77_5%TPEl& z=Nbi$Joz)~-;P29GTAJ?do$Xv0EG6i^cNAcRz*g z06Nd}2x&S!67;!;P8)W;fZgz^ah~~bLodsSvJumg)css52lneKFYl{}GtRSskgz)zvgai+pMkwz94N6+Po51bx)k6Bl0DxT zQgnUHVREjm5mGupoie33ZoXh3jM<^Rw4bhCw-9O$S?`rLIhVf`=TpFUQ4mGZhpNZ} z{QkH%fq_M=fxn6)aG$>@4f+k1a15B%6E;t7ZaA?E3K*TDuF>K`D)SzJ)8TfkJ96la zJV@YtCG;LWl%}OjArFo-4l$L~fB3;>6u$#}Bo#WzDc>q4j|oV+`|SS)n`7_^DOa)f zTFY0R+qM_EY)a4FajF7R{@~wr5&7s6Bgzt#MGhWZ4^+F`$_yNCV$>buawc!{sq3)7 zD~U7zV7$S6v*i8AUl?m4cLBpm-Pd z?>)ZYxKCu&gHQ1ZItG_dV7w*IFC^M7kK`zHYX7l8hcw~ZpJf*Wma zttkjH>^b=)G3%T%@;@|cKS5WfxE#916DkwtBC4d2vD7*bB`A!GAj^V+#S~~-tY_nB zohr+%6Ouk|0vN3KYr>J%*!5KRStn<=%wGWupDe6_Ib4rVWu0=-hr`#fY=09ljX#kR zD-7qMc3_59>a95R%u{V-SJe4;$p8Eje-ALGNB6W`G5T5!?_Q>FNW!pI&mg1VoY8uc zj~1QW7*EI=VvIK6q~h9z+Sft#`5}rFM2^Pa$(M7z=i}d>xel zfhd?FM&IaT{pVjI1IA@e|o7RTc18bFkQiz5{`HETp*_B|D{`Ej1isTh*`p8wfrO*gI`qo0<8r8g4{g_ z0XcYx>8m|q+_#Qd>(SA8ByVFpaUlZm34Xe)9ej4Kj|4&d-+Qpi$3E4ugGX(#Qkct2 zh%EYi9F4&f%|B+Kv~2%ProPPv`Raf5B$xlmz=O`RnEy8R>_HZ9??t^^wb}fickcdE z?7+bSmQ$WJ>u`SDw(VgO3&4W~-;M{^%H#z%knSck_dc`_Qn^!{sH3eBh~;S$Htl z@nT2z_qN9{#{^5`W|OqSAWfr!WeIkuC^<_?Vb>IoU<#cP6y|mv%XW4Dy3l+?7~l8N zNpLP&?i}R;rmbmWv^92JkUYv52K^dzgdWyKi@UbtR-YR-=+=U2T~CKzdt`#|*lT33 zU_8ZZvqV3WJ@yqC8tba3E3Smo28+e}>4-fok)m8fPLBUhNn*$ite%#cltVtD6rZTA zY;C}Ysfm!pAP>1g#TX#qJf)>R`4YJ^WP|>gG3t1MfPudJ^8R0+7Fc%(*YwkdwbDLpE4d$l(e*6V~`EF`O`c zBi!FVb_R7Q-Y9ibrxhg~Z!CV|mS;ABHa*$L`VzhG>E`tm^7--Np~7r@6=qC=ei&nY zW2)scgTZ~@k_Rr`4%M4(qtyP=PG4u3@k}yK8*v&xKRyvWiFuizxm3x1%H6>CaA8sm z|7|!GekDd~W7~PuJrTQOHuQCK<7uvl<~!7(lLksik_l$Newuwki2kc9Q(Bp?Q8VdJQkbNW80kN5%=zH8AD^y3um#W#b|kxH;%$Lds9Z+LO#Pps|Oy zrzmPEpQcpf-D%Ys#PaCqV{~44Fo|W_`yn{MAqiLfY#t$kp;k}hi=gGhz5B*>J*Zbd zV1}FVax_|j*TU3eg)09$#5H4zCq`ZG`!o#n&ixJAiGvLBA)k+Bsv`7?uQ?kpxn}=i zBPme*%w{dANCBd5t2}z#2^$&TeH7j6h=oUTcwjq~K1;n3aM3Odvd!fP)|QIdkbBC2 zxS82o-vb{wG$9q!u1&d(*(7MmMO>3c>tKI4z91AgITy?<*!DI*F5PhHGXz;vtY<<3 z$HRfESYga^48gsi(-!MRc7BOG-pK_!eJJ=6S(qP!#XKIKMwRUy`!RuP%M{AW^+Rz9#eDfX!0!-DNWd!SQG|;oHAO3Mb!L+T zpI^4H8sIJ_4UwwlSoiqP;_=*Vyumnm+E{wR8YvHCQjG-{RyQ?2db zUQSdG_!hHKt2CNT#>d(ftNzrJNT_8$6Sa?Qy=sl2Rx;y;>22gKqKjoe|CT50`NzEr zLhPpC+{IMUUt-!ZwWBZLUNX0)iL02if`^4Z3rIejeMuB1o#GGHK9<3KErKoA!`8&5 zp^h&_tzO$NR{-Ze#raS<58|%L%OWHX;B-^*0W?<&Kik^G< zyqVl!q?Ubh>x4h=l}SdNyLhz4Tpl9gr_M=CL~{UU(oP$DPki_POBI+m(3kN#RUc_S4GdxZ zL~Bp}(hXYGkCQ_xE>L*UIg<<(+l8nDlZ_&yxAx5W@~1n&F+}aKiaPLgsZxg8@zZVa zgeQv`aSYLR;maU?o6RXflV=+yuxxAJx$9{KTI2f^8@ZwmGb>4p^>0WoWY=ZYw)+X; zF`nrJ9-8WRUkiqnWNRRbp7S0^#HetU^=Y+da%A%Bx|FTPo?J^yq9=r4HElRvoFh1_ zlX!$)S`j8ab#uY@*}g6Oo}x^JI`a8=(Sv#lxDMO3#~Fh!F7VOQhEo55mo!F9$9m7?BMK~-05r10%e)m**-fS7kb zJ;K-5<9sk#4XznW7L<6I#~W|ji1}@vILkKbnGkaS_^O`4%Jvsh4#fl?L6#dEA0G`G`fNESf~-353MDe=~UIxtP4Kb!fYEsYvO6p z;IhSg1to7T$hR7^eLIygaY)8T{rKfT04gIv#wr&4>{owgTirX0zmE0VmwTmx>eT?5 zM8*-0pk)_o)$?fP|=rz&l(eJnBd2M(ndjaQ#q}lmt8Riv5nl zY@_!oSN%WpNtcDAwivmOSdD25;fl6~3E@|{nAI}##n>X2 zwAR5r*wd(aJ+Vz!9uT~W2FmMjyk^d8N7-iLA;g|``C&po;syj9;s~*I^HT;gGuuuH z#xMf%nARo73IJ1pDz)*zoTPm3*qN4nj2Tkw6#0v-xYMi)WU@Q^9<)UIq z@x^=V_|?$m}OXZ=HJc@(&x-PF_)^t^ZGZ-x=1_ z)~y>rL{Y#(QA9yV=uN|>NL6WzqJo5uAYwF9m8K#Pu%Sp>2$-Ov7!(511tFARLo~=X zB0YeBxM@K}2oixi7wUKR-sjujx9>Ukx#zjhbNQun%~dkjobMdt9q$;6!4Te7mdWrl z`DPTpm_&sK5 zcG~Hl-A_i`DqYR!jF4Smf$0Y2xt|+60kvC=ve{!Vj!B=-4yKDw`+9gsfS3VHnNG5` zoSg+OXByV~O52`ISwBJxdq#Wg292)YC}wpO&ZqVH6F+}wwn9P+1<#HCCF}NPAdoMT zdE_t~d*sQrb=|TD&JAipfmU;6ln7Sjp^l|B;eH`1H8#gig4B5jHcBo3r7nfG*ZLsh zM+%&6$Gp87a+b1*;g>M8;QpQ+W%^qhk75Rxl@^!hjquX`?(2{_C{v;#b|!!~c^GW{ zdu$FOIF7sU-Qg0GI}B*|J{ug;h)PbhuM!;KAoJiCow5SWL}2d)X>Kpey4IPjJtd7R5j)1`|&A2 zwG6QpR2~Wg^UwsRxr?b4o+U%@SK>-2&oVcCUl)fUk!mhJtX$%JW3gH;y9)$(lr8A&lvYosDwL(X!gH`$)yaN6 zREDwL^X}7o_VoG*oigStTd=Nj9eU>ZuwX?!o}1j|!%=h|@_Rse(_@%}jYm0LL5!9@ z0)8G;abs4&71qZzMz9bEBGa=G0Bt(XA>rH18zg$ntY1gV6Vhpc38mVoCTrlI_0;ZT z&Mwi0TmbQq>ma)u-5kfdJXwTMYG>77$u>gzoxgUE+*tw!LaQu~`+9OCKI{TT`kD5P zHQm}l(Z0Y9%pFR%+M(94uoc*n%2f`D#Q9xJU%0osap5ugMv>Rso1cHOn*8n0!6WH* zIB(?U=Y1I~_?Xm2-l7I+J5ta8>ZJ#s|ki*wz*|l2wIr*ej8{T90mG%ve@4Sy@*qlu-iIU3|dBMT2UZ+soSEo1vVq zVyD33BvPNT_E((vg&36cc^6~Q{;dx9O(@D8!Bt*n@%|6FZcl+NmWxm>ym)Pk2Er!3 zo?6YYdQ~<%a=*JIk(;0ONXm zLt4fDMCO9e(f;H3oAF<_yOUh53N(zc1s zBYVFXyUvUm3cH$ntRnJdz`*m)1T`UDZ(LsTKd)h6vZ=>1aO8zj90JlmJWzS->JEma zxyK43-+sgMk)CoRw%APyBL~9!u01-mIr-^)_tl44hG)vr1SIfgq#iMQ^!=3$cOJZX zrtr2xnX_^F(+DWqaQ!do!IVgC*Sp81U+w8B{9PDD<&u#~af_X_hGqO>Z~loakSK`u zTVHAmDs;N~;FGN8Y?y7c=4>Oi@khWqQReFygO6EN?_>($E6gJnZwG}N%brL0DXG<4 zlxDbf@NH{Qo-)lHbj%Q3K9ti63v$Ba3C zrkXSzUv7eeAQvD|SdHbz$G-oqC^h6Z1z*BC4I6V3N`-KZl8@f3)O0!QA+TavtR)>@8EO3R_zp!@!e2D7 z`h~p50Iud<2}m#B#k7N??s;eBdmqSX#g1nKaszC~QdCGH=gm&9hC-s z*fJ-K8ZL-ca*&Ht0x14|<`7g#A<+d+&F^&6nynCUPx5+#U~D*o=L%^lHkoo!<=yq%MktKL*Sj9`|ZzIt|-X2Z^`p= z5+p!{Y$fvt_M;tE4p?@B&A{t$+x2eoz|I|B0h|msD;WUrqBs1;3mp>KhfbubFS3GFm1V6Wx04$aufntc=lwzZA zbyY3x19AH5m(tk&2O4blMquKvTvD?)QJ?HzlWBF@qv1of9EcE@ zc;`v+(DvHjrC>3j-rBU1fjD>@J6|$}#}7GGP=w)9^bP3)W~dniRsg5d%!5-bla;qipMJ z>PnNZ;5ARY?hQ^GtfJiuQTk?z#0X?u>=4AQrSw6UJ$If?ik)Hy3f3#5-lx6zh_yW| zpb2huwSl32HKdjjZB^~Y{MJ!;2kjv@avJM4%8pLVHAO*IohzJoC`$6UwHAe5ngCHN zYq-r=d+WQwvfu0E!2V(68wTMLJlybzAe3@5i zpt^Vxt5!3>GZb1>P@qmJ8Q*5f#K66)YQdm5ro;z{ zs#TZhvjHxlgc*~VT6My0lA$VLryti2R^O+|8x~*+{OF6o*kLckn{p7NE4pW14Y6L) zkW=Htf<5yqdm;Lr)ACz2vb5#x3z}tvpa^>_1bB2%#7=c2AUqoL0^u0#n^>OP%Q7(2 z$syO!spY23dRF9K%2OUoSA^~tLaV`FjzE3S*q*Y4yQKLZR@UWGT>$zHm-+8UKMg=J z_knJI41sumkFcaUc4_RErp-KE4$-m;m*N=fp!~JPvUo?^ohSX!1PYLyMXe_+say74 zC|~Ub-?f$$=SJS4U|I8lpArB+$H)dxh>}SWya6H;_1LnzVx1q1+*R`KQMST(D>C2^ zMczF`x)k>*&q)<`qt$fEB$~$^urG7_7cdZe;XG~eCCl>!C(D|Q8CmFPeKIgHs!*Mi ztqp#{Xx$6Y^AYYg z@c9A)ROG6oB`Z z5q4)vgk+}ePjA)~VVvNUVSY&qIZrWzqqdF+Vci^FsrRLXWXH;IswrGCgNs|)MtP3} z5u*cbKmAmjc=TM@R(!Zd5QmGoLG2g02pw~KWh7t46=Buhra2|eOZBrga&Gw`8@L?5 zrypOOi6XEvOBv&;SZ*dI2ab{-af8Ay9$&JGAT#7qB$aB{l##$pi2>phss^DlHn;+8 zu%)!gJ@1!DaBX}#sOZg~RWfJwV&BK7Gv$h7yvf%EL98iVZp45;T6BjJkTjK0^f88` z?0n_9YhZ8%^uaI*3yVnt36>sYoOV@-5FZ=py#>_rSNRO48Vbh)o=za^@)jO_95X_& z?$W6jEFGtyMdT1Ka_HV=VXqrKkvsW0AXK|g?1nOO!5zbFv38Qn1&SL+Etq4BIm%Ii z)G!{CVlF0nB%s1LhXKD_>mVLRhgT*iwx?)R#GKu0rm;KvCdC&$pMNE6(t7TGY^vpw zKLg4oUL<*tb5lt3FBk!3+2>Z=vd{l*ZE*$$7tY~<%5NwUDrq8x#k?;rA^Oqef)(cU zR~cG3`@P{=nYTPd_&oze!Z`|$bA{*&$a!1f1J;!`HIJ$ra}?cOc}lfZs6OV zF#=LR2y(zV0u1eV0x}}|&a3a%fbeaO%+=5YrYiQ`R02F|MO6U2I~;PErjhsS*^Kp| zk~Xp}7O##}gQTsFA>kCj4wQOxyDjH(D1DZ$%6XZzK#@DzC~nasU=Wd9Y#|Bj9lM!! zg0b!9(3IF2{i$SHVE#wGjP>8Szj2hfKE4Z?a(YTDy+*PXiG5{#0P3xO z7DFZmt)+ub_C+zVZv~6eLrN*u=!s;lH;h+_rJAUBKtGKCeV}INIPW>oJq@PSkF%W~ za`&?rF3^lM2*s2lv2)u%x_K9Xb9>v zSXTe#;b z8s6c^9)xCr@U_zWRd|%ZZ6HBQMfbHtBE!-AkI?ly%gVLIp_ezkzuc5K@9uf#kW1e_ z(#9Bowx*95HuixKzo5||Dwmh3MvRmtfGB4Crn{4fd@Pm>F0^iYnF3lZL(7Tj3fW?A zXyhePJR1Sj?&GI)i$3{)dFuqK84Z|Qx;_z%1C~5|+JborzH52fgE=o~bW!Ly2LiEf z8TS!wUhVzNK8&FsUI=(uA1CZIpxsMqP%lsKt?{?`Womhy$H|`etg6+*@-+xCv8%B4 zq6Z1!2~hj)B18QnzaD9j@&A!fj!Lfhduobx!JJ&*xGSV8H12&nxB|M7fri)tYcqId z5|%O={s(nKf33(}5{%VMf;E;ZDZ)eqm41IHHS~id$j9IPuj_{Xz@Pq` zyeYStv0Hh`!ucU{DqcfJP6Sdn9sbvhS8?;FZN@*W1e+iso>srk&ILILrUIhcOr&K= zbF19#Gxw>0b&MjOu%rRsRUA+k;IG(N_q&{X*H2mB&kO%a#q#s;HJXLTZl6D79HxPf z3EF#hLwEpiHE2^6e^pz#k%C2q%;l2s7e`nv`Wd_V}ee&Ek5I)|m_h5ReI@c(Tv z%1zn}Dv-UM`>%18-vap7Rqzn!|K^%O1CbbMb0EG9`Tj?D0N$73mq-%2^7{#g_)#Gb zD5)3^S;9pirw9vMs>FiGsYiDJAFh;=5g&*JTc8?l&9j@Av(=H8U};{RK%98WT7~gX zE?4^eRzUK_VN2-Fz zwbnuHN^u}Sr$mSY>Y2Nloh^6NmV}M#tX~2h?cy$`L{KgmsTU?^Y=v_TM@-c&Yg6b};`W^jw|ONyI*fgNMUTw2N7J*e31wmY^xdn$lgJ6Srj0$Xy|67d$BQdK?p`!~dK;wPfGNS@R@6`J&%pD?Jm(zXWxb>1>O^$1g9TFq6S)zH2C`EFc>Hb*m{)n19x`s1=yPrp<-y7o zgI{*_1g9D9jq5w0wZD&_^(O6 zf4yq{vuGLUSMhi?hF>hH)*qkQ%xEqMZ~d9Yg(QHfjC_E#txp>cxFTbE%bhu~!z-{V z0trj*#|T}jK=?eAH`L#@D0I0omV=Z_3T1U%zhN=Zq98h=JrO5Sf*(@#5&Edwz zGr>zafxJlb`{XcyqiR3*^q}uD+;e{?@CbZ|f0C%8@-7IH`kzfyF^@bmSI}Xk5X#O) zAhsg_pzVuEZIiE4c+6YJb&-xew`1d#mvZ!=;3 z!-)KoUpv1z!!~p+EUg)w&S*2dJ%*%a-|>t?{*Ujd>t?T&=Zj zRqorv#m6$d$Algn%U@xw6oOM{G{3=co564>E$9)yF{!SvOrk7g(W3Or zx3sd+RVNwuPcm}2#wWUr;Bq}Fz@R`dCDEA(!Dz_dLX{5wi_Vi1_X7WgTi34AMA}E6 z*C^9r1H-1TP@8;}T2m41erx1#FCu&Hw1ZlVLrO{qxz5bhxON>FD7hXI+5H| zQ!V0*yRqtecVD{C#-Lwpx6NeTiL*v7jz3hFK%7sfR0;SW7-q66p_1-8){VPL@aT4cTwQ71I{?l4dYjRc4zEFjxj@q$1rh?JbQVB zd0+9pvizL=`T}w_=uKvZoEPqap!3@(T7gI0;BIJ75!E(wUfeePI{KGzJ1gX5R&)E> z;+=iBU9zJZ?F3|z^Y)WC{LV1SnkYn{mJ>1uE%4oKMd~|kJ`=ozf;JWlcWl(!drMjwC+I(vys?m4cY*LCgM2_qC)RYu#oEo|tL+S68s{;YlCrg^ZGiy3mUmLrk(LUGr)cU;7OASETKy|ieX`IB3NECfHgr4ss~>E5 z;ma?^a>+v@(5tI;>Tjld@~RNAsRlrB9pWw@cw367*zO9H1W&iq+kiV)0@q6qx1nYN=b`@<^aA;@*yUJTp{+d+Mo$n~u6Ia~E=V#$BxNA3kyg?P8L8 zC!ad%;-9G}$u}YJC6G4*6@iMUj(&>dHou80KmXQ=xSKj!K$s(eU+%wd(JFXF@mgJ4 z`#xK-Ii-F(C`=mXl}hDgwTHk4D^6|?Iu^Z1os%sV*9~FIUiUlu)Tz6I=b_+9&1+f5 z9dO~ynsye?@KMG$zUKNH(NaAJZ-U7Jm*>crpdwZmdq2OppfWe(`|Zo;&yTlqqll-* z9N^7_sNvc(*Rr-H`(&b5!3LU5E9u^3Yl{Y{g!5<`Zy;aded?x*A=~y2zPkglZA!N_ zeSpV1D_f+XF4rsu{=VUu?ICt>@KJMGFVA>)UnKQ0!|o&;wb#a*C(Q8i;*QYznd2RvYNk863-vc*LZW5-`VxpG^jJOq^ z;R#Bb;dX28J#>#6l`_Oj#tAoo5!sb`a^*l8Jz#hGr3u!XqaDAnBx_Z=8xOX(O~#zc z42F7_*R0enq#9`!cYnKbZ}L}FyL!3dqpe$3t5THcZ5;oI*F!#r#iCPMg%$5~w&ZHLRKm}jI=nzfJfZ!m$msL5pqOt6v_ zyeGRubUa#nthT?^7$8%-a*FzR=qu@i#7tt@QT0o1#_ikoP-plCu=hIrqn??-hhiQR z(rrBD8P;AKpu(_0aNe}K-??s6Ww z&bPEp$o!oa1ZjRQN4ZXLxJ%1 z)S)9#z}`~zCKqUn{_MBjp1bHI>(4o89fd0OrU+Qo^7F-scHh*V^dVm;Y?JP@6iB(L z4oa7Ic~i$0gM>3DAo)|K@cvZy%%OIFKLz>Dfb^Ed`+8or6=)Hg6Rr^>!)geypM1V; z3p{iMy(zWlnvV;7(RPZ=m9LU%x-j!0*bF|jeD3Lr@PlSk=LEa&wSSu^Z0+SkiIyN2s4h52H7k@L2F9ElZ?QQ@IC%^@^==6U_Ds zg`lm)w+-7|`;XIK5?>OJIKJJCz0N5fR1a)9C>ONtqfr0e7$?4bVxz-b5C1+h`g{6r zVlDwuRM&1yoFAqfzxl__iSVpjE}vL7E?LQcp*A2Of;*iQr8tqJ~0Yymfug zrk_x&PgMs4)52{h4wOWzYJT`y6jgs$=~xu+8QzopwrjNt7Q=RTh4BRl7#aLpS8d4l z6e;&y?#KJ;?)+K1?geVa;zFdqMTNgF@qFaOs_u$S#E9t!8Ul_YHNBh|X})NKU!yD| z#^sKD3YHA(c|S3gwCQN{eDc)EBi&)GCViX-6EC@4Vao&fjIiiL4qrHr$e3@$vCPMZ zwVX`Olr%Tqyt6QW0u&$)4$i#VY`q54Y=w(l)vLbwaM7JXE^7^%lI#(1^QS?Uj=%)tisB~d{ZseIK$%AJoRPC9kB`NrGztz#~MNp8Kc7&&3{IyL*nFaoJWPr5nu>K9Z`R@mC!~V70{BiMSopSwjMknXuzkd z-*oe>`mU6rm2~6RqK0Ru?iFr1F?-N|>|RH^Z(r2&;3zGMzZv)8SaPIct=?gU@@`F$ zYr|i$Ur*s!+P{WHayz28(0DT(A|UU*IFC%9$MmU)ZP5gB`{pZOP076D8|nO7a=t&z za&2qeeV#qg3FYRv)&unY#3F@Lip?utCTN*;SdrIKvcJ-!C%0a#@wM!>;a*VV82NdR z`K6zqoc=1&ziR&%7h^N7pG|nw*-y5==!F-1PI5C)?R!35$MN+|?!qz!J_eE9d$er* zfGMt0ib^dvqf+5BnRihU`?}ApXcpj}rH%%0mRpw^HrL##D%B5qW#Se{Ia<77q}@$) zsE|qyFeMb;=M`6|vRG7fhHntm=f~UQPqF;G_mrY^6NYOSaavPPjCj9Iw5@OlmN7#Q zAPJ@^YC>SPVJ6~JW|dv7_5A7cyO+<^U8$<~>`>_vTRdG>{_x|XJySI=W2AW- zSHC*qn^Y##G+%L|;PG|a;Ld)ql9QyFoYiFSnRD7A;i*0cVSg^UcMo=v-9ep`X+GNb z4od7$PO*k58})eQ2^GDo{IZ6m`UU9swXAdMH#o1p&V03&Z}tpru@$ZG+P%6`qIvs_ z@!^2Jh&GTwg$@6uawjoBR#ij&cum`>R3m@`lyDsnF@^Sv`g8Vl< en7=yxn{of`x%cqLxBdtX2AS Date: Tue, 7 Mar 2023 15:07:48 +0100 Subject: [PATCH 021/108] Update notebooks --- notebooks/03_categorical_pipeline.ipynb | 5 +++-- notebooks/ensemble_adaboost.ipynb | 4 ++-- notebooks/ensemble_bagging.ipynb | 6 +++--- notebooks/ensemble_ex_01.ipynb | 2 +- notebooks/ensemble_introduction.ipynb | 4 ++-- notebooks/ensemble_random_forest.ipynb | 4 ++-- notebooks/ensemble_sol_01.ipynb | 6 +++--- notebooks/logistic_regression.ipynb | 12 +----------- notebooks/logistic_regression_non_linear.ipynb | 10 ---------- notebooks/parameter_tuning_ex_02.ipynb | 6 +++--- notebooks/parameter_tuning_grid_search.ipynb | 11 ----------- notebooks/parameter_tuning_nested.ipynb | 12 ------------ notebooks/parameter_tuning_randomized_search.ipynb | 11 ----------- notebooks/parameter_tuning_sol_02.ipynb | 12 ++++++------ 14 files changed, 26 insertions(+), 79 deletions(-) diff --git a/notebooks/03_categorical_pipeline.ipynb b/notebooks/03_categorical_pipeline.ipynb index 155670ac8..ac940c898 100644 --- a/notebooks/03_categorical_pipeline.ipynb +++ b/notebooks/03_categorical_pipeline.ipynb @@ -425,8 +425,9 @@ "- the original categories (before encoding) have an ordering;\n", "- the encoded categories follow the same ordering than the original\n", " categories.\n", - "The **next exercise** highlights the issue of misusing `OrdinalEncoder` with\n", - "a linear model.\n", + "\n", + "The **next exercise** shows what can happen when using an `OrdinalEncoder`\n", + "with a liner model and the conditions above are not met.\n", "\n", "One-hot encoding categorical variables with high cardinality can cause \n", "computational inefficiency in tree-based models. Because of this, it is not recommended\n", diff --git a/notebooks/ensemble_adaboost.ipynb b/notebooks/ensemble_adaboost.ipynb index 397ff1c01..4fae9cb45 100644 --- a/notebooks/ensemble_adaboost.ipynb +++ b/notebooks/ensemble_adaboost.ipynb @@ -247,8 +247,8 @@ "source": [ "from sklearn.ensemble import AdaBoostClassifier\n", "\n", - "base_estimator = DecisionTreeClassifier(max_depth=3, random_state=0)\n", - "adaboost = AdaBoostClassifier(base_estimator=base_estimator,\n", + "estimator = DecisionTreeClassifier(max_depth=3, random_state=0)\n", + "adaboost = AdaBoostClassifier(estimator=estimator,\n", " n_estimators=3, algorithm=\"SAMME\",\n", " random_state=0)\n", "adaboost.fit(data, target)" diff --git a/notebooks/ensemble_bagging.ipynb b/notebooks/ensemble_bagging.ipynb index 1f611e1f1..bd43a80cb 100644 --- a/notebooks/ensemble_bagging.ipynb +++ b/notebooks/ensemble_bagging.ipynb @@ -344,7 +344,7 @@ "from sklearn.ensemble import BaggingRegressor\n", "\n", "bagged_trees = BaggingRegressor(\n", - " base_estimator=DecisionTreeRegressor(max_depth=3),\n", + " estimator=DecisionTreeRegressor(max_depth=3),\n", " n_estimators=100,\n", ")\n", "_ = bagged_trees.fit(data_train, target_train)" @@ -469,7 +469,7 @@ "base models.\n", "\n", "The ensemble itself is simply built by passing the resulting pipeline as the\n", - "`base_estimator` parameter of the `BaggingRegressor` class:" + "`estimator` parameter of the `BaggingRegressor` class:" ] }, { @@ -481,7 +481,7 @@ "outputs": [], "source": [ "bagging = BaggingRegressor(\n", - " base_estimator=polynomial_regressor,\n", + " estimator=polynomial_regressor,\n", " n_estimators=100,\n", " random_state=0,\n", ")\n", diff --git a/notebooks/ensemble_ex_01.ipynb b/notebooks/ensemble_ex_01.ipynb index ce9cfecbc..aac64f1ac 100644 --- a/notebooks/ensemble_ex_01.ipynb +++ b/notebooks/ensemble_ex_01.ipynb @@ -44,7 +44,7 @@ "metadata": {}, "source": [ "Create a `BaggingRegressor` and provide a `DecisionTreeRegressor`\n", - "to its parameter `base_estimator`. Train the regressor and evaluate its\n", + "to its parameter `estimator`. Train the regressor and evaluate its\n", "generalization performance on the testing set using the mean absolute error." ] }, diff --git a/notebooks/ensemble_introduction.ipynb b/notebooks/ensemble_introduction.ipynb index 153cb165a..2abcee2e3 100644 --- a/notebooks/ensemble_introduction.ipynb +++ b/notebooks/ensemble_introduction.ipynb @@ -145,9 +145,9 @@ "%%time\n", "from sklearn.ensemble import BaggingRegressor\n", "\n", - "base_estimator = DecisionTreeRegressor(random_state=0)\n", + "estimator = DecisionTreeRegressor(random_state=0)\n", "bagging_regressor = BaggingRegressor(\n", - " base_estimator=base_estimator, n_estimators=20, random_state=0)\n", + " estimator=estimator, n_estimators=20, random_state=0)\n", "\n", "cv_results = cross_validate(bagging_regressor, data, target, n_jobs=2)\n", "scores = cv_results[\"test_score\"]\n", diff --git a/notebooks/ensemble_random_forest.ipynb b/notebooks/ensemble_random_forest.ipynb index 8861b7481..4da0667c2 100644 --- a/notebooks/ensemble_random_forest.ipynb +++ b/notebooks/ensemble_random_forest.ipynb @@ -149,7 +149,7 @@ "bagged_trees = make_pipeline(\n", " preprocessor,\n", " BaggingClassifier(\n", - " base_estimator=DecisionTreeClassifier(random_state=0),\n", + " estimator=DecisionTreeClassifier(random_state=0),\n", " n_estimators=50, n_jobs=2, random_state=0,\n", " )\n", ")" @@ -176,7 +176,7 @@ "better than the performance of a single tree.\n", "\n", "Now, we will use a random forest. You will observe that we do not need to\n", - "specify any `base_estimator` because the estimator is forced to be a decision\n", + "specify any `estimator` because the estimator is forced to be a decision\n", "tree. Thus, we just specify the desired number of trees in the forest." ] }, diff --git a/notebooks/ensemble_sol_01.ipynb b/notebooks/ensemble_sol_01.ipynb index 9d26700ac..91f547842 100644 --- a/notebooks/ensemble_sol_01.ipynb +++ b/notebooks/ensemble_sol_01.ipynb @@ -44,7 +44,7 @@ "metadata": {}, "source": [ "Create a `BaggingRegressor` and provide a `DecisionTreeRegressor`\n", - "to its parameter `base_estimator`. Train the regressor and evaluate its\n", + "to its parameter `estimator`. Train the regressor and evaluate its\n", "generalization performance on the testing set using the mean absolute error." ] }, @@ -60,7 +60,7 @@ "from sklearn.ensemble import BaggingRegressor\n", "\n", "tree = DecisionTreeRegressor()\n", - "bagging = BaggingRegressor(base_estimator=tree, n_jobs=2)\n", + "bagging = BaggingRegressor(estimator=tree, n_jobs=2)\n", "bagging.fit(data_train, target_train)\n", "target_predicted = bagging.predict(data_test)\n", "print(f\"Basic mean absolute error of the bagging regressor:\\n\"\n", @@ -112,7 +112,7 @@ " \"n_estimators\": randint(10, 30),\n", " \"max_samples\": [0.5, 0.8, 1.0],\n", " \"max_features\": [0.5, 0.8, 1.0],\n", - " \"base_estimator__max_depth\": randint(3, 10),\n", + " \"estimator__max_depth\": randint(3, 10),\n", "}\n", "search = RandomizedSearchCV(\n", " bagging, param_grid, n_iter=20, scoring=\"neg_mean_absolute_error\"\n", diff --git a/notebooks/logistic_regression.ipynb b/notebooks/logistic_regression.ipynb index 57c464969..fa30beced 100644 --- a/notebooks/logistic_regression.ipynb +++ b/notebooks/logistic_regression.ipynb @@ -108,16 +108,6 @@ "algorithm." ] }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import sklearn\n", - "sklearn.set_config(display=\"diagram\")" - ] - }, { "cell_type": "code", "execution_count": null, @@ -129,7 +119,7 @@ "from sklearn.linear_model import LogisticRegression\n", "\n", "logistic_regression = make_pipeline(\n", - " StandardScaler(), LogisticRegression(penalty=\"none\")\n", + " StandardScaler(), LogisticRegression(penalty=None)\n", ")\n", "logistic_regression.fit(data_train, target_train)\n", "accuracy = logistic_regression.score(data_test, target_test)\n", diff --git a/notebooks/logistic_regression_non_linear.ipynb b/notebooks/logistic_regression_non_linear.ipynb index 2f08c6851..dae0e199b 100644 --- a/notebooks/logistic_regression_non_linear.ipynb +++ b/notebooks/logistic_regression_non_linear.ipynb @@ -78,16 +78,6 @@ "a linear support vector machine classifier." ] }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import sklearn\n", - "sklearn.set_config(display=\"diagram\")" - ] - }, { "cell_type": "code", "execution_count": null, diff --git a/notebooks/parameter_tuning_ex_02.ipynb b/notebooks/parameter_tuning_ex_02.ipynb index 16a849ae1..b6b18b96a 100644 --- a/notebooks/parameter_tuning_ex_02.ipynb +++ b/notebooks/parameter_tuning_ex_02.ipynb @@ -92,9 +92,9 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "\n", - "Now use the test set to score the model using the best parameters\n", - "that we found using cross-validation in the training set." + "Now use the test set to score the model using the best parameters that we\n", + "found using cross-validation. You will have to refit the model over the full\n", + "training set." ] }, { diff --git a/notebooks/parameter_tuning_grid_search.ipynb b/notebooks/parameter_tuning_grid_search.ipynb index 4c9e56fa6..ec9372599 100644 --- a/notebooks/parameter_tuning_grid_search.ipynb +++ b/notebooks/parameter_tuning_grid_search.ipynb @@ -20,17 +20,6 @@ "Let us reload the dataset as we did previously:" ] }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from sklearn import set_config\n", - "\n", - "set_config(display=\"diagram\")" - ] - }, { "cell_type": "code", "execution_count": null, diff --git a/notebooks/parameter_tuning_nested.ipynb b/notebooks/parameter_tuning_nested.ipynb index 5b72895e9..2d7d02637 100644 --- a/notebooks/parameter_tuning_nested.ipynb +++ b/notebooks/parameter_tuning_nested.ipynb @@ -49,18 +49,6 @@ "this pipeline is identical to the one we used in the previous notebook." ] }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from sklearn import set_config\n", - "\n", - "# To get a diagram visualization of the pipeline\n", - "set_config(display=\"diagram\")" - ] - }, { "cell_type": "code", "execution_count": null, diff --git a/notebooks/parameter_tuning_randomized_search.ipynb b/notebooks/parameter_tuning_randomized_search.ipynb index 62e66ecdc..14d344ee8 100644 --- a/notebooks/parameter_tuning_randomized_search.ipynb +++ b/notebooks/parameter_tuning_randomized_search.ipynb @@ -27,17 +27,6 @@ "Let us reload the dataset as we did previously:" ] }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from sklearn import set_config\n", - "\n", - "set_config(display=\"diagram\")" - ] - }, { "cell_type": "code", "execution_count": null, diff --git a/notebooks/parameter_tuning_sol_02.ipynb b/notebooks/parameter_tuning_sol_02.ipynb index 6b4e25597..67158f8fc 100644 --- a/notebooks/parameter_tuning_sol_02.ipynb +++ b/notebooks/parameter_tuning_sol_02.ipynb @@ -106,7 +106,7 @@ " print(f\"score: {mean_score:.3f}\")\n", " if mean_score > best_score:\n", " best_score = mean_score\n", - " best_params = {'learning-rate': lr, 'max leaf nodes': mln}\n", + " best_params = {'learning_rate': lr, 'max_leaf_nodes': mln}\n", " print(f\"Found new best model with score {best_score:.3f}!\")\n", "\n", "print(f\"The best accuracy obtained is {best_score:.3f}\")\n", @@ -117,9 +117,9 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "\n", - "Now use the test set to score the model using the best parameters\n", - "that we found using cross-validation in the training set." + "Now use the test set to score the model using the best parameters that we\n", + "found using cross-validation. You will have to refit the model over the full\n", + "training set." ] }, { @@ -129,8 +129,8 @@ "outputs": [], "source": [ "# solution\n", - "best_lr = best_params['learning-rate']\n", - "best_mln = best_params['max leaf nodes']\n", + "best_lr = best_params['learning_rate']\n", + "best_mln = best_params['max_leaf_nodes']\n", "\n", "model.set_params(classifier__learning_rate=best_lr,\n", " classifier__max_leaf_nodes=best_mln)\n", From 11552394a3f726fc9d681fc1a43ab3b4c60bba97 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Est=C3=A8ve?= Date: Thu, 9 Mar 2023 12:13:32 +0100 Subject: [PATCH 022/108] Add full-index.ipynb --- full-index.ipynb | 281 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 281 insertions(+) create mode 100644 full-index.ipynb diff --git a/full-index.ipynb b/full-index.ipynb new file mode 100644 index 000000000..bc885df14 --- /dev/null +++ b/full-index.ipynb @@ -0,0 +1,281 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "a4dde4d5", + "metadata": {}, + "source": [ + "# Machine Learning Concepts\n", + "\n", + "[๐ŸŽฅ Introducing machine-learning concepts](https://inria.github.io/scikit-learn-mooc/ml_concepts/slides.html)\n", + "\n", + "[โœ… Quiz Intro.01](https://inria.github.io/scikit-learn-mooc/ml_concepts/quiz_intro_01.html)\n", + "\n", + "# The predictive modeling pipeline\n", + "\n", + "[Module overview](https://inria.github.io/scikit-learn-mooc/predictive_modeling_pipeline/predictive_modeling_module_intro.html)\n", + "\n", + "### Tabular data exploration\n", + "\n", + "* [First look at our dataset](notebooks/01_tabular_data_exploration.ipynb)\n", + "* [๐Ÿ“ Exercise M1.01](notebooks/01_tabular_data_exploration_ex_01.ipynb)\n", + "* [๐Ÿ“ƒ Solution for Exercise M1.01](notebooks/01_tabular_data_exploration_sol_01.ipynb)\n", + "* [โœ… Quiz M1.01](https://inria.github.io/scikit-learn-mooc/predictive_modeling_pipeline/01_tabular_data_exploration_quiz_m1_01.html)\n", + "\n", + "### Fitting a scikit-learn model on numerical data\n", + "\n", + "* [First model with scikit-learn](notebooks/02_numerical_pipeline_introduction.ipynb)\n", + "* [๐Ÿ“ Exercise M1.02](notebooks/02_numerical_pipeline_ex_00.ipynb)\n", + "* [๐Ÿ“ƒ Solution for Exercise M1.02](notebooks/02_numerical_pipeline_sol_00.ipynb)\n", + "* [Working with numerical data](notebooks/02_numerical_pipeline_hands_on.ipynb)\n", + "* [๐Ÿ“ Exercise M1.03](notebooks/02_numerical_pipeline_ex_01.ipynb)\n", + "* [๐Ÿ“ƒ Solution for Exercise M1.03](notebooks/02_numerical_pipeline_sol_01.ipynb)\n", + "* [Preprocessing for numerical features](notebooks/02_numerical_pipeline_scaling.ipynb)\n", + "* [๐ŸŽฅ Validation of a model](https://inria.github.io/scikit-learn-mooc/predictive_modeling_pipeline/02_numerical_pipeline_video_cross_validation.html)\n", + "* [Model evaluation using cross-validation](notebooks/02_numerical_pipeline_cross_validation.ipynb)\n", + "* [โœ… Quiz M1.02](https://inria.github.io/scikit-learn-mooc/predictive_modeling_pipeline/02_numerical_pipeline_quiz_m1_02.html)\n", + "\n", + "### Handling categorical data\n", + "\n", + "* [Encoding of categorical variables](notebooks/03_categorical_pipeline.ipynb)\n", + "* [๐Ÿ“ Exercise M1.04](notebooks/03_categorical_pipeline_ex_01.ipynb)\n", + "* [๐Ÿ“ƒ Solution for Exercise M1.04](notebooks/03_categorical_pipeline_sol_01.ipynb)\n", + "* [Using numerical and categorical variables together](notebooks/03_categorical_pipeline_column_transformer.ipynb)\n", + "* [๐Ÿ“ Exercise M1.05](notebooks/03_categorical_pipeline_ex_02.ipynb)\n", + "* [๐Ÿ“ƒ Solution for Exercise M1.05](notebooks/03_categorical_pipeline_sol_02.ipynb)\n", + "* [๐ŸŽฅ Visualizing scikit-learn pipelines in Jupyter](https://inria.github.io/scikit-learn-mooc/predictive_modeling_pipeline/03_categorical_pipeline_visualization_video.html)\n", + "* [Visualizing scikit-learn pipelines in Jupyter](notebooks/03_categorical_pipeline_visualization.ipynb)\n", + "* [โœ… Quiz M1.03](https://inria.github.io/scikit-learn-mooc/predictive_modeling_pipeline/03_categorical_pipeline_quiz_m1_03.html)\n", + "\n", + "[๐Ÿ Wrap-up quiz 1](https://inria.github.io/scikit-learn-mooc/predictive_modeling_pipeline/wrap_up_quiz.html)\n", + "\n", + "[Main take-away](https://inria.github.io/scikit-learn-mooc/predictive_modeling_pipeline/predictive_modeling_module_take_away.html)\n", + "\n", + "# Selecting the best model\n", + "\n", + "[Module overview](https://inria.github.io/scikit-learn-mooc/overfit/overfit_module_intro.html)\n", + "\n", + "### Overfitting and underfitting\n", + "\n", + "* [๐ŸŽฅ Overfitting and Underfitting](https://inria.github.io/scikit-learn-mooc/overfit/overfitting_vs_under_fitting_slides.html)\n", + "* [Cross-validation framework](notebooks/cross_validation_train_test.ipynb)\n", + "* [โœ… Quiz M2.01](https://inria.github.io/scikit-learn-mooc/overfit/overfitting_vs_under_fitting_quiz_m2_01.html)\n", + "\n", + "### Validation and learning curves\n", + "\n", + "* [๐ŸŽฅ Comparing train and test errors](https://inria.github.io/scikit-learn-mooc/overfit/learning_validation_curves_slides.html)\n", + "* [Overfit-generalization-underfit](notebooks/cross_validation_validation_curve.ipynb)\n", + "* [Effect of the sample size in cross-validation](notebooks/cross_validation_learning_curve.ipynb)\n", + "* [๐Ÿ“ Exercise M2.01](notebooks/cross_validation_ex_01.ipynb)\n", + "* [๐Ÿ“ƒ Solution for Exercise M2.01](notebooks/cross_validation_sol_01.ipynb)\n", + "* [โœ… Quiz M2.02](https://inria.github.io/scikit-learn-mooc/overfit/learning_validation_curves_quiz_m2_02.html)\n", + "\n", + "### Bias versus variance trade-off\n", + "\n", + "* [๐ŸŽฅ Bias versus Variance](https://inria.github.io/scikit-learn-mooc/overfit/bias_vs_variance_slides.html)\n", + "* [โœ… Quiz M2.03](https://inria.github.io/scikit-learn-mooc/overfit/bias_vs_variance_quiz_m2_03.html)\n", + "\n", + "[๐Ÿ Wrap-up quiz 2](https://inria.github.io/scikit-learn-mooc/overfit/overfit_wrap_up_quiz.html)\n", + "\n", + "[Main take-away](https://inria.github.io/scikit-learn-mooc/overfit/overfit_take_away.html)\n", + "\n", + "# Hyperparameter tuning\n", + "\n", + "[Module overview](https://inria.github.io/scikit-learn-mooc/tuning/parameter_tuning_module_intro.html)\n", + "\n", + "### Manual tuning\n", + "\n", + "* [Set and get hyperparameters in scikit-learn](notebooks/parameter_tuning_manual.ipynb)\n", + "* [๐Ÿ“ Exercise M3.01](notebooks/parameter_tuning_ex_02.ipynb)\n", + "* [๐Ÿ“ƒ Solution for Exercise M3.01](notebooks/parameter_tuning_sol_02.ipynb)\n", + "* [โœ… Quiz M3.01](https://inria.github.io/scikit-learn-mooc/tuning/parameter_tuning_manual_quiz_m3_01.html)\n", + "\n", + "### Automated tuning\n", + "\n", + "* [Hyperparameter tuning by grid-search](notebooks/parameter_tuning_grid_search.ipynb)\n", + "* [Hyperparameter tuning by randomized-search](notebooks/parameter_tuning_randomized_search.ipynb)\n", + "* [๐ŸŽฅ Analysis of hyperparameter search results](https://inria.github.io/scikit-learn-mooc/tuning/parameter_tuning_parallel_plot_video.html)\n", + "* [Analysis of hyperparameter search results](notebooks/parameter_tuning_parallel_plot.ipynb)\n", + "* [Evaluation and hyperparameter tuning](notebooks/parameter_tuning_nested.ipynb)\n", + "* [๐Ÿ“ Exercise M3.02](notebooks/parameter_tuning_ex_03.ipynb)\n", + "* [๐Ÿ“ƒ Solution for Exercise M3.02](notebooks/parameter_tuning_sol_03.ipynb)\n", + "* [โœ… Quiz M3.02](https://inria.github.io/scikit-learn-mooc/tuning/parameter_tuning_automated_quiz_m3_02.html)\n", + "\n", + "[๐Ÿ Wrap-up quiz 3](https://inria.github.io/scikit-learn-mooc/tuning/parameter_tuning_wrap_up_quiz.html)\n", + "\n", + "[Main take-away](https://inria.github.io/scikit-learn-mooc/tuning/parameter_tuning_module_take_away.html)\n", + "\n", + "# Linear models\n", + "\n", + "[Module overview](https://inria.github.io/scikit-learn-mooc/linear_models/linear_models_module_intro.html)\n", + "\n", + "### Intuitions on linear models\n", + "\n", + "* [๐ŸŽฅ Intuitions on linear models](https://inria.github.io/scikit-learn-mooc/linear_models/linear_models_slides.html)\n", + "* [โœ… Quiz M4.01](https://inria.github.io/scikit-learn-mooc/linear_models/linear_models_quiz_m4_01.html)\n", + "\n", + "### Linear regression\n", + "\n", + "* [Linear regression without scikit-learn](notebooks/linear_regression_without_sklearn.ipynb)\n", + "* [๐Ÿ“ Exercise M4.01](notebooks/linear_models_ex_01.ipynb)\n", + "* [๐Ÿ“ƒ Solution for Exercise M4.01](notebooks/linear_models_sol_01.ipynb)\n", + "* [Linear regression using scikit-learn](notebooks/linear_regression_in_sklearn.ipynb)\n", + "* [โœ… Quiz M4.02](https://inria.github.io/scikit-learn-mooc/linear_models/linear_models_quiz_m4_02.html)\n", + "\n", + "### Modelling non-linear features-target relationships\n", + "\n", + "* [๐Ÿ“ Exercise M4.02](notebooks/linear_models_ex_02.ipynb)\n", + "* [๐Ÿ“ƒ Solution for Exercise M4.02](notebooks/linear_models_sol_02.ipynb)\n", + "* [Linear regression for a non-linear features-target relationship](notebooks/linear_regression_non_linear_link.ipynb)\n", + "* [๐Ÿ“ Exercise M4.03](notebooks/linear_models_ex_03.ipynb)\n", + "* [๐Ÿ“ƒ Solution for Exercise M4.03](notebooks/linear_models_sol_03.ipynb)\n", + "* [โœ… Quiz M4.03](https://inria.github.io/scikit-learn-mooc/linear_models/linear_models_quiz_m4_03.html)\n", + "\n", + "### Regularization in linear model\n", + "\n", + "* [๐ŸŽฅ Intuitions on regularized linear models](https://inria.github.io/scikit-learn-mooc/linear_models/regularized_linear_models_slides.html)\n", + "* [Regularization of linear regression model](notebooks/linear_models_regularization.ipynb)\n", + "* [๐Ÿ“ Exercise M4.04](notebooks/linear_models_ex_04.ipynb)\n", + "* [๐Ÿ“ƒ Solution for Exercise M4.04](notebooks/linear_models_sol_04.ipynb)\n", + "* [โœ… Quiz M4.04](https://inria.github.io/scikit-learn-mooc/linear_models/linear_models_quiz_m4_04.html)\n", + "\n", + "### Linear model for classification\n", + "\n", + "* [Linear model for classification](notebooks/logistic_regression.ipynb)\n", + "* [๐Ÿ“ Exercise M4.05](notebooks/linear_models_ex_05.ipynb)\n", + "* [๐Ÿ“ƒ Solution for Exercise M4.05](notebooks/linear_models_sol_05.ipynb)\n", + "* [Beyond linear separation in classification](notebooks/logistic_regression_non_linear.ipynb)\n", + "* [โœ… Quiz M4.05](https://inria.github.io/scikit-learn-mooc/linear_models/linear_models_quiz_m4_05.html)\n", + "\n", + "[๐Ÿ Wrap-up quiz 4](https://inria.github.io/scikit-learn-mooc/linear_models/linear_models_wrap_up_quiz.html)\n", + "\n", + "[Main take-away](https://inria.github.io/scikit-learn-mooc/linear_models/linear_models_module_take_away.html)\n", + "\n", + "# Decision tree models\n", + "\n", + "[Module overview](https://inria.github.io/scikit-learn-mooc/trees/trees_module_intro.html)\n", + "\n", + "### Intuitions on tree-based models\n", + "\n", + "* [๐ŸŽฅ Intuitions on tree-based models](https://inria.github.io/scikit-learn-mooc/trees/slides.html)\n", + "* [โœ… Quiz M5.01](https://inria.github.io/scikit-learn-mooc/trees/trees_quiz_m5_01.html)\n", + "\n", + "### Decision tree in classification\n", + "\n", + "* [Build a classification decision tree](notebooks/trees_classification.ipynb)\n", + "* [๐Ÿ“ Exercise M5.01](notebooks/trees_ex_01.ipynb)\n", + "* [๐Ÿ“ƒ Solution for Exercise M5.01](notebooks/trees_sol_01.ipynb)\n", + "* [โœ… Quiz M5.02](https://inria.github.io/scikit-learn-mooc/trees/trees_quiz_m5_02.html)\n", + "\n", + "### Decision tree in regression\n", + "\n", + "* [Decision tree for regression](notebooks/trees_regression.ipynb)\n", + "* [๐Ÿ“ Exercise M5.02](notebooks/trees_ex_02.ipynb)\n", + "* [๐Ÿ“ƒ Solution for Exercise M5.02](notebooks/trees_sol_02.ipynb)\n", + "* [โœ… Quiz M5.03](https://inria.github.io/scikit-learn-mooc/trees/trees_quiz_m5_03.html)\n", + "\n", + "### Hyperparameters of decision tree\n", + "\n", + "* [Importance of decision tree hyperparameters on generalization](notebooks/trees_hyperparameters.ipynb)\n", + "* [โœ… Quiz M5.04](https://inria.github.io/scikit-learn-mooc/trees/trees_quiz_m5_04.html)\n", + "\n", + "[๐Ÿ Wrap-up quiz 5](https://inria.github.io/scikit-learn-mooc/trees/trees_wrap_up_quiz.html)\n", + "\n", + "[Main take-away](https://inria.github.io/scikit-learn-mooc/trees/trees_module_take_away.html)\n", + "\n", + "# Ensemble of models\n", + "\n", + "[Module overview](https://inria.github.io/scikit-learn-mooc/ensemble/ensemble_module_intro.html)\n", + "\n", + "### Ensemble method using bootstrapping\n", + "\n", + "* [๐ŸŽฅ Intuitions on ensemble models: bagging](https://inria.github.io/scikit-learn-mooc/ensemble/bagging_slides.html)\n", + "* [Introductory example to ensemble models](notebooks/ensemble_introduction.ipynb)\n", + "* [Bagging](notebooks/ensemble_bagging.ipynb)\n", + "* [๐Ÿ“ Exercise M6.01](notebooks/ensemble_ex_01.ipynb)\n", + "* [๐Ÿ“ƒ Solution for Exercise M6.01](notebooks/ensemble_sol_01.ipynb)\n", + "* [Random forests](notebooks/ensemble_random_forest.ipynb)\n", + "* [๐Ÿ“ Exercise M6.02](notebooks/ensemble_ex_02.ipynb)\n", + "* [๐Ÿ“ƒ Solution for Exercise M6.02](notebooks/ensemble_sol_02.ipynb)\n", + "* [โœ… Quiz M6.01](https://inria.github.io/scikit-learn-mooc/ensemble/ensemble_quiz_m6_01.html)\n", + "\n", + "### Ensemble based on boosting\n", + "\n", + "* [๐ŸŽฅ Intuitions on ensemble models: boosting](https://inria.github.io/scikit-learn-mooc/ensemble/boosting_slides.html)\n", + "* [Adaptive Boosting (AdaBoost)](notebooks/ensemble_adaboost.ipynb)\n", + "* [Gradient-boosting decision tree (GBDT)](notebooks/ensemble_gradient_boosting.ipynb)\n", + "* [๐Ÿ“ Exercise M6.03](notebooks/ensemble_ex_03.ipynb)\n", + "* [๐Ÿ“ƒ Solution for Exercise M6.03](notebooks/ensemble_sol_03.ipynb)\n", + "* [Speeding-up gradient-boosting](notebooks/ensemble_hist_gradient_boosting.ipynb)\n", + "* [โœ… Quiz M6.02](https://inria.github.io/scikit-learn-mooc/ensemble/ensemble_quiz_m6_02.html)\n", + "\n", + "### Hyperparameter tuning with ensemble methods\n", + "\n", + "* [Hyperparameter tuning](notebooks/ensemble_hyperparameters.ipynb)\n", + "* [๐Ÿ“ Exercise M6.04](notebooks/ensemble_ex_04.ipynb)\n", + "* [๐Ÿ“ƒ Solution for Exercise M6.04](notebooks/ensemble_sol_04.ipynb)\n", + "* [โœ… Quiz M6.03](https://inria.github.io/scikit-learn-mooc/ensemble/ensemble_quiz_m6_03.html)\n", + "\n", + "[๐Ÿ Wrap-up quiz 6](https://inria.github.io/scikit-learn-mooc/ensemble/ensemble_wrap_up_quiz.html)\n", + "\n", + "[Main take-away](https://inria.github.io/scikit-learn-mooc/ensemble/ensemble_module_take_away.html)\n", + "\n", + "# Evaluating model performance\n", + "\n", + "[Module overview](https://inria.github.io/scikit-learn-mooc/evaluation/evaluation_module_intro.html)\n", + "\n", + "### Comparing a model with simple baselines\n", + "\n", + "* [Comparing model performance with a simple baseline](notebooks/cross_validation_baseline.ipynb)\n", + "* [๐Ÿ“ Exercise M7.01](notebooks/cross_validation_ex_02.ipynb)\n", + "* [๐Ÿ“ƒ Solution for Exercise M7.01](notebooks/cross_validation_sol_02.ipynb)\n", + "* [โœ… Quiz M7.01](https://inria.github.io/scikit-learn-mooc/evaluation/evaluation_quiz_m7_01.html)\n", + "\n", + "### Choice of cross-validation\n", + "\n", + "* [Stratification](notebooks/cross_validation_stratification.ipynb)\n", + "* [Sample grouping](notebooks/cross_validation_grouping.ipynb)\n", + "* [Non i.i.d. data](notebooks/cross_validation_time.ipynb)\n", + "* [โœ… Quiz M7.02](https://inria.github.io/scikit-learn-mooc/evaluation/evaluation_quiz_m7_02.html)\n", + "\n", + "### Nested cross-validation\n", + "\n", + "* [Nested cross-validation](notebooks/cross_validation_nested.ipynb)\n", + "* [โœ… Quiz M7.03](https://inria.github.io/scikit-learn-mooc/evaluation/evaluation_quiz_m7_03.html)\n", + "\n", + "### Classification metrics\n", + "\n", + "* [Classification](notebooks/metrics_classification.ipynb)\n", + "* [๐Ÿ“ Exercise M7.02](notebooks/metrics_ex_01.ipynb)\n", + "* [๐Ÿ“ƒ Solution for Exercise M7.02](notebooks/metrics_sol_01.ipynb)\n", + "* [โœ… Quiz M7.04](https://inria.github.io/scikit-learn-mooc/evaluation/evaluation_quiz_m7_04.html)\n", + "\n", + "### Regression metrics\n", + "\n", + "* [Regression](notebooks/metrics_regression.ipynb)\n", + "* [๐Ÿ“ Exercise M7.03](notebooks/metrics_ex_02.ipynb)\n", + "* [๐Ÿ“ƒ Solution for Exercise M7.03](notebooks/metrics_sol_02.ipynb)\n", + "* [โœ… Quiz M7.05](https://inria.github.io/scikit-learn-mooc/evaluation/evaluation_quiz_m7_05.html)\n", + "\n", + "[๐Ÿ Wrap-up quiz 7](https://inria.github.io/scikit-learn-mooc/evaluation/evaluation_wrap_up_quiz.html)\n", + "\n", + "[Main take-away](https://inria.github.io/scikit-learn-mooc/evaluation/evaluation_module_take_away.html)\n", + "\n", + "# Concluding remarks\n", + "\n", + "[๐ŸŽฅ Concluding remarks](https://inria.github.io/scikit-learn-mooc/concluding_remarks_video.html)\n", + "\n", + "[Concluding remarks](https://inria.github.io/scikit-learn-mooc/concluding_remarks.html)" + ] + } + ], + "metadata": { + "jupytext": { + "cell_metadata_filter": "-all", + "main_language": "python", + "notebook_metadata_filter": "-all" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} From 28128b5c1ba6b16daafbc87611d4ab9a3c55ea33 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Est=C3=A8ve?= Date: Thu, 9 Mar 2023 19:01:58 +0100 Subject: [PATCH 023/108] Move index to ipynb format (#691) --- CONTRIBUTING.md | 4 +- ...generate-md-index.py => generate-index.py} | 31 ++- environment.yml | 1 - full-index.ipynb | 11 +- full-index.md | 261 ------------------ local-install-instructions.md | 4 +- requirements.txt | 1 - 7 files changed, 31 insertions(+), 282 deletions(-) rename build_tools/{generate-md-index.py => generate-index.py} (92%) delete mode 100644 full-index.md diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index ad1741142..bf2d13378 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -89,9 +89,9 @@ quizzes (quizzes with solutions). make quizzes ``` -### Generating full-index.md +### Generating full-index.ipynb -full-index.md is useful when giving the course live. It makes it easier to guide people +full-index.ipynb is useful when giving the course live. It makes it easier to guide people through the notebooks. ``` diff --git a/build_tools/generate-md-index.py b/build_tools/generate-index.py similarity index 92% rename from build_tools/generate-md-index.py rename to build_tools/generate-index.py index cbf868072..3894b245b 100644 --- a/build_tools/generate-md-index.py +++ b/build_tools/generate-index.py @@ -1,9 +1,10 @@ # %% from pathlib import Path -import yaml import click +import nbformat + import jupytext from sphinx_external_toc.parsing import parse_toc_yaml @@ -73,7 +74,7 @@ def get_single_file_markdown(docname): title = get_first_title(path) target = path # For now the target is relative to the repo root directory since that is - # where full-index.md lives. Maybe one day this can be another argument of + # where full-index.ipynb lives. Maybe one day this can be another argument of # the script ? target = target.relative_to(root_dir) @@ -208,6 +209,24 @@ def test_get_full_index_markdown(): print(get_full_index_markdown(toc_path)) +def get_full_index_ipynb(toc_path): + md_str = get_full_index_markdown(toc_path) + nb = jupytext.reads(md_str, format=".md") + + nb = nbformat.v4.new_notebook(cells=[nbformat.v4.new_markdown_cell(md_str)]) + + # nb_content = jupytext.writes(nb, fmt=".ipynb") + # nb = json.loads(nb_content) + + # In nbformat 5, markdown cells have ids, nbformat.write and consequently + # jupytext writes random cell ids when generating .ipynb from .py, creating + # unnecessary changes. + for c in nb.cells: + del c["id"] + + return nbformat.v4.writes(nb) + + def test_json_manipulation(): """Gives a few hints about the json format""" # %% @@ -288,15 +307,15 @@ def test_json_manipulation(): ) @click.option( "--output", - default="full-index.md", - help="Path where the markdown index will be written", + default="full-index.ipynb", + help="Path where the index notebook will be written", ) def main(toc, output): toc_path = Path(toc) output_path = Path(output) - md_str = get_full_index_markdown(toc_path) - output_path.write_text(md_str) + ipynb_str = get_full_index_ipynb(toc_path) + output_path.write_text(ipynb_str) if __name__ == "__main__": diff --git a/environment.yml b/environment.yml index 3f840ce77..3ca255705 100644 --- a/environment.yml +++ b/environment.yml @@ -10,7 +10,6 @@ dependencies: - seaborn - jupyterlab - notebook - - jupytext - plotly >= 5.10 - IPython - packaging diff --git a/full-index.ipynb b/full-index.ipynb index bc885df14..d5ab13817 100644 --- a/full-index.ipynb +++ b/full-index.ipynb @@ -2,7 +2,6 @@ "cells": [ { "cell_type": "markdown", - "id": "a4dde4d5", "metadata": {}, "source": [ "# Machine Learning Concepts\n", @@ -269,13 +268,7 @@ ] } ], - "metadata": { - "jupytext": { - "cell_metadata_filter": "-all", - "main_language": "python", - "notebook_metadata_filter": "-all" - } - }, + "metadata": {}, "nbformat": 4, "nbformat_minor": 5 -} +} \ No newline at end of file diff --git a/full-index.md b/full-index.md deleted file mode 100644 index c0c19aea5..000000000 --- a/full-index.md +++ /dev/null @@ -1,261 +0,0 @@ -# Machine Learning Concepts - -[๐ŸŽฅ Introducing machine-learning concepts](https://inria.github.io/scikit-learn-mooc/ml_concepts/slides.html) - -[โœ… Quiz Intro.01](https://inria.github.io/scikit-learn-mooc/ml_concepts/quiz_intro_01.html) - -# The predictive modeling pipeline - -[Module overview](https://inria.github.io/scikit-learn-mooc/predictive_modeling_pipeline/predictive_modeling_module_intro.html) - -### Tabular data exploration - -* [First look at our dataset](notebooks/01_tabular_data_exploration.ipynb) -* [๐Ÿ“ Exercise M1.01](notebooks/01_tabular_data_exploration_ex_01.ipynb) -* [๐Ÿ“ƒ Solution for Exercise M1.01](notebooks/01_tabular_data_exploration_sol_01.ipynb) -* [โœ… Quiz M1.01](https://inria.github.io/scikit-learn-mooc/predictive_modeling_pipeline/01_tabular_data_exploration_quiz_m1_01.html) - -### Fitting a scikit-learn model on numerical data - -* [First model with scikit-learn](notebooks/02_numerical_pipeline_introduction.ipynb) -* [๐Ÿ“ Exercise M1.02](notebooks/02_numerical_pipeline_ex_00.ipynb) -* [๐Ÿ“ƒ Solution for Exercise M1.02](notebooks/02_numerical_pipeline_sol_00.ipynb) -* [Working with numerical data](notebooks/02_numerical_pipeline_hands_on.ipynb) -* [๐Ÿ“ Exercise M1.03](notebooks/02_numerical_pipeline_ex_01.ipynb) -* [๐Ÿ“ƒ Solution for Exercise M1.03](notebooks/02_numerical_pipeline_sol_01.ipynb) -* [Preprocessing for numerical features](notebooks/02_numerical_pipeline_scaling.ipynb) -* [๐ŸŽฅ Validation of a model](https://inria.github.io/scikit-learn-mooc/predictive_modeling_pipeline/02_numerical_pipeline_video_cross_validation.html) -* [Model evaluation using cross-validation](notebooks/02_numerical_pipeline_cross_validation.ipynb) -* [โœ… Quiz M1.02](https://inria.github.io/scikit-learn-mooc/predictive_modeling_pipeline/02_numerical_pipeline_quiz_m1_02.html) - -### Handling categorical data - -* [Encoding of categorical variables](notebooks/03_categorical_pipeline.ipynb) -* [๐Ÿ“ Exercise M1.04](notebooks/03_categorical_pipeline_ex_01.ipynb) -* [๐Ÿ“ƒ Solution for Exercise M1.04](notebooks/03_categorical_pipeline_sol_01.ipynb) -* [Using numerical and categorical variables together](notebooks/03_categorical_pipeline_column_transformer.ipynb) -* [๐Ÿ“ Exercise M1.05](notebooks/03_categorical_pipeline_ex_02.ipynb) -* [๐Ÿ“ƒ Solution for Exercise M1.05](notebooks/03_categorical_pipeline_sol_02.ipynb) -* [๐ŸŽฅ Visualizing scikit-learn pipelines in Jupyter](https://inria.github.io/scikit-learn-mooc/predictive_modeling_pipeline/03_categorical_pipeline_visualization_video.html) -* [Visualizing scikit-learn pipelines in Jupyter](notebooks/03_categorical_pipeline_visualization.ipynb) -* [โœ… Quiz M1.03](https://inria.github.io/scikit-learn-mooc/predictive_modeling_pipeline/03_categorical_pipeline_quiz_m1_03.html) - -[๐Ÿ Wrap-up quiz 1](https://inria.github.io/scikit-learn-mooc/predictive_modeling_pipeline/wrap_up_quiz.html) - -[Main take-away](https://inria.github.io/scikit-learn-mooc/predictive_modeling_pipeline/predictive_modeling_module_take_away.html) - -# Selecting the best model - -[Module overview](https://inria.github.io/scikit-learn-mooc/overfit/overfit_module_intro.html) - -### Overfitting and underfitting - -* [๐ŸŽฅ Overfitting and Underfitting](https://inria.github.io/scikit-learn-mooc/overfit/overfitting_vs_under_fitting_slides.html) -* [Cross-validation framework](notebooks/cross_validation_train_test.ipynb) -* [โœ… Quiz M2.01](https://inria.github.io/scikit-learn-mooc/overfit/overfitting_vs_under_fitting_quiz_m2_01.html) - -### Validation and learning curves - -* [๐ŸŽฅ Comparing train and test errors](https://inria.github.io/scikit-learn-mooc/overfit/learning_validation_curves_slides.html) -* [Overfit-generalization-underfit](notebooks/cross_validation_validation_curve.ipynb) -* [Effect of the sample size in cross-validation](notebooks/cross_validation_learning_curve.ipynb) -* [๐Ÿ“ Exercise M2.01](notebooks/cross_validation_ex_01.ipynb) -* [๐Ÿ“ƒ Solution for Exercise M2.01](notebooks/cross_validation_sol_01.ipynb) -* [โœ… Quiz M2.02](https://inria.github.io/scikit-learn-mooc/overfit/learning_validation_curves_quiz_m2_02.html) - -### Bias versus variance trade-off - -* [๐ŸŽฅ Bias versus Variance](https://inria.github.io/scikit-learn-mooc/overfit/bias_vs_variance_slides.html) -* [โœ… Quiz M2.03](https://inria.github.io/scikit-learn-mooc/overfit/bias_vs_variance_quiz_m2_03.html) - -[๐Ÿ Wrap-up quiz 2](https://inria.github.io/scikit-learn-mooc/overfit/overfit_wrap_up_quiz.html) - -[Main take-away](https://inria.github.io/scikit-learn-mooc/overfit/overfit_take_away.html) - -# Hyperparameter tuning - -[Module overview](https://inria.github.io/scikit-learn-mooc/tuning/parameter_tuning_module_intro.html) - -### Manual tuning - -* [Set and get hyperparameters in scikit-learn](notebooks/parameter_tuning_manual.ipynb) -* [๐Ÿ“ Exercise M3.01](notebooks/parameter_tuning_ex_02.ipynb) -* [๐Ÿ“ƒ Solution for Exercise M3.01](notebooks/parameter_tuning_sol_02.ipynb) -* [โœ… Quiz M3.01](https://inria.github.io/scikit-learn-mooc/tuning/parameter_tuning_manual_quiz_m3_01.html) - -### Automated tuning - -* [Hyperparameter tuning by grid-search](notebooks/parameter_tuning_grid_search.ipynb) -* [Hyperparameter tuning by randomized-search](notebooks/parameter_tuning_randomized_search.ipynb) -* [๐ŸŽฅ Analysis of hyperparameter search results](https://inria.github.io/scikit-learn-mooc/tuning/parameter_tuning_parallel_plot_video.html) -* [Analysis of hyperparameter search results](notebooks/parameter_tuning_parallel_plot.ipynb) -* [Evaluation and hyperparameter tuning](notebooks/parameter_tuning_nested.ipynb) -* [๐Ÿ“ Exercise M3.02](notebooks/parameter_tuning_ex_03.ipynb) -* [๐Ÿ“ƒ Solution for Exercise M3.02](notebooks/parameter_tuning_sol_03.ipynb) -* [โœ… Quiz M3.02](https://inria.github.io/scikit-learn-mooc/tuning/parameter_tuning_automated_quiz_m3_02.html) - -[๐Ÿ Wrap-up quiz 3](https://inria.github.io/scikit-learn-mooc/tuning/parameter_tuning_wrap_up_quiz.html) - -[Main take-away](https://inria.github.io/scikit-learn-mooc/tuning/parameter_tuning_module_take_away.html) - -# Linear models - -[Module overview](https://inria.github.io/scikit-learn-mooc/linear_models/linear_models_module_intro.html) - -### Intuitions on linear models - -* [๐ŸŽฅ Intuitions on linear models](https://inria.github.io/scikit-learn-mooc/linear_models/linear_models_slides.html) -* [โœ… Quiz M4.01](https://inria.github.io/scikit-learn-mooc/linear_models/linear_models_quiz_m4_01.html) - -### Linear regression - -* [Linear regression without scikit-learn](notebooks/linear_regression_without_sklearn.ipynb) -* [๐Ÿ“ Exercise M4.01](notebooks/linear_models_ex_01.ipynb) -* [๐Ÿ“ƒ Solution for Exercise M4.01](notebooks/linear_models_sol_01.ipynb) -* [Linear regression using scikit-learn](notebooks/linear_regression_in_sklearn.ipynb) -* [โœ… Quiz M4.02](https://inria.github.io/scikit-learn-mooc/linear_models/linear_models_quiz_m4_02.html) - -### Modelling non-linear features-target relationships - -* [๐Ÿ“ Exercise M4.02](notebooks/linear_models_ex_02.ipynb) -* [๐Ÿ“ƒ Solution for Exercise M4.02](notebooks/linear_models_sol_02.ipynb) -* [Linear regression for a non-linear features-target relationship](notebooks/linear_regression_non_linear_link.ipynb) -* [๐Ÿ“ Exercise M4.03](notebooks/linear_models_ex_03.ipynb) -* [๐Ÿ“ƒ Solution for Exercise M4.03](notebooks/linear_models_sol_03.ipynb) -* [โœ… Quiz M4.03](https://inria.github.io/scikit-learn-mooc/linear_models/linear_models_quiz_m4_03.html) - -### Regularization in linear model - -* [๐ŸŽฅ Intuitions on regularized linear models](https://inria.github.io/scikit-learn-mooc/linear_models/regularized_linear_models_slides.html) -* [Regularization of linear regression model](notebooks/linear_models_regularization.ipynb) -* [๐Ÿ“ Exercise M4.04](notebooks/linear_models_ex_04.ipynb) -* [๐Ÿ“ƒ Solution for Exercise M4.04](notebooks/linear_models_sol_04.ipynb) -* [โœ… Quiz M4.04](https://inria.github.io/scikit-learn-mooc/linear_models/linear_models_quiz_m4_04.html) - -### Linear model for classification - -* [Linear model for classification](notebooks/logistic_regression.ipynb) -* [๐Ÿ“ Exercise M4.05](notebooks/linear_models_ex_05.ipynb) -* [๐Ÿ“ƒ Solution for Exercise M4.05](notebooks/linear_models_sol_05.ipynb) -* [Beyond linear separation in classification](notebooks/logistic_regression_non_linear.ipynb) -* [โœ… Quiz M4.05](https://inria.github.io/scikit-learn-mooc/linear_models/linear_models_quiz_m4_05.html) - -[๐Ÿ Wrap-up quiz 4](https://inria.github.io/scikit-learn-mooc/linear_models/linear_models_wrap_up_quiz.html) - -[Main take-away](https://inria.github.io/scikit-learn-mooc/linear_models/linear_models_module_take_away.html) - -# Decision tree models - -[Module overview](https://inria.github.io/scikit-learn-mooc/trees/trees_module_intro.html) - -### Intuitions on tree-based models - -* [๐ŸŽฅ Intuitions on tree-based models](https://inria.github.io/scikit-learn-mooc/trees/slides.html) -* [โœ… Quiz M5.01](https://inria.github.io/scikit-learn-mooc/trees/trees_quiz_m5_01.html) - -### Decision tree in classification - -* [Build a classification decision tree](notebooks/trees_classification.ipynb) -* [๐Ÿ“ Exercise M5.01](notebooks/trees_ex_01.ipynb) -* [๐Ÿ“ƒ Solution for Exercise M5.01](notebooks/trees_sol_01.ipynb) -* [โœ… Quiz M5.02](https://inria.github.io/scikit-learn-mooc/trees/trees_quiz_m5_02.html) - -### Decision tree in regression - -* [Decision tree for regression](notebooks/trees_regression.ipynb) -* [๐Ÿ“ Exercise M5.02](notebooks/trees_ex_02.ipynb) -* [๐Ÿ“ƒ Solution for Exercise M5.02](notebooks/trees_sol_02.ipynb) -* [โœ… Quiz M5.03](https://inria.github.io/scikit-learn-mooc/trees/trees_quiz_m5_03.html) - -### Hyperparameters of decision tree - -* [Importance of decision tree hyperparameters on generalization](notebooks/trees_hyperparameters.ipynb) -* [โœ… Quiz M5.04](https://inria.github.io/scikit-learn-mooc/trees/trees_quiz_m5_04.html) - -[๐Ÿ Wrap-up quiz 5](https://inria.github.io/scikit-learn-mooc/trees/trees_wrap_up_quiz.html) - -[Main take-away](https://inria.github.io/scikit-learn-mooc/trees/trees_module_take_away.html) - -# Ensemble of models - -[Module overview](https://inria.github.io/scikit-learn-mooc/ensemble/ensemble_module_intro.html) - -### Ensemble method using bootstrapping - -* [๐ŸŽฅ Intuitions on ensemble models: bagging](https://inria.github.io/scikit-learn-mooc/ensemble/bagging_slides.html) -* [Introductory example to ensemble models](notebooks/ensemble_introduction.ipynb) -* [Bagging](notebooks/ensemble_bagging.ipynb) -* [๐Ÿ“ Exercise M6.01](notebooks/ensemble_ex_01.ipynb) -* [๐Ÿ“ƒ Solution for Exercise M6.01](notebooks/ensemble_sol_01.ipynb) -* [Random forests](notebooks/ensemble_random_forest.ipynb) -* [๐Ÿ“ Exercise M6.02](notebooks/ensemble_ex_02.ipynb) -* [๐Ÿ“ƒ Solution for Exercise M6.02](notebooks/ensemble_sol_02.ipynb) -* [โœ… Quiz M6.01](https://inria.github.io/scikit-learn-mooc/ensemble/ensemble_quiz_m6_01.html) - -### Ensemble based on boosting - -* [๐ŸŽฅ Intuitions on ensemble models: boosting](https://inria.github.io/scikit-learn-mooc/ensemble/boosting_slides.html) -* [Adaptive Boosting (AdaBoost)](notebooks/ensemble_adaboost.ipynb) -* [Gradient-boosting decision tree (GBDT)](notebooks/ensemble_gradient_boosting.ipynb) -* [๐Ÿ“ Exercise M6.03](notebooks/ensemble_ex_03.ipynb) -* [๐Ÿ“ƒ Solution for Exercise M6.03](notebooks/ensemble_sol_03.ipynb) -* [Speeding-up gradient-boosting](notebooks/ensemble_hist_gradient_boosting.ipynb) -* [โœ… Quiz M6.02](https://inria.github.io/scikit-learn-mooc/ensemble/ensemble_quiz_m6_02.html) - -### Hyperparameter tuning with ensemble methods - -* [Hyperparameter tuning](notebooks/ensemble_hyperparameters.ipynb) -* [๐Ÿ“ Exercise M6.04](notebooks/ensemble_ex_04.ipynb) -* [๐Ÿ“ƒ Solution for Exercise M6.04](notebooks/ensemble_sol_04.ipynb) -* [โœ… Quiz M6.03](https://inria.github.io/scikit-learn-mooc/ensemble/ensemble_quiz_m6_03.html) - -[๐Ÿ Wrap-up quiz 6](https://inria.github.io/scikit-learn-mooc/ensemble/ensemble_wrap_up_quiz.html) - -[Main take-away](https://inria.github.io/scikit-learn-mooc/ensemble/ensemble_module_take_away.html) - -# Evaluating model performance - -[Module overview](https://inria.github.io/scikit-learn-mooc/evaluation/evaluation_module_intro.html) - -### Comparing a model with simple baselines - -* [Comparing model performance with a simple baseline](notebooks/cross_validation_baseline.ipynb) -* [๐Ÿ“ Exercise M7.01](notebooks/cross_validation_ex_02.ipynb) -* [๐Ÿ“ƒ Solution for Exercise M7.01](notebooks/cross_validation_sol_02.ipynb) -* [โœ… Quiz M7.01](https://inria.github.io/scikit-learn-mooc/evaluation/evaluation_quiz_m7_01.html) - -### Choice of cross-validation - -* [Stratification](notebooks/cross_validation_stratification.ipynb) -* [Sample grouping](notebooks/cross_validation_grouping.ipynb) -* [Non i.i.d. data](notebooks/cross_validation_time.ipynb) -* [โœ… Quiz M7.02](https://inria.github.io/scikit-learn-mooc/evaluation/evaluation_quiz_m7_02.html) - -### Nested cross-validation - -* [Nested cross-validation](notebooks/cross_validation_nested.ipynb) -* [โœ… Quiz M7.03](https://inria.github.io/scikit-learn-mooc/evaluation/evaluation_quiz_m7_03.html) - -### Classification metrics - -* [Classification](notebooks/metrics_classification.ipynb) -* [๐Ÿ“ Exercise M7.02](notebooks/metrics_ex_01.ipynb) -* [๐Ÿ“ƒ Solution for Exercise M7.02](notebooks/metrics_sol_01.ipynb) -* [โœ… Quiz M7.04](https://inria.github.io/scikit-learn-mooc/evaluation/evaluation_quiz_m7_04.html) - -### Regression metrics - -* [Regression](notebooks/metrics_regression.ipynb) -* [๐Ÿ“ Exercise M7.03](notebooks/metrics_ex_02.ipynb) -* [๐Ÿ“ƒ Solution for Exercise M7.03](notebooks/metrics_sol_02.ipynb) -* [โœ… Quiz M7.05](https://inria.github.io/scikit-learn-mooc/evaluation/evaluation_quiz_m7_05.html) - -[๐Ÿ Wrap-up quiz 7](https://inria.github.io/scikit-learn-mooc/evaluation/evaluation_wrap_up_quiz.html) - -[Main take-away](https://inria.github.io/scikit-learn-mooc/evaluation/evaluation_module_take_away.html) - -# Concluding remarks - -[๐ŸŽฅ Concluding remarks](https://inria.github.io/scikit-learn-mooc/concluding_remarks_video.html) - -[Concluding remarks](https://inria.github.io/scikit-learn-mooc/concluding_remarks.html) \ No newline at end of file diff --git a/local-install-instructions.md b/local-install-instructions.md index 026b8eaa3..cc69b9ab6 100644 --- a/local-install-instructions.md +++ b/local-install-instructions.md @@ -58,9 +58,9 @@ Using python in /home/lesteve/miniconda3/envs/scikit-learn-course ```sh # Activate your conda environment conda activate scikit-learn-course -jupyter notebook full-index.md +jupyter notebook full-index.ipynb ``` -`full-index.md` is an index file helping to navigate the notebooks. +`full-index.ipynb` is an index file helping to navigate the notebooks. All the Jupyter notebooks are located in the `notebooks` folder. diff --git a/requirements.txt b/requirements.txt index 49df7fe00..7a6c74f9b 100644 --- a/requirements.txt +++ b/requirements.txt @@ -5,5 +5,4 @@ seaborn plotly jupyterlab notebook -jupytext IPython From 14f469d4782c99c474da4c02afb5c01a08cf0189 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Est=C3=A8ve?= Date: Tue, 21 Mar 2023 16:37:03 +0100 Subject: [PATCH 024/108] MNT Modify the Binder links to go to full-index.ipynb (#692) --- CONTRIBUTING.md | 6 +++--- README.md | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index bf2d13378..638a3aa61 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -196,8 +196,8 @@ See [this](./workflow-notes.md). ## Direct binder links to OVH, GESIS and GKE to trigger and cache builds -- [OVH Binder](https://ovh.mybinder.org/v2/gh/INRIA/scikit-learn-mooc/main) +- [OVH Binder](https://ovh.mybinder.org/v2/gh/INRIA/scikit-learn-mooc/main?filepath=full-index.ipynb) -- [GESIS Binder](https://gesis.mybinder.org/v2/gh/INRIA/scikit-learn-mooc/main) +- [GESIS Binder](https://gesis.mybinder.org/v2/gh/INRIA/scikit-learn-mooc/main?filepath=full-index.ipynb) -- [GKE Binder](https://gke.mybinder.org/v2/gh/INRIA/scikit-learn-mooc/main) +- [GKE Binder](https://gke.mybinder.org/v2/gh/INRIA/scikit-learn-mooc/main?filepath=full-index.ipynb) diff --git a/README.md b/README.md index ae9ceec47..dc52e5b95 100644 --- a/README.md +++ b/README.md @@ -21,7 +21,7 @@ https://inria.github.io/scikit-learn-mooc/index.html A few different ways are available: - Launch an online notebook environment using [![Binder](https://mybinder.org/badge_logo.svg)]( - https://mybinder.org/v2/gh/INRIA/scikit-learn-mooc/main) + https://mybinder.org/v2/gh/INRIA/scikit-learn-mooc/main?filepath=full-index.ipynb) - Browse [website](https://inria.github.io/scikit-learn-mooc) generated with [Jupyter Book](https://jupyterbook.org/) From ccaee259eadc6b2630a07d6313ce79d2d5057cba Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Est=C3=A8ve?= Date: Tue, 21 Mar 2023 17:11:54 +0100 Subject: [PATCH 025/108] Add Binder grafana dashboard link --- CONTRIBUTING.md | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 638a3aa61..00e9b92d0 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -195,9 +195,11 @@ See [this](./workflow-notes.md). ## Direct binder links to OVH, GESIS and GKE to trigger and cache builds - - [OVH Binder](https://ovh.mybinder.org/v2/gh/INRIA/scikit-learn-mooc/main?filepath=full-index.ipynb) - [GESIS Binder](https://gesis.mybinder.org/v2/gh/INRIA/scikit-learn-mooc/main?filepath=full-index.ipynb) - [GKE Binder](https://gke.mybinder.org/v2/gh/INRIA/scikit-learn-mooc/main?filepath=full-index.ipynb) + +The [Binder Grafana dashboard](https://grafana.mybinder.org/d/3SpLQinmk/1-overview?orgId=1) +can be handy to have an idea whether there are currently some Binder issues. From a3f96ad0ef3d0132a3ed4e0b36c6f6c163110214 Mon Sep 17 00:00:00 2001 From: Arturo Amor <86408019+ArturoAmorQ@users.noreply.github.com> Date: Mon, 5 Jun 2023 12:06:08 +0200 Subject: [PATCH 026/108] MNT Use lint and black format (#693) --------- Co-authored-by: ArturoAmorQ Co-authored-by: Guillaume Lemaitre --- .github/workflows/formatting.yml | 28 ++ .pre-commit-config.yaml | 22 ++ pyproject.toml | 25 ++ python_scripts/01_tabular_data_exploration.py | 73 +++-- .../02_numerical_pipeline_cross_validation.py | 23 +- python_scripts/02_numerical_pipeline_ex_00.py | 1 + python_scripts/02_numerical_pipeline_ex_01.py | 3 +- .../02_numerical_pipeline_hands_on.py | 89 +++--- .../02_numerical_pipeline_introduction.py | 103 +++--- .../02_numerical_pipeline_scaling.py | 128 ++++---- .../02_numerical_pipeline_sol_00.py | 19 +- .../02_numerical_pipeline_sol_01.py | 52 ++-- python_scripts/03_categorical_pipeline.py | 180 ++++++----- ...categorical_pipeline_column_transformer.py | 104 ++++--- .../03_categorical_pipeline_ex_02.py | 20 +- .../03_categorical_pipeline_sol_01.py | 63 ++-- .../03_categorical_pipeline_sol_02.py | 120 ++++--- .../03_categorical_pipeline_visualization.py | 67 ++-- python_scripts/cross_validation_baseline.py | 19 +- python_scripts/cross_validation_grouping.py | 79 +++-- .../cross_validation_learning_curve.py | 49 ++- python_scripts/cross_validation_nested.py | 74 ++--- python_scripts/cross_validation_sol_01.py | 15 +- python_scripts/cross_validation_sol_02.py | 75 ++--- .../cross_validation_stratification.py | 87 +++--- python_scripts/cross_validation_time.py | 47 +-- python_scripts/cross_validation_train_test.py | 51 +-- .../cross_validation_validation_curve.py | 116 ++++--- python_scripts/datasets_ames_housing.py | 38 ++- python_scripts/datasets_bike_rides.py | 110 +++---- python_scripts/datasets_blood_transfusion.py | 41 ++- python_scripts/datasets_california_housing.py | 98 +++--- python_scripts/dev_features_importance.py | 294 ++++++++++-------- python_scripts/ensemble_adaboost.py | 128 +++++--- python_scripts/ensemble_bagging.py | 214 ++++++++----- python_scripts/ensemble_ex_01.py | 3 +- python_scripts/ensemble_ex_02.py | 3 +- python_scripts/ensemble_ex_03.py | 3 +- python_scripts/ensemble_gradient_boosting.py | 205 ++++++------ .../ensemble_hist_gradient_boosting.py | 125 ++++---- python_scripts/ensemble_hyperparameters.py | 35 ++- python_scripts/ensemble_introduction.py | 66 ++-- python_scripts/ensemble_random_forest.py | 70 +++-- python_scripts/ensemble_sol_01.py | 41 +-- python_scripts/ensemble_sol_02.py | 35 ++- python_scripts/ensemble_sol_03.py | 34 +- python_scripts/ensemble_sol_04.py | 58 ++-- python_scripts/feature_selection_ex_01.py | 28 +- .../feature_selection_introduction.py | 58 ++-- .../feature_selection_limitation_model.py | 27 +- python_scripts/feature_selection_sol_01.py | 52 ++-- python_scripts/linear_models_ex_02.py | 15 +- python_scripts/linear_models_ex_05.py | 8 +- .../linear_models_regularization.py | 277 ++++++++++------- python_scripts/linear_models_sol_01.py | 38 ++- python_scripts/linear_models_sol_02.py | 31 +- python_scripts/linear_models_sol_03.py | 44 +-- python_scripts/linear_models_sol_04.py | 78 +++-- python_scripts/linear_models_sol_05.py | 32 +- .../linear_regression_in_sklearn.py | 33 +- .../linear_regression_non_linear_link.py | 124 ++++---- .../linear_regression_without_sklearn.py | 116 +++---- python_scripts/logistic_regression.py | 52 ++-- .../logistic_regression_non_linear.py | 128 +++++--- python_scripts/matplotlibrc | 2 +- python_scripts/metrics_classification.py | 155 ++++----- python_scripts/metrics_regression.py | 112 ++++--- python_scripts/metrics_sol_01.py | 32 +- python_scripts/metrics_sol_02.py | 16 +- python_scripts/parameter_tuning_ex_02.py | 31 +- python_scripts/parameter_tuning_ex_03.py | 3 +- .../parameter_tuning_grid_search.py | 137 ++++---- python_scripts/parameter_tuning_manual.py | 85 ++--- python_scripts/parameter_tuning_nested.py | 129 ++++---- .../parameter_tuning_parallel_plot.py | 46 +-- .../parameter_tuning_randomized_search.py | 179 ++++++----- python_scripts/parameter_tuning_sol_02.py | 66 ++-- python_scripts/parameter_tuning_sol_03.py | 44 +-- python_scripts/trees_classification.py | 93 +++--- python_scripts/trees_dataset.py | 30 +- python_scripts/trees_ex_01.py | 20 +- python_scripts/trees_hyperparameters.py | 138 ++++---- python_scripts/trees_regression.py | 58 ++-- python_scripts/trees_sol_01.py | 53 ++-- python_scripts/trees_sol_02.py | 71 +++-- 85 files changed, 3377 insertions(+), 2597 deletions(-) create mode 100644 .github/workflows/formatting.yml create mode 100644 .pre-commit-config.yaml create mode 100644 pyproject.toml diff --git a/.github/workflows/formatting.yml b/.github/workflows/formatting.yml new file mode 100644 index 000000000..1730027f4 --- /dev/null +++ b/.github/workflows/formatting.yml @@ -0,0 +1,28 @@ +name: Formatting + +on: + push: + branches: + - "main" + + pull_request: + branches: + - '*' + +jobs: + run-linters: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v3 + + - name: Set up Python 3.11 + uses: actions/setup-python@v4 + with: + python-version: "3.11" + allow-prereleases: true + + - name: Run the linters via pre-commit + run: | + python -m pip install pre-commit + # only run pre-commit on the folder `python_scripts` + pre-commit run --files python_scripts/* diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml new file mode 100644 index 000000000..bcea7193e --- /dev/null +++ b/.pre-commit-config.yaml @@ -0,0 +1,22 @@ +repos: +- repo: https://github.com/pre-commit/pre-commit-hooks + rev: v4.4.0 + hooks: + - id: check-yaml + exclude: doc/ + - id: end-of-file-fixer + exclude: doc/ + - id: trailing-whitespace + exclude: doc/ +- repo: https://github.com/psf/black + rev: 23.1.0 + hooks: + - id: black + exclude: doc/ +- repo: https://github.com/pycqa/flake8 + rev: 4.0.1 + hooks: + - id: flake8 + entry: pflake8 + additional_dependencies: [pyproject-flake8] + types: [file, python] diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 000000000..11c889d00 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,25 @@ +[tool.black] +line-length = 79 +target_version = ['py38', 'py39', 'py310', 'py311'] +preview = true +exclude = ''' +/( + \.eggs # exclude a few common directories in the + | \.git # root of the project + | \.mypy_cache + | \.vscode + | build + | dist +)/ +''' + +[tool.flake8] +ignore = [ + 'E402', # module level import not at top of file + 'F401', # imported but unused + 'E501', # line too long + 'E203', # whitespace before ':' + 'W503', # line break before binary operator + 'W504', # Line break occurred after a binary operator + 'E24', +] diff --git a/python_scripts/01_tabular_data_exploration.py b/python_scripts/01_tabular_data_exploration.py index 765f1e353..69427f8d0 100644 --- a/python_scripts/01_tabular_data_exploration.py +++ b/python_scripts/01_tabular_data_exploration.py @@ -15,8 +15,8 @@ # * looking at the variables in the dataset, in particular, differentiate # between numerical and categorical variables, which need different # preprocessing in most machine learning workflows; -# * visualizing the distribution of the variables to gain some insights into -# the dataset. +# * visualizing the distribution of the variables to gain some insights into the +# dataset. # %% [markdown] # ## Loading the adult census dataset @@ -50,9 +50,9 @@ # %% [markdown] # ## The variables (columns) in the dataset # -# The data are stored in a `pandas` dataframe. A dataframe is a type of structured -# data composed of 2 dimensions. This type of data is also referred as tabular -# data. +# The data are stored in a `pandas` dataframe. A dataframe is a type of +# structured data composed of 2 dimensions. This type of data is also referred +# as tabular data. # # Each row represents a "sample". In the field of machine learning or # descriptive statistics, commonly used equivalent terms are "record", @@ -71,12 +71,12 @@ adult_census.head() # %% [markdown] -# The column named **class** is our target variable (i.e., the variable which -# we want to predict). The two possible classes are `<=50K` (low-revenue) and -# `>50K` (high-revenue). The resulting prediction problem is therefore a -# binary classification problem as `class` has only two possible values. -# We will use the left-over columns (any column other than `class`) as input -# variables for our model. +# The column named **class** is our target variable (i.e., the variable which we +# want to predict). The two possible classes are `<=50K` (low-revenue) and +# `>50K` (high-revenue). The resulting prediction problem is therefore a binary +# classification problem as `class` has only two possible values. We will use +# the left-over columns (any column other than `class`) as input variables for +# our model. # %% target_column = "class" @@ -84,14 +84,14 @@ # %% [markdown] # ```{note} -# Here, classes are slightly imbalanced, meaning there are more samples of one or -# more classes compared to others. In this case, we have many more samples with -# `" <=50K"` than with `" >50K"`. Class imbalance happens often in practice +# Here, classes are slightly imbalanced, meaning there are more samples of one +# or more classes compared to others. In this case, we have many more samples +# with `" <=50K"` than with `" >50K"`. Class imbalance happens often in practice # and may need special techniques when building a predictive model. # -# For example in a medical setting, if we are trying to predict whether -# subjects will develop a rare disease, there will be a lot more healthy -# subjects than ill subjects in the dataset. +# For example in a medical setting, if we are trying to predict whether subjects +# will develop a rare disease, there will be a lot more healthy subjects than +# ill subjects in the dataset. # ``` # %% [markdown] @@ -197,9 +197,9 @@ # real life setting. # # We recommend our readers to refer to [fairlearn.org](https://fairlearn.org) -# for resources on how to quantify and potentially mitigate fairness -# issues related to the deployment of automated decision making -# systems that rely on machine learning components. +# for resources on how to quantify and potentially mitigate fairness issues +# related to the deployment of automated decision making systems that rely on +# machine learning components. # # Studying why the data collection process of this dataset lead to such an # unexpected gender imbalance is beyond the scope of this MOOC but we should @@ -211,21 +211,24 @@ adult_census["education"].value_counts() # %% [markdown] -# As noted above, `"education-num"` distribution has two clear peaks around 10 and -# 13. It would be reasonable to expect that `"education-num"` is the number of -# years of education. +# As noted above, `"education-num"` distribution has two clear peaks around 10 +# and 13. It would be reasonable to expect that `"education-num"` is the number +# of years of education. # # Let's look at the relationship between `"education"` and `"education-num"`. # %% -pd.crosstab(index=adult_census["education"], columns=adult_census["education-num"]) +pd.crosstab( + index=adult_census["education"], columns=adult_census["education-num"] +) # %% [markdown] # For every entry in `\"education\"`, there is only one single corresponding -# value in `\"education-num\"`. This shows that `"education"` and `"education-num"` -# give you the same information. For example, `"education-num"=2` is equivalent to -# `"education"="1st-4th"`. In practice that means we can remove -# `"education-num"` without losing information. Note that having redundant (or -# highly correlated) columns can be a problem for machine learning algorithms. +# value in `\"education-num\"`. This shows that `"education"` and +# `"education-num"` give you the same information. For example, +# `"education-num"=2` is equivalent to `"education"="1st-4th"`. In practice that +# means we can remove `"education-num"` without losing information. Note that +# having redundant (or highly correlated) columns can be a problem for machine +# learning algorithms. # %% [markdown] # ```{note} @@ -299,7 +302,9 @@ plt.axvline(x=age_limit, ymin=0, ymax=1, color="black", linestyle="--") hours_per_week_limit = 40 -plt.axhline(y=hours_per_week_limit, xmin=0.18, xmax=1, color="black", linestyle="--") +plt.axhline( + y=hours_per_week_limit, xmin=0.18, xmax=1, color="black", linestyle="--" +) plt.annotate("<=50K", (17, 25), rotation=90, fontsize=35) plt.annotate("<=50K", (35, 20), fontsize=35) @@ -322,10 +327,10 @@ # will choose the "best" splits based on data without human intervention or # inspection. Decision trees will be covered more in detail in a future module. # -# Note that machine learning is often used when creating rules by hand -# is not straightforward. For example because we are in high dimension (many -# features in a table) or because there are no simple and obvious rules that -# separate the two classes as in the top-right region of the previous plot. +# Note that machine learning is often used when creating rules by hand is not +# straightforward. For example because we are in high dimension (many features +# in a table) or because there are no simple and obvious rules that separate the +# two classes as in the top-right region of the previous plot. # # To sum up, the important thing to remember is that in a machine-learning # setting, a model automatically creates the "rules" from the existing data in diff --git a/python_scripts/02_numerical_pipeline_cross_validation.py b/python_scripts/02_numerical_pipeline_cross_validation.py index 1d333ea37..0edbd1cf8 100644 --- a/python_scripts/02_numerical_pipeline_cross_validation.py +++ b/python_scripts/02_numerical_pipeline_cross_validation.py @@ -13,7 +13,7 @@ # We will discuss the practical aspects of assessing the generalization # performance of our model via **cross-validation** instead of a single # train-test split. -# +# # ## Data preparation # # First, let's load the full adult census dataset. @@ -79,10 +79,11 @@ # # ```{note} # This figure shows the particular case of **K-fold** cross-validation strategy. -# For each cross-validation split, the procedure trains a clone of model on all the red -# samples and evaluate the score of the model on the blue samples. -# As mentioned earlier, there is a variety of different cross-validation -# strategies. Some of these aspects will be covered in more detail in future notebooks. +# For each cross-validation split, the procedure trains a clone of model on all +# the red samples and evaluate the score of the model on the blue samples. As +# mentioned earlier, there is a variety of different cross-validation +# strategies. Some of these aspects will be covered in more detail in future +# notebooks. # ``` # # Cross-validation is therefore computationally intensive because it requires @@ -104,8 +105,10 @@ # %% [markdown] # The output of `cross_validate` is a Python dictionary, which by default # contains three entries: -# - (i) the time to train the model on the training data for each fold, `fit_time` -# - (ii) the time to predict with the model on the testing data for each fold, `score_time` +# - (i) the time to train the model on the training data for each fold, +# `fit_time` +# - (ii) the time to predict with the model on the testing data for each fold, +# `score_time` # - (iii) the default score on the testing data for each fold, `test_score`. # # Setting `cv=5` created 5 distinct splits to get 5 variations for the training @@ -144,8 +147,8 @@ # we can estimate the uncertainty of our model generalization performance. This # is the main advantage of cross-validation and can be crucial in practice, for # example when comparing different models to figure out whether one is better -# than the other or whether our measures of the generalization performance of each -# model are within the error bars of one-another. +# than the other or whether our measures of the generalization performance of +# each model are within the error bars of one-another. # # In this particular case, only the first 2 decimals seem to be trustworthy. If # you go up in this notebook, you can check that the performance we get with @@ -153,6 +156,6 @@ # %% [markdown] # ## Notebook recap -# +# # In this notebook we assessed the generalization performance of our model via # **cross-validation**. diff --git a/python_scripts/02_numerical_pipeline_ex_00.py b/python_scripts/02_numerical_pipeline_ex_00.py index 8ae3a8a4e..5d41ab982 100644 --- a/python_scripts/02_numerical_pipeline_ex_00.py +++ b/python_scripts/02_numerical_pipeline_ex_00.py @@ -24,6 +24,7 @@ # %% import pandas as pd + adult_census = pd.read_csv("../datasets/adult-census-numeric.csv") data = adult_census.drop(columns="class") target = adult_census["class"] diff --git a/python_scripts/02_numerical_pipeline_ex_01.py b/python_scripts/02_numerical_pipeline_ex_01.py index ab4e9bca2..826f99759 100644 --- a/python_scripts/02_numerical_pipeline_ex_01.py +++ b/python_scripts/02_numerical_pipeline_ex_01.py @@ -49,8 +49,7 @@ # notebook. # %% -numerical_columns = [ - "age", "capital-gain", "capital-loss", "hours-per-week"] +numerical_columns = ["age", "capital-gain", "capital-loss", "hours-per-week"] data_numeric = data[numerical_columns] diff --git a/python_scripts/02_numerical_pipeline_hands_on.py b/python_scripts/02_numerical_pipeline_hands_on.py index f17b46e7a..83f4346ed 100644 --- a/python_scripts/02_numerical_pipeline_hands_on.py +++ b/python_scripts/02_numerical_pipeline_hands_on.py @@ -8,12 +8,11 @@ # %% [markdown] # # Working with numerical data # -# In the previous notebook, we trained a k-nearest neighbors model on -# some data. +# In the previous notebook, we trained a k-nearest neighbors model on some data. # # However, we oversimplified the procedure by loading a dataset that contained -# exclusively numerical data. Besides, we used datasets which were already -# split into train-test sets. +# exclusively numerical data. Besides, we used datasets which were already split +# into train-test sets. # # In this notebook, we aim at: # @@ -27,8 +26,8 @@ # # ## Loading the entire dataset # -# As in the previous notebook, we rely on pandas to open the CSV file into -# a pandas dataframe. +# As in the previous notebook, we rely on pandas to open the CSV file into a +# pandas dataframe. # %% import pandas as pd @@ -59,18 +58,17 @@ # ``` # %% [markdown] -# At this point, we can focus on the data we want to use to train our -# predictive model. +# At this point, we can focus on the data we want to use to train our predictive +# model. # # ## Identify numerical data # # Numerical data are represented with numbers. They are linked to measurable -# (quantitative) data, such as age or the number of hours a person works a -# week. +# (quantitative) data, such as age or the number of hours a person works a week. # -# Predictive models are natively designed to work with numerical data. -# Moreover, numerical data usually requires very little work before getting -# started with training. +# Predictive models are natively designed to work with numerical data. Moreover, +# numerical data usually requires very little work before getting started with +# training. # # The first task here will be to identify numerical data in our dataset. # @@ -86,16 +84,16 @@ data.dtypes # %% [markdown] -# We seem to have only two data types: `int64` and `object`. We can make -# sure by checking for unique data types. +# We seem to have only two data types: `int64` and `object`. We can make sure by +# checking for unique data types. # %% data.dtypes.unique() # %% [markdown] -# Indeed, the only two types in the dataset are integer `int64` and `object`. -# We can look at the first few lines of the dataframe to understand the -# meaning of the `object` data type. +# Indeed, the only two types in the dataset are integer `int64` and `object`. We +# can look at the first few lines of the dataframe to understand the meaning of +# the `object` data type. # %% data.head() @@ -111,9 +109,9 @@ data[numerical_columns].head() # %% [markdown] -# Now that we limited the dataset to numerical columns only, -# we can analyse these numbers to figure out what they represent. We can -# identify two types of usage. +# Now that we limited the dataset to numerical columns only, we can analyse +# these numbers to figure out what they represent. We can identify two types of +# usage. # # The first column, `"age"`, is self-explanatory. We can note that the values # are continuous, meaning they can take up any number in a given range. Let's @@ -150,7 +148,8 @@ from sklearn.model_selection import train_test_split data_train, data_test, target_train, target_test = train_test_split( - data_numeric, target, random_state=42, test_size=0.25) + data_numeric, target, random_state=42, test_size=0.25 +) # %% [markdown] # ```{tip} @@ -163,18 +162,22 @@ # %% [markdown] # When calling the function `train_test_split`, we specified that we would like # to have 25% of samples in the testing set while the remaining samples (75%) -# will be available in the training set. We can check quickly if we got -# what we expected. +# will be available in the training set. We can check quickly if we got what we +# expected. # %% -print(f"Number of samples in testing: {data_test.shape[0]} => " - f"{data_test.shape[0] / data_numeric.shape[0] * 100:.1f}% of the" - f" original set") +print( + f"Number of samples in testing: {data_test.shape[0]} => " + f"{data_test.shape[0] / data_numeric.shape[0] * 100:.1f}% of the" + " original set" +) # %% -print(f"Number of samples in training: {data_train.shape[0]} => " - f"{data_train.shape[0] / data_numeric.shape[0] * 100:.1f}% of the" - f" original set") +print( + f"Number of samples in training: {data_train.shape[0]} => " + f"{data_train.shape[0] / data_numeric.shape[0] * 100:.1f}% of the" + " original set" +) # %% [markdown] # In the previous notebook, we used a k-nearest neighbors model. While this @@ -203,17 +206,17 @@ model = LogisticRegression() # %% [markdown] -# Now that the model has been created, you can use it exactly the same way as -# we used the k-nearest neighbors model in the previous notebook. In -# particular, we can use the `fit` method to train the model using the training -# data and labels: +# Now that the model has been created, you can use it exactly the same way as we +# used the k-nearest neighbors model in the previous notebook. In particular, we +# can use the `fit` method to train the model using the training data and +# labels: # %% model.fit(data_train, target_train) # %% [markdown] -# We can also use the `score` method to check the model generalization performance -# on the test set. +# We can also use the `score` method to check the model generalization +# performance on the test set. # %% accuracy = model.score(data_test, target_test) @@ -222,16 +225,16 @@ # %% [markdown] # ## Notebook recap # -# In scikit-learn, the `score` method of a classification model returns the accuracy, -# i.e. the fraction of correctly classified samples. In this case, around -# 8 / 10 of the times the logistic regression predicts the right income of a -# person. Now the real question is: is this generalization performance relevant -# of a good predictive model? Find out by solving the next exercise! +# In scikit-learn, the `score` method of a classification model returns the +# accuracy, i.e. the fraction of correctly classified samples. In this case, +# around 8 / 10 of the times the logistic regression predicts the right income +# of a person. Now the real question is: is this generalization performance +# relevant of a good predictive model? Find out by solving the next exercise! # # In this notebook, we learned to: # # * identify numerical data in a heterogeneous dataset; # * select the subset of columns corresponding to numerical data; -# * use the scikit-learn `train_test_split` function to separate data into -# a train and a test set; +# * use the scikit-learn `train_test_split` function to separate data into a +# train and a test set; # * train and evaluate a logistic regression model. diff --git a/python_scripts/02_numerical_pipeline_introduction.py b/python_scripts/02_numerical_pipeline_introduction.py index cb93af61a..51171c320 100644 --- a/python_scripts/02_numerical_pipeline_introduction.py +++ b/python_scripts/02_numerical_pipeline_introduction.py @@ -24,8 +24,8 @@ # . # # Numerical data is the most natural type of data used in machine learning and -# can (almost) directly be fed into predictive models. We will load a -# subset of the original data with only the numerical columns. +# can (almost) directly be fed into predictive models. We will load a subset of +# the original data with only the numerical columns. # %% import pandas as pd @@ -40,10 +40,9 @@ # %% [markdown] # We see that this CSV file contains all information: the target that we would -# like to predict (i.e. `"class"`) and the data that we want to use to train -# our predictive model (i.e. the remaining columns). The first step is to -# separate columns to get on one side the target and on the other side the -# data. +# like to predict (i.e. `"class"`) and the data that we want to use to train our +# predictive model (i.e. the remaining columns). The first step is to separate +# columns to get on one side the target and on the other side the data. # # ## Separate the data and the target @@ -53,7 +52,7 @@ target # %% -data = adult_census.drop(columns=[target_name, ]) +data = adult_census.drop(columns=[target_name]) data.head() # %% [markdown] @@ -65,16 +64,18 @@ data.columns # %% -print(f"The dataset contains {data.shape[0]} samples and " - f"{data.shape[1]} features") +print( + f"The dataset contains {data.shape[0]} samples and " + f"{data.shape[1]} features" +) # %% [markdown] # ## Fit a model and make predictions # -# We will build a classification model using the "K-nearest neighbors" -# strategy. To predict the target of a new sample, a k-nearest neighbors takes -# into account its `k` closest samples in the training set and predicts the -# majority target of these samples. +# We will build a classification model using the "K-nearest neighbors" strategy. +# To predict the target of a new sample, a k-nearest neighbors takes into +# account its `k` closest samples in the training set and predicts the majority +# target of these samples. # # ```{caution} # We use a K-nearest neighbors here. However, be aware that it is seldom useful @@ -96,11 +97,11 @@ # # ![Predictor fit diagram](../figures/api_diagram-predictor.fit.svg) # -# The method `fit` is composed of two elements: (i) a **learning algorithm** -# and (ii) some **model states**. The learning algorithm takes the training -# data and training target as input and sets the model states. These model -# states will be used later to either predict (for classifiers and regressors) -# or transform data (for transformers). +# The method `fit` is composed of two elements: (i) a **learning algorithm** and +# (ii) some **model states**. The learning algorithm takes the training data and +# training target as input and sets the model states. These model states will be +# used later to either predict (for classifiers and regressors) or transform +# data (for transformers). # # Both the learning algorithm and the type of model states are specific to each # type of model. @@ -128,8 +129,8 @@ # model states, the prediction function is specific for each type of model. # %% [markdown] -# Let's now have a look at the computed predictions. For the sake of -# simplicity, we will look at the five first predicted targets. +# Let's now have a look at the computed predictions. For the sake of simplicity, +# we will look at the five first predicted targets. # %% target_predicted[:5] @@ -147,8 +148,10 @@ target[:5] == target_predicted[:5] # %% -print(f"Number of correct prediction: " - f"{(target[:5] == target_predicted[:5]).sum()} / 5") +print( + "Number of correct prediction: " + f"{(target[:5] == target_predicted[:5]).sum()} / 5" +) # %% [markdown] # Here, we see that our model makes a mistake when predicting for the first @@ -160,29 +163,27 @@ (target == target_predicted).mean() # %% [markdown] -# This result means that the model makes a correct prediction for -# approximately 82 samples out of 100. Note that we used the same data -# to train and evaluate our model. Can this evaluation be trusted or is -# it too good to be true? +# This result means that the model makes a correct prediction for approximately +# 82 samples out of 100. Note that we used the same data to train and evaluate +# our model. Can this evaluation be trusted or is it too good to be true? # # ## Train-test data split # # When building a machine learning model, it is important to evaluate the # trained model on data that was not used to fit it, as **generalization** is # more than memorization (meaning we want a rule that generalizes to new data, -# without comparing to data we memorized). -# It is harder to conclude on never-seen instances than on already seen ones. +# without comparing to data we memorized). It is harder to conclude on +# never-seen instances than on already seen ones. # # Correct evaluation is easily done by leaving out a subset of the data when -# training the model and using it afterwards for model evaluation. -# The data used to fit a model is called training data while the data used to -# assess a model is called testing data. +# training the model and using it afterwards for model evaluation. The data used +# to fit a model is called training data while the data used to assess a model +# is called testing data. # -# We can load more data, which was actually left-out from the original data -# set. +# We can load more data, which was actually left-out from the original data set. # %% -adult_census_test = pd.read_csv('../datasets/adult-census-numeric-test.csv') +adult_census_test = pd.read_csv("../datasets/adult-census-numeric-test.csv") # %% [markdown] # From this new data, we separate our input features and the target to predict, @@ -190,41 +191,41 @@ # %% target_test = adult_census_test[target_name] -data_test = adult_census_test.drop(columns=[target_name, ]) +data_test = adult_census_test.drop(columns=[target_name]) # %% [markdown] # We can check the number of features and samples available in this new set. # %% -print(f"The testing dataset contains {data_test.shape[0]} samples and " - f"{data_test.shape[1]} features") +print( + f"The testing dataset contains {data_test.shape[0]} samples and " + f"{data_test.shape[1]} features" +) # %% [markdown] -# -# Instead of computing the prediction and manually computing the average -# success rate, we can use the method `score`. When dealing with classifiers -# this method returns their performance metric. +# Instead of computing the prediction and manually computing the average success +# rate, we can use the method `score`. When dealing with classifiers this method +# returns their performance metric. # %% accuracy = model.score(data_test, target_test) model_name = model.__class__.__name__ -print(f"The test accuracy using a {model_name} is " - f"{accuracy:.3f}") +print(f"The test accuracy using a {model_name} is {accuracy:.3f}") # %% [markdown] # Let's check the underlying mechanism when the `score` method is called: # # ![Predictor score diagram](../figures/api_diagram-predictor.score.svg) # -# To compute the score, the predictor first computes the predictions (using -# the `predict` method) and then uses a scoring function to compare the -# true target `y` and the predictions. Finally, the score is returned. +# To compute the score, the predictor first computes the predictions (using the +# `predict` method) and then uses a scoring function to compare the true target +# `y` and the predictions. Finally, the score is returned. # %% [markdown] -# If we compare with the accuracy obtained by wrongly evaluating the model -# on the training set, we find that this evaluation was indeed optimistic -# compared to the score obtained on a held-out test set. +# If we compare with the accuracy obtained by wrongly evaluating the model on +# the training set, we find that this evaluation was indeed optimistic compared +# to the score obtained on a held-out test set. # # It shows the importance to always testing the generalization performance of # predictive models on a different set than the one used to train these models. @@ -250,5 +251,5 @@ # * fitted a **k-nearest neighbors** model on a training dataset; # * evaluated its generalization performance on the testing data; # * introduced the scikit-learn API `.fit(X, y)` (to train a model), -# `.predict(X)` (to make predictions) and `.score(X, y)` -# (to evaluate a model). +# `.predict(X)` (to make predictions) and `.score(X, y)` (to evaluate a +# model). diff --git a/python_scripts/02_numerical_pipeline_scaling.py b/python_scripts/02_numerical_pipeline_scaling.py index 892fb5ec6..6516e79f9 100644 --- a/python_scripts/02_numerical_pipeline_scaling.py +++ b/python_scripts/02_numerical_pipeline_scaling.py @@ -13,8 +13,7 @@ # We will introduce these new aspects: # # * an example of preprocessing, namely **scaling numerical variables**; -# * using a scikit-learn **pipeline** to chain preprocessing and model -# training. +# * using a scikit-learn **pipeline** to chain preprocessing and model training. # # ## Data preparation # @@ -26,8 +25,8 @@ adult_census = pd.read_csv("../datasets/adult-census.csv") # %% [markdown] -# We will now drop the target from the data we will use to train our -# predictive model. +# We will now drop the target from the data we will use to train our predictive +# model. # %% target_name = "class" @@ -35,12 +34,10 @@ data = adult_census.drop(columns=target_name) # %% [markdown] -# Then, we select only the numerical columns, as seen in the previous -# notebook. +# Then, we select only the numerical columns, as seen in the previous notebook. # %% -numerical_columns = [ - "age", "capital-gain", "capital-loss", "hours-per-week"] +numerical_columns = ["age", "capital-gain", "capital-loss", "hours-per-week"] data_numeric = data[numerical_columns] @@ -51,15 +48,16 @@ from sklearn.model_selection import train_test_split data_train, data_test, target_train, target_test = train_test_split( - data_numeric, target, random_state=42) + data_numeric, target, random_state=42 +) # %% [markdown] # ## Model fitting with preprocessing # -# A range of preprocessing algorithms in scikit-learn allow us to transform -# the input data before training a model. In our case, we will standardize the -# data and then train a new logistic regression model on that new version of -# the dataset. +# A range of preprocessing algorithms in scikit-learn allow us to transform the +# input data before training a model. In our case, we will standardize the data +# and then train a new logistic regression model on that new version of the +# dataset. # # Let's start by printing some statistics about the training data. @@ -85,8 +83,8 @@ # # Whether or not a machine learning model requires scaling the features depends # on the model family. Linear models such as logistic regression generally -# benefit from scaling the features while other models such as decision trees -# do not need such preprocessing (but will not suffer from it). +# benefit from scaling the features while other models such as decision trees do +# not need such preprocessing (but will not suffer from it). # # We show how to apply such normalization using a scikit-learn transformer # called `StandardScaler`. This transformer shifts and scales each feature @@ -113,8 +111,8 @@ # ![Transformer fit diagram](../figures/api_diagram-transformer.fit.svg) # # In this case, the algorithm needs to compute the mean and standard deviation -# for each feature and store them into some NumPy arrays. Here, these -# statistics are the model states. +# for each feature and store them into some NumPy arrays. Here, these statistics +# are the model states. # # ```{note} # The fact that the model states of this scaler are arrays of means and @@ -154,17 +152,18 @@ # Let's illustrate the internal mechanism of the `transform` method and put it # to perspective with what we already saw with predictors. # -# ![Transformer transform diagram](../figures/api_diagram-transformer.transform.svg) +# ![Transformer transform +# diagram](../figures/api_diagram-transformer.transform.svg) # -# The `transform` method for transformers is similar to the `predict` method -# for predictors. It uses a predefined function, called a **transformation +# The `transform` method for transformers is similar to the `predict` method for +# predictors. It uses a predefined function, called a **transformation # function**, and uses the model states and the input data. However, instead of # outputting predictions, the job of the `transform` method is to output a # transformed version of the input data. # %% [markdown] -# Finally, the method `fit_transform` is a shorthand method to call -# successively `fit` and then `transform`. +# Finally, the method `fit_transform` is a shorthand method to call successively +# `fit` and then `transform`. # # ![Transformer fit_transform diagram](../figures/api_diagram-transformer.fit_transform.svg) @@ -173,39 +172,50 @@ data_train_scaled # %% -data_train_scaled = pd.DataFrame(data_train_scaled, - columns=data_train.columns) +data_train_scaled = pd.DataFrame(data_train_scaled, columns=data_train.columns) data_train_scaled.describe() # %% [markdown] -# Notice that the mean of all the columns is close to 0 and the standard deviation -# in all cases is close to 1. -# We can also visualize the effect of `StandardScaler` using a jointplot to show -# both the histograms of the distributions and a scatterplot of any pair of numerical -# features at the same time. We can observe that `StandardScaler` does not change -# the structure of the data itself but the axes get shifted and scaled. +# Notice that the mean of all the columns is close to 0 and the standard +# deviation in all cases is close to 1. We can also visualize the effect of +# `StandardScaler` using a jointplot to show both the histograms of the +# distributions and a scatterplot of any pair of numerical features at the same +# time. We can observe that `StandardScaler` does not change the structure of +# the data itself but the axes get shifted and scaled. # %% -import matplotlib.pyplot as plt +import matplotlib.pyplot as plt import seaborn as sns # number of points to visualize to have a clearer plot num_points_to_plot = 300 -sns.jointplot(data=data_train[:num_points_to_plot], x="age", - y="hours-per-week", marginal_kws=dict(bins=15)) -plt.suptitle("Jointplot of 'age' vs 'hours-per-week' \nbefore StandardScaler", y=1.1) - -sns.jointplot(data=data_train_scaled[:num_points_to_plot], x="age", - y="hours-per-week", marginal_kws=dict(bins=15)) -_ = plt.suptitle("Jointplot of 'age' vs 'hours-per-week' \nafter StandardScaler", y=1.1) +sns.jointplot( + data=data_train[:num_points_to_plot], + x="age", + y="hours-per-week", + marginal_kws=dict(bins=15), +) +plt.suptitle( + "Jointplot of 'age' vs 'hours-per-week' \nbefore StandardScaler", y=1.1 +) + +sns.jointplot( + data=data_train_scaled[:num_points_to_plot], + x="age", + y="hours-per-week", + marginal_kws=dict(bins=15), +) +_ = plt.suptitle( + "Jointplot of 'age' vs 'hours-per-week' \nafter StandardScaler", y=1.1 +) # %% [markdown] -# We can easily combine sequential operations with a scikit-learn -# `Pipeline`, which chains together operations and is used as any other -# classifier or regressor. The helper function `make_pipeline` will create a -# `Pipeline`: it takes as arguments the successive transformations to perform, -# followed by the classifier or regressor model. +# We can easily combine sequential operations with a scikit-learn `Pipeline`, +# which chains together operations and is used as any other classifier or +# regressor. The helper function `make_pipeline` will create a `Pipeline`: it +# takes as arguments the successive transformations to perform, followed by the +# classifier or regressor model. # %% import time @@ -226,8 +236,8 @@ # %% [markdown] # This predictive pipeline exposes the same methods as the final predictor: -# `fit` and `predict` (and additionally `predict_proba`, `decision_function`, -# or `score`). +# `fit` and `predict` (and additionally `predict_proba`, `decision_function`, or +# `score`). # %% start = time.time() @@ -235,8 +245,8 @@ elapsed_time = time.time() - start # %% [markdown] -# We can represent the internal mechanism of a pipeline when calling `fit` -# by the following diagram: +# We can represent the internal mechanism of a pipeline when calling `fit` by +# the following diagram: # # ![pipeline fit diagram](../figures/api_diagram-pipeline.fit.svg) # @@ -265,20 +275,22 @@ # the predictor that will output the predicted target by calling its method # `predict`. # -# As a shorthand, we can check the score of the full predictive pipeline -# calling the method `model.score`. Thus, let's check the computational and +# As a shorthand, we can check the score of the full predictive pipeline calling +# the method `model.score`. Thus, let's check the computational and # generalization performance of such a predictive pipeline. # %% model_name = model.__class__.__name__ score = model.score(data_test, target_test) -print(f"The accuracy using a {model_name} is {score:.3f} " - f"with a fitting time of {elapsed_time:.3f} seconds " - f"in {model[-1].n_iter_[0]} iterations") +print( + f"The accuracy using a {model_name} is {score:.3f} " + f"with a fitting time of {elapsed_time:.3f} seconds " + f"in {model[-1].n_iter_[0]} iterations" +) # %% [markdown] -# We could compare this predictive model with the predictive model used in -# the previous notebook which did not scale features. +# We could compare this predictive model with the predictive model used in the +# previous notebook which did not scale features. # %% model = LogisticRegression() @@ -289,9 +301,11 @@ # %% model_name = model.__class__.__name__ score = model.score(data_test, target_test) -print(f"The accuracy using a {model_name} is {score:.3f} " - f"with a fitting time of {elapsed_time:.3f} seconds " - f"in {model.n_iter_[0]} iterations") +print( + f"The accuracy using a {model_name} is {score:.3f} " + f"with a fitting time of {elapsed_time:.3f} seconds " + f"in {model.n_iter_[0]} iterations" +) # %% [markdown] # We see that scaling the data before training the logistic regression was diff --git a/python_scripts/02_numerical_pipeline_sol_00.py b/python_scripts/02_numerical_pipeline_sol_00.py index a0bc17dd0..7ac9a5496 100644 --- a/python_scripts/02_numerical_pipeline_sol_00.py +++ b/python_scripts/02_numerical_pipeline_sol_00.py @@ -17,6 +17,7 @@ # %% import pandas as pd + adult_census = pd.read_csv("../datasets/adult-census-numeric.csv") data = adult_census.drop(columns="class") target = adult_census["class"] @@ -24,12 +25,12 @@ # %% [markdown] # In the previous notebook we used `model = KNeighborsClassifier()`. All # scikit-learn models can be created without arguments. This is convenient -# because it means that you don't need to understand the full details of a -# model before starting to use it. +# because it means that you don't need to understand the full details of a model +# before starting to use it. # -# One of the `KNeighborsClassifier` parameters is `n_neighbors`. It controls -# the number of neighbors we are going to use to make a prediction for a new -# data point. +# One of the `KNeighborsClassifier` parameters is `n_neighbors`. It controls the +# number of neighbors we are going to use to make a prediction for a new data +# point. # # What is the default value of the `n_neighbors` parameter? Hint: Look at the # documentation on the [scikit-learn @@ -40,7 +41,7 @@ # %% from sklearn.neighbors import KNeighborsClassifier -KNeighborsClassifier? +# KNeighborsClassifier? # %% [markdown] tags=["solution"] # We can see that the default value for `n_neighbors` is 5. @@ -75,11 +76,13 @@ # %% tags=["solution"] number_of_correct_predictions = ( - first_predictions == first_target_values).sum() + first_predictions == first_target_values +).sum() number_of_predictions = len(first_predictions) print( f"{number_of_correct_predictions}/{number_of_predictions} " - "of predictions are correct") + "of predictions are correct" +) # %% [markdown] # Compute the accuracy on the training data. diff --git a/python_scripts/02_numerical_pipeline_sol_01.py b/python_scripts/02_numerical_pipeline_sol_01.py index 86751da6c..70a21c31d 100644 --- a/python_scripts/02_numerical_pipeline_sol_01.py +++ b/python_scripts/02_numerical_pipeline_sol_01.py @@ -9,19 +9,19 @@ # # ๐Ÿ“ƒ Solution for Exercise M1.03 # # The goal of this exercise is to compare the performance of our classifier in -# the previous notebook (roughly 81% accuracy with `LogisticRegression`) to -# some simple baseline classifiers. The simplest baseline classifier is one -# that always predicts the same class, irrespective of the input data. +# the previous notebook (roughly 81% accuracy with `LogisticRegression`) to some +# simple baseline classifiers. The simplest baseline classifier is one that +# always predicts the same class, irrespective of the input data. # # - What would be the score of a model that always predicts `' >50K'`? # - What would be the score of a model that always predicts `' <=50K'`? # - Is 81% or 82% accuracy a good score for this problem? # -# Use a `DummyClassifier` and do a train-test split to evaluate -# its accuracy on the test set. This +# Use a `DummyClassifier` and do a train-test split to evaluate its accuracy on +# the test set. This # [link](https://scikit-learn.org/stable/modules/model_evaluation.html#dummy-estimators) -# shows a few examples of how to evaluate the generalization performance of these -# baseline models. +# shows a few examples of how to evaluate the generalization performance of +# these baseline models. # %% import pandas as pd @@ -42,8 +42,7 @@ # notebook. # %% -numerical_columns = [ - "age", "capital-gain", "capital-loss", "hours-per-week"] +numerical_columns = ["age", "capital-gain", "capital-loss", "hours-per-week"] data_numeric = data[numerical_columns] @@ -54,44 +53,47 @@ from sklearn.model_selection import train_test_split # solution -data_numeric_train, data_numeric_test, target_train, target_test = \ +data_numeric_train, data_numeric_test, target_train, target_test = ( train_test_split(data_numeric, target, random_state=42) +) # %% [markdown] -# Use a `DummyClassifier` such that the resulting classifier will always -# predict the class `' >50K'`. What is the accuracy score on the test set? -# Repeat the experiment by always predicting the class `' <=50K'`. +# Use a `DummyClassifier` such that the resulting classifier will always predict +# the class `' >50K'`. What is the accuracy score on the test set? Repeat the +# experiment by always predicting the class `' <=50K'`. # -# Hint: you can set the `strategy` parameter of the `DummyClassifier` to -# achieve the desired behavior. +# Hint: you can set the `strategy` parameter of the `DummyClassifier` to achieve +# the desired behavior. # %% from sklearn.dummy import DummyClassifier # solution class_to_predict = " >50K" -high_revenue_clf = DummyClassifier(strategy="constant", - constant=class_to_predict) +high_revenue_clf = DummyClassifier( + strategy="constant", constant=class_to_predict +) high_revenue_clf.fit(data_numeric_train, target_train) score = high_revenue_clf.score(data_numeric_test, target_test) print(f"Accuracy of a model predicting only high revenue: {score:.3f}") # %% [markdown] tags=["solution"] -# We clearly see that the score is below 0.5 which might be surprising at -# first. We will now check the generalization performance of a model which always +# We clearly see that the score is below 0.5 which might be surprising at first. +# We will now check the generalization performance of a model which always # predict the low revenue class, i.e. `" <=50K"`. # %% tags=["solution"] class_to_predict = " <=50K" -low_revenue_clf = DummyClassifier(strategy="constant", - constant=class_to_predict) +low_revenue_clf = DummyClassifier( + strategy="constant", constant=class_to_predict +) low_revenue_clf.fit(data_numeric_train, target_train) score = low_revenue_clf.score(data_numeric_test, target_test) print(f"Accuracy of a model predicting only low revenue: {score:.3f}") # %% [markdown] tags=["solution"] -# We observe that this model has an accuracy higher than 0.5. This is due to -# the fact that we have 3/4 of the target belonging to low-revenue class. +# We observe that this model has an accuracy higher than 0.5. This is due to the +# fact that we have 3/4 of the target belonging to low-revenue class. # %% [markdown] tags=["solution"] # Therefore, any predictive model giving results below this dummy classifier @@ -104,8 +106,8 @@ (target == " <=50K").mean() # %% [markdown] tags=["solution"] -# In practice, we could have the strategy `"most_frequent"` to predict the -# class that appears the most in the training target. +# In practice, we could have the strategy `"most_frequent"` to predict the class +# that appears the most in the training target. # %% tags=["solution"] most_freq_revenue_clf = DummyClassifier(strategy="most_frequent") diff --git a/python_scripts/03_categorical_pipeline.py b/python_scripts/03_categorical_pipeline.py index bd62c6e26..3c4ce7048 100644 --- a/python_scripts/03_categorical_pipeline.py +++ b/python_scripts/03_categorical_pipeline.py @@ -8,9 +8,9 @@ # %% [markdown] # # Encoding of categorical variables # -# In this notebook, we will present typical ways of dealing with -# **categorical variables** by encoding them, namely **ordinal encoding** and -# **one-hot encoding**. +# In this notebook, we will present typical ways of dealing with **categorical +# variables** by encoding them, namely **ordinal encoding** and **one-hot +# encoding**. # %% [markdown] # Let's first load the entire adult dataset containing both numerical and @@ -32,25 +32,23 @@ # # ## Identify categorical variables # -# As we saw in the previous section, a numerical variable is a -# quantity represented by a real or integer number. These variables can be -# naturally handled by machine learning algorithms that are typically composed -# of a sequence of arithmetic instructions such as additions and -# multiplications. +# As we saw in the previous section, a numerical variable is a quantity +# represented by a real or integer number. These variables can be naturally +# handled by machine learning algorithms that are typically composed of a +# sequence of arithmetic instructions such as additions and multiplications. # -# In contrast, categorical variables have discrete values, typically -# represented by string labels (but not only) taken from a finite list of -# possible choices. For instance, the variable `native-country` in our dataset -# is a categorical variable because it encodes the data using a finite list of -# possible countries (along with the `?` symbol when this information is -# missing): +# In contrast, categorical variables have discrete values, typically represented +# by string labels (but not only) taken from a finite list of possible choices. +# For instance, the variable `native-country` in our dataset is a categorical +# variable because it encodes the data using a finite list of possible countries +# (along with the `?` symbol when this information is missing): # %% data["native-country"].value_counts().sort_index() # %% [markdown] -# How can we easily recognize categorical columns among the dataset? Part of -# the answer lies in the columns' data type: +# How can we easily recognize categorical columns among the dataset? Part of the +# answer lies in the columns' data type: # %% data.dtypes @@ -63,8 +61,8 @@ # # In the previous notebook, we manually defined the numerical columns. We could # do a similar approach. Instead, we will use the scikit-learn helper function -# `make_column_selector`, which allows us to select columns based on -# their data type. We will illustrate how to use this helper. +# `make_column_selector`, which allows us to select columns based on their data +# type. We will illustrate how to use this helper. # %% from sklearn.compose import make_column_selector as selector @@ -97,9 +95,8 @@ # ### Encoding ordinal categories # # The most intuitive strategy is to encode each category with a different -# number. The `OrdinalEncoder` will transform the data in such manner. -# We will start by encoding a single column to understand how the encoding -# works. +# number. The `OrdinalEncoder` will transform the data in such manner. We will +# start by encoding a single column to understand how the encoding works. # %% from sklearn.preprocessing import OrdinalEncoder @@ -126,45 +123,44 @@ data_encoded[:5] # %% -print( - f"The dataset encoded contains {data_encoded.shape[1]} features") +print(f"The dataset encoded contains {data_encoded.shape[1]} features") # %% [markdown] # We see that the categories have been encoded for each feature (column) # independently. We also note that the number of features before and after the # encoding is the same. # -# However, be careful when applying this encoding strategy: -# using this integer representation leads downstream predictive models -# to assume that the values are ordered (0 < 1 < 2 < 3... for instance). +# However, be careful when applying this encoding strategy: using this integer +# representation leads downstream predictive models to assume that the values +# are ordered (0 < 1 < 2 < 3... for instance). # # By default, `OrdinalEncoder` uses a lexicographical strategy to map string -# category labels to integers. This strategy is arbitrary and often -# meaningless. For instance, suppose the dataset has a categorical variable -# named `"size"` with categories such as "S", "M", "L", "XL". We would like the -# integer representation to respect the meaning of the sizes by mapping them to -# increasing integers such as `0, 1, 2, 3`. -# However, the lexicographical strategy used by default would map the labels -# "S", "M", "L", "XL" to 2, 1, 0, 3, by following the alphabetical order. +# category labels to integers. This strategy is arbitrary and often meaningless. +# For instance, suppose the dataset has a categorical variable named `"size"` +# with categories such as "S", "M", "L", "XL". We would like the integer +# representation to respect the meaning of the sizes by mapping them to +# increasing integers such as `0, 1, 2, 3`. However, the lexicographical +# strategy used by default would map the labels "S", "M", "L", "XL" to 2, 1, 0, +# 3, by following the alphabetical order. # -# The `OrdinalEncoder` class accepts a `categories` constructor argument to -# pass categories in the expected ordering explicitly. You can find more -# information in the -# [scikit-learn documentation](https://scikit-learn.org/stable/modules/preprocessing.html#encoding-categorical-features) +# The `OrdinalEncoder` class accepts a `categories` constructor argument to pass +# categories in the expected ordering explicitly. You can find more information +# in the [scikit-learn +# documentation](https://scikit-learn.org/stable/modules/preprocessing.html#encoding-categorical-features) # if needed. # -# If a categorical variable does not carry any meaningful order information -# then this encoding might be misleading to downstream statistical models and -# you might consider using one-hot encoding instead (see below). +# If a categorical variable does not carry any meaningful order information then +# this encoding might be misleading to downstream statistical models and you +# might consider using one-hot encoding instead (see below). # # ### Encoding nominal categories (without assuming any order) # -# `OneHotEncoder` is an alternative encoder that prevents the downstream -# models to make a false assumption about the ordering of categories. For a -# given feature, it will create as many new columns as there are possible -# categories. For a given sample, the value of the column corresponding to the -# category will be set to `1` while all the columns of the other categories -# will be set to `0`. +# `OneHotEncoder` is an alternative encoder that prevents the downstream models +# to make a false assumption about the ordering of categories. For a given +# feature, it will create as many new columns as there are possible categories. +# For a given sample, the value of the column corresponding to the category will +# be set to `1` while all the columns of the other categories will be set to +# `0`. # # We will start by encoding a single feature (e.g. `"education"`) to illustrate # how the encoding works. @@ -178,19 +174,19 @@ # %% [markdown] # ```{note} -# `sparse_output=False` is used in the `OneHotEncoder` for didactic purposes, namely -# easier visualization of the data. +# `sparse_output=False` is used in the `OneHotEncoder` for didactic purposes, +# namely easier visualization of the data. # # Sparse matrices are efficient data structures when most of your matrix -# elements are zero. They won't be covered in detail in this course. If you -# want more details about them, you can look at +# elements are zero. They won't be covered in detail in this course. If you want +# more details about them, you can look at # [this](https://scipy-lectures.org/advanced/scipy_sparse/introduction.html#why-sparse-matrices). # ``` # %% [markdown] # We see that encoding a single feature will give a NumPy array full of zeros -# and ones. We can get a better understanding using the associated feature -# names resulting from the transformation. +# and ones. We can get a better understanding using the associated feature names +# resulting from the transformation. # %% feature_names = encoder.get_feature_names_out(input_features=["education"]) @@ -204,8 +200,7 @@ # Let's apply this encoding on the full dataset. # %% -print( - f"The dataset is composed of {data_categorical.shape[1]} features") +print(f"The dataset is composed of {data_categorical.shape[1]} features") data_categorical.head() # %% @@ -213,8 +208,7 @@ data_encoded[:5] # %% -print( - f"The encoded dataset contains {data_encoded.shape[1]} features") +print(f"The encoded dataset contains {data_encoded.shape[1]} features") # %% [markdown] # Let's wrap this NumPy array in a dataframe with informative column names as @@ -225,11 +219,11 @@ pd.DataFrame(data_encoded, columns=columns_encoded).head() # %% [markdown] -# Look at how the `"workclass"` variable of the 3 first records has been -# encoded and compare this to the original string representation. +# Look at how the `"workclass"` variable of the 3 first records has been encoded +# and compare this to the original string representation. # -# The number of features after the encoding is more than 10 times larger than -# in the original data because some variables such as `occupation` and +# The number of features after the encoding is more than 10 times larger than in +# the original data because some variables such as `occupation` and # `native-country` have many possible categories. # %% [markdown] @@ -240,18 +234,17 @@ # %% [markdown] # ```{note} -# In general `OneHotEncoder` is the encoding strategy used when the -# downstream models are **linear models** while `OrdinalEncoder` is often a -# good strategy with **tree-based models**. +# In general `OneHotEncoder` is the encoding strategy used when the downstream +# models are **linear models** while `OrdinalEncoder` is often a good strategy +# with **tree-based models**. # ``` # %% [markdown] -# -# Using an `OrdinalEncoder` will output ordinal categories. This means -# that there is an order in the resulting categories (e.g. `0 < 1 < 2`). The -# impact of violating this ordering assumption is really dependent on the -# downstream models. Linear models will be impacted by misordered categories -# while tree-based models will not. +# Using an `OrdinalEncoder` will output ordinal categories. This means that +# there is an order in the resulting categories (e.g. `0 < 1 < 2`). The impact +# of violating this ordering assumption is really dependent on the downstream +# models. Linear models will be impacted by misordered categories while +# tree-based models will not. # # You can still use an `OrdinalEncoder` with linear models but you need to be # sure that: @@ -262,18 +255,19 @@ # The **next exercise** shows what can happen when using an `OrdinalEncoder` # with a liner model and the conditions above are not met. # -# One-hot encoding categorical variables with high cardinality can cause -# computational inefficiency in tree-based models. Because of this, it is not recommended -# to use `OneHotEncoder` in such cases even if the original categories do not -# have a given order. We will show this in the **final exercise** of this sequence. +# One-hot encoding categorical variables with high cardinality can cause +# computational inefficiency in tree-based models. Because of this, it is not +# recommended to use `OneHotEncoder` in such cases even if the original +# categories do not have a given order. We will show this in the **final +# exercise** of this sequence. # %% [markdown] # ## Evaluate our predictive pipeline # # We can now integrate this encoder inside a machine learning pipeline like we # did with numerical data: let's train a linear classifier on the encoded data -# and check the generalization performance of this machine learning pipeline using -# cross-validation. +# and check the generalization performance of this machine learning pipeline +# using cross-validation. # # Before we create the pipeline, we have to linger on the `native-country`. # Let's recall some statistics regarding this column. @@ -291,22 +285,21 @@ # # * list all the possible categories and provide it to the encoder via the # keyword argument `categories`; -# * use the parameter `handle_unknown`, i.e. if an unknown category is encountered -# during transform, the resulting one-hot encoded columns for this feature will -# be all zeros. +# * use the parameter `handle_unknown`, i.e. if an unknown category is +# encountered during transform, the resulting one-hot encoded columns for this +# feature will be all zeros. # # Here, we will use the latter solution for simplicity. # %% [markdown] # ```{tip} -# Be aware the `OrdinalEncoder` exposes as well a parameter -# `handle_unknown`. It can be set to `use_encoded_value`. If that option is chosen, -# you can define a fixed value to which all unknowns will be set to during -# `transform`. For example, -# `OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=42)` -# will set all values encountered during `transform` to `42` which are not part of -# the data encountered during the `fit` call. -# You are going to use these parameters in the next exercise. +# Be aware the `OrdinalEncoder` exposes as well a parameter `handle_unknown`. It +# can be set to `use_encoded_value`. If that option is chosen, you can define a +# fixed value to which all unknowns will be set to during `transform`. For +# example, `OrdinalEncoder(handle_unknown='use_encoded_value', +# unknown_value=42)` will set all values encountered during `transform` to `42` +# which are not part of the data encountered during the `fit` call. You are +# going to use these parameters in the next exercise. # ``` # %% [markdown] @@ -323,10 +316,10 @@ # %% [markdown] # ```{note} # Here, we need to increase the maximum number of iterations to obtain a fully -# converged `LogisticRegression` and silence a `ConvergenceWarning`. Contrary -# to the numerical features, the one-hot encoded categorical features are all -# on the same scale (values are 0 or 1), so they would not benefit from -# scaling. In this case, increasing `max_iter` is the right thing to do. +# converged `LogisticRegression` and silence a `ConvergenceWarning`. Contrary to +# the numerical features, the one-hot encoded categorical features are all on +# the same scale (values are 0 or 1), so they would not benefit from scaling. In +# this case, increasing `max_iter` is the right thing to do. # ``` # %% [markdown] @@ -335,6 +328,7 @@ # %% from sklearn.model_selection import cross_validate + cv_results = cross_validate(model, data_categorical, target) cv_results @@ -343,9 +337,9 @@ print(f"The accuracy is: {scores.mean():.3f} ยฑ {scores.std():.3f}") # %% [markdown] -# As you can see, this representation of the categorical variables is -# slightly more predictive of the revenue than the numerical variables -# that we used previously. +# As you can see, this representation of the categorical variables is slightly +# more predictive of the revenue than the numerical variables that we used +# previously. # %% [markdown] # diff --git a/python_scripts/03_categorical_pipeline_column_transformer.py b/python_scripts/03_categorical_pipeline_column_transformer.py index cd70ab282..002889af3 100644 --- a/python_scripts/03_categorical_pipeline_column_transformer.py +++ b/python_scripts/03_categorical_pipeline_column_transformer.py @@ -69,15 +69,14 @@ # # We first define the columns depending on their data type: # -# * **one-hot encoding** will be applied to categorical columns. Besides, we -# use `handle_unknown="ignore"` to solve the potential issues due to rare +# * **one-hot encoding** will be applied to categorical columns. Besides, we use +# `handle_unknown="ignore"` to solve the potential issues due to rare # categories. # * **numerical scaling** numerical features which will be standardized. # -# Now, we create our `ColumnTransfomer` by specifying three values: -# the preprocessor name, the transformer, and the columns. -# First, let's create the preprocessors for the numerical and categorical -# parts. +# Now, we create our `ColumnTransfomer` by specifying three values: the +# preprocessor name, the transformer, and the columns. First, let's create the +# preprocessors for the numerical and categorical parts. # %% from sklearn.preprocessing import OneHotEncoder, StandardScaler @@ -86,15 +85,18 @@ numerical_preprocessor = StandardScaler() # %% [markdown] -# Now, we create the transformer and associate each of these preprocessors -# with their respective columns. +# Now, we create the transformer and associate each of these preprocessors with +# their respective columns. # %% from sklearn.compose import ColumnTransformer -preprocessor = ColumnTransformer([ - ('one-hot-encoder', categorical_preprocessor, categorical_columns), - ('standard_scaler', numerical_preprocessor, numerical_columns)]) +preprocessor = ColumnTransformer( + [ + ("one-hot-encoder", categorical_preprocessor, categorical_columns), + ("standard_scaler", numerical_preprocessor, numerical_columns), + ] +) # %% [markdown] # We can take a minute to represent graphically the structure of a @@ -107,14 +109,14 @@ # * It **splits the columns** of the original dataset based on the column names # or indices provided. We will obtain as many subsets as the number of # transformers passed into the `ColumnTransformer`. -# * It **transforms each subsets**. A specific transformer is applied to -# each subset: it will internally call `fit_transform` or `transform`. The -# output of this step is a set of transformed datasets. +# * It **transforms each subsets**. A specific transformer is applied to each +# subset: it will internally call `fit_transform` or `transform`. The output +# of this step is a set of transformed datasets. # * It then **concatenates the transformed datasets** into a single dataset. -# The important thing is that `ColumnTransformer` is like any other -# scikit-learn transformer. In particular it can be combined with a classifier -# in a `Pipeline`: +# The important thing is that `ColumnTransformer` is like any other scikit-learn +# transformer. In particular it can be combined with a classifier in a +# `Pipeline`: # %% from sklearn.linear_model import LogisticRegression @@ -124,8 +126,8 @@ model # %% [markdown] -# The final model is more complex than the previous models but still follows -# the same API (the same set of methods that can be called by the user): +# The final model is more complex than the previous models but still follows the +# same API (the same set of methods that can be called by the user): # # - the `fit` method is called to preprocess the data and then train the # classifier of the preprocessed data; @@ -139,16 +141,16 @@ from sklearn.model_selection import train_test_split data_train, data_test, target_train, target_test = train_test_split( - data, target, random_state=42) + data, target, random_state=42 +) # %% [markdown] # # ```{caution} # Be aware that we use `train_test_split` here for didactic purposes, to show # the scikit-learn API. In a real setting one might prefer to use -# cross-validation to also be able to evaluate the uncertainty of -# our estimation of the generalization performance of a model, -# as previously demonstrated. +# cross-validation to also be able to evaluate the uncertainty of our estimation +# of the generalization performance of a model, as previously demonstrated. # ``` # # Now, we can train the model on the train set. @@ -160,8 +162,7 @@ # Then, we can send the raw dataset straight to the pipeline. Indeed, we do not # need to make any manual preprocessing (calling the `transform` or # `fit_transform` methods) as it will be handled when calling the `predict` -# method. As an example, we predict on the five first samples from the test -# set. +# method. As an example, we predict on the five first samples from the test set. # %% data_test.head() @@ -194,8 +195,10 @@ # %% scores = cv_results["test_score"] -print("The mean cross-validation accuracy is: " - f"{scores.mean():.3f} ยฑ {scores.std():.3f}") +print( + "The mean cross-validation accuracy is: " + f"{scores.mean():.3f} ยฑ {scores.std():.3f}" +) # %% [markdown] # The compound model has a higher predictive accuracy than the two models that @@ -204,15 +207,15 @@ # %% [markdown] # ## Fitting a more powerful model # -# **Linear models** are nice because they are usually cheap to train, -# **small** to deploy, **fast** to predict and give a **good baseline**. +# **Linear models** are nice because they are usually cheap to train, **small** +# to deploy, **fast** to predict and give a **good baseline**. # # However, it is often useful to check whether more complex models such as an # ensemble of decision trees can lead to higher predictive performance. In this # section we will use such a model called **gradient-boosting trees** and -# evaluate its generalization performance. More precisely, the scikit-learn model -# we will use is called `HistGradientBoostingClassifier`. Note that boosting -# models will be covered in more detail in a future module. +# evaluate its generalization performance. More precisely, the scikit-learn +# model we will use is called `HistGradientBoostingClassifier`. Note that +# boosting models will be covered in more detail in a future module. # # For tree-based models, the handling of numerical and categorical variables is # simpler than for linear models: @@ -220,19 +223,21 @@ # * using an **ordinal encoding for the categorical variables** is fine even if # the encoding results in an arbitrary ordering # -# Therefore, for `HistGradientBoostingClassifier`, the preprocessing pipeline -# is slightly simpler than the one we saw earlier for the `LogisticRegression`: +# Therefore, for `HistGradientBoostingClassifier`, the preprocessing pipeline is +# slightly simpler than the one we saw earlier for the `LogisticRegression`: # %% from sklearn.ensemble import HistGradientBoostingClassifier from sklearn.preprocessing import OrdinalEncoder -categorical_preprocessor = OrdinalEncoder(handle_unknown="use_encoded_value", - unknown_value=-1) +categorical_preprocessor = OrdinalEncoder( + handle_unknown="use_encoded_value", unknown_value=-1 +) -preprocessor = ColumnTransformer([ - ('categorical', categorical_preprocessor, categorical_columns)], - remainder="passthrough") +preprocessor = ColumnTransformer( + [("categorical", categorical_preprocessor, categorical_columns)], + remainder="passthrough", +) model = make_pipeline(preprocessor, HistGradientBoostingClassifier()) @@ -248,19 +253,18 @@ # %% [markdown] # We can observe that we get significantly higher accuracies with the Gradient -# Boosting model. This is often what we observe whenever the dataset has a -# large number of samples and limited number of informative features (e.g. less -# than 1000) with a mix of numerical and categorical variables. +# Boosting model. This is often what we observe whenever the dataset has a large +# number of samples and limited number of informative features (e.g. less than +# 1000) with a mix of numerical and categorical variables. # -# This explains why Gradient Boosted Machines are very popular among -# datascience practitioners who work with tabular data. +# This explains why Gradient Boosted Machines are very popular among datascience +# practitioners who work with tabular data. # %% [markdown] # In this notebook we: # -# * used a `ColumnTransformer` to apply different preprocessing for -# categorical and numerical variables; -# * used a pipeline to chain the `ColumnTransformer` preprocessing and -# logistic regression fitting; -# * saw that **gradient boosting methods** can outperform **linear -# models**. +# * used a `ColumnTransformer` to apply different preprocessing for categorical +# and numerical variables; +# * used a pipeline to chain the `ColumnTransformer` preprocessing and logistic +# regression fitting; +# * saw that **gradient boosting methods** can outperform **linear models**. diff --git a/python_scripts/03_categorical_pipeline_ex_02.py b/python_scripts/03_categorical_pipeline_ex_02.py index 56adf7b71..6211844c8 100644 --- a/python_scripts/03_categorical_pipeline_ex_02.py +++ b/python_scripts/03_categorical_pipeline_ex_02.py @@ -63,11 +63,13 @@ from sklearn.preprocessing import OrdinalEncoder from sklearn.ensemble import HistGradientBoostingClassifier -categorical_preprocessor = OrdinalEncoder(handle_unknown="use_encoded_value", - unknown_value=-1) -preprocessor = ColumnTransformer([ - ('categorical', categorical_preprocessor, categorical_columns)], - remainder="passthrough") +categorical_preprocessor = OrdinalEncoder( + handle_unknown="use_encoded_value", unknown_value=-1 +) +preprocessor = ColumnTransformer( + [("categorical", categorical_preprocessor, categorical_columns)], + remainder="passthrough", +) model = make_pipeline(preprocessor, HistGradientBoostingClassifier()) @@ -77,9 +79,11 @@ scores = cv_results["test_score"] -print("The mean cross-validation accuracy is: " - f"{scores.mean():.3f} ยฑ {scores.std():.3f} " - f"with a fitting time of {elapsed_time:.3f}") +print( + "The mean cross-validation accuracy is: " + f"{scores.mean():.3f} ยฑ {scores.std():.3f} " + f"with a fitting time of {elapsed_time:.3f}" +) # %% [markdown] # ## Scaling numerical features diff --git a/python_scripts/03_categorical_pipeline_sol_01.py b/python_scripts/03_categorical_pipeline_sol_01.py index fed949fcb..0847e7e30 100644 --- a/python_scripts/03_categorical_pipeline_sol_01.py +++ b/python_scripts/03_categorical_pipeline_sol_01.py @@ -9,8 +9,8 @@ # # ๐Ÿ“ƒ Solution for Exercise M1.04 # # The goal of this exercise is to evaluate the impact of using an arbitrary -# integer encoding for categorical variables along with a linear -# classification model such as Logistic Regression. +# integer encoding for categorical variables along with a linear classification +# model such as Logistic Regression. # # To do so, let's try to use `OrdinalEncoder` to preprocess the categorical # variables. This preprocessor is assembled in a pipeline with @@ -50,8 +50,8 @@ # # Because `OrdinalEncoder` can raise errors if it sees an unknown category at # prediction time, you can set the `handle_unknown="use_encoded_value"` and -# `unknown_value` parameters. You can refer to the -# [scikit-learn documentation](https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.OrdinalEncoder.html) +# `unknown_value` parameters. You can refer to the [scikit-learn +# documentation](https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.OrdinalEncoder.html) # for more details regarding these parameters. # %% @@ -62,7 +62,8 @@ # solution model = make_pipeline( OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1), - LogisticRegression(max_iter=500)) + LogisticRegression(max_iter=500), +) # %% [markdown] # Your model is now defined. Evaluate it using a cross-validation using @@ -70,13 +71,12 @@ # # ```{note} # Be aware that if an error happened during the cross-validation, -# `cross_validate` will raise a warning and return NaN (Not a Number) -# as scores. To make it raise a standard Python exception with a traceback, -# you can pass the `error_score="raise"` argument in the call to -# `cross_validate`. An exception will be raised instead of a warning at the first -# encountered problem and `cross_validate` will stop right away instead of -# returning NaN values. This is particularly handy when developing -# complex machine learning pipelines. +# `cross_validate` will raise a warning and return NaN (Not a Number) as scores. +# To make it raise a standard Python exception with a traceback, you can pass +# the `error_score="raise"` argument in the call to `cross_validate`. An +# exception will be raised instead of a warning at the first encountered problem +# and `cross_validate` will stop right away instead of returning NaN values. +# This is particularly handy when developing complex machine learning pipelines. # ``` # %% @@ -86,8 +86,10 @@ cv_results = cross_validate(model, data_categorical, target) scores = cv_results["test_score"] -print("The mean cross-validation accuracy is: " - f"{scores.mean():.3f} ยฑ {scores.std():.3f}") +print( + "The mean cross-validation accuracy is: " + f"{scores.mean():.3f} ยฑ {scores.std():.3f}" +) # %% [markdown] tags=["solution"] # Using an arbitrary mapping from string labels to integers as done here causes @@ -101,34 +103,39 @@ # %% tags=["solution"] from sklearn.dummy import DummyClassifier -cv_results = cross_validate(DummyClassifier(strategy="most_frequent"), - data_categorical, target) +cv_results = cross_validate( + DummyClassifier(strategy="most_frequent"), data_categorical, target +) scores = cv_results["test_score"] -print("The mean cross-validation accuracy is: " - f"{scores.mean():.3f} ยฑ {scores.std():.3f}") +print( + "The mean cross-validation accuracy is: " + f"{scores.mean():.3f} ยฑ {scores.std():.3f}" +) # %% [markdown] # Now, we would like to compare the generalization performance of our previous -# model with a new model where instead of using an `OrdinalEncoder`, we will -# use a `OneHotEncoder`. Repeat the model evaluation using cross-validation. -# Compare the score of both models and conclude on the impact of choosing a -# specific encoding strategy when using a linear model. +# model with a new model where instead of using an `OrdinalEncoder`, we will use +# a `OneHotEncoder`. Repeat the model evaluation using cross-validation. Compare +# the score of both models and conclude on the impact of choosing a specific +# encoding strategy when using a linear model. # %% from sklearn.preprocessing import OneHotEncoder # solution model = make_pipeline( - OneHotEncoder(handle_unknown="ignore"), - LogisticRegression(max_iter=500)) + OneHotEncoder(handle_unknown="ignore"), LogisticRegression(max_iter=500) +) cv_results = cross_validate(model, data_categorical, target) scores = cv_results["test_score"] -print("The mean cross-validation accuracy is: " - f"{scores.mean():.3f} ยฑ {scores.std():.3f}") +print( + "The mean cross-validation accuracy is: " + f"{scores.mean():.3f} ยฑ {scores.std():.3f}" +) # %% [markdown] tags=["solution"] -# With the linear classifier chosen, using an encoding that does not assume -# any ordering lead to much better result. +# With the linear classifier chosen, using an encoding that does not assume any +# ordering lead to much better result. # # The important message here is: linear model and `OrdinalEncoder` are used # together only for ordinal categorical features, i.e. features that have a diff --git a/python_scripts/03_categorical_pipeline_sol_02.py b/python_scripts/03_categorical_pipeline_sol_02.py index a0185c9e5..f73671fe4 100644 --- a/python_scripts/03_categorical_pipeline_sol_02.py +++ b/python_scripts/03_categorical_pipeline_sol_02.py @@ -29,9 +29,9 @@ data = adult_census.drop(columns=[target_name, "education-num"]) # %% [markdown] -# As in the previous notebooks, we use the utility `make_column_selector` -# to select only columns with a specific data type. Besides, we list in -# advance all categories for the categorical columns. +# As in the previous notebooks, we use the utility `make_column_selector` to +# select only columns with a specific data type. Besides, we list in advance all +# categories for the categorical columns. # %% from sklearn.compose import make_column_selector as selector @@ -56,11 +56,13 @@ from sklearn.preprocessing import OrdinalEncoder from sklearn.ensemble import HistGradientBoostingClassifier -categorical_preprocessor = OrdinalEncoder(handle_unknown="use_encoded_value", - unknown_value=-1) -preprocessor = ColumnTransformer([ - ('categorical', categorical_preprocessor, categorical_columns)], - remainder="passthrough") +categorical_preprocessor = OrdinalEncoder( + handle_unknown="use_encoded_value", unknown_value=-1 +) +preprocessor = ColumnTransformer( + [("categorical", categorical_preprocessor, categorical_columns)], + remainder="passthrough", +) model = make_pipeline(preprocessor, HistGradientBoostingClassifier()) @@ -70,9 +72,11 @@ scores = cv_results["test_score"] -print("The mean cross-validation accuracy is: " - f"{scores.mean():.3f} ยฑ {scores.std():.3f} " - f"with a fitting time of {elapsed_time:.3f}") +print( + "The mean cross-validation accuracy is: " + f"{scores.mean():.3f} ยฑ {scores.std():.3f} " + f"with a fitting time of {elapsed_time:.3f}" +) # %% [markdown] # ## Scaling numerical features @@ -86,11 +90,18 @@ from sklearn.preprocessing import StandardScaler -preprocessor = ColumnTransformer([ - ('numerical', StandardScaler(), numerical_columns), - ('categorical', OrdinalEncoder(handle_unknown="use_encoded_value", - unknown_value=-1), - categorical_columns)]) +preprocessor = ColumnTransformer( + [ + ("numerical", StandardScaler(), numerical_columns), + ( + "categorical", + OrdinalEncoder( + handle_unknown="use_encoded_value", unknown_value=-1 + ), + categorical_columns, + ), + ] +) model = make_pipeline(preprocessor, HistGradientBoostingClassifier()) @@ -100,9 +111,11 @@ scores = cv_results["test_score"] -print("The mean cross-validation accuracy is: " - f"{scores.mean():.3f} ยฑ {scores.std():.3f} " - f"with a fitting time of {elapsed_time:.3f}") +print( + "The mean cross-validation accuracy is: " + f"{scores.mean():.3f} ยฑ {scores.std():.3f} " + f"with a fitting time of {elapsed_time:.3f}" +) # %% [markdown] tags=["solution"] # ### Analysis @@ -119,15 +132,15 @@ # # We observed that integer coding of categorical variables can be very # detrimental for linear models. However, it does not seem to be the case for -# `HistGradientBoostingClassifier` models, as the cross-validation score -# of the reference pipeline with `OrdinalEncoder` is reasonably good. +# `HistGradientBoostingClassifier` models, as the cross-validation score of the +# reference pipeline with `OrdinalEncoder` is reasonably good. # # Let's see if we can get an even better accuracy with `OneHotEncoder`. # -# Hint: `HistGradientBoostingClassifier` does not yet support sparse input -# data. You might want to use -# `OneHotEncoder(handle_unknown="ignore", sparse_output=False)` to force the use of a -# dense representation as a workaround. +# Hint: `HistGradientBoostingClassifier` does not yet support sparse input data. +# You might want to use `OneHotEncoder(handle_unknown="ignore", +# sparse_output=False)` to force the use of a dense representation as a +# workaround. # %% # solution @@ -135,10 +148,13 @@ from sklearn.preprocessing import OneHotEncoder -categorical_preprocessor = OneHotEncoder(handle_unknown="ignore", sparse_output=False) -preprocessor = ColumnTransformer([ - ('one-hot-encoder', categorical_preprocessor, categorical_columns)], - remainder="passthrough") +categorical_preprocessor = OneHotEncoder( + handle_unknown="ignore", sparse_output=False +) +preprocessor = ColumnTransformer( + [("one-hot-encoder", categorical_preprocessor, categorical_columns)], + remainder="passthrough", +) model = make_pipeline(preprocessor, HistGradientBoostingClassifier()) @@ -148,41 +164,43 @@ scores = cv_results["test_score"] -print("The mean cross-validation accuracy is: " - f"{scores.mean():.3f} ยฑ {scores.std():.3f} " - f"with a fitting time of {elapsed_time:.3f}") +print( + "The mean cross-validation accuracy is: " + f"{scores.mean():.3f} ยฑ {scores.std():.3f} " + f"with a fitting time of {elapsed_time:.3f}" +) # %% [markdown] tags=["solution"] # ### Analysis # -# From an accuracy point of view, the result is almost exactly the same. -# The reason is that `HistGradientBoostingClassifier` is expressive -# and robust enough to deal with misleading ordering of integer coded -# categories (which was not the case for linear models). +# From an accuracy point of view, the result is almost exactly the same. The +# reason is that `HistGradientBoostingClassifier` is expressive and robust +# enough to deal with misleading ordering of integer coded categories (which was +# not the case for linear models). # -# However from a computation point of view, the training time is -# much longer: this is caused by the fact that `OneHotEncoder` -# generates approximately 10 times more features than `OrdinalEncoder`. +# However from a computation point of view, the training time is much longer: +# this is caused by the fact that `OneHotEncoder` generates approximately 10 +# times more features than `OrdinalEncoder`. # -# Note that the current implementation `HistGradientBoostingClassifier` -# is still incomplete, and once sparse representation are handled -# correctly, training time might improve with such kinds of encodings. +# Note that the current implementation `HistGradientBoostingClassifier` is still +# incomplete, and once sparse representation are handled correctly, training +# time might improve with such kinds of encodings. # -# The main take away message is that arbitrary integer coding of -# categories is perfectly fine for `HistGradientBoostingClassifier` -# and yields fast training times. +# The main take away message is that arbitrary integer coding of categories is +# perfectly fine for `HistGradientBoostingClassifier` and yields fast training +# times. # %% [markdown] tags=["solution"] # ```{important} # Which encoder should I use? # -# | | Meaningful order | Non-meaningful order -# ------------ | ------------- | ------------- -# Tree-based model | `OrdinalEncoder` | `OrdinalEncoder` -# Linear model | `OrdinalEncoder` with caution | `OneHotEncoder` +# | | Meaningful order | Non-meaningful order | +# | ---------------- | ----------------------------- | -------------------- | +# | Tree-based model | `OrdinalEncoder` | `OrdinalEncoder` | +# | Linear model | `OrdinalEncoder` with caution | `OneHotEncoder` | # -# - `OneHotEncoder`: will always do something meaningful, but can be -# unnecessary slow with trees. +# - `OneHotEncoder`: will always do something meaningful, but can be unnecessary +# slow with trees. # - `OrdinalEncoder`: can be detrimental for linear models unless your category # has a meaningful order and you make sure that `OrdinalEncoder` respects this # order. Trees can deal with `OrdinalEncoder` fine as long as they are deep diff --git a/python_scripts/03_categorical_pipeline_visualization.py b/python_scripts/03_categorical_pipeline_visualization.py index 101fe04f6..0b10a6f42 100644 --- a/python_scripts/03_categorical_pipeline_visualization.py +++ b/python_scripts/03_categorical_pipeline_visualization.py @@ -10,7 +10,7 @@ # %% [markdown] # The goal of keeping this notebook is to: - +# # - make it available for users that want to reproduce it locally # - archive the script in the event we want to rerecord this video with an # update in the UI of scikit-learn in a future release. @@ -19,15 +19,19 @@ # ## First we load the dataset # %% [markdown] -# We need to define our data and target. In this case we will build a classification model +# We need to define our data and target. In this case we will build a +# classification model # %% import pandas as pd -ames_housing = pd.read_csv("../datasets/house_prices.csv", na_values='?') +ames_housing = pd.read_csv("../datasets/house_prices.csv", na_values="?") target_name = "SalePrice" -data, target = ames_housing.drop(columns=target_name), ames_housing[target_name] +data, target = ( + ames_housing.drop(columns=target_name), + ames_housing[target_name], +) target = (target > 200_000).astype(int) # %% [markdown] @@ -41,8 +45,8 @@ # this arbitrary subset of data: # %% -numeric_features = ['LotArea', 'FullBath', 'HalfBath'] -categorical_features = ['Neighborhood', 'HouseStyle'] +numeric_features = ["LotArea", "FullBath", "HalfBath"] +categorical_features = ["Neighborhood", "HouseStyle"] data = data[numeric_features + categorical_features] # %% [markdown] @@ -56,12 +60,17 @@ from sklearn.impute import SimpleImputer from sklearn.preprocessing import StandardScaler, OneHotEncoder -numeric_transformer = Pipeline(steps=[ - ('imputer', SimpleImputer(strategy='median')), - ('scaler', StandardScaler(), -)]) +numeric_transformer = Pipeline( + steps=[ + ("imputer", SimpleImputer(strategy="median")), + ( + "scaler", + StandardScaler(), + ), + ] +) -categorical_transformer = OneHotEncoder(handle_unknown='ignore') +categorical_transformer = OneHotEncoder(handle_unknown="ignore") # %% [markdown] # The next step is to apply the transformations using `ColumnTransformer` @@ -69,10 +78,12 @@ # %% from sklearn.compose import ColumnTransformer -preprocessor = ColumnTransformer(transformers=[ - ('num', numeric_transformer, numeric_features), - ('cat', categorical_transformer, categorical_features), -]) +preprocessor = ColumnTransformer( + transformers=[ + ("num", numeric_transformer, numeric_features), + ("cat", categorical_transformer, categorical_features), + ] +) # %% [markdown] # Then we define the model and join the steps in order @@ -80,10 +91,12 @@ # %% from sklearn.linear_model import LogisticRegression -model = Pipeline(steps=[ - ('preprocessor', preprocessor), - ('classifier', LogisticRegression()), -]) +model = Pipeline( + steps=[ + ("preprocessor", preprocessor), + ("classifier", LogisticRegression()), + ] +) # %% [markdown] # Let's visualize it! @@ -99,21 +112,23 @@ cv_results = cross_validate(model, data, target, cv=5) scores = cv_results["test_score"] -print("The mean cross-validation accuracy is: " - f"{scores.mean():.3f} ยฑ {scores.std():.3f}") +print( + "The mean cross-validation accuracy is: " + f"{scores.mean():.3f} ยฑ {scores.std():.3f}" +) # %% [markdown] # ```{note} # In this case, around 86% of the times the pipeline correctly predicts whether -# the price of a house is above or below the 200_000 dollars threshold. But -# be aware that this score was obtained by picking some features by hand, which -# is not necessarily the best thing we can do for this classification task. In this +# the price of a house is above or below the 200_000 dollars threshold. But be +# aware that this score was obtained by picking some features by hand, which is +# not necessarily the best thing we can do for this classification task. In this # example we can hope that fitting a complex machine learning pipelines on a # richer set of features can improve upon this performance level. # # Reducing a price estimation problem to a binary classification problem with a -# single threshold at 200_000 dollars is probably too coarse to be useful in -# in practice. Treating this problem as a regression problem is probably a better +# single threshold at 200_000 dollars is probably too coarse to be useful in in +# practice. Treating this problem as a regression problem is probably a better # idea. We will see later in this MOOC how to train and evaluate the performance # of various regression models. # ``` diff --git a/python_scripts/cross_validation_baseline.py b/python_scripts/cross_validation_baseline.py index 04923b1b5..832e4400a 100644 --- a/python_scripts/cross_validation_baseline.py +++ b/python_scripts/cross_validation_baseline.py @@ -29,8 +29,8 @@ target *= 100 # rescale the target in k$ # %% [markdown] -# Across all evaluations, we will use a `ShuffleSplit` cross-validation -# splitter with 20% of the data held on the validation side of the split. +# Across all evaluations, we will use a `ShuffleSplit` cross-validation splitter +# with 20% of the data held on the validation side of the split. # %% from sklearn.model_selection import ShuffleSplit @@ -75,7 +75,6 @@ ) errors_dummy_regressor.describe() - # %% [markdown] # We now plot the cross-validation testing errors for the mean target baseline # and the actual decision tree regressor. @@ -99,13 +98,13 @@ # %% [markdown] # We see that the generalization performance of our decision tree is far from -# being perfect: the price predictions are off by more than 45,000 US dollars -# on average. However it is much better than the mean price baseline. So this -# confirms that it is possible to predict the housing price much better by -# using a model that takes into account the values of the input features -# (housing location, size, neighborhood income...). Such a model makes more -# informed predictions and approximately divides the error rate by a factor of 2 -# compared to the baseline that ignores the input features. +# being perfect: the price predictions are off by more than 45,000 US dollars on +# average. However it is much better than the mean price baseline. So this +# confirms that it is possible to predict the housing price much better by using +# a model that takes into account the values of the input features (housing +# location, size, neighborhood income...). Such a model makes more informed +# predictions and approximately divides the error rate by a factor of 2 compared +# to the baseline that ignores the input features. # # Note that here we used the mean price as the baseline prediction. We could # have used the median instead. See the online documentation of the diff --git a/python_scripts/cross_validation_grouping.py b/python_scripts/cross_validation_grouping.py index 597928fca..3c473ecdf 100644 --- a/python_scripts/cross_validation_grouping.py +++ b/python_scripts/cross_validation_grouping.py @@ -18,8 +18,8 @@ data, target = digits.data, digits.target # %% [markdown] -# We will recreate the same model used in the previous notebook: -# a logistic regression classifier with a preprocessor to scale the data. +# We will recreate the same model used in the previous notebook: a logistic +# regression classifier with a preprocessor to scale the data. # %% from sklearn.preprocessing import MinMaxScaler @@ -36,11 +36,12 @@ from sklearn.model_selection import cross_val_score, KFold cv = KFold(shuffle=False) -test_score_no_shuffling = cross_val_score(model, data, target, cv=cv, - n_jobs=2) -print(f"The average accuracy is " - f"{test_score_no_shuffling.mean():.3f} ยฑ " - f"{test_score_no_shuffling.std():.3f}") +test_score_no_shuffling = cross_val_score(model, data, target, cv=cv, n_jobs=2) +print( + "The average accuracy is " + f"{test_score_no_shuffling.mean():.3f} ยฑ " + f"{test_score_no_shuffling.std():.3f}" +) # %% [markdown] # Now, let's repeat the experiment by shuffling the data within the @@ -48,16 +49,19 @@ # %% cv = KFold(shuffle=True) -test_score_with_shuffling = cross_val_score(model, data, target, cv=cv, - n_jobs=2) -print(f"The average accuracy is " - f"{test_score_with_shuffling.mean():.3f} ยฑ " - f"{test_score_with_shuffling.std():.3f}") +test_score_with_shuffling = cross_val_score( + model, data, target, cv=cv, n_jobs=2 +) +print( + "The average accuracy is " + f"{test_score_with_shuffling.mean():.3f} ยฑ " + f"{test_score_with_shuffling.std():.3f}" +) # %% [markdown] -# We observe that shuffling the data improves the mean accuracy. -# We could go a little further and plot the distribution of the testing -# score. We can first concatenate the test scores. +# We observe that shuffling the data improves the mean accuracy. We could go a +# little further and plot the distribution of the testing score. We can first +# concatenate the test scores. # %% import pandas as pd @@ -136,8 +140,23 @@ # defines the lower and upper bounds of sample indices # for each writer -writer_boundaries = [0, 130, 256, 386, 516, 646, 776, 915, 1029, - 1157, 1287, 1415, 1545, 1667, 1797] +writer_boundaries = [ + 0, + 130, + 256, + 386, + 516, + 646, + 776, + 915, + 1029, + 1157, + 1287, + 1415, + 1545, + 1667, + 1797, +] groups = np.zeros_like(target) lower_bounds = writer_boundaries[:-1] upper_bounds = writer_boundaries[1:] @@ -164,23 +183,27 @@ from sklearn.model_selection import GroupKFold cv = GroupKFold() -test_score = cross_val_score(model, data, target, groups=groups, cv=cv, - n_jobs=2) -print(f"The average accuracy is " - f"{test_score.mean():.3f} ยฑ " - f"{test_score.std():.3f}") +test_score = cross_val_score( + model, data, target, groups=groups, cv=cv, n_jobs=2 +) +print( + f"The average accuracy is {test_score.mean():.3f} ยฑ {test_score.std():.3f}" +) # %% [markdown] -# We see that this strategy is less optimistic regarding the model generalization -# performance. However, this is the most reliable if our goal is to make -# handwritten digits recognition writers independent. Besides, we can as well -# see that the standard deviation was reduced. +# We see that this strategy is less optimistic regarding the model +# generalization performance. However, this is the most reliable if our goal is +# to make handwritten digits recognition writers independent. Besides, we can as +# well see that the standard deviation was reduced. # %% all_scores = pd.DataFrame( [test_score_no_shuffling, test_score_with_shuffling, test_score], - index=["KFold without shuffling", "KFold with shuffling", - "KFold with groups"], + index=[ + "KFold without shuffling", + "KFold with shuffling", + "KFold with groups", + ], ).T # %% diff --git a/python_scripts/cross_validation_learning_curve.py b/python_scripts/cross_validation_learning_curve.py index 3a73516c8..26e615c14 100644 --- a/python_scripts/cross_validation_learning_curve.py +++ b/python_scripts/cross_validation_learning_curve.py @@ -13,8 +13,8 @@ # generalizing. Besides these aspects, it is also important to understand how # the different errors are influenced by the number of samples available. # -# In this notebook, we will show this aspect by looking a the variability of -# the different errors. +# In this notebook, we will show this aspect by looking a the variability of the +# different errors. # # Let's first load the data and create the same model as in the previous # notebook. @@ -50,14 +50,15 @@ # the validation curve, but instead of varying a hyperparameter, we vary the # number of training samples. This curve is called the **learning curve**. # -# It gives information regarding the benefit of adding new training samples -# to improve a model's generalization performance. +# It gives information regarding the benefit of adding new training samples to +# improve a model's generalization performance. # -# Let's compute the learning curve for a decision tree and vary the -# proportion of the training set from 10% to 100%. +# Let's compute the learning curve for a decision tree and vary the proportion +# of the training set from 10% to 100%. # %% import numpy as np + train_sizes = np.linspace(0.1, 1.0, num=5, endpoint=True) train_sizes @@ -76,8 +77,14 @@ from sklearn.model_selection import learning_curve results = learning_curve( - regressor, data, target, train_sizes=train_sizes, cv=cv, - scoring="neg_mean_absolute_error", n_jobs=2) + regressor, + data, + target, + train_sizes=train_sizes, + cv=cv, + scoring="neg_mean_absolute_error", + n_jobs=2, +) train_size, train_scores, test_scores = results[:3] # Convert the scores into errors train_errors, test_errors = -train_scores, -test_scores @@ -88,10 +95,18 @@ # %% import matplotlib.pyplot as plt -plt.errorbar(train_size, train_errors.mean(axis=1), - yerr=train_errors.std(axis=1), label="Training error") -plt.errorbar(train_size, test_errors.mean(axis=1), - yerr=test_errors.std(axis=1), label="Testing error") +plt.errorbar( + train_size, + train_errors.mean(axis=1), + yerr=train_errors.std(axis=1), + label="Training error", +) +plt.errorbar( + train_size, + test_errors.mean(axis=1), + yerr=test_errors.std(axis=1), + label="Testing error", +) plt.legend() plt.xscale("log") @@ -104,11 +119,11 @@ # means that the trained model (i.e. decision tree) is clearly overfitting the # training data. # -# Looking at the testing error alone, we observe that the more samples are -# added into the training set, the lower the testing error becomes. Also, we -# are searching for the plateau of the testing error for which there is no -# benefit to adding samples anymore or assessing the potential gain of adding -# more samples into the training set. +# Looking at the testing error alone, we observe that the more samples are added +# into the training set, the lower the testing error becomes. Also, we are +# searching for the plateau of the testing error for which there is no benefit +# to adding samples anymore or assessing the potential gain of adding more +# samples into the training set. # # If we achieve a plateau and adding new samples in the training set does not # reduce the testing error, we might have reached the Bayes error rate using the diff --git a/python_scripts/cross_validation_nested.py b/python_scripts/cross_validation_nested.py index 0d8028102..9c1b78e26 100644 --- a/python_scripts/cross_validation_nested.py +++ b/python_scripts/cross_validation_nested.py @@ -8,16 +8,14 @@ # %% [markdown] # # Nested cross-validation # -# Cross-validation can be used both for hyperparameter tuning and for -# estimating the generalization performance of a model. However, using -# it for both purposes at the same time is problematic, as the resulting -# evaluation can underestimate some overfitting that results from -# the hyperparameter tuning procedure itself. +# Cross-validation can be used both for hyperparameter tuning and for estimating +# the generalization performance of a model. However, using it for both purposes +# at the same time is problematic, as the resulting evaluation can underestimate +# some overfitting that results from the hyperparameter tuning procedure itself. # -# Philosophically, hyperparameter tuning is a form of machine learning -# itself and therefore, we need another outer loop of cross-validation to -# properly evaluate the generalization performance of the full modeling -# procedure. +# Philosophically, hyperparameter tuning is a form of machine learning itself +# and therefore, we need another outer loop of cross-validation to properly +# evaluate the generalization performance of the full modeling procedure. # # This notebook highlights nested cross-validation and its impact on the # estimated generalization performance compared to naively using a single level @@ -39,23 +37,21 @@ from sklearn.model_selection import GridSearchCV from sklearn.svm import SVC -param_grid = {"C": [0.1, 1, 10], "gamma": [.01, .1]} +param_grid = {"C": [0.1, 1, 10], "gamma": [0.01, 0.1]} model_to_tune = SVC() -search = GridSearchCV( - estimator=model_to_tune, param_grid=param_grid, n_jobs=2 -) +search = GridSearchCV(estimator=model_to_tune, param_grid=param_grid, n_jobs=2) search.fit(data, target) # %% [markdown] # We recall that, internally, `GridSearchCV` trains several models for each on # sub-sampled training sets and evaluate each of them on the matching testing -# sets using cross-validation. This evaluation procedure is controlled via -# using the `cv` parameter. The procedure is then repeated for all possible +# sets using cross-validation. This evaluation procedure is controlled via using +# the `cv` parameter. The procedure is then repeated for all possible # combinations of parameters given in `param_grid`. # -# The attribute `best_params_` gives us the best set of parameters that -# maximize the mean score on the internal test sets. +# The attribute `best_params_` gives us the best set of parameters that maximize +# the mean score on the internal test sets. # %% print(f"The best parameters found are: {search.best_params_}") @@ -69,9 +65,8 @@ # %% [markdown] # At this stage, one should be extremely careful using this score. The # misinterpretation would be the following: since this mean score was computed -# using cross-validation test sets, we could use it to assess the -# generalization performance of the model trained with the best -# hyper-parameters. +# using cross-validation test sets, we could use it to assess the generalization +# performance of the model trained with the best hyper-parameters. # # However, we should not forget that we used this score to pick-up the best # model. It means that we used knowledge from the test sets (i.e. test scores) @@ -89,8 +84,8 @@ # dedicated to estimate the testing error of our tuned model. # # In this case, our inner cross-validation always gets the training set of the -# outer cross-validation, making it possible to always compute the final -# testing scores on completely independent sets of samples. +# outer cross-validation, making it possible to always compute the final testing +# scores on completely independent sets of samples. # # Let us do this in one go as follows: @@ -108,8 +103,10 @@ # Outer cross-validation to compute the testing score test_score = cross_val_score(model, data, target, cv=outer_cv, n_jobs=2) -print(f"The mean score using nested cross-validation is: " - f"{test_score.mean():.3f} ยฑ {test_score.std():.3f}") +print( + "The mean score using nested cross-validation is: " + f"{test_score.mean():.3f} ยฑ {test_score.std():.3f}" +) # %% [markdown] # The reported score is more trustworthy and should be close to production's @@ -118,9 +115,9 @@ # # We would like to better assess the difference between the nested and # non-nested cross-validation scores to show that the latter can be too -# optimistic in practice. To do this, we repeat the experiment several times -# and shuffle the data differently to ensure that our conclusion does not -# depend on a particular resampling of the data. +# optimistic in practice. To do this, we repeat the experiment several times and +# shuffle the data differently to ensure that our conclusion does not depend on +# a particular resampling of the data. # %% test_score_not_nested = [] @@ -135,8 +132,9 @@ outer_cv = KFold(n_splits=3, shuffle=True, random_state=i) # Non_nested parameter search and scoring - model = GridSearchCV(estimator=model_to_tune, param_grid=param_grid, - cv=inner_cv, n_jobs=2) + model = GridSearchCV( + estimator=model_to_tune, param_grid=param_grid, cv=inner_cv, n_jobs=2 + ) model.fit(data, target) test_score_not_nested.append(model.best_score_) @@ -162,17 +160,19 @@ color = {"whiskers": "black", "medians": "black", "caps": "black"} all_scores.plot.box(color=color, vert=False) plt.xlabel("Accuracy") -_ = plt.title("Comparison of mean accuracy obtained on the test sets with\n" - "and without nested cross-validation") +_ = plt.title( + "Comparison of mean accuracy obtained on the test sets with\n" + "and without nested cross-validation" +) # %% [markdown] # We observe that the generalization performance estimated without using nested -# CV is higher than what we obtain with nested CV. The reason is that the -# tuning procedure itself selects the model with the highest inner CV score. If -# there are many hyper-parameter combinations and if the inner CV scores have -# comparatively large standard deviations, taking the maximum value can lure -# the naive data scientist into over-estimating the true generalization -# performance of the result of the full learning procedure. By using an outer +# CV is higher than what we obtain with nested CV. The reason is that the tuning +# procedure itself selects the model with the highest inner CV score. If there +# are many hyper-parameter combinations and if the inner CV scores have +# comparatively large standard deviations, taking the maximum value can lure the +# naive data scientist into over-estimating the true generalization performance +# of the result of the full learning procedure. By using an outer # cross-validation procedure one gets a more trustworthy estimate of the # generalization performance of the full learning procedure, including the # effect of tuning the hyperparameters. diff --git a/python_scripts/cross_validation_sol_01.py b/python_scripts/cross_validation_sol_01.py index 3525bd521..7dcd6ad96 100644 --- a/python_scripts/cross_validation_sol_01.py +++ b/python_scripts/cross_validation_sol_01.py @@ -81,7 +81,7 @@ # %% tags=["solution"] print( - f"Accuracy score of our model:\n" + "Accuracy score of our model:\n" f"{cv_results['test_score'].mean():.3f} ยฑ " f"{cv_results['test_score'].std():.3f}" ) @@ -110,8 +110,14 @@ gammas = np.logspace(-3, 2, num=30) param_name = "svc__gamma" train_scores, test_scores = validation_curve( - model, data, target, param_name=param_name, param_range=gammas, cv=cv, - n_jobs=2) + model, + data, + target, + param_name=param_name, + param_range=gammas, + cv=cv, + n_jobs=2, +) # %% [markdown] # Plot the validation curve for the train and test scores. @@ -160,7 +166,8 @@ train_sizes = np.linspace(0.1, 1, num=10) results = learning_curve( - model, data, target, train_sizes=train_sizes, cv=cv, n_jobs=2) + model, data, target, train_sizes=train_sizes, cv=cv, n_jobs=2 +) train_size, train_scores, test_scores = results[:3] # %% tags=["solution"] diff --git a/python_scripts/cross_validation_sol_02.py b/python_scripts/cross_validation_sol_02.py index c56f57cc1..1de2517aa 100644 --- a/python_scripts/cross_validation_sol_02.py +++ b/python_scripts/cross_validation_sol_02.py @@ -8,12 +8,12 @@ # %% [markdown] # # ๐Ÿ“ƒ Solution for Exercise M7.01 # -# In this exercise we will define dummy classification baselines and use them -# as reference to assess the relative predictive performance of a given model -# of interest. +# In this exercise we will define dummy classification baselines and use them as +# reference to assess the relative predictive performance of a given model of +# interest. # -# We illustrate those baselines with the help of the Adult Census dataset, -# using only the numerical features for the sake of simplicity. +# We illustrate those baselines with the help of the Adult Census dataset, using +# only the numerical features for the sake of simplicity. # %% import pandas as pd @@ -44,8 +44,8 @@ classifier = make_pipeline(StandardScaler(), LogisticRegression()) # %% [markdown] -# Compute the cross-validation (test) scores for the classifier on this -# dataset. Store the results pandas Series as we did in the previous notebook. +# Compute the cross-validation (test) scores for the classifier on this dataset. +# Store the results pandas Series as we did in the previous notebook. # %% # solution @@ -61,9 +61,9 @@ test_score_logistic_regression # %% [markdown] -# Now, compute the cross-validation scores of a dummy classifier that -# constantly predicts the most frequent class observed the training set. Please -# refer to the online documentation for the [sklearn.dummy.DummyClassifier +# Now, compute the cross-validation scores of a dummy classifier that constantly +# predicts the most frequent class observed the training set. Please refer to +# the online documentation for the [sklearn.dummy.DummyClassifier # ](https://scikit-learn.org/stable/modules/generated/sklearn.dummy.DummyClassifier.html) # class. # @@ -78,26 +78,27 @@ most_frequent_classifier, data, target, cv=cv, n_jobs=2 ) test_score_most_frequent = pd.Series( - cv_results_most_frequent["test_score"], name="Most frequent class predictor" + cv_results_most_frequent["test_score"], + name="Most frequent class predictor", ) test_score_most_frequent # %% [markdown] -# Now that we collected the results from the baseline and the model, -# concatenate the test scores as columns a single pandas dataframe. +# Now that we collected the results from the baseline and the model, concatenate +# the test scores as columns a single pandas dataframe. # %% # solution all_test_scores = pd.concat( [test_score_logistic_regression, test_score_most_frequent], - axis='columns', + axis="columns", ) all_test_scores # %% [markdown] # -# Next, plot the histogram of the cross-validation test scores for both -# models with the help of [pandas built-in plotting +# Next, plot the histogram of the cross-validation test scores for both models +# with the help of [pandas built-in plotting # function](https://pandas.pydata.org/pandas-docs/stable/user_guide/visualization.html#histograms). # # What conclusions do you draw from the results? @@ -115,10 +116,10 @@ # %% [markdown] tags=["solution"] # We observe that the two histograms are well separated. Therefore the dummy -# classifier with the strategy `most_frequent` has a much lower accuracy -# than the logistic regression classifier. We conclude that the logistic -# regression model can successfully find predictive information in the input -# features to improve upon the baseline. +# classifier with the strategy `most_frequent` has a much lower accuracy than +# the logistic regression classifier. We conclude that the logistic regression +# model can successfully find predictive information in the input features to +# improve upon the baseline. # %% [markdown] # Change the `strategy` of the dummy classifier to `"stratified"`, compute the @@ -160,7 +161,7 @@ test_score_dummy_stratified, test_score_dummy_uniform, ], - axis='columns', + axis="columns", ) # %% tags=["solution"] @@ -172,16 +173,16 @@ # %% [markdown] tags=["solution"] # We see that using `strategy="stratified"`, the results are much worse than # with the `most_frequent` strategy. Since the classes are imbalanced, -# predicting the most frequent involves that we will be right for the -# proportion of this class (~75% of the samples). However, the `"stratified"` -# strategy will randomly generate predictions by respecting the training -# set's class distribution, resulting in some wrong predictions even for -# the most frequent class, hence we obtain a lower accuracy. +# predicting the most frequent involves that we will be right for the proportion +# of this class (~75% of the samples). However, the `"stratified"` strategy will +# randomly generate predictions by respecting the training set's class +# distribution, resulting in some wrong predictions even for the most frequent +# class, hence we obtain a lower accuracy. # -# This is even more so for the `strategy="uniform"`: this strategy assigns -# class labels uniformly at random. Therefore, on a binary classification -# problem, the cross-validation accuracy is 50% on average, which is the -# weakest of the three dummy baselines. +# This is even more so for the `strategy="uniform"`: this strategy assigns class +# labels uniformly at random. Therefore, on a binary classification problem, the +# cross-validation accuracy is 50% on average, which is the weakest of the three +# dummy baselines. # %% [markdown] tags=["solution"] # Note: one could argue that the `"uniform"` or `strategy="stratified"` @@ -198,14 +199,14 @@ # of interest. When training on such randomly permuted labels, many machine # learning estimators would end up approximately behaving much like the # `DummyClassifier(strategy="most_frequent")` by always predicting the majority -# class, irrespective of the input features. As a result, this -# `"most_frequent"` baseline is sometimes called the "chance level" for -# imbalanced classification problems, even though its predictions are -# completely deterministic and do not involve much "chance" anymore. +# class, irrespective of the input features. As a result, this `"most_frequent"` +# baseline is sometimes called the "chance level" for imbalanced classification +# problems, even though its predictions are completely deterministic and do not +# involve much "chance" anymore. # # Defining the chance level using `permutation_test_score` is quite # computation-intensive because it requires fitting many non-dummy models on -# random permutations of the data. Using dummy classifiers as baselines is -# often enough for practical purposes. For imbalanced classification problems, -# the `"most_frequent"` strategy is the strongest of the three baselines and +# random permutations of the data. Using dummy classifiers as baselines is often +# enough for practical purposes. For imbalanced classification problems, the +# `"most_frequent"` strategy is the strongest of the three baselines and # therefore the one we should use. diff --git a/python_scripts/cross_validation_stratification.py b/python_scripts/cross_validation_stratification.py index 927bca80b..4acad6a36 100644 --- a/python_scripts/cross_validation_stratification.py +++ b/python_scripts/cross_validation_stratification.py @@ -12,11 +12,11 @@ # In the previous notebooks, we always used either a default `KFold` or a # `ShuffleSplit` cross-validation strategies to iteratively split our dataset. # However, you should not assume that these approaches are always the best -# option: some other cross-validation strategies might be better adapted to -# your problem. +# option: some other cross-validation strategies might be better adapted to your +# problem. # -# Let's start with the concept of stratification by giving an example where -# we can get into trouble if we are not careful. Let's load the iris dataset. +# Let's start with the concept of stratification by giving an example where we +# can get into trouble if we are not careful. Let's load the iris dataset. # %% from sklearn.datasets import load_iris @@ -37,9 +37,9 @@ # %% [markdown] # Once we created our model, we will use the cross-validation framework to -# evaluate it. We will use the `KFold` cross-validation strategy. -# We will define a dataset with nine samples and repeat the cross-validation -# three times (i.e. `n_splits`). +# evaluate it. We will use the `KFold` cross-validation strategy. We will define +# a dataset with nine samples and repeat the cross-validation three times (i.e. +# `n_splits`). # %% import numpy as np @@ -52,11 +52,11 @@ # %% [markdown] # By defining three splits, we will use three samples for testing and six for -# training each time. `KFold` does not shuffle by default. It means that it -# will select the three first samples for the testing set at the first split, -# then the three next three samples for the second split, and the three next -# for the last split. In the end, all samples have been used in testing at -# least once among the different splits. +# training each time. `KFold` does not shuffle by default. It means that it will +# select the three first samples for the testing set at the first split, then +# the three next three samples for the second split, and the three next for the +# last split. In the end, all samples have been used in testing at least once +# among the different splits. # # Now, let's apply this strategy to check the generalization performance of our # model. @@ -67,13 +67,14 @@ cv = KFold(n_splits=3) results = cross_validate(model, data, target, cv=cv) test_score = results["test_score"] -print(f"The average accuracy is " - f"{test_score.mean():.3f} ยฑ {test_score.std():.3f}") +print( + f"The average accuracy is {test_score.mean():.3f} ยฑ {test_score.std():.3f}" +) # %% [markdown] # It is a real surprise that our model cannot correctly classify any sample in -# any cross-validation split. We will now check our target's value to -# understand the issue. +# any cross-validation split. We will now check our target's value to understand +# the issue. # %% import matplotlib.pyplot as plt @@ -86,8 +87,8 @@ # %% [markdown] # We see that the target vector `target` is ordered. It will have some -# unexpected consequences when using the `KFold` cross-validation. To -# illustrate the consequences, we will show the class count in each fold of the +# unexpected consequences when using the `KFold` cross-validation. To illustrate +# the consequences, we will show the class count in each fold of the # cross-validation in the train and test set. # # Let's compute the class counts for both the training and testing sets using @@ -117,14 +118,16 @@ # information regarding the fold within the same dataset. # %% -train_cv_counts = pd.concat(train_cv_counts, axis=1, - keys=[f"Fold #{idx}" for idx in range(n_splits)]) +train_cv_counts = pd.concat( + train_cv_counts, axis=1, keys=[f"Fold #{idx}" for idx in range(n_splits)] +) train_cv_counts.index.name = "Class label" train_cv_counts # %% -test_cv_counts = pd.concat(test_cv_counts, axis=1, - keys=[f"Fold #{idx}" for idx in range(n_splits)]) +test_cv_counts = pd.concat( + test_cv_counts, axis=1, keys=[f"Fold #{idx}" for idx in range(n_splits)] +) test_cv_counts.index.name = "Class label" test_cv_counts @@ -144,20 +147,21 @@ _ = plt.title("Test set") # %% [markdown] -# We can confirm that in each fold, only two of the three classes are present -# in the training set and all samples of the remaining class is used as a test -# set. So our model is unable to predict this class that was unseen during the +# We can confirm that in each fold, only two of the three classes are present in +# the training set and all samples of the remaining class is used as a test set. +# So our model is unable to predict this class that was unseen during the # training stage. # -# One possibility to solve the issue is to shuffle the data before splitting -# the data into three groups. +# One possibility to solve the issue is to shuffle the data before splitting the +# data into three groups. # %% cv = KFold(n_splits=3, shuffle=True, random_state=0) results = cross_validate(model, data, target, cv=cv) test_score = results["test_score"] -print(f"The average accuracy is " - f"{test_score.mean():.3f} ยฑ {test_score.std():.3f}") +print( + f"The average accuracy is {test_score.mean():.3f} ยฑ {test_score.std():.3f}" +) # %% [markdown] # We get results that are closer to what we would expect with an accuracy above @@ -174,10 +178,12 @@ train_cv_counts.append(target_train.value_counts()) test_cv_counts.append(target_test.value_counts()) -train_cv_counts = pd.concat(train_cv_counts, axis=1, - keys=[f"Fold #{idx}" for idx in range(n_splits)]) -test_cv_counts = pd.concat(test_cv_counts, axis=1, - keys=[f"Fold #{idx}" for idx in range(n_splits)]) +train_cv_counts = pd.concat( + train_cv_counts, axis=1, keys=[f"Fold #{idx}" for idx in range(n_splits)] +) +test_cv_counts = pd.concat( + test_cv_counts, axis=1, keys=[f"Fold #{idx}" for idx in range(n_splits)] +) train_cv_counts.index.name = "Class label" test_cv_counts.index.name = "Class label" @@ -211,8 +217,9 @@ # %% results = cross_validate(model, data, target, cv=cv) test_score = results["test_score"] -print(f"The average accuracy is " - f"{test_score.mean():.3f} ยฑ {test_score.std():.3f}") +print( + f"The average accuracy is {test_score.mean():.3f} ยฑ {test_score.std():.3f}" +) # %% train_cv_counts = [] @@ -222,10 +229,12 @@ train_cv_counts.append(target_train.value_counts()) test_cv_counts.append(target_test.value_counts()) -train_cv_counts = pd.concat(train_cv_counts, axis=1, - keys=[f"Fold #{idx}" for idx in range(n_splits)]) -test_cv_counts = pd.concat(test_cv_counts, axis=1, - keys=[f"Fold #{idx}" for idx in range(n_splits)]) +train_cv_counts = pd.concat( + train_cv_counts, axis=1, keys=[f"Fold #{idx}" for idx in range(n_splits)] +) +test_cv_counts = pd.concat( + test_cv_counts, axis=1, keys=[f"Fold #{idx}" for idx in range(n_splits)] +) train_cv_counts.index.name = "Class label" test_cv_counts.index.name = "Class label" diff --git a/python_scripts/cross_validation_time.py b/python_scripts/cross_validation_time.py index d595d4f28..3328f6742 100644 --- a/python_scripts/cross_validation_time.py +++ b/python_scripts/cross_validation_time.py @@ -27,9 +27,14 @@ # %% import pandas as pd -symbols = {"TOT": "Total", "XOM": "Exxon", "CVX": "Chevron", - "COP": "ConocoPhillips", "VLO": "Valero Energy"} -template_name = ("../datasets/financial-data/{}.csv") +symbols = { + "TOT": "Total", + "XOM": "Exxon", + "CVX": "Chevron", + "COP": "ConocoPhillips", + "VLO": "Valero Energy", +} +template_name = "../datasets/financial-data/{}.csv" quotes = {} for symbol in symbols: @@ -60,7 +65,8 @@ data, target = quotes.drop(columns=["Chevron"]), quotes["Chevron"] data_train, data_test, target_train, target_test = train_test_split( - data, target, shuffle=True, random_state=0) + data, target, shuffle=True, random_state=0 +) # %% [markdown] # We will use a decision tree regressor that we expect to overfit and thus not @@ -88,10 +94,10 @@ # %% from sklearn.model_selection import cross_val_score -test_score = cross_val_score(regressor, data_train, target_train, cv=cv, - n_jobs=2) -print(f"The mean R2 is: " - f"{test_score.mean():.2f} ยฑ {test_score.std():.2f}") +test_score = cross_val_score( + regressor, data_train, target_train, cv=cv, n_jobs=2 +) +print(f"The mean R2 is: {test_score.mean():.2f} ยฑ {test_score.std():.2f}") # %% [markdown] # Surprisingly, we get outstanding generalization performance. We will @@ -117,8 +123,8 @@ print(f"The R2 on this single split is: {test_score:.2f}") # %% [markdown] -# Similarly, we obtain good results in terms of $R^2$. -# We will plot the training, testing and prediction samples. +# Similarly, we obtain good results in terms of $R^2$. We will plot the +# training, testing and prediction samples. # %% target_train.plot(label="Training") @@ -145,7 +151,10 @@ # %% data_train, data_test, target_train, target_test = train_test_split( - data, target, shuffle=False, random_state=0, + data, + target, + shuffle=False, + random_state=0, ) regressor.fit(data_train, target_train) target_predicted = regressor.predict(data_test) @@ -184,10 +193,10 @@ groups = quotes.index.to_period("Q") cv = LeaveOneGroupOut() -test_score = cross_val_score(regressor, data, target, - cv=cv, groups=groups, n_jobs=2) -print(f"The mean R2 is: " - f"{test_score.mean():.2f} ยฑ {test_score.std():.2f}") +test_score = cross_val_score( + regressor, data, target, cv=cv, groups=groups, n_jobs=2 +) +print(f"The mean R2 is: {test_score.mean():.2f} ยฑ {test_score.std():.2f}") # %% [markdown] # In this case, we see that we cannot make good predictions, which is less @@ -203,10 +212,10 @@ from sklearn.model_selection import TimeSeriesSplit cv = TimeSeriesSplit(n_splits=groups.nunique()) -test_score = cross_val_score(regressor, data, target, - cv=cv, groups=groups, n_jobs=2) -print(f"The mean R2 is: " - f"{test_score.mean():.2f} ยฑ {test_score.std():.2f}") +test_score = cross_val_score( + regressor, data, target, cv=cv, groups=groups, n_jobs=2 +) +print(f"The mean R2 is: {test_score.mean():.2f} ยฑ {test_score.std():.2f}") # %% [markdown] # In conclusion, it is really important to not use an out of the shelves diff --git a/python_scripts/cross_validation_train_test.py b/python_scripts/cross_validation_train_test.py index cb3ae4a0f..f249a91fb 100644 --- a/python_scripts/cross_validation_train_test.py +++ b/python_scripts/cross_validation_train_test.py @@ -8,9 +8,9 @@ # %% [markdown] # # Cross-validation framework # -# In the previous notebooks, we introduce some concepts regarding the -# evaluation of predictive models. While this section could be slightly -# redundant, we intend to go into details into the cross-validation framework. +# In the previous notebooks, we introduce some concepts regarding the evaluation +# of predictive models. While this section could be slightly redundant, we +# intend to go into details into the cross-validation framework. # # Before we dive in, let's linger on the reasons for always having training and # testing sets. Let's first look at the limitation of using a dataset without @@ -44,8 +44,8 @@ data.head() # %% [markdown] -# To simplify future visualization, let's transform the prices from the -# 100 (k\$) range to the thousand dollars (k\$) range. +# To simplify future visualization, let's transform the prices from the 100 +# (k\$) range to the thousand dollars (k\$) range. # %% target *= 100 @@ -69,9 +69,10 @@ regressor.fit(data, target) # %% [markdown] -# After training the regressor, we would like to know its potential generalization -# performance once deployed in production. For this purpose, we use the mean -# absolute error, which gives us an error in the native unit, i.e. k\$. +# After training the regressor, we would like to know its potential +# generalization performance once deployed in production. For this purpose, we +# use the mean absolute error, which gives us an error in the native unit, i.e. +# k\$. # %% from sklearn.metrics import mean_absolute_error @@ -96,11 +97,11 @@ # In this MOOC, we will consistently use the term "training error". # ``` # -# We trained a predictive model to minimize the training error but our aim is -# to minimize the error on data that has not been seen during training. +# We trained a predictive model to minimize the training error but our aim is to +# minimize the error on data that has not been seen during training. # -# This error is also called the **generalization error** or the "true" -# **testing error**. +# This error is also called the **generalization error** or the "true" **testing +# error**. # # ```{note} # In this MOOC, we will consistently use the term "testing error". @@ -118,7 +119,8 @@ from sklearn.model_selection import train_test_split data_train, data_test, target_train, target_test = train_test_split( - data, target, random_state=0) + data, target, random_state=0 +) # %% [markdown] # Then, let's train our model. @@ -145,8 +147,8 @@ print(f"The testing error of our model is {score:.2f} k$") # %% [markdown] -# This testing error is actually about what we would expect from our model if -# it was used in a production environment. +# This testing error is actually about what we would expect from our model if it +# was used in a production environment. # %% [markdown] # ## Stability of the cross-validation estimates @@ -205,7 +207,8 @@ cv = ShuffleSplit(n_splits=40, test_size=0.3, random_state=0) cv_results = cross_validate( - regressor, data, target, cv=cv, scoring="neg_mean_absolute_error") + regressor, data, target, cv=cv, scoring="neg_mean_absolute_error" +) # %% [markdown] # The results `cv_results` are stored into a Python dictionary. We will convert @@ -269,12 +272,16 @@ # 43 k\$ to 50 k\$. # %% -print(f"The mean cross-validated testing error is: " - f"{cv_results['test_error'].mean():.2f} k$") +print( + "The mean cross-validated testing error is: " + f"{cv_results['test_error'].mean():.2f} k$" +) # %% -print(f"The standard deviation of the testing error is: " - f"{cv_results['test_error'].std():.2f} k$") +print( + "The standard deviation of the testing error is: " + f"{cv_results['test_error'].std():.2f} k$" +) # %% [markdown] # Note that the standard deviation is much smaller than the mean: we could @@ -326,8 +333,8 @@ # ## More detail regarding `cross_validate` # # During cross-validation, many models are trained and evaluated. Indeed, the -# number of elements in each array of the output of `cross_validate` is a -# result from one of these `fit`/`score` procedures. To make it explicit, it is +# number of elements in each array of the output of `cross_validate` is a result +# from one of these `fit`/`score` procedures. To make it explicit, it is # possible to retrieve these fitted models for each of the splits/folds by # passing the option `return_estimator=True` in `cross_validate`. diff --git a/python_scripts/cross_validation_validation_curve.py b/python_scripts/cross_validation_validation_curve.py index 35b0d56cb..79297634f 100644 --- a/python_scripts/cross_validation_validation_curve.py +++ b/python_scripts/cross_validation_validation_curve.py @@ -9,8 +9,8 @@ # # Overfit-generalization-underfit # # In the previous notebook, we presented the general cross-validation framework -# and how it helps us quantify the training and testing errors as well -# as their fluctuations. +# and how it helps us quantify the training and testing errors as well as their +# fluctuations. # # In this notebook, we will put these two errors into perspective and show how # they can help us know if our model generalizes, overfits, or underfits. @@ -39,29 +39,36 @@ # %% [markdown] # ## Overfitting vs. underfitting # -# To better understand the generalization performance of our model and maybe find -# insights on how to improve it, we will compare the testing error with the -# training error. Thus, we need to compute the error on the training set, -# which is possible using the `cross_validate` function. +# To better understand the generalization performance of our model and maybe +# find insights on how to improve it, we will compare the testing error with the +# training error. Thus, we need to compute the error on the training set, which +# is possible using the `cross_validate` function. # %% import pandas as pd from sklearn.model_selection import cross_validate, ShuffleSplit cv = ShuffleSplit(n_splits=30, test_size=0.2) -cv_results = cross_validate(regressor, data, target, - cv=cv, scoring="neg_mean_absolute_error", - return_train_score=True, n_jobs=2) +cv_results = cross_validate( + regressor, + data, + target, + cv=cv, + scoring="neg_mean_absolute_error", + return_train_score=True, + n_jobs=2, +) cv_results = pd.DataFrame(cv_results) # %% [markdown] -# The cross-validation used the negative mean absolute error. We transform -# the negative mean absolute error into a positive mean absolute error. +# The cross-validation used the negative mean absolute error. We transform the +# negative mean absolute error into a positive mean absolute error. # %% scores = pd.DataFrame() scores[["train error", "test error"]] = -cv_results[ - ["train_score", "test_score"]] + ["train_score", "test_score"] +] # %% import matplotlib.pyplot as plt @@ -71,26 +78,26 @@ _ = plt.title("Train and test errors distribution via cross-validation") # %% [markdown] -# By plotting the distribution of the training and testing errors, we -# get information about whether our model is over-fitting, under-fitting (or -# both at the same time). +# By plotting the distribution of the training and testing errors, we get +# information about whether our model is over-fitting, under-fitting (or both at +# the same time). # -# Here, we observe a **small training error** (actually zero), meaning that -# the model is **not under-fitting**: it is flexible enough to capture any +# Here, we observe a **small training error** (actually zero), meaning that the +# model is **not under-fitting**: it is flexible enough to capture any # variations present in the training set. # -# However the **significantly larger testing error** tells us that the -# model is **over-fitting**: the model has memorized many variations of the -# training set that could be considered "noisy" because they do not generalize -# to help us make good prediction on the test set. +# However the **significantly larger testing error** tells us that the model is +# **over-fitting**: the model has memorized many variations of the training set +# that could be considered "noisy" because they do not generalize to help us +# make good prediction on the test set. # # ## Validation curve # # Some model hyperparameters are usually the key to go from a model that # underfits to a model that overfits, hopefully going through a region were we -# can get a good balance between the two. We can acquire knowledge by plotting -# a curve called the validation curve. This curve can also be applied to the -# above experiment and varies the value of a hyperparameter. +# can get a good balance between the two. We can acquire knowledge by plotting a +# curve called the validation curve. This curve can also be applied to the above +# experiment and varies the value of a hyperparameter. # # For the decision tree, the `max_depth` parameter is used to control the # tradeoff between under-fitting and over-fitting. @@ -101,8 +108,15 @@ max_depth = [1, 5, 10, 15, 20, 25] train_scores, test_scores = validation_curve( - regressor, data, target, param_name="max_depth", param_range=max_depth, - cv=cv, scoring="neg_mean_absolute_error", n_jobs=2) + regressor, + data, + target, + param_name="max_depth", + param_range=max_depth, + cv=cv, + scoring="neg_mean_absolute_error", + n_jobs=2, +) train_errors, test_errors = -train_scores, -test_scores # %% [markdown] @@ -122,38 +136,45 @@ # The validation curve can be divided into three areas: # # - For `max_depth < 10`, the decision tree underfits. The training error and -# therefore the testing error are both high. The model is too -# constrained and cannot capture much of the variability of the target -# variable. +# therefore the testing error are both high. The model is too constrained and +# cannot capture much of the variability of the target variable. # # - The region around `max_depth = 10` corresponds to the parameter for which # the decision tree generalizes the best. It is flexible enough to capture a # fraction of the variability of the target that generalizes, while not # memorizing all of the noise in the target. # -# - For `max_depth > 10`, the decision tree overfits. The training error -# becomes very small, while the testing error increases. In this -# region, the models create decisions specifically for noisy samples harming -# its ability to generalize to test data. +# - For `max_depth > 10`, the decision tree overfits. The training error becomes +# very small, while the testing error increases. In this region, the models +# create decisions specifically for noisy samples harming its ability to +# generalize to test data. # # Note that for `max_depth = 10`, the model overfits a bit as there is a gap -# between the training error and the testing error. It can also -# potentially underfit also a bit at the same time, because the training error -# is still far from zero (more than 30 k\$), meaning that the model might -# still be too constrained to model interesting parts of the data. However, the -# testing error is minimal, and this is what really matters. This is the -# best compromise we could reach by just tuning this parameter. +# between the training error and the testing error. It can also potentially +# underfit also a bit at the same time, because the training error is still far +# from zero (more than 30 k\$), meaning that the model might still be too +# constrained to model interesting parts of the data. However, the testing error +# is minimal, and this is what really matters. This is the best compromise we +# could reach by just tuning this parameter. # # Be aware that looking at the mean errors is quite limiting. We should also -# look at the standard deviation to assess the dispersion of the score. We -# can repeat the same plot as before but this time, we will add some -# information to show the standard deviation of the errors as well. +# look at the standard deviation to assess the dispersion of the score. We can +# repeat the same plot as before but this time, we will add some information to +# show the standard deviation of the errors as well. # %% -plt.errorbar(max_depth, train_errors.mean(axis=1), - yerr=train_errors.std(axis=1), label='Training error') -plt.errorbar(max_depth, test_errors.mean(axis=1), - yerr=test_errors.std(axis=1), label='Testing error') +plt.errorbar( + max_depth, + train_errors.mean(axis=1), + yerr=train_errors.std(axis=1), + label="Training error", +) +plt.errorbar( + max_depth, + test_errors.mean(axis=1), + yerr=test_errors.std(axis=1), + label="Testing error", +) plt.legend() plt.xlabel("Maximum depth of decision tree") @@ -172,5 +193,4 @@ # # * how to identify whether a model is generalizing, overfitting, or # underfitting; -# * how to check influence of a hyperparameter on the tradeoff -# underfit/overfit. +# * how to check influence of a hyperparameter on the tradeoff underfit/overfit. diff --git a/python_scripts/datasets_ames_housing.py b/python_scripts/datasets_ames_housing.py index 9f47c2acf..c69c236b1 100644 --- a/python_scripts/datasets_ames_housing.py +++ b/python_scripts/datasets_ames_housing.py @@ -9,9 +9,9 @@ # # The Ames housing dataset # # In this notebook, we will quickly present the "Ames housing" dataset. We will -# see that this dataset is similar to the "California housing" dataset. -# However, it is more complex to handle: it contains missing data and both -# numerical and categorical features. +# see that this dataset is similar to the "California housing" dataset. However, +# it is more complex to handle: it contains missing data and both numerical and +# categorical features. # # This dataset is located in the `datasets` directory. It is stored in a comma # separated value (CSV) file. As previously mentioned, we are aware that the @@ -24,7 +24,7 @@ # %% import pandas as pd -ames_housing = pd.read_csv("../datasets/house_prices.csv", na_values='?') +ames_housing = pd.read_csv("../datasets/house_prices.csv", na_values="?") ames_housing = ames_housing.drop(columns="Id") # %% [markdown] @@ -40,7 +40,10 @@ # %% target_name = "SalePrice" -data, target = ames_housing.drop(columns=target_name), ames_housing[target_name] +data, target = ( + ames_housing.drop(columns=target_name), + ames_housing[target_name], +) # %% [markdown] # Let's have a quick look at the target before to focus on the data. @@ -54,6 +57,7 @@ # %% import matplotlib.pyplot as plt + target.plot.hist(bins=20, edgecolor="black") plt.xlabel("House price in $") _ = plt.title("Distribution of the house price \nin Ames") @@ -83,18 +87,19 @@ numerical_data.info() # %% [markdown] -# We see that the data are mainly represented with integer number. Let's have -# a look at the histogram for all these features. +# We see that the data are mainly represented with integer number. Let's have a +# look at the histogram for all these features. # %% -numerical_data.hist(bins=20, figsize=(12, 22), edgecolor="black", - layout=(9, 4)) +numerical_data.hist( + bins=20, figsize=(12, 22), edgecolor="black", layout=(9, 4) +) plt.subplots_adjust(hspace=0.8, wspace=0.8) # %% [markdown] # We see that some features have high picks for 0. It could be linked that this -# value was assigned when the criterion did not apply, for instance the -# area of the swimming pool when no swimming pools are available. +# value was assigned when the criterion did not apply, for instance the area of +# the swimming pool when no swimming pools are available. # # We also have some feature encoding some date (for instance year). # @@ -147,14 +152,17 @@ # ``` # %% -ames_housing_no_missing = pd.read_csv("../datasets/ames_housing_no_missing.csv") +ames_housing_no_missing = pd.read_csv( + "../datasets/ames_housing_no_missing.csv" +) ames_housing_no_missing.head() # %% [markdown] # It contains the same information as the original dataset after using a # [`sklearn.impute.SimpleImputer`](https://scikit-learn.org/stable/modules/generated/sklearn.impute.SimpleImputer.html) # to replace missing values using the mean along each numerical column -# (including the target), and the most frequent value along each categorical column. +# (including the target), and the most frequent value along each categorical +# column. # %% from sklearn.compose import make_column_transformer @@ -203,5 +211,7 @@ columns=categorical_features.tolist() + numerical_features, ) ames_housing_preprocessed = ames_housing_preprocessed[ames_housing.columns] -ames_housing_preprocessed = ames_housing_preprocessed.astype(ames_housing.dtypes) +ames_housing_preprocessed = ames_housing_preprocessed.astype( + ames_housing.dtypes +) (ames_housing_no_missing == ames_housing_preprocessed).all() diff --git a/python_scripts/datasets_bike_rides.py b/python_scripts/datasets_bike_rides.py index 6315f9aa7..7944a0723 100644 --- a/python_scripts/datasets_bike_rides.py +++ b/python_scripts/datasets_bike_rides.py @@ -20,29 +20,30 @@ cycling.head() # %% [markdown] -# The first column `timestamp` contains a specific information regarding the -# the time and date of a record while other columns contain numerical value -# of some specific measurements. Let's check the data type of the columns more -# in details. +# The first column `timestamp` contains a specific information regarding the the +# time and date of a record while other columns contain numerical value of some +# specific measurements. Let's check the data type of the columns more in +# details. # %% cycling.info() # %% [markdown] -# Indeed, CSV format store data as text. Pandas tries to infer numerical type -# by default. It is the reason why all features but `timestamp` are encoded as +# Indeed, CSV format store data as text. Pandas tries to infer numerical type by +# default. It is the reason why all features but `timestamp` are encoded as # floating point values. However, we see that the `timestamp` is stored as an # `object` column. It means that the data in this column are stored as `str` # rather than a specialized `datetime` data type. # -# In fact, one needs to set an option such that pandas is directed to infer -# such data type when opening the file. In addition, we will want to use -# `timestamp` as an index. Thus, we can reopen the file with some extra -# arguments to help pandas at reading properly our CSV file. +# In fact, one needs to set an option such that pandas is directed to infer such +# data type when opening the file. In addition, we will want to use `timestamp` +# as an index. Thus, we can reopen the file with some extra arguments to help +# pandas at reading properly our CSV file. # %% -cycling = pd.read_csv("../datasets/bike_rides.csv", index_col=0, - parse_dates=True) +cycling = pd.read_csv( + "../datasets/bike_rides.csv", index_col=0, parse_dates=True +) cycling.index.name = "" cycling.head() @@ -50,40 +51,40 @@ cycling.info() # %% [markdown] -# By specifying to pandas to parse the date, we obtain a `DatetimeIndex` that -# is really handy when filtering data based on date. +# By specifying to pandas to parse the date, we obtain a `DatetimeIndex` that is +# really handy when filtering data based on date. # -# We can now have a look at the data stored in our dataframe. It will help us -# to frame the data science problem that we try to solve. +# We can now have a look at the data stored in our dataframe. It will help us to +# frame the data science problem that we try to solve. # -# The records correspond at information derived from GPS recordings of a -# cyclist (`speed`, `acceleration`, `slope`) and some extra information -# acquired from other sensors: `heart-rate` that corresponds to the number of -# beats per minute of the cyclist heart, `cadence` that is the rate at which a -# cyclist is turning the pedals, and `power` that corresponds to the work -# required by the cyclist to go forward. +# The records correspond at information derived from GPS recordings of a cyclist +# (`speed`, `acceleration`, `slope`) and some extra information acquired from +# other sensors: `heart-rate` that corresponds to the number of beats per minute +# of the cyclist heart, `cadence` that is the rate at which a cyclist is turning +# the pedals, and `power` that corresponds to the work required by the cyclist +# to go forward. # # The power might be slightly an abstract quantity so let's give a more # intuitive explanation. # -# Let's take the example of a soup blender that one uses to blend vegetable. -# The engine of this blender develop an instantaneous power of ~300 Watts to -# blend the vegetable. Here, our cyclist is just the engine of the blender (at -# the difference that an average cyclist will develop an instantaneous power -# around ~150 Watts) and blending the vegetable corresponds to move the -# cyclist's bike forward. +# Let's take the example of a soup blender that one uses to blend vegetable. The +# engine of this blender develop an instantaneous power of ~300 Watts to blend +# the vegetable. Here, our cyclist is just the engine of the blender (at the +# difference that an average cyclist will develop an instantaneous power around +# ~150 Watts) and blending the vegetable corresponds to move the cyclist's bike +# forward. # # Professional cyclists are using power to calibrate their training and track # the energy spent during a ride. For instance, riding at a higher power # requires more energy and thus, you need to provide resources to create this -# energy. With human, this resource is food. For our soup blender, this -# resource can be uranium, petrol, natural gas, coal, etc. Our body serves as a -# power plant to transform the resources into energy. +# energy. With human, this resource is food. For our soup blender, this resource +# can be uranium, petrol, natural gas, coal, etc. Our body serves as a power +# plant to transform the resources into energy. # # The issue with measuring power is linked to the cost of the sensor: a cycling -# power meter. The cost of such sensor vary from $400 to $1000. Thus, our -# data science problem is quite easy: can we predict instantaneous cyclist -# power from other (cheaper) sensors. +# power meter. The cost of such sensor vary from $400 to $1000. Thus, our data +# science problem is quite easy: can we predict instantaneous cyclist power from +# other (cheaper) sensors. # %% target_name = "power" @@ -125,10 +126,10 @@ data.index.min(), data.index.max() # %% [markdown] -# The starting date is the August 18, 2020 and the ending date is -# September 13, 2020. However, it is obvious that our cyclist did not ride -# every seconds between these dates. Indeed, only a couple of date should be -# present in the dataframe, corresponding to the number of cycling rides. +# The starting date is the August 18, 2020 and the ending date is September 13, +# 2020. However, it is obvious that our cyclist did not ride every seconds +# between these dates. Indeed, only a couple of date should be present in the +# dataframe, corresponding to the number of cycling rides. # %% data.index.normalize().nunique() @@ -144,18 +145,18 @@ # %% data_ride.plot() -plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left') +plt.legend(bbox_to_anchor=(1.05, 1), loc="upper left") _ = plt.title("Sensor values for different cyclist measurements") # %% [markdown] # Since the unit and range of each measurement (feature) is different, it is # rather difficult to interpret the plot. Also, the high temporal resolution -# make it difficult to make any observation. We could resample the data to get -# a smoother visualization. +# make it difficult to make any observation. We could resample the data to get a +# smoother visualization. # %% data_ride.resample("60S").mean().plot() -plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left') +plt.legend(bbox_to_anchor=(1.05, 1), loc="upper left") _ = plt.title("Sensor values for different cyclist measurements") # %% [markdown] @@ -164,16 +165,21 @@ # %% axs = data_ride.hist(figsize=(10, 12), bins=50, edgecolor="black", grid=False) # add the units to the plots -units = ["beats per minute", "rotations per minute", "meters per second", - "meters per second squared", "%"] +units = [ + "beats per minute", + "rotations per minute", + "meters per second", + "meters per second squared", + "%", +] for unit, ax in zip(units, axs.ravel()): ax.set_xlabel(unit) plt.subplots_adjust(hspace=0.6) # %% [markdown] # From these plots, we can see some interesting information: a cyclist is -# spending some time without pedaling. This samples should be associated with -# a null power. We also see that the slope have large extremum. +# spending some time without pedaling. This samples should be associated with a +# null power. We also see that the slope have large extremum. # # Let's make a pair plot on a subset of data samples to see if we can confirm # some of these intuitions. @@ -196,9 +202,9 @@ _ = sns.pairplot(data=subset, hue="power", palette="viridis") # %% [markdown] -# Indeed, we see that low cadence is associated with low power. We can also -# the a link between higher slope / high heart-rate and higher power: a cyclist -# need to develop more energy to go uphill enforcing a stronger physiological -# stimuli on the body. We can confirm this intuition by looking at the -# interaction between the slope and the speed: a lower speed with a higher -# slope is usually associated with higher power. +# Indeed, we see that low cadence is associated with low power. We can also the +# a link between higher slope / high heart-rate and higher power: a cyclist need +# to develop more energy to go uphill enforcing a stronger physiological stimuli +# on the body. We can confirm this intuition by looking at the interaction +# between the slope and the speed: a lower speed with a higher slope is usually +# associated with higher power. diff --git a/python_scripts/datasets_blood_transfusion.py b/python_scripts/datasets_blood_transfusion.py index 70fbafd0e..cca0c3439 100644 --- a/python_scripts/datasets_blood_transfusion.py +++ b/python_scripts/datasets_blood_transfusion.py @@ -9,8 +9,8 @@ # # The blood transfusion dataset # # In this notebook, we will present the "blood transfusion" dataset. This -# dataset is locally available in the directory `datasets` and it is stored as -# a comma separated value (CSV) file. We start by loading the entire dataset. +# dataset is locally available in the directory `datasets` and it is stored as a +# comma separated value (CSV) file. We start by loading the entire dataset. # %% import pandas as pd @@ -24,10 +24,10 @@ blood_transfusion.head() # %% [markdown] -# In this dataframe, we can see that the last column correspond to the target -# to be predicted called `"Class"`. We will create two variables, `data` and -# `target` to separate the data from which we could learn a predictive model -# and the `target` that should be predicted. +# In this dataframe, we can see that the last column correspond to the target to +# be predicted called `"Class"`. We will create two variables, `data` and +# `target` to separate the data from which we could learn a predictive model and +# the `target` that should be predicted. # %% data = blood_transfusion.drop(columns="Class") @@ -40,19 +40,18 @@ data.head() # %% [markdown] -# We observe four columns. Each record corresponds to a person that intended -# to give blood. The information stored in each column are: +# We observe four columns. Each record corresponds to a person that intended to +# give blood. The information stored in each column are: # -# * `Recency`: the time in months since the last time a person intended to -# give blood; -# * `Frequency`: the number of time a person intended to give blood in the -# past; +# * `Recency`: the time in months since the last time a person intended to give +# blood; +# * `Frequency`: the number of time a person intended to give blood in the past; # * `Monetary`: the amount of blood given in the past (in c.c.); # * `Time`: the time in months since the first time a person intended to give # blood. # -# Now, let's have a look regarding the type of data that we are dealing in -# these columns and if any missing values are present in our dataset. +# Now, let's have a look regarding the type of data that we are dealing in these +# columns and if any missing values are present in our dataset. # %% data.info() @@ -97,8 +96,8 @@ # important: a classifier that would predict always this `"not donated"` class # would achieve an accuracy of 76% of good classification without using any # information from the data itself. This issue is known as class imbalance. One -# should take care about the generalization performance metric used to evaluate a -# model as well as the predictive model chosen itself. +# should take care about the generalization performance metric used to evaluate +# a model as well as the predictive model chosen itself. # # Now, let's have a naive analysis to see if there is a link between features # and the target using a pair plot representation. @@ -111,11 +110,11 @@ # %% [markdown] # Looking at the diagonal plots, we don't see any feature that individually # could help at separating the two classes. When looking at a pair of feature, -# we don't see any striking combinations as well. However, we can note that -# the `"Monetary"` and `"Frequency"` features are perfectly correlated: all the -# data points are aligned on a diagonal. +# we don't see any striking combinations as well. However, we can note that the +# `"Monetary"` and `"Frequency"` features are perfectly correlated: all the data +# points are aligned on a diagonal. # # As a conclusion, this dataset would be a challenging dataset: it suffer from # class imbalance, correlated features and thus very few features will be -# available to learn a model, and none of the feature combinations were found -# to help at predicting. +# available to learn a model, and none of the feature combinations were found to +# help at predicting. diff --git a/python_scripts/datasets_california_housing.py b/python_scripts/datasets_california_housing.py index 73733cfe8..03ec00ac9 100644 --- a/python_scripts/datasets_california_housing.py +++ b/python_scripts/datasets_california_housing.py @@ -8,8 +8,8 @@ # %% [markdown] # # The California housing dataset # -# In this notebook, we will quickly present the dataset known as the -# "California housing dataset". This dataset can be fetched from internet using +# In this notebook, we will quickly present the dataset known as the "California +# housing dataset". This dataset can be fetched from internet using # scikit-learn. # %% @@ -41,8 +41,8 @@ # In this dataset, we have information regarding the demography (income, # population, house occupancy) in the districts, the location of the districts # (latitude, longitude), and general information regarding the house in the -# districts (number of rooms, number of bedrooms, age of the house). Since -# these statistics are at the granularity of the district, they corresponds to +# districts (number of rooms, number of bedrooms, age of the house). Since these +# statistics are at the granularity of the district, they corresponds to # averages or medians. # # Now, let's have a look to the target to be predicted. @@ -81,9 +81,9 @@ # We can first focus on features for which their distributions would be more or # less expected. # -# The median income is a distribution with a long tail. It means that the -# salary of people is more or less normally distributed but there is some -# people getting a high salary. +# The median income is a distribution with a long tail. It means that the salary +# of people is more or less normally distributed but there is some people +# getting a high salary. # # Regarding the average house age, the distribution is more or less uniform. # @@ -102,51 +102,61 @@ california_housing.frame[features_of_interest].describe() # %% [markdown] -# For each of these features, comparing the `max` and `75%` values, we can see -# a huge difference. It confirms the intuitions that there are a couple of -# extreme values. +# For each of these features, comparing the `max` and `75%` values, we can see a +# huge difference. It confirms the intuitions that there are a couple of extreme +# values. # # Up to know, we discarded the longitude and latitude that carry geographical -# information. In short, the combination of this feature could help us to -# decide if there are locations associated with high-valued houses. Indeed, -# we could make a scatter plot where the x- and y-axis would be the latitude -# and longitude and the circle size and color would be linked with the house -# value in the district. +# information. In short, the combination of this feature could help us to decide +# if there are locations associated with high-valued houses. Indeed, we could +# make a scatter plot where the x- and y-axis would be the latitude and +# longitude and the circle size and color would be linked with the house value +# in the district. # %% import seaborn as sns -sns.scatterplot(data=california_housing.frame, x="Longitude", y="Latitude", - size="MedHouseVal", hue="MedHouseVal", - palette="viridis", alpha=0.5) -plt.legend(title="MedHouseVal", bbox_to_anchor=(1.05, 0.95), - loc="upper left") +sns.scatterplot( + data=california_housing.frame, + x="Longitude", + y="Latitude", + size="MedHouseVal", + hue="MedHouseVal", + palette="viridis", + alpha=0.5, +) +plt.legend(title="MedHouseVal", bbox_to_anchor=(1.05, 0.95), loc="upper left") _ = plt.title("Median house value depending of\n their spatial location") # %% [markdown] # If you are not familiar with the state of California, it is interesting to -# notice that all datapoints show a graphical representation of this state. -# We note that the high-valued houses will be located on the coast, where the -# big cities from California are located: San Diego, Los Angeles, San Jose, or -# San Francisco. +# notice that all datapoints show a graphical representation of this state. We +# note that the high-valued houses will be located on the coast, where the big +# cities from California are located: San Diego, Los Angeles, San Jose, or San +# Francisco. # -# We can do a random subsampling to have less data points to plot but that -# could still allow us to see these specificities. +# We can do a random subsampling to have less data points to plot but that could +# still allow us to see these specificities. # %% import numpy as np rng = np.random.RandomState(0) -indices = rng.choice(np.arange(california_housing.frame.shape[0]), size=500, - replace=False) +indices = rng.choice( + np.arange(california_housing.frame.shape[0]), size=500, replace=False +) # %% -sns.scatterplot(data=california_housing.frame.iloc[indices], - x="Longitude", y="Latitude", - size="MedHouseVal", hue="MedHouseVal", - palette="viridis", alpha=0.5) -plt.legend(title="MedHouseVal", bbox_to_anchor=(1.05, 1), - loc="upper left") +sns.scatterplot( + data=california_housing.frame.iloc[indices], + x="Longitude", + y="Latitude", + size="MedHouseVal", + hue="MedHouseVal", + palette="viridis", + alpha=0.5, +) +plt.legend(title="MedHouseVal", bbox_to_anchor=(1.05, 1), loc="upper left") _ = plt.title("Median house value depending of\n their spatial location") # %% [markdown] @@ -168,8 +178,8 @@ _ = sns.pairplot(data=subset, hue="MedHouseVal", palette="viridis") # %% [markdown] -# While it is always complicated to interpret a pairplot since there is a lot -# of data, here we can get a couple of intuitions. We can confirm that some +# While it is always complicated to interpret a pairplot since there is a lot of +# data, here we can get a couple of intuitions. We can confirm that some # features have extreme values (outliers?). We can as well see that the median # income is helpful to distinguish high-valued from low-valued houses. # @@ -178,7 +188,7 @@ # house values. # # If you are curious, we created a linear predictive model below and show the -# values of the coefficients obtained via cross-validation +# values of the coefficients obtained via cross-validation. # %% from sklearn.preprocessing import StandardScaler @@ -189,8 +199,12 @@ alphas = np.logspace(-3, 1, num=30) model = make_pipeline(StandardScaler(), RidgeCV(alphas=alphas)) cv_results = cross_validate( - model, california_housing.data, california_housing.target, - return_estimator=True, n_jobs=2) + model, + california_housing.data, + california_housing.target, + return_estimator=True, + n_jobs=2, +) # %% score = cv_results["test_score"] @@ -201,7 +215,7 @@ coefs = pd.DataFrame( [est[-1].coef_ for est in cv_results["estimator"]], - columns=california_housing.feature_names + columns=california_housing.feature_names, ) # %% @@ -212,6 +226,6 @@ # %% [markdown] # It seems that the three features that we earlier spotted are found important -# by this model. But be careful regarding interpreting these coefficients. -# We let you go into the module "Interpretation" to go in depth regarding such +# by this model. But be careful regarding interpreting these coefficients. We +# let you go into the module "Interpretation" to go in depth regarding such # experiment. diff --git a/python_scripts/dev_features_importance.py b/python_scripts/dev_features_importance.py index 09e98aac0..d92c2f04d 100644 --- a/python_scripts/dev_features_importance.py +++ b/python_scripts/dev_features_importance.py @@ -13,8 +13,8 @@ # # 1. interpreting the coefficients in a linear model; # 2. the attribute `feature_importances_` in RandomForest; -# 3. `permutation feature importance`, which is an inspection technique that -# can be used for any fitted model. +# 3. `permutation feature importance`, which is an inspection technique that can +# be used for any fitted model. # %% [markdown] # ## 0. Presentation of the dataset @@ -62,8 +62,8 @@ # Adding random features rng = np.random.RandomState(0) -bin_var = pd.Series(rng.randint(0, 1, X.shape[0]), name='rnd_bin') -num_var = pd.Series(np.arange(X.shape[0]), name='rnd_num') +bin_var = pd.Series(rng.randint(0, 1, X.shape[0]), name="rnd_bin") +num_var = pd.Series(np.arange(X.shape[0]), name="rnd_num") X_with_rnd_feat = pd.concat((X, bin_var, num_var), axis=1) @@ -73,11 +73,13 @@ # %% from sklearn.model_selection import train_test_split -X_train, X_test, y_train, y_test = train_test_split(X_with_rnd_feat, y, - random_state=29) + +X_train, X_test, y_train, y_test = train_test_split( + X_with_rnd_feat, y, random_state=29 +) # %% [markdown] -# Let's quickly inspect some features and the target +# Let's quickly inspect some features and the target: # %% import seaborn as sns @@ -85,15 +87,20 @@ train_dataset = X_train.copy() train_dataset.insert(0, "MedHouseVal", y_train) _ = sns.pairplot( - train_dataset[['MedHouseVal', 'Latitude', 'AveRooms', 'AveBedrms', 'MedInc']], - kind='reg', diag_kind='kde', plot_kws={'scatter_kws': {'alpha': 0.1}}) + train_dataset[ + ["MedHouseVal", "Latitude", "AveRooms", "AveBedrms", "MedInc"] + ], + kind="reg", + diag_kind="kde", + plot_kws={"scatter_kws": {"alpha": 0.1}}, +) # %% [markdown] # We see in the upper right plot that the median income seems to be positively # correlated to the median house price (the target). # -# We can also see that the average number of rooms `AveRooms` is very -# correlated to the average number of bedrooms `AveBedrms`. +# We can also see that the average number of rooms `AveRooms` is very correlated +# to the average number of bedrooms `AveBedrms`. # %% [markdown] # ## 1. Linear model inspection @@ -104,10 +111,9 @@ # # Coefficients represent the relationship between the given feature $X_i$ and # the target $y$, assuming that all the other features remain constant -# (conditional dependence). This is different from plotting $X_i$ versus $y$ -# and fitting a linear relationship: in that case all possible values of the -# other features are taken into account in the estimation (marginal -# dependence). +# (conditional dependence). This is different from plotting $X_i$ versus $y$ and +# fitting a linear relationship: in that case all possible values of the other +# features are taken into account in the estimation (marginal dependence). # %% from sklearn.linear_model import RidgeCV @@ -116,26 +122,25 @@ model.fit(X_train, y_train) -print(f'model score on training data: {model.score(X_train, y_train)}') -print(f'model score on testing data: {model.score(X_test, y_test)}') +print(f"model score on training data: {model.score(X_train, y_train)}") +print(f"model score on testing data: {model.score(X_test, y_test)}") # %% [markdown] # Our linear model obtains a $R^2$ score of .60, so it explains a significant # part of the target. Its coefficient should be somehow relevant. Let's look at -# the coefficient learnt +# the coefficient learnt: # %% import matplotlib.pyplot as plt coefs = pd.DataFrame( - model.coef_, - columns=['Coefficients'], index=X_train.columns + model.coef_, columns=["Coefficients"], index=X_train.columns ) -coefs.plot(kind='barh', figsize=(9, 7)) -plt.title('Ridge model') -plt.axvline(x=0, color='.5') -plt.subplots_adjust(left=.3) +coefs.plot(kind="barh", figsize=(9, 7)) +plt.title("Ridge model") +plt.axvline(x=0, color=".5") +plt.subplots_adjust(left=0.3) # %% [markdown] @@ -147,24 +152,24 @@ # price of houses decreases with the number of rooms? # ``` # -# The coefficients of a linear model are a *conditional* association: -# they quantify the variation of a the output (the price) when the given -# feature is varied, **keeping all other features constant**. We should -# not interpret them as a *marginal* association, characterizing the link -# between the two quantities ignoring all the rest. +# The coefficients of a linear model are a *conditional* association: they +# quantify the variation of a the output (the price) when the given feature is +# varied, **keeping all other features constant**. We should not interpret them +# as a *marginal* association, characterizing the link between the two +# quantities ignoring all the rest. # -# The coefficient associated to `AveRooms` is negative because the number -# of rooms is strongly correlated with the number of bedrooms, -# `AveBedrms`. What we are seeing here is that for districts where the houses -# have the same number of bedrooms on average, when there are more rooms -# (hence non-bedroom rooms), the houses are worth comparatively less. +# The coefficient associated to `AveRooms` is negative because the number of +# rooms is strongly correlated with the number of bedrooms, `AveBedrms`. What we +# are seeing here is that for districts where the houses have the same number of +# bedrooms on average, when there are more rooms (hence non-bedroom rooms), the +# houses are worth comparatively less. # # ### Scale of coefficients # # The `AveBedrms` have the higher coefficient. However, we can't compare the # magnitude of these coefficients directly, since they are not scaled. Indeed, -# `Population` is an integer which can be thousands, while `AveBedrms` is -# around 4 and Latitude is in degree. +# `Population` is an integer which can be thousands, while `AveBedrms` is around +# 4 and Latitude is in degree. # # So the Population coefficient is expressed in "$100k\$$ / habitant" while the # AveBedrms is expressed in "$100k\$$ / nb of bedrooms" and the Latitude @@ -172,8 +177,8 @@ # # We see that changing population by one does not change the outcome, while as # we go south (latitude increase) the price becomes cheaper. Also, adding a -# bedroom (keeping all other feature constant) shall rise the price of the -# house by 80k$. +# bedroom (keeping all other feature constant) shall rise the price of the house +# by 80k$. # %% [markdown] # So looking at the coefficient plot to gauge feature importance can be @@ -184,9 +189,9 @@ # features. # %% -X_train.std(axis=0).plot(kind='barh', figsize=(9, 7)) -plt.title('Features std. dev.') -plt.subplots_adjust(left=.3) +X_train.std(axis=0).plot(kind="barh", figsize=(9, 7)) +plt.title("Features std. dev.") +plt.subplots_adjust(left=0.3) plt.xlim((0, 100)) # %% [markdown] @@ -201,32 +206,31 @@ model.fit(X_train, y_train) -print(f'model score on training data: {model.score(X_train, y_train)}') -print(f'model score on testing data: {model.score(X_test, y_test)}') +print(f"model score on training data: {model.score(X_train, y_train)}") +print(f"model score on testing data: {model.score(X_test, y_test)}") # %% coefs = pd.DataFrame( - model[1].coef_, - columns=['Coefficients'], index=X_train.columns + model[1].coef_, columns=["Coefficients"], index=X_train.columns ) -coefs.plot(kind='barh', figsize=(9, 7)) -plt.title('Ridge model') -plt.axvline(x=0, color='.5') -plt.subplots_adjust(left=.3) +coefs.plot(kind="barh", figsize=(9, 7)) +plt.title("Ridge model") +plt.axvline(x=0, color=".5") +plt.subplots_adjust(left=0.3) # %% [markdown] # Now that the coefficients have been scaled, we can safely compare them. # -# The median income feature, with longitude and latitude are the three -# variables that most influence the model. +# The median income feature, with longitude and latitude are the three variables +# that most influence the model. # # The plot above tells us about dependencies between a specific feature and the # target when all other features remain constant, i.e., conditional # dependencies. An increase of the `HouseAge` will induce an increase of the -# price when all other features remain constant. On the contrary, an increase -# of the average rooms will induce an decrease of the price when all other -# features remain constant. +# price when all other features remain constant. On the contrary, an increase of +# the average rooms will induce an decrease of the price when all other features +# remain constant. # %% [markdown] # ### Checking the variability of the coefficients @@ -244,20 +248,23 @@ from sklearn.model_selection import RepeatedKFold cv_model = cross_validate( - model, X_with_rnd_feat, y, cv=RepeatedKFold(n_splits=5, n_repeats=5), - return_estimator=True, n_jobs=2 + model, + X_with_rnd_feat, + y, + cv=RepeatedKFold(n_splits=5, n_repeats=5), + return_estimator=True, + n_jobs=2, ) coefs = pd.DataFrame( - [model[1].coef_ - for model in cv_model['estimator']], - columns=X_with_rnd_feat.columns + [model[1].coef_ for model in cv_model["estimator"]], + columns=X_with_rnd_feat.columns, ) plt.figure(figsize=(9, 7)) -sns.boxplot(data=coefs, orient='h', color='cyan', saturation=0.5) -plt.axvline(x=0, color='.5') -plt.xlabel('Coefficient importance') -plt.title('Coefficient importance and its variability') -plt.subplots_adjust(left=.3) +sns.boxplot(data=coefs, orient="h", color="cyan", saturation=0.5) +plt.axvline(x=0, color=".5") +plt.xlabel("Coefficient importance") +plt.title("Coefficient importance and its variability") +plt.subplots_adjust(left=0.3) # %% [markdown] # Every coefficient looks pretty stable, which mean that different Ridge model @@ -267,68 +274,68 @@ # ### Linear models with sparse coefficients (Lasso) # %% [markdown] -# In it important to keep in mind that the associations extracted depend -# on the model. To illustrate this point we consider a Lasso model, that -# performs feature selection with a L1 penalty. Let us fit a Lasso model -# with a strong regularization parameters `alpha` +# In it important to keep in mind that the associations extracted depend on the +# model. To illustrate this point we consider a Lasso model, that performs +# feature selection with a L1 penalty. Let us fit a Lasso model with a strong +# regularization parameters `alpha` # %% from sklearn.linear_model import Lasso -model = make_pipeline(StandardScaler(), Lasso(alpha=.015)) +model = make_pipeline(StandardScaler(), Lasso(alpha=0.015)) model.fit(X_train, y_train) -print(f'model score on training data: {model.score(X_train, y_train)}') -print(f'model score on testing data: {model.score(X_test, y_test)}') +print(f"model score on training data: {model.score(X_train, y_train)}") +print(f"model score on testing data: {model.score(X_test, y_test)}") # %% coefs = pd.DataFrame( - model[1].coef_, - columns=['Coefficients'], index=X_train.columns + model[1].coef_, columns=["Coefficients"], index=X_train.columns ) -coefs.plot(kind='barh', figsize=(9, 7)) -plt.title('Lasso model, strong regularization') -plt.axvline(x=0, color='.5') -plt.subplots_adjust(left=.3) +coefs.plot(kind="barh", figsize=(9, 7)) +plt.title("Lasso model, strong regularization") +plt.axvline(x=0, color=".5") +plt.subplots_adjust(left=0.3) # %% [markdown] # Here the model score is a bit lower, because of the strong regularization. # However, it has zeroed out 3 coefficients, selecting a small number of # variables to make its prediction. # -# We can see that out of the two correlated features `AveRooms` and -# `AveBedrms`, the model has selected one. Note that this choice is -# partly arbitrary: choosing one does not mean that the other is not -# important for prediction. **Avoid over-interpreting models, as they are -# imperfect**. +# We can see that out of the two correlated features `AveRooms` and `AveBedrms`, +# the model has selected one. Note that this choice is partly arbitrary: +# choosing one does not mean that the other is not important for prediction. +# **Avoid over-interpreting models, as they are imperfect**. # # As above, we can look at the variability of the coefficients: # %% cv_model = cross_validate( - model, X_with_rnd_feat, y, cv=RepeatedKFold(n_splits=5, n_repeats=5), - return_estimator=True, n_jobs=2 + model, + X_with_rnd_feat, + y, + cv=RepeatedKFold(n_splits=5, n_repeats=5), + return_estimator=True, + n_jobs=2, ) coefs = pd.DataFrame( - [model[1].coef_ - for model in cv_model['estimator']], - columns=X_with_rnd_feat.columns + [model[1].coef_ for model in cv_model["estimator"]], + columns=X_with_rnd_feat.columns, ) plt.figure(figsize=(9, 7)) -sns.boxplot(data=coefs, orient='h', color='cyan', saturation=0.5) -plt.axvline(x=0, color='.5') -plt.xlabel('Coefficient importance') -plt.title('Coefficient importance and its variability') -plt.subplots_adjust(left=.3) +sns.boxplot(data=coefs, orient="h", color="cyan", saturation=0.5) +plt.axvline(x=0, color=".5") +plt.xlabel("Coefficient importance") +plt.title("Coefficient importance and its variability") +plt.subplots_adjust(left=0.3) # %% [markdown] -# We can see that both the coefficients associated to `AveRooms` and -# `AveBedrms` have a strong variability and that they can both be non -# zero. Given that they are strongly correlated, the model can pick one -# or the other to predict well. This choice is a bit arbitrary, and must -# not be over-interpreted. +# We can see that both the coefficients associated to `AveRooms` and `AveBedrms` +# have a strong variability and that they can both be non zero. Given that they +# are strongly correlated, the model can pick one or the other to predict well. +# This choice is a bit arbitrary, and must not be over-interpreted. # %% [markdown] # ### Lessons learned @@ -348,9 +355,9 @@ # %% [markdown] # ## 2. RandomForest `feature_importances_` # -# On some algorithms, there are some feature importance methods, -# inherently built within the model. It is the case in RandomForest models. -# Let's investigate the built-in `feature_importances_` attribute. +# On some algorithms, there are some feature importance methods, inherently +# built within the model. It is the case in RandomForest models. Let's +# investigate the built-in `feature_importances_` attribute. # %% from sklearn.ensemble import RandomForestRegressor @@ -359,8 +366,8 @@ model.fit(X_train, y_train) -print(f'model score on training data: {model.score(X_train, y_train)}') -print(f'model score on testing data: {model.score(X_test, y_test)}') +print(f"model score on training data: {model.score(X_train, y_train)}") +print(f"model score on testing data: {model.score(X_test, y_test)}") # %% [markdown] # Contrary to the testing set, the score on the training set is almost perfect, @@ -394,8 +401,8 @@ # # We introduce here a new technique to evaluate the feature importance of any # given fitted model. It basically shuffles a feature and sees how the model -# changes its prediction. Thus, the change in prediction will correspond to -# the feature importance. +# changes its prediction. Thus, the change in prediction will correspond to the +# feature importance. # %% # Any model could be used here @@ -407,14 +414,14 @@ # %% model.fit(X_train, y_train) -print(f'model score on training data: {model.score(X_train, y_train)}') -print(f'model score on testing data: {model.score(X_test, y_test)}') +print(f"model score on training data: {model.score(X_train, y_train)}") +print(f"model score on testing data: {model.score(X_test, y_test)}") # %% [markdown] -# As the model gives a good prediction, it has captured well the link -# between X and y. Hence, it is reasonable to interpret what it has -# captured from the data. +# As the model gives a good prediction, it has captured well the link between X +# and y. Hence, it is reasonable to interpret what it has captured from the +# data. # %% [markdown] # ### Feature importance @@ -437,22 +444,24 @@ # On the contrary, if the feature is not used by the model, the score shall # remain the same, thus the feature importance will be close to 0. + # %% def get_score_after_permutation(model, X, y, curr_feat): - """ return the score of model when curr_feat is permuted """ + """return the score of model when curr_feat is permuted""" X_permuted = X.copy() col_idx = list(X.columns).index(curr_feat) # permute one column X_permuted.iloc[:, col_idx] = np.random.permutation( - X_permuted[curr_feat].values) + X_permuted[curr_feat].values + ) permuted_score = model.score(X_permuted, y) return permuted_score def get_feature_importance(model, X, y, curr_feat): - """ compare the score when curr_feat is permuted """ + """compare the score when curr_feat is permuted""" baseline_score_train = model.score(X, y) permuted_score_train = get_score_after_permutation(model, X, y, curr_feat) @@ -462,11 +471,13 @@ def get_feature_importance(model, X, y, curr_feat): return feature_importance -curr_feat = 'MedInc' +curr_feat = "MedInc" feature_importance = get_feature_importance(model, X_train, y_train, curr_feat) -print(f'feature importance of "{curr_feat}" on train set is ' - f'{feature_importance:.3}') +print( + f'feature importance of "{curr_feat}" on train set is ' + f"{feature_importance:.3}" +) # %% [markdown] # Since there is some randomness, it is advisable to run it multiple times and @@ -478,18 +489,21 @@ def get_feature_importance(model, X, y, curr_feat): list_feature_importance = [] for n_round in range(n_repeats): list_feature_importance.append( - get_feature_importance(model, X_train, y_train, curr_feat)) + get_feature_importance(model, X_train, y_train, curr_feat) + ) print( f'feature importance of "{curr_feat}" on train set is ' - f'{np.mean(list_feature_importance):.3} ' - f'ยฑ {np.std(list_feature_importance):.3}') + f"{np.mean(list_feature_importance):.3} " + f"ยฑ {np.std(list_feature_importance):.3}" +) # %% [markdown] -# 0.67 over 0.98 is very relevant (note the $R^2$ score could go below 0). So -# we can imagine our model relies heavily on this feature to predict the class. -# We can now compute the feature permutation importance for all the features. +# 0.67 over 0.98 is very relevant (note the $R^2$ score could go below 0). So we +# can imagine our model relies heavily on this feature to predict the class. We +# can now compute the feature permutation importance for all the features. + # %% def permutation_importance(model, X, y, n_repeats=10): @@ -500,13 +514,17 @@ def permutation_importance(model, X, y, n_repeats=10): list_feature_importance = [] for n_round in range(n_repeats): list_feature_importance.append( - get_feature_importance(model, X, y, curr_feat)) + get_feature_importance(model, X, y, curr_feat) + ) importances.append(list_feature_importance) - return {'importances_mean': np.mean(importances, axis=1), - 'importances_std': np.std(importances, axis=1), - 'importances': importances} + return { + "importances_mean": np.mean(importances, axis=1), + "importances_std": np.std(importances, axis=1), + "importances": importances, + } + # This function could directly be access from sklearn # from sklearn.inspection import permutation_importance @@ -514,14 +532,16 @@ def permutation_importance(model, X, y, n_repeats=10): # %% def plot_feature_importances(perm_importance_result, feat_name): - """ bar plot the feature importance """ + """bar plot the feature importance""" fig, ax = plt.subplots() - indices = perm_importance_result['importances_mean'].argsort() - plt.barh(range(len(indices)), - perm_importance_result['importances_mean'][indices], - xerr=perm_importance_result['importances_std'][indices]) + indices = perm_importance_result["importances_mean"].argsort() + plt.barh( + range(len(indices)), + perm_importance_result["importances_mean"][indices], + xerr=perm_importance_result["importances_std"][indices], + ) ax.set_yticks(range(len(indices))) _ = ax.set_yticklabels(feat_name[indices]) @@ -532,7 +552,8 @@ def plot_feature_importances(perm_importance_result, feat_name): # %% perm_importance_result_train = permutation_importance( - model, X_train, y_train, n_repeats=10) + model, X_train, y_train, n_repeats=10 +) plot_feature_importances(perm_importance_result_train, X_train.columns) @@ -541,8 +562,8 @@ def plot_feature_importances(perm_importance_result, feat_name): # important for the model. # # We note that our random variable `rnd_num` is now very less important than -# latitude. Indeed, the feature importance built-in in RandomForest has bias -# for continuous data, such as `AveOccup` and `rnd_num`. +# latitude. Indeed, the feature importance built-in in RandomForest has bias for +# continuous data, such as `AveOccup` and `rnd_num`. # # However, the model still uses these `rnd_num` feature to compute the output. # It is in line with the overfitting we had noticed between the train and test @@ -562,11 +583,8 @@ def plot_feature_importances(perm_importance_result, feat_name): # %% [markdown] # # Take Away # -# - -# %% [markdown] -# * One could directly interpret the coefficient in linear model (if the -# feature have been scaled first) +# * One could directly interpret the coefficient in linear model (if the feature +# have been scaled first) # * Model like RandomForest have built-in feature importance # * `permutation_importance` gives feature importance by permutation for any # fitted model diff --git a/python_scripts/ensemble_adaboost.py b/python_scripts/ensemble_adaboost.py index 8349e715b..982084aed 100644 --- a/python_scripts/ensemble_adaboost.py +++ b/python_scripts/ensemble_adaboost.py @@ -31,8 +31,8 @@ # ``` # %% [markdown] -# We will purposefully train a shallow decision tree. Since it is shallow, -# it is unlikely to overfit and some of the training examples will even be +# We will purposefully train a shallow decision tree. Since it is shallow, it is +# unlikely to overfit and some of the training examples will even be # misclassified. # %% @@ -63,31 +63,42 @@ ) # plot the original dataset -sns.scatterplot(data=penguins, x=culmen_columns[0], y=culmen_columns[1], - hue=target_column, palette=palette) +sns.scatterplot( + data=penguins, + x=culmen_columns[0], + y=culmen_columns[1], + hue=target_column, + palette=palette, +) # plot the misclassified samples -sns.scatterplot(data=data_misclassified, x=culmen_columns[0], - y=culmen_columns[1], label="Misclassified samples", - marker="+", s=150, color="k") +sns.scatterplot( + data=data_misclassified, + x=culmen_columns[0], + y=culmen_columns[1], + label="Misclassified samples", + marker="+", + s=150, + color="k", +) plt.legend(bbox_to_anchor=(1.04, 0.5), loc="center left") -_ = plt.title("Decision tree predictions \nwith misclassified samples " - "highlighted") +_ = plt.title( + "Decision tree predictions \nwith misclassified samples highlighted" +) # %% [markdown] # We observe that several samples have been misclassified by the classifier. # # We mentioned that boosting relies on creating a new classifier which tries to -# correct these misclassifications. In scikit-learn, learners have a -# parameter `sample_weight` which forces it to pay more attention to -# samples with higher weights during the training. +# correct these misclassifications. In scikit-learn, learners have a parameter +# `sample_weight` which forces it to pay more attention to samples with higher +# weights during the training. # -# This parameter is set when calling -# `classifier.fit(X, y, sample_weight=weights)`. -# We will use this trick to create a new classifier by 'discarding' all -# correctly classified samples and only considering the misclassified samples. -# Thus, misclassified samples will be assigned a weight of 1 and well -# classified samples will be assigned a weight of 0. +# This parameter is set when calling `classifier.fit(X, y, +# sample_weight=weights)`. We will use this trick to create a new classifier by +# 'discarding' all correctly classified samples and only considering the +# misclassified samples. Thus, misclassified samples will be assigned a weight +# of 1 and well classified samples will be assigned a weight of 0. # %% sample_weight = np.zeros_like(target, dtype=int) @@ -100,12 +111,22 @@ DecisionBoundaryDisplay.from_estimator( tree, data, response_method="predict", cmap="RdBu", alpha=0.5 ) -sns.scatterplot(data=penguins, x=culmen_columns[0], y=culmen_columns[1], - hue=target_column, palette=palette) -sns.scatterplot(data=data_misclassified, x=culmen_columns[0], - y=culmen_columns[1], - label="Previously misclassified samples", - marker="+", s=150, color="k") +sns.scatterplot( + data=penguins, + x=culmen_columns[0], + y=culmen_columns[1], + hue=target_column, + palette=palette, +) +sns.scatterplot( + data=data_misclassified, + x=culmen_columns[0], + y=culmen_columns[1], + label="Previously misclassified samples", + marker="+", + s=150, + color="k", +) plt.legend(bbox_to_anchor=(1.04, 0.5), loc="center left") _ = plt.title("Decision tree by changing sample weights") @@ -121,14 +142,16 @@ misclassified_samples_idx, newly_misclassified_samples_idx ) -print(f"Number of samples previously misclassified and " - f"still misclassified: {len(remaining_misclassified_samples_idx)}") +print( + "Number of samples previously misclassified and " + f"still misclassified: {len(remaining_misclassified_samples_idx)}" +) # %% [markdown] # However, we are making mistakes on previously well classified samples. Thus, # we get the intuition that we should weight the predictions of each classifier -# differently, most probably by using the number of mistakes each classifier -# is making. +# differently, most probably by using the number of mistakes each classifier is +# making. # # So we could use the classification error to combine both trees. @@ -145,31 +168,30 @@ # slightly more than the second one. We could use these accuracy values to # weight the predictions of each learner. # -# To summarize, boosting learns several classifiers, each of which will -# focus more or less on specific samples of the dataset. Boosting is thus -# different from bagging: here we never resample our dataset, we just assign -# different weights to the original dataset. +# To summarize, boosting learns several classifiers, each of which will focus +# more or less on specific samples of the dataset. Boosting is thus different +# from bagging: here we never resample our dataset, we just assign different +# weights to the original dataset. # # Boosting requires some strategy to combine the learners together: # -# * one needs to define a way to compute the weights to be assigned -# to samples; +# * one needs to define a way to compute the weights to be assigned to samples; # * one needs to assign a weight to each learner when making predictions. # -# Indeed, we defined a really simple scheme to assign sample weights and -# learner weights. However, there are statistical theories (like in AdaBoost) -# for how these sample and learner weights can be optimally calculated. +# Indeed, we defined a really simple scheme to assign sample weights and learner +# weights. However, there are statistical theories (like in AdaBoost) for how +# these sample and learner weights can be optimally calculated. # -# We will use the AdaBoost classifier implemented in scikit-learn and -# look at the underlying decision tree classifiers trained. +# We will use the AdaBoost classifier implemented in scikit-learn and look at +# the underlying decision tree classifiers trained. # %% from sklearn.ensemble import AdaBoostClassifier estimator = DecisionTreeClassifier(max_depth=3, random_state=0) -adaboost = AdaBoostClassifier(estimator=estimator, - n_estimators=3, algorithm="SAMME", - random_state=0) +adaboost = AdaBoostClassifier( + estimator=estimator, n_estimators=3, algorithm="SAMME", random_state=0 +) adaboost.fit(data, target) # %% @@ -177,11 +199,19 @@ plt.figure() # we convert `data` into a NumPy array to avoid a warning raised in scikit-learn DecisionBoundaryDisplay.from_estimator( - tree, data.to_numpy(), response_method="predict", cmap="RdBu", alpha=0.5 + tree, + data.to_numpy(), + response_method="predict", + cmap="RdBu", + alpha=0.5, + ) + sns.scatterplot( + x=culmen_columns[0], + y=culmen_columns[1], + hue=target_column, + data=penguins, + palette=palette, ) - sns.scatterplot(x=culmen_columns[0], y=culmen_columns[1], - hue=target_column, data=penguins, - palette=palette) plt.legend(bbox_to_anchor=(1.04, 0.5), loc="center left") _ = plt.title(f"Decision tree trained at round {boosting_round}") @@ -199,6 +229,6 @@ # classifier also has the highest classification generalization performance. # # While AdaBoost is a nice algorithm to demonstrate the internal machinery of -# boosting algorithms, it is not the most efficient. -# This title is handed to the gradient-boosting decision tree (GBDT) algorithm, -# which we will discuss in the next unit. +# boosting algorithms, it is not the most efficient. This title is handed to the +# gradient-boosting decision tree (GBDT) algorithm, which we will discuss in the +# next unit. diff --git a/python_scripts/ensemble_bagging.py b/python_scripts/ensemble_bagging.py index 70efc88c3..84696187c 100644 --- a/python_scripts/ensemble_bagging.py +++ b/python_scripts/ensemble_bagging.py @@ -8,8 +8,8 @@ # %% [markdown] # # Bagging # -# This notebook introduces a very natural strategy to build ensembles of -# machine learning models named "bagging". +# This notebook introduces a very natural strategy to build ensembles of machine +# learning models named "bagging". # # "Bagging" stands for Bootstrap AGGregatING. It uses bootstrap resampling # (random sampling with replacement) to learn several models on random @@ -33,12 +33,13 @@ def generate_data(n_samples=30): x_min, x_max = -3, 3 x = rng.uniform(x_min, x_max, size=n_samples) noise = 4.0 * rng.randn(n_samples) - y = x ** 3 - 0.5 * (x + 1) ** 2 + noise + y = x**3 - 0.5 * (x + 1) ** 2 + noise y /= y.std() data_train = pd.DataFrame(x, columns=["Feature"]) data_test = pd.DataFrame( - np.linspace(x_max, x_min, num=300), columns=["Feature"]) + np.linspace(x_max, x_min, num=300), columns=["Feature"] + ) target_train = pd.Series(y, name="Target") return data_train, data_test, target_train @@ -49,8 +50,9 @@ def generate_data(n_samples=30): import seaborn as sns data_train, data_test, target_train = generate_data(n_samples=30) -sns.scatterplot(x=data_train["Feature"], y=target_train, color="black", - alpha=0.5) +sns.scatterplot( + x=data_train["Feature"], y=target_train, color="black", alpha=0.5 +) _ = plt.title("Synthetic regression dataset") # %% [markdown] @@ -68,12 +70,13 @@ def generate_data(n_samples=30): # %% [markdown] # Remember that the term "test" here refers to data that was not used for -# training and computing an evaluation metric on such a synthetic test set -# would be meaningless. +# training and computing an evaluation metric on such a synthetic test set would +# be meaningless. # %% -sns.scatterplot(x=data_train["Feature"], y=target_train, color="black", - alpha=0.5) +sns.scatterplot( + x=data_train["Feature"], y=target_train, color="black", alpha=0.5 +) plt.plot(data_test["Feature"], y_pred, label="Fitted tree") plt.legend() _ = plt.title("Predictions by a single decision tree") @@ -86,9 +89,9 @@ def generate_data(n_samples=30): # Given a dataset with `n` data points, bootstrapping corresponds to resampling # with replacement `n` out of such `n` data points uniformly at random. # -# As a result, the output of the bootstrap sampling procedure is another -# dataset with also n data points, but likely with duplicates. As a consequence, -# there are also data points from the original dataset that are never selected to +# As a result, the output of the bootstrap sampling procedure is another dataset +# with also n data points, but likely with duplicates. As a consequence, there +# are also data points from the original dataset that are never selected to # appear in a bootstrap sample (by chance). Those data points that are left away # are often referred to as the out-of-bag sample. # @@ -101,7 +104,9 @@ def bootstrap_sample(data, target): # Indices corresponding to a sampling with replacement of the same sample # size than the original data bootstrap_indices = rng.choice( - np.arange(target.shape[0]), size=target.shape[0], replace=True, + np.arange(target.shape[0]), + size=target.shape[0], + replace=True, ) # In pandas, we need to use `.iloc` to extract rows using an integer # position index: @@ -120,23 +125,36 @@ def bootstrap_sample(data, target): for bootstrap_idx in range(n_bootstraps): # draw a bootstrap from the original data data_bootstrap, target_bootstrap = bootstrap_sample( - data_train, target_train, + data_train, + target_train, ) plt.figure() - plt.scatter(data_bootstrap["Feature"], target_bootstrap, - color="tab:blue", facecolors="none", - alpha=0.5, label="Resampled data", s=180, linewidth=5) - plt.scatter(data_train["Feature"], target_train, - color="black", s=60, - alpha=1, label="Original data") + plt.scatter( + data_bootstrap["Feature"], + target_bootstrap, + color="tab:blue", + facecolors="none", + alpha=0.5, + label="Resampled data", + s=180, + linewidth=5, + ) + plt.scatter( + data_train["Feature"], + target_train, + color="black", + s=60, + alpha=1, + label="Original data", + ) plt.title(f"Resampled data #{bootstrap_idx}") plt.legend() # %% [markdown] # # Observe that the 3 variations all share common points with the original -# dataset. Some of the points are randomly resampled several times and appear -# as darker blue circles. +# dataset. Some of the points are randomly resampled several times and appear as +# darker blue circles. # # The 3 generated bootstrap samples are all different from the original dataset # and from each other. To confirm this intuition, we can check the number of @@ -144,14 +162,17 @@ def bootstrap_sample(data, target): # %% data_train_huge, data_test_huge, target_train_huge = generate_data( - n_samples=100_000) + n_samples=100_000 +) data_bootstrap_sample, target_bootstrap_sample = bootstrap_sample( - data_train_huge, target_train_huge) + data_train_huge, target_train_huge +) -ratio_unique_sample = (np.unique(data_bootstrap_sample).size / - data_bootstrap_sample.size) +ratio_unique_sample = ( + np.unique(data_bootstrap_sample).size / data_bootstrap_sample.size +) print( - f"Percentage of samples present in the original dataset: " + "Percentage of samples present in the original dataset: " f"{ratio_unique_sample * 100:.1f}%" ) @@ -162,9 +183,9 @@ def bootstrap_sample(data, target): # the same size as the original dataset, there will be many samples that are in # the bootstrap sample multiple times. # -# Using bootstrap we are able to generate many datasets, all slightly -# different. We can fit a decision tree for each of these datasets and they all -# shall be slightly different as well. +# Using bootstrap we are able to generate many datasets, all slightly different. +# We can fit a decision tree for each of these datasets and they all shall be +# slightly different as well. # %% bag_of_trees = [] @@ -172,7 +193,8 @@ def bootstrap_sample(data, target): tree = DecisionTreeRegressor(max_depth=3, random_state=0) data_bootstrap_sample, target_bootstrap_sample = bootstrap_sample( - data_train, target_train) + data_train, target_train + ) tree.fit(data_bootstrap_sample, target_bootstrap_sample) bag_of_trees.append(tree) @@ -183,12 +205,18 @@ def bootstrap_sample(data, target): # different predictions. # %% -sns.scatterplot(x=data_train["Feature"], y=target_train, color="black", - alpha=0.5) +sns.scatterplot( + x=data_train["Feature"], y=target_train, color="black", alpha=0.5 +) for tree_idx, tree in enumerate(bag_of_trees): tree_predictions = tree.predict(data_test) - plt.plot(data_test["Feature"], tree_predictions, linestyle="--", alpha=0.8, - label=f"Tree #{tree_idx} predictions") + plt.plot( + data_test["Feature"], + tree_predictions, + linestyle="--", + alpha=0.8, + label=f"Tree #{tree_idx} predictions", + ) plt.legend() _ = plt.title("Predictions of trees trained on different bootstraps") @@ -196,40 +224,49 @@ def bootstrap_sample(data, target): # %% [markdown] # ## Aggregating # -# Once our trees are fitted, we are able to get predictions for each of -# them. In regression, the most straightforward way to combine those -# predictions is just to average them: for a given test data point, we feed the -# input feature values to each of the `n` trained models in the ensemble and as -# a result compute `n` predicted values for the target variable. The final -# prediction of the ensemble for the test data point is the average of those -# `n` values. +# Once our trees are fitted, we are able to get predictions for each of them. In +# regression, the most straightforward way to combine those predictions is just +# to average them: for a given test data point, we feed the input feature values +# to each of the `n` trained models in the ensemble and as a result compute `n` +# predicted values for the target variable. The final prediction of the ensemble +# for the test data point is the average of those `n` values. # # We can plot the averaged predictions from the previous example. # %% -sns.scatterplot(x=data_train["Feature"], y=target_train, color="black", - alpha=0.5) +sns.scatterplot( + x=data_train["Feature"], y=target_train, color="black", alpha=0.5 +) bag_predictions = [] for tree_idx, tree in enumerate(bag_of_trees): tree_predictions = tree.predict(data_test) - plt.plot(data_test["Feature"], tree_predictions, linestyle="--", alpha=0.8, - label=f"Tree #{tree_idx} predictions") + plt.plot( + data_test["Feature"], + tree_predictions, + linestyle="--", + alpha=0.8, + label=f"Tree #{tree_idx} predictions", + ) bag_predictions.append(tree_predictions) bag_predictions = np.mean(bag_predictions, axis=0) -plt.plot(data_test["Feature"], bag_predictions, label="Averaged predictions", - linestyle="-") +plt.plot( + data_test["Feature"], + bag_predictions, + label="Averaged predictions", + linestyle="-", +) plt.legend(bbox_to_anchor=(1.05, 0.8), loc="upper left") _ = plt.title("Predictions of bagged trees") # %% [markdown] # -# The unbroken red line shows the averaged predictions, which would be the -# final predictions given by our 'bag' of decision tree regressors. Note that -# the predictions of the ensemble is more stable because of the averaging -# operation. As a result, the bag of trees as a whole is less likely to overfit -# than the individual trees. +# The unbroken red line shows the averaged predictions, which would be the final +# predictions given by our 'bag' of decision tree regressors. Note that the +# predictions of the ensemble is more stable because of the averaging operation. +# As a result, the bag of trees as a whole is less likely to overfit than the +# individual trees. # # ## Bagging in scikit-learn # @@ -251,11 +288,12 @@ def bootstrap_sample(data, target): _ = bagged_trees.fit(data_train, target_train) # %% [markdown] -# # Let us visualize the predictions of the ensemble on the same interval of data: + # %% -sns.scatterplot(x=data_train["Feature"], y=target_train, color="black", - alpha=0.5) +sns.scatterplot( + x=data_train["Feature"], y=target_train, color="black", alpha=0.5 +) bagged_trees_predictions = bagged_trees.predict(data_test) plt.plot(data_test["Feature"], bagged_trees_predictions) @@ -263,7 +301,6 @@ def bootstrap_sample(data, target): _ = plt.title("Predictions from a bagging classifier") # %% [markdown] -# # Because we use 100 trees in the ensemble, the average prediction is indeed # slightly smoother but very similar to our previous average plot. # @@ -277,19 +314,29 @@ def bootstrap_sample(data, target): label = "Predictions of individual trees" if tree_idx == 0 else None # we convert `data_test` into a NumPy array to avoid a warning raised in scikit-learn tree_predictions = tree.predict(data_test.to_numpy()) - plt.plot(data_test["Feature"], tree_predictions, linestyle="--", alpha=0.1, - color="tab:blue", label=label) + plt.plot( + data_test["Feature"], + tree_predictions, + linestyle="--", + alpha=0.1, + color="tab:blue", + label=label, + ) -sns.scatterplot(x=data_train["Feature"], y=target_train, color="black", - alpha=0.5) +sns.scatterplot( + x=data_train["Feature"], y=target_train, color="black", alpha=0.5 +) bagged_trees_predictions = bagged_trees.predict(data_test) -plt.plot(data_test["Feature"], bagged_trees_predictions, - color="tab:orange", label="Predictions of ensemble") +plt.plot( + data_test["Feature"], + bagged_trees_predictions, + color="tab:orange", + label="Predictions of ensemble", +) _ = plt.legend() # %% [markdown] -# # We used a low value of the opacity parameter `alpha` to better appreciate the # overlap in the prediction functions of the individual trees. # @@ -319,11 +366,10 @@ def bootstrap_sample(data, target): ) # %% [markdown] -# -# This pipeline first scales the data to the 0-1 range with `MinMaxScaler`. -# Then it extracts degree-4 polynomial features. The resulting features will -# all stay in the 0-1 range by construction: if `x` lies in the 0-1 range then -# `x ** n` also lies in the 0-1 range for any value of `n`. +# This pipeline first scales the data to the 0-1 range with `MinMaxScaler`. Then +# it extracts degree-4 polynomial features. The resulting features will all stay +# in the 0-1 range by construction: if `x` lies in the 0-1 range then `x ** n` +# also lies in the 0-1 range for any value of `n`. # # Then the pipeline feeds the resulting non-linear features to a regularized # linear regression model for the final prediction of the target variable. @@ -349,26 +395,32 @@ def bootstrap_sample(data, target): # we convert `data_test` into a NumPy array to avoid a warning raised in scikit-learn regressor_predictions = regressor.predict(data_test.to_numpy()) base_model_line = plt.plot( - data_test["Feature"], regressor_predictions, linestyle="--", alpha=0.2, + data_test["Feature"], + regressor_predictions, + linestyle="--", + alpha=0.2, label="Predictions of base models" if i == 0 else None, - color="tab:blue" + color="tab:blue", ) -sns.scatterplot(x=data_train["Feature"], y=target_train, color="black", - alpha=0.5) +sns.scatterplot( + x=data_train["Feature"], y=target_train, color="black", alpha=0.5 +) bagging_predictions = bagging.predict(data_test) -plt.plot(data_test["Feature"], bagging_predictions, - color="tab:orange", label="Predictions of ensemble") +plt.plot( + data_test["Feature"], + bagging_predictions, + color="tab:orange", + label="Predictions of ensemble", +) plt.ylim(target_train.min(), target_train.max()) plt.legend() _ = plt.title("Bagged polynomial regression") # %% [markdown] -# -# The predictions of this bagged polynomial regression model looks -# qualitatively better than the bagged trees. This is somewhat expected since -# the base model better reflects our knowledge of the true data generating -# process. +# The predictions of this bagged polynomial regression model looks qualitatively +# better than the bagged trees. This is somewhat expected since the base model +# better reflects our knowledge of the true data generating process. # # Again the different shades induced by the overlapping blue lines let us # appreciate the uncertainty in the prediction of the bagged ensemble. diff --git a/python_scripts/ensemble_ex_01.py b/python_scripts/ensemble_ex_01.py index 382d9fe11..cad686f26 100644 --- a/python_scripts/ensemble_ex_01.py +++ b/python_scripts/ensemble_ex_01.py @@ -28,7 +28,8 @@ data, target = fetch_california_housing(as_frame=True, return_X_y=True) target *= 100 # rescale the target in k$ data_train, data_test, target_train, target_test = train_test_split( - data, target, random_state=0, test_size=0.5) + data, target, random_state=0, test_size=0.5 +) # %% [markdown] # ```{note} diff --git a/python_scripts/ensemble_ex_02.py b/python_scripts/ensemble_ex_02.py index 20a2ac8ce..147d28c82 100644 --- a/python_scripts/ensemble_ex_02.py +++ b/python_scripts/ensemble_ex_02.py @@ -29,7 +29,8 @@ target_name = "Body Mass (g)" data, target = penguins[[feature_name]], penguins[target_name] data_train, data_test, target_train, target_test = train_test_split( - data, target, random_state=0) + data, target, random_state=0 +) # %% [markdown] # ```{note} diff --git a/python_scripts/ensemble_ex_03.py b/python_scripts/ensemble_ex_03.py index df36d1f23..3be02f899 100644 --- a/python_scripts/ensemble_ex_03.py +++ b/python_scripts/ensemble_ex_03.py @@ -31,7 +31,8 @@ data, target = fetch_california_housing(return_X_y=True, as_frame=True) target *= 100 # rescale the target in k$ data_train, data_test, target_train, target_test = train_test_split( - data, target, random_state=0, test_size=0.5) + data, target, random_state=0, test_size=0.5 +) # %% [markdown] # ```{note} diff --git a/python_scripts/ensemble_gradient_boosting.py b/python_scripts/ensemble_gradient_boosting.py index 703d44e54..874c3ed20 100644 --- a/python_scripts/ensemble_gradient_boosting.py +++ b/python_scripts/ensemble_gradient_boosting.py @@ -12,10 +12,10 @@ # algorithm and contrast it with AdaBoost. # # Gradient-boosting differs from AdaBoost due to the following reason: instead -# of assigning weights to specific samples, GBDT will fit a decision tree on -# the residuals error (hence the name "gradient") of the previous tree. -# Therefore, each new tree in the ensemble predicts the error made by the -# previous learner instead of predicting the target directly. +# of assigning weights to specific samples, GBDT will fit a decision tree on the +# residuals error (hence the name "gradient") of the previous tree. Therefore, +# each new tree in the ensemble predicts the error made by the previous learner +# instead of predicting the target directly. # # In this section, we will provide some intuition about the way learners are # combined to give the final prediction. In this regard, let's go back to our @@ -37,11 +37,12 @@ def generate_data(n_samples=50): len_x = x_max - x_min x = rng.rand(n_samples) * len_x - len_x / 2 noise = rng.randn(n_samples) * 0.3 - y = x ** 3 - 0.5 * x ** 2 + noise + y = x**3 - 0.5 * x**2 + noise data_train = pd.DataFrame(x, columns=["Feature"]) - data_test = pd.DataFrame(np.linspace(x_max, x_min, num=300), - columns=["Feature"]) + data_test = pd.DataFrame( + np.linspace(x_max, x_min, num=300), columns=["Feature"] + ) target_train = pd.Series(y, name="Target") return data_train, data_test, target_train @@ -53,14 +54,15 @@ def generate_data(n_samples=50): import matplotlib.pyplot as plt import seaborn as sns -sns.scatterplot(x=data_train["Feature"], y=target_train, color="black", - alpha=0.5) +sns.scatterplot( + x=data_train["Feature"], y=target_train, color="black", alpha=0.5 +) _ = plt.title("Synthetic regression dataset") # %% [markdown] -# As we previously discussed, boosting will be based on assembling a sequence -# of learners. We will start by creating a decision tree regressor. We will set -# the depth of the tree so that the resulting learner will underfit the data. +# As we previously discussed, boosting will be based on assembling a sequence of +# learners. We will start by creating a decision tree regressor. We will set the +# depth of the tree so that the resulting learner will underfit the data. # %% from sklearn.tree import DecisionTreeRegressor @@ -72,26 +74,28 @@ def generate_data(n_samples=50): target_test_predicted = tree.predict(data_test) # %% [markdown] -# Using the term "test" here refers to data that was not used for training. -# It should not be confused with data coming from a train-test split, as it -# was generated in equally-spaced intervals for the visual evaluation of the +# Using the term "test" here refers to data that was not used for training. It +# should not be confused with data coming from a train-test split, as it was +# generated in equally-spaced intervals for the visual evaluation of the # predictions. # %% # plot the data -sns.scatterplot(x=data_train["Feature"], y=target_train, color="black", - alpha=0.5) +sns.scatterplot( + x=data_train["Feature"], y=target_train, color="black", alpha=0.5 +) # plot the predictions line_predictions = plt.plot(data_test["Feature"], target_test_predicted, "--") # plot the residuals -for value, true, predicted in zip(data_train["Feature"], - target_train, - target_train_predicted): +for value, true, predicted in zip( + data_train["Feature"], target_train, target_train_predicted +): lines_residuals = plt.plot([value, value], [true, predicted], color="red") -plt.legend([line_predictions[0], lines_residuals[0]], - ["Fitted tree", "Residuals"]) +plt.legend( + [line_predictions[0], lines_residuals[0]], ["Fitted tree", "Residuals"] +) _ = plt.title("Prediction function together \nwith errors on the training set") # %% [markdown] @@ -104,11 +108,11 @@ def generate_data(n_samples=50): # between the predictions and the ground-truth data. We represent these errors, # called "Residuals", by unbroken red lines. # -# Indeed, our initial tree was not expressive enough to handle the complexity -# of the data, as shown by the residuals. In a gradient-boosting algorithm, the -# idea is to create a second tree which, given the same data `data`, will try -# to predict the residuals instead of the vector `target`. We would therefore -# have a tree that is able to predict the errors made by the initial tree. +# Indeed, our initial tree was not expressive enough to handle the complexity of +# the data, as shown by the residuals. In a gradient-boosting algorithm, the +# idea is to create a second tree which, given the same data `data`, will try to +# predict the residuals instead of the vector `target`. We would therefore have +# a tree that is able to predict the errors made by the initial tree. # # Let's train such a tree. @@ -124,29 +128,33 @@ def generate_data(n_samples=50): # %% sns.scatterplot(x=data_train["Feature"], y=residuals, color="black", alpha=0.5) line_predictions = plt.plot( - data_test["Feature"], target_test_predicted_residuals, "--") + data_test["Feature"], target_test_predicted_residuals, "--" +) # plot the residuals of the predicted residuals -for value, true, predicted in zip(data_train["Feature"], - residuals, - target_train_predicted_residuals): +for value, true, predicted in zip( + data_train["Feature"], residuals, target_train_predicted_residuals +): lines_residuals = plt.plot([value, value], [true, predicted], color="red") -plt.legend([line_predictions[0], lines_residuals[0]], - ["Fitted tree", "Residuals"], bbox_to_anchor=(1.05, 0.8), - loc="upper left") +plt.legend( + [line_predictions[0], lines_residuals[0]], + ["Fitted tree", "Residuals"], + bbox_to_anchor=(1.05, 0.8), + loc="upper left", +) _ = plt.title("Prediction of the previous residuals") # %% [markdown] # We see that this new tree only manages to fit some of the residuals. We will -# focus on a specific sample from the training set (i.e. we know that the -# sample will be well predicted using two successive trees). We will use this -# sample to explain how the predictions of both trees are combined. Let's first -# select this sample in `data_train`. +# focus on a specific sample from the training set (i.e. we know that the sample +# will be well predicted using two successive trees). We will use this sample to +# explain how the predictions of both trees are combined. Let's first select +# this sample in `data_train`. # %% sample = data_train.iloc[[-2]] -x_sample = sample['Feature'].iloc[0] +x_sample = sample["Feature"].iloc[0] target_true = target_train.iloc[-2] target_true_residual = residuals.iloc[-2] @@ -161,24 +169,26 @@ def generate_data(n_samples=50): # * the predictions # * the residuals -sns.scatterplot(x=data_train["Feature"], y=target_train, color="black", - alpha=0.5) +sns.scatterplot( + x=data_train["Feature"], y=target_train, color="black", alpha=0.5 +) plt.plot(data_test["Feature"], target_test_predicted, "--") -for value, true, predicted in zip(data_train["Feature"], - target_train, - target_train_predicted): +for value, true, predicted in zip( + data_train["Feature"], target_train, target_train_predicted +): lines_residuals = plt.plot([value, value], [true, predicted], color="red") # Highlight the sample of interest -plt.scatter(sample, target_true, label="Sample of interest", - color="tab:orange", s=200) +plt.scatter( + sample, target_true, label="Sample of interest", color="tab:orange", s=200 +) plt.xlim([-1, 0]) plt.legend(bbox_to_anchor=(1.05, 0.8), loc="upper left") _ = plt.title("Tree predictions") # %% [markdown] -# Now, let's plot the residuals information. We will plot the residuals -# computed from the first decision tree and show the residual predictions. +# Now, let's plot the residuals information. We will plot the residuals computed +# from the first decision tree and show the residual predictions. # %% # Plot the previous information: @@ -186,17 +196,21 @@ def generate_data(n_samples=50): # * the residual predictions # * the residuals of the residual predictions -sns.scatterplot(x=data_train["Feature"], y=residuals, - color="black", alpha=0.5) +sns.scatterplot(x=data_train["Feature"], y=residuals, color="black", alpha=0.5) plt.plot(data_test["Feature"], target_test_predicted_residuals, "--") -for value, true, predicted in zip(data_train["Feature"], - residuals, - target_train_predicted_residuals): +for value, true, predicted in zip( + data_train["Feature"], residuals, target_train_predicted_residuals +): lines_residuals = plt.plot([value, value], [true, predicted], color="red") # Highlight the sample of interest -plt.scatter(sample, target_true_residual, label="Sample of interest", - color="tab:orange", s=200) +plt.scatter( + sample, + target_true_residual, + label="Sample of interest", + color="tab:orange", + s=200, +) plt.xlim([-1, 0]) plt.legend() _ = plt.title("Prediction of the residuals") @@ -209,12 +223,13 @@ def generate_data(n_samples=50): # and compare it with the true value. # %% -print(f"True value to predict for " - f"f(x={x_sample:.3f}) = {target_true:.3f}") +print(f"True value to predict for f(x={x_sample:.3f}) = {target_true:.3f}") y_pred_first_tree = tree.predict(sample)[0] -print(f"Prediction of the first decision tree for x={x_sample:.3f}: " - f"y={y_pred_first_tree:.3f}") +print( + f"Prediction of the first decision tree for x={x_sample:.3f}: " + f"y={y_pred_first_tree:.3f}" +) print(f"Error of the tree: {target_true - y_pred_first_tree:.3f}") # %% [markdown] @@ -222,8 +237,10 @@ def generate_data(n_samples=50): # tree to try to predict this residual. # %% -print(f"Prediction of the residual for x={x_sample:.3f}: " - f"{tree_residuals.predict(sample)[0]:.3f}") +print( + f"Prediction of the residual for x={x_sample:.3f}: " + f"{tree_residuals.predict(sample)[0]:.3f}" +) # %% [markdown] # We see that our second tree is capable of predicting the exact residual @@ -234,17 +251,19 @@ def generate_data(n_samples=50): y_pred_first_and_second_tree = ( y_pred_first_tree + tree_residuals.predict(sample)[0] ) -print(f"Prediction of the first and second decision trees combined for " - f"x={x_sample:.3f}: y={y_pred_first_and_second_tree:.3f}") +print( + "Prediction of the first and second decision trees combined for " + f"x={x_sample:.3f}: y={y_pred_first_and_second_tree:.3f}" +) print(f"Error of the tree: {target_true - y_pred_first_and_second_tree:.3f}") # %% [markdown] # We chose a sample for which only two trees were enough to make the perfect # prediction. However, we saw in the previous plot that two trees were not -# enough to correct the residuals of all samples. Therefore, one needs to -# add several trees to the ensemble to successfully correct the error -# (i.e. the second tree corrects the first tree's error, while the third tree -# corrects the second tree's error and so on). +# enough to correct the residuals of all samples. Therefore, one needs to add +# several trees to the ensemble to successfully correct the error (i.e. the +# second tree corrects the first tree's error, while the third tree corrects the +# second tree's error and so on). # # We will compare the generalization performance of random-forest and gradient # boosting on the California housing dataset. @@ -261,45 +280,53 @@ def generate_data(n_samples=50): gradient_boosting = GradientBoostingRegressor(n_estimators=200) cv_results_gbdt = cross_validate( - gradient_boosting, data, target, scoring="neg_mean_absolute_error", + gradient_boosting, + data, + target, + scoring="neg_mean_absolute_error", n_jobs=2, ) # %% print("Gradient Boosting Decision Tree") -print(f"Mean absolute error via cross-validation: " - f"{-cv_results_gbdt['test_score'].mean():.3f} ยฑ " - f"{cv_results_gbdt['test_score'].std():.3f} k$") -print(f"Average fit time: " - f"{cv_results_gbdt['fit_time'].mean():.3f} seconds") -print(f"Average score time: " - f"{cv_results_gbdt['score_time'].mean():.3f} seconds") +print( + "Mean absolute error via cross-validation: " + f"{-cv_results_gbdt['test_score'].mean():.3f} ยฑ " + f"{cv_results_gbdt['test_score'].std():.3f} k$" +) +print(f"Average fit time: {cv_results_gbdt['fit_time'].mean():.3f} seconds") +print( + f"Average score time: {cv_results_gbdt['score_time'].mean():.3f} seconds" +) # %% from sklearn.ensemble import RandomForestRegressor random_forest = RandomForestRegressor(n_estimators=200, n_jobs=2) cv_results_rf = cross_validate( - random_forest, data, target, scoring="neg_mean_absolute_error", + random_forest, + data, + target, + scoring="neg_mean_absolute_error", n_jobs=2, ) # %% print("Random Forest") -print(f"Mean absolute error via cross-validation: " - f"{-cv_results_rf['test_score'].mean():.3f} ยฑ " - f"{cv_results_rf['test_score'].std():.3f} k$") -print(f"Average fit time: " - f"{cv_results_rf['fit_time'].mean():.3f} seconds") -print(f"Average score time: " - f"{cv_results_rf['score_time'].mean():.3f} seconds") +print( + "Mean absolute error via cross-validation: " + f"{-cv_results_rf['test_score'].mean():.3f} ยฑ " + f"{cv_results_rf['test_score'].std():.3f} k$" +) +print(f"Average fit time: {cv_results_rf['fit_time'].mean():.3f} seconds") +print(f"Average score time: {cv_results_rf['score_time'].mean():.3f} seconds") # %% [markdown] # In term of computation performance, the forest can be parallelized and will -# benefit from using multiple cores of the CPU. In terms of scoring -# performance, both algorithms lead to very close results. +# benefit from using multiple cores of the CPU. In terms of scoring performance, +# both algorithms lead to very close results. # -# However, we see that the gradient boosting is a very fast algorithm to -# predict compared to random forest. This is due to the fact that gradient -# boosting uses shallow trees. We will go into details in the next notebook -# about the hyperparameters to consider when optimizing ensemble methods. +# However, we see that the gradient boosting is a very fast algorithm to predict +# compared to random forest. This is due to the fact that gradient boosting uses +# shallow trees. We will go into details in the next notebook about the +# hyperparameters to consider when optimizing ensemble methods. diff --git a/python_scripts/ensemble_hist_gradient_boosting.py b/python_scripts/ensemble_hist_gradient_boosting.py index 2b233051d..7a40a569d 100644 --- a/python_scripts/ensemble_hist_gradient_boosting.py +++ b/python_scripts/ensemble_hist_gradient_boosting.py @@ -7,6 +7,7 @@ # %% [markdown] # # Speeding-up gradient-boosting +# # In this notebook, we present a modified version of gradient boosting which # uses a reduced number of splits when building the different trees. This # algorithm is called "histogram gradient boosting" in scikit-learn. @@ -16,18 +17,18 @@ # Therefore, the algorithm scales efficiently with both the number of cores and # the number of samples. # -# In gradient-boosting, the algorithm is a sequential algorithm. It requires -# the `N-1` trees to have been fit to be able to fit the tree at stage `N`. +# In gradient-boosting, the algorithm is a sequential algorithm. It requires the +# `N-1` trees to have been fit to be able to fit the tree at stage `N`. # Therefore, the algorithm is quite computationally expensive. The most -# expensive part in this algorithm is the search for the best split in the -# tree which is a brute-force approach: all possible split are evaluated and -# the best one is picked. We explained this process in the notebook "tree in -# depth", which you can refer to. +# expensive part in this algorithm is the search for the best split in the tree +# which is a brute-force approach: all possible split are evaluated and the best +# one is picked. We explained this process in the notebook "tree in depth", +# which you can refer to. # # To accelerate the gradient-boosting algorithm, one could reduce the number of -# splits to be evaluated. As a consequence, the generalization performance of such -# a tree would be reduced. However, since we are combining several trees in a -# gradient-boosting, we can add more estimators to overcome this issue. +# splits to be evaluated. As a consequence, the generalization performance of +# such a tree would be reduced. However, since we are combining several trees in +# a gradient-boosting, we can add more estimators to overcome this issue. # # We will make a naive implementation of such algorithm using building blocks # from scikit-learn. First, we will load the California housing dataset. @@ -53,26 +54,31 @@ gradient_boosting = GradientBoostingRegressor(n_estimators=200) cv_results_gbdt = cross_validate( - gradient_boosting, data, target, scoring="neg_mean_absolute_error", - n_jobs=2 + gradient_boosting, + data, + target, + scoring="neg_mean_absolute_error", + n_jobs=2, ) # %% print("Gradient Boosting Decision Tree") -print(f"Mean absolute error via cross-validation: " - f"{-cv_results_gbdt['test_score'].mean():.3f} ยฑ " - f"{cv_results_gbdt['test_score'].std():.3f} k$") -print(f"Average fit time: " - f"{cv_results_gbdt['fit_time'].mean():.3f} seconds") -print(f"Average score time: " - f"{cv_results_gbdt['score_time'].mean():.3f} seconds") +print( + "Mean absolute error via cross-validation: " + f"{-cv_results_gbdt['test_score'].mean():.3f} ยฑ " + f"{cv_results_gbdt['test_score'].std():.3f} k$" +) +print(f"Average fit time: {cv_results_gbdt['fit_time'].mean():.3f} seconds") +print( + f"Average score time: {cv_results_gbdt['score_time'].mean():.3f} seconds" +) # %% [markdown] # We recall that a way of accelerating the gradient boosting is to reduce the # number of split considered within the tree building. One way is to bin the # data before to give them into the gradient boosting. A transformer called -# `KBinsDiscretizer` is doing such transformation. Thus, we can pipeline -# this preprocessing with the gradient boosting. +# `KBinsDiscretizer` is doing such transformation. Thus, we can pipeline this +# preprocessing with the gradient boosting. # # We can first demonstrate the transformation done by the `KBinsDiscretizer`. @@ -81,7 +87,8 @@ from sklearn.preprocessing import KBinsDiscretizer discretizer = KBinsDiscretizer( - n_bins=256, encode="ordinal", strategy="quantile") + n_bins=256, encode="ordinal", strategy="quantile" +) data_trans = discretizer.fit_transform(data) data_trans @@ -91,48 +98,54 @@ # the features, we requested too much bins in regard of the data dispersion # for those features. The smallest bins will be removed. # ``` -# We see that the discretizer transforms the original data into integral -# values (even though they are encoded using a floating-point representation). -# Each value represents the bin index when the distribution by quantile is -# performed. We can check the number of bins per feature. +# We see that the discretizer transforms the original data into integral values +# (even though they are encoded using a floating-point representation). Each +# value represents the bin index when the distribution by quantile is performed. +# We can check the number of bins per feature. # %% [len(np.unique(col)) for col in data_trans.T] # %% [markdown] # After this transformation, we see that we have at most 256 unique values per -# features. Now, we will use this transformer to discretize data before -# training the gradient boosting regressor. +# features. Now, we will use this transformer to discretize data before training +# the gradient boosting regressor. # %% from sklearn.pipeline import make_pipeline gradient_boosting = make_pipeline( - discretizer, GradientBoostingRegressor(n_estimators=200)) + discretizer, GradientBoostingRegressor(n_estimators=200) +) cv_results_gbdt = cross_validate( - gradient_boosting, data, target, scoring="neg_mean_absolute_error", + gradient_boosting, + data, + target, + scoring="neg_mean_absolute_error", n_jobs=2, ) # %% print("Gradient Boosting Decision Tree with KBinsDiscretizer") -print(f"Mean absolute error via cross-validation: " - f"{-cv_results_gbdt['test_score'].mean():.3f} ยฑ " - f"{cv_results_gbdt['test_score'].std():.3f} k$") -print(f"Average fit time: " - f"{cv_results_gbdt['fit_time'].mean():.3f} seconds") -print(f"Average score time: " - f"{cv_results_gbdt['score_time'].mean():.3f} seconds") +print( + "Mean absolute error via cross-validation: " + f"{-cv_results_gbdt['test_score'].mean():.3f} ยฑ " + f"{cv_results_gbdt['test_score'].std():.3f} k$" +) +print(f"Average fit time: {cv_results_gbdt['fit_time'].mean():.3f} seconds") +print( + f"Average score time: {cv_results_gbdt['score_time'].mean():.3f} seconds" +) # %% [markdown] -# Here, we see that the fit time has been reduced but that the -# generalization performance of the model is identical. Scikit-learn provides -# specific classes which are even more optimized for large dataset, called +# Here, we see that the fit time has been reduced but that the generalization +# performance of the model is identical. Scikit-learn provides specific classes +# which are even more optimized for large dataset, called # `HistGradientBoostingClassifier` and `HistGradientBoostingRegressor`. Each # feature in the dataset `data` is first binned by computing histograms, which # are later used to evaluate the potential splits. The number of splits to -# evaluate is then much smaller. This algorithm becomes much more efficient -# than gradient boosting when the dataset has over 10,000 samples. +# evaluate is then much smaller. This algorithm becomes much more efficient than +# gradient boosting when the dataset has over 10,000 samples. # # Below we will give an example for a large dataset and we will compare # computation times with the experiment of the previous section. @@ -141,23 +154,29 @@ from sklearn.ensemble import HistGradientBoostingRegressor histogram_gradient_boosting = HistGradientBoostingRegressor( - max_iter=200, random_state=0) + max_iter=200, random_state=0 +) cv_results_hgbdt = cross_validate( - histogram_gradient_boosting, data, target, - scoring="neg_mean_absolute_error", n_jobs=2, + histogram_gradient_boosting, + data, + target, + scoring="neg_mean_absolute_error", + n_jobs=2, ) # %% print("Histogram Gradient Boosting Decision Tree") -print(f"Mean absolute error via cross-validation: " - f"{-cv_results_hgbdt['test_score'].mean():.3f} ยฑ " - f"{cv_results_hgbdt['test_score'].std():.3f} k$") -print(f"Average fit time: " - f"{cv_results_hgbdt['fit_time'].mean():.3f} seconds") -print(f"Average score time: " - f"{cv_results_hgbdt['score_time'].mean():.3f} seconds") +print( + "Mean absolute error via cross-validation: " + f"{-cv_results_hgbdt['test_score'].mean():.3f} ยฑ " + f"{cv_results_hgbdt['test_score'].std():.3f} k$" +) +print(f"Average fit time: {cv_results_hgbdt['fit_time'].mean():.3f} seconds") +print( + f"Average score time: {cv_results_hgbdt['score_time'].mean():.3f} seconds" +) # %% [markdown] -# The histogram gradient-boosting is the best algorithm in terms of score. -# It will also scale when the number of samples increases, while the normal +# The histogram gradient-boosting is the best algorithm in terms of score. It +# will also scale when the number of samples increases, while the normal # gradient-boosting will not. diff --git a/python_scripts/ensemble_hyperparameters.py b/python_scripts/ensemble_hyperparameters.py index 6c90bc940..7fd079558 100644 --- a/python_scripts/ensemble_hyperparameters.py +++ b/python_scripts/ensemble_hyperparameters.py @@ -32,7 +32,8 @@ data, target = fetch_california_housing(return_X_y=True, as_frame=True) target *= 100 # rescale the target in k$ data_train, data_test, target_train, target_test = train_test_split( - data, target, random_state=0) + data, target, random_state=0 +) # %% [markdown] # ## Random forest @@ -95,8 +96,12 @@ "min_samples_leaf": [1, 2, 5, 10, 20, 50, 100], } search_cv = RandomizedSearchCV( - RandomForestRegressor(n_jobs=2), param_distributions=param_distributions, - scoring="neg_mean_absolute_error", n_iter=10, random_state=0, n_jobs=2, + RandomForestRegressor(n_jobs=2), + param_distributions=param_distributions, + scoring="neg_mean_absolute_error", + n_iter=10, + random_state=0, + n_jobs=2, ) search_cv.fit(data_train, target_train) @@ -126,7 +131,9 @@ # %% error = -search_cv.score(data_test, target_test) -print(f"On average, our random forest regressor makes an error of {error:.2f} k$") +print( + f"On average, our random forest regressor makes an error of {error:.2f} k$" +) # %% [markdown] # ## Gradient-boosting decision trees @@ -136,8 +143,8 @@ # `learning_rate`, and `max_depth` or `max_leaf_nodes` (as previously discussed # random forest). # -# Let's first discuss the `max_depth` (or `max_leaf_nodes`) parameter. We saw -# in the section on gradient-boosting that the algorithm fits the error of the +# Let's first discuss the `max_depth` (or `max_leaf_nodes`) parameter. We saw in +# the section on gradient-boosting that the algorithm fits the error of the # previous tree in the ensemble. Thus, fitting fully grown trees would be # detrimental. Indeed, the first tree of the ensemble would perfectly fit # (overfit) the data and thus no subsequent tree would be required, since there @@ -170,8 +177,12 @@ "learning_rate": loguniform(0.01, 1), } search_cv = RandomizedSearchCV( - GradientBoostingRegressor(), param_distributions=param_distributions, - scoring="neg_mean_absolute_error", n_iter=20, random_state=0, n_jobs=2 + GradientBoostingRegressor(), + param_distributions=param_distributions, + scoring="neg_mean_absolute_error", + n_iter=20, + random_state=0, + n_jobs=2, ) search_cv.fit(data_train, target_train) @@ -191,10 +202,10 @@ # # In this search, we see that the `learning_rate` is required to be large # enough, i.e. > 0.1. We also observe that for the best ranked models, having a -# smaller `learning_rate`, will require more trees or a larger number of -# leaves for each tree. However, it is particularly difficult to draw -# more detailed conclusions since the best value of an hyperparameter depends -# on the other hyperparameter values. +# smaller `learning_rate`, will require more trees or a larger number of leaves +# for each tree. However, it is particularly difficult to draw more detailed +# conclusions since the best value of an hyperparameter depends on the other +# hyperparameter values. # %% [markdown] # Now we estimate the generalization performance of the best model using the diff --git a/python_scripts/ensemble_introduction.py b/python_scripts/ensemble_introduction.py index 3b8e91d6b..21a9dce41 100644 --- a/python_scripts/ensemble_introduction.py +++ b/python_scripts/ensemble_introduction.py @@ -13,8 +13,8 @@ # models result in more powerful and robust models with less hassle. # # We will start by loading the california housing dataset. We recall that the -# goal in this dataset is to predict the median house value in some district -# in California based on demographic and geographic data. +# goal in this dataset is to predict the median house value in some district in +# California based on demographic and geographic data. # %% [markdown] # ```{note} @@ -40,8 +40,10 @@ cv_results = cross_validate(tree, data, target, n_jobs=2) scores = cv_results["test_score"] -print(f"R2 score obtained by cross-validation: " - f"{scores.mean():.3f} ยฑ {scores.std():.3f}") +print( + "R2 score obtained by cross-validation: " + f"{scores.mean():.3f} ยฑ {scores.std():.3f}" +) # %% [markdown] # We obtain fair results. However, as we previously presented in the "tree in @@ -67,21 +69,29 @@ param_grid = { "max_depth": [5, 8, None], "min_samples_split": [2, 10, 30, 50], - "min_samples_leaf": [0.01, 0.05, 0.1, 1]} + "min_samples_leaf": [0.01, 0.05, 0.1, 1], +} cv = 3 -tree = GridSearchCV(DecisionTreeRegressor(random_state=0), - param_grid=param_grid, cv=cv, n_jobs=2) -cv_results = cross_validate(tree, data, target, n_jobs=2, - return_estimator=True) +tree = GridSearchCV( + DecisionTreeRegressor(random_state=0), + param_grid=param_grid, + cv=cv, + n_jobs=2, +) +cv_results = cross_validate( + tree, data, target, n_jobs=2, return_estimator=True +) scores = cv_results["test_score"] -print(f"R2 score obtained by cross-validation: " - f"{scores.mean():.3f} ยฑ {scores.std():.3f}") +print( + "R2 score obtained by cross-validation: " + f"{scores.mean():.3f} ยฑ {scores.std():.3f}" +) # %% [markdown] -# We see that optimizing the hyperparameters will have a positive effect -# on the generalization performance. However, it comes with a higher computational +# We see that optimizing the hyperparameters will have a positive effect on the +# generalization performance. However, it comes with a higher computational # cost. # %% [markdown] @@ -89,14 +99,14 @@ # the tuning of the parameters and investigate the results. # # Now we will use an ensemble method called bagging. More details about this -# method will be discussed in the next section. In short, this method will use -# a base regressor (i.e. decision tree regressors) and will train several of -# them on a slightly modified version of the training set. Then, the -# predictions of all these base regressors will be combined by averaging. +# method will be discussed in the next section. In short, this method will use a +# base regressor (i.e. decision tree regressors) and will train several of them +# on a slightly modified version of the training set. Then, the predictions of +# all these base regressors will be combined by averaging. # # Here, we will use 20 decision trees and check the fitting time as well as the -# generalization performance on the left-out testing data. It is important to note -# that we are not going to tune any parameter of the decision tree. +# generalization performance on the left-out testing data. It is important to +# note that we are not going to tune any parameter of the decision tree. # %% # %%time @@ -104,26 +114,28 @@ estimator = DecisionTreeRegressor(random_state=0) bagging_regressor = BaggingRegressor( - estimator=estimator, n_estimators=20, random_state=0) + estimator=estimator, n_estimators=20, random_state=0 +) cv_results = cross_validate(bagging_regressor, data, target, n_jobs=2) scores = cv_results["test_score"] -print(f"R2 score obtained by cross-validation: " - f"{scores.mean():.3f} ยฑ {scores.std():.3f}") +print( + "R2 score obtained by cross-validation: " + f"{scores.mean():.3f} ยฑ {scores.std():.3f}" +) # %% [markdown] # Without searching for optimal hyperparameters, the overall generalization -# performance of the bagging regressor is better than a single decision tree. -# In addition, the computational cost is reduced in comparison of seeking -# for the optimal hyperparameters. +# performance of the bagging regressor is better than a single decision tree. In +# addition, the computational cost is reduced in comparison of seeking for the +# optimal hyperparameters. # # This shows the motivation behind the use of an ensemble learner: it gives a # relatively good baseline with decent generalization performance without any # parameter tuning. # -# Now, we will discuss in detail two ensemble families: bagging and -# boosting: +# Now, we will discuss in detail two ensemble families: bagging and boosting: # # * ensemble using bootstrap (e.g. bagging and random-forest); # * ensemble using boosting (e.g. adaptive boosting and gradient-boosting diff --git a/python_scripts/ensemble_random_forest.py b/python_scripts/ensemble_random_forest.py index fa1aca90f..9d8234dab 100644 --- a/python_scripts/ensemble_random_forest.py +++ b/python_scripts/ensemble_random_forest.py @@ -12,9 +12,9 @@ # differences with the bagging ensembles. # # Random forests are a popular model in machine learning. They are a -# modification of the bagging algorithm. In bagging, any classifier or -# regressor can be used. In random forests, the base classifier or regressor -# is always a decision tree. +# modification of the bagging algorithm. In bagging, any classifier or regressor +# can be used. In random forests, the base classifier or regressor is always a +# decision tree. # # Random forests have another particularity: when training a tree, the search # for the best split is done only on a subset of the original features taken at @@ -30,8 +30,8 @@ # # ## A look at random forests # -# We will illustrate the usage of a random forest classifier on the adult -# census dataset. +# We will illustrate the usage of a random forest classifier on the adult census +# dataset. # %% import pandas as pd @@ -53,12 +53,12 @@ # features using an `OrdinalEncoder` since tree-based models can work very # efficiently with such a naive representation of categorical variables. # -# Since there are rare categories in this dataset we need to specifically -# encode unknown categories at prediction time in order to be able to use +# Since there are rare categories in this dataset we need to specifically encode +# unknown categories at prediction time in order to be able to use # cross-validation. Otherwise some rare categories could only be present on the # validation side of the cross-validation split and the `OrdinalEncoder` would -# raise an error when calling its `transform` method with the data points -# of the validation set. +# raise an error when calling its `transform` method with the data points of the +# validation set. # %% from sklearn.preprocessing import OrdinalEncoder @@ -69,13 +69,13 @@ ) preprocessor = make_column_transformer( (categorical_encoder, make_column_selector(dtype_include=object)), - remainder="passthrough" + remainder="passthrough", ) # %% [markdown] # -# We will first give a simple example where we will train a single decision -# tree classifier and check its generalization performance via cross-validation. +# We will first give a simple example where we will train a single decision tree +# classifier and check its generalization performance via cross-validation. # %% from sklearn.pipeline import make_pipeline @@ -88,8 +88,10 @@ scores_tree = cross_val_score(tree, data, target) -print(f"Decision tree classifier: " - f"{scores_tree.mean():.3f} ยฑ {scores_tree.std():.3f}") +print( + "Decision tree classifier: " + f"{scores_tree.mean():.3f} ยฑ {scores_tree.std():.3f}" +) # %% [markdown] # @@ -105,42 +107,46 @@ preprocessor, BaggingClassifier( estimator=DecisionTreeClassifier(random_state=0), - n_estimators=50, n_jobs=2, random_state=0, - ) + n_estimators=50, + n_jobs=2, + random_state=0, + ), ) # %% scores_bagged_trees = cross_val_score(bagged_trees, data, target) -print(f"Bagged decision tree classifier: " - f"{scores_bagged_trees.mean():.3f} ยฑ {scores_bagged_trees.std():.3f}") +print( + "Bagged decision tree classifier: " + f"{scores_bagged_trees.mean():.3f} ยฑ {scores_bagged_trees.std():.3f}" +) # %% [markdown] -# # Note that the generalization performance of the bagged trees is already much # better than the performance of a single tree. # # Now, we will use a random forest. You will observe that we do not need to -# specify any `estimator` because the estimator is forced to be a decision -# tree. Thus, we just specify the desired number of trees in the forest. +# specify any `estimator` because the estimator is forced to be a decision tree. +# Thus, we just specify the desired number of trees in the forest. # %% from sklearn.ensemble import RandomForestClassifier random_forest = make_pipeline( preprocessor, - RandomForestClassifier(n_estimators=50, n_jobs=2, random_state=0) + RandomForestClassifier(n_estimators=50, n_jobs=2, random_state=0), ) # %% scores_random_forest = cross_val_score(random_forest, data, target) -print(f"Random forest classifier: " - f"{scores_random_forest.mean():.3f} ยฑ " - f"{scores_random_forest.std():.3f}") +print( + "Random forest classifier: " + f"{scores_random_forest.mean():.3f} ยฑ " + f"{scores_random_forest.std():.3f}" +) # %% [markdown] -# # It seems that the random forest is performing slightly better than the bagged # trees possibly due to the randomized selection of the features which # decorrelates the prediction errors of individual trees and as a consequence @@ -162,17 +168,17 @@ # # However, `max_features` is one of the hyperparameters to consider when tuning # a random forest: -# - too much randomness in the trees can lead to underfitted base models and -# can be detrimental for the ensemble as a whole, +# - too much randomness in the trees can lead to underfitted base models and can +# be detrimental for the ensemble as a whole, # - too few randomness in the trees leads to more correlation of the prediction -# errors and as a result reduce the benefits of the averaging step in terms -# of overfitting control. +# errors and as a result reduce the benefits of the averaging step in terms of +# overfitting control. # # In scikit-learn, the bagging classes also expose a `max_features` parameter. # However, `BaggingClassifier` and `BaggingRegressor` are agnostic with respect # to their base model and therefore random feature subsampling can only happen -# once before fitting each base model instead of several times per base model -# as is the case when adding splits to a given tree. +# once before fitting each base model instead of several times per base model as +# is the case when adding splits to a given tree. # # We summarize these details in the following table: # diff --git a/python_scripts/ensemble_sol_01.py b/python_scripts/ensemble_sol_01.py index 57d675788..c32ca1cc0 100644 --- a/python_scripts/ensemble_sol_01.py +++ b/python_scripts/ensemble_sol_01.py @@ -11,8 +11,8 @@ # The aim of this notebook is to investigate if we can tune the hyperparameters # of a bagging regressor and evaluate the gain obtained. # -# We will load the California housing dataset and split it into a training and -# a testing set. +# We will load the California housing dataset and split it into a training and a +# testing set. # %% from sklearn.datasets import fetch_california_housing @@ -21,7 +21,8 @@ data, target = fetch_california_housing(as_frame=True, return_X_y=True) target *= 100 # rescale the target in k$ data_train, data_test, target_train, target_test = train_test_split( - data, target, random_state=0, test_size=0.5) + data, target, random_state=0, test_size=0.5 +) # %% [markdown] # ```{note} @@ -30,9 +31,9 @@ # ``` # %% [markdown] -# Create a `BaggingRegressor` and provide a `DecisionTreeRegressor` -# to its parameter `estimator`. Train the regressor and evaluate its -# generalization performance on the testing set using the mean absolute error. +# Create a `BaggingRegressor` and provide a `DecisionTreeRegressor` to its +# parameter `estimator`. Train the regressor and evaluate its generalization +# performance on the testing set using the mean absolute error. # %% # solution @@ -44,19 +45,19 @@ bagging = BaggingRegressor(estimator=tree, n_jobs=2) bagging.fit(data_train, target_train) target_predicted = bagging.predict(data_test) -print(f"Basic mean absolute error of the bagging regressor:\n" - f"{mean_absolute_error(target_test, target_predicted):.2f} k$") +print( + "Basic mean absolute error of the bagging regressor:\n" + f"{mean_absolute_error(target_test, target_predicted):.2f} k$" +) # %% [markdown] -# Now, create a `RandomizedSearchCV` instance using the previous model and -# tune the important parameters of the bagging regressor. Find the best -# parameters and check if you are able to find a set of parameters that -# improve the default regressor still using the mean absolute error as a -# metric. +# Now, create a `RandomizedSearchCV` instance using the previous model and tune +# the important parameters of the bagging regressor. Find the best parameters +# and check if you are able to find a set of parameters that improve the default +# regressor still using the mean absolute error as a metric. # ```{tip} -# You can list the bagging regressor's parameters using the `get_params` -# method. +# You can list the bagging regressor's parameters using the `get_params` method. # ``` # %% @@ -91,9 +92,11 @@ # %% tags=["solution"] target_predicted = search.predict(data_test) -print(f"Mean absolute error after tuning of the bagging regressor:\n" - f"{mean_absolute_error(target_test, target_predicted):.2f} k$") +print( + "Mean absolute error after tuning of the bagging regressor:\n" + f"{mean_absolute_error(target_test, target_predicted):.2f} k$" +) # %% [markdown] tags=["solution"] -# We see that the predictor provided by the bagging regressor does not need -# much hyperparameter tuning compared to a single decision tree. +# We see that the predictor provided by the bagging regressor does not need much +# hyperparameter tuning compared to a single decision tree. diff --git a/python_scripts/ensemble_sol_02.py b/python_scripts/ensemble_sol_02.py index 66a07dab3..232ec2c04 100644 --- a/python_scripts/ensemble_sol_02.py +++ b/python_scripts/ensemble_sol_02.py @@ -22,7 +22,8 @@ target_name = "Body Mass (g)" data, target = penguins[[feature_name]], penguins[target_name] data_train, data_test, target_train, target_test = train_test_split( - data, target, random_state=0) + data, target, random_state=0 +) # %% [markdown] # ```{note} @@ -31,9 +32,8 @@ # ``` # %% [markdown] -# Create a random forest containing three trees. Train the forest and -# check the generalization performance on the testing set in terms of mean -# absolute error. +# Create a random forest containing three trees. Train the forest and check the +# generalization performance on the testing set in terms of mean absolute error. # %% # solution @@ -43,8 +43,10 @@ forest = RandomForestRegressor(n_estimators=3) forest.fit(data_train, target_train) target_predicted = forest.predict(data_test) -print(f"Mean absolute error: " - f"{mean_absolute_error(target_test, target_predicted):.3f} grams") +print( + "Mean absolute error: " + f"{mean_absolute_error(target_test, target_predicted):.3f} grams" +) # %% [markdown] # We now aim to plot the predictions from the individual trees in the forest. @@ -56,8 +58,7 @@ # solution import numpy as np -data_range = pd.DataFrame(np.linspace(170, 235, num=300), - columns=data.columns) +data_range = pd.DataFrame(np.linspace(170, 235, num=300), columns=data.columns) # %% [markdown] # The trees contained in the forest that you created can be accessed with the @@ -86,13 +87,19 @@ import matplotlib.pyplot as plt import seaborn as sns -sns.scatterplot(data=penguins, x=feature_name, y=target_name, - color="black", alpha=0.5) +sns.scatterplot( + data=penguins, x=feature_name, y=target_name, color="black", alpha=0.5 +) # plot tree predictions for tree_idx, predictions in enumerate(tree_predictions): - plt.plot(data_range[feature_name], predictions, label=f"Tree #{tree_idx}", - linestyle="--", alpha=0.8) - -plt.plot(data_range[feature_name], forest_predictions, label=f"Random forest") + plt.plot( + data_range[feature_name], + predictions, + label=f"Tree #{tree_idx}", + linestyle="--", + alpha=0.8, + ) + +plt.plot(data_range[feature_name], forest_predictions, label="Random forest") _ = plt.legend(bbox_to_anchor=(1.05, 0.8), loc="upper left") diff --git a/python_scripts/ensemble_sol_03.py b/python_scripts/ensemble_sol_03.py index 67f13b42a..302eb6864 100644 --- a/python_scripts/ensemble_sol_03.py +++ b/python_scripts/ensemble_sol_03.py @@ -10,10 +10,10 @@ # # The aim of this exercise is to: # -# * verifying if a random forest or a gradient-boosting decision tree overfit -# if the number of estimators is not properly chosen; -# * use the early-stopping strategy to avoid adding unnecessary trees, to -# get the best generalization performances. +# * verifying if a random forest or a gradient-boosting decision tree overfit if +# the number of estimators is not properly chosen; +# * use the early-stopping strategy to avoid adding unnecessary trees, to get +# the best generalization performances. # # We will use the California housing dataset to conduct our experiments. @@ -24,7 +24,8 @@ data, target = fetch_california_housing(return_X_y=True, as_frame=True) target *= 100 # rescale the target in k$ data_train, data_test, target_train, target_test = train_test_split( - data, target, random_state=0, test_size=0.5) + data, target, random_state=0, test_size=0.5 +) # %% [markdown] # ```{note} @@ -73,7 +74,10 @@ scoring="neg_mean_absolute_error", n_jobs=2, ) -gbdt_train_errors, gbdt_validation_errors = -gbdt_train_scores, -gbdt_validation_scores +gbdt_train_errors, gbdt_validation_errors = ( + -gbdt_train_scores, + -gbdt_validation_scores, +) forest_train_scores, forest_validation_scores = validation_curve( forest, @@ -138,8 +142,8 @@ # improving for several iterations, it will stop adding trees. # # Now, create a gradient-boosting model with `n_estimators=1_000`. This number -# of trees will be too large. Change the parameter `n_iter_no_change` such -# that the gradient boosting fitting will stop after adding 5 trees that do not +# of trees will be too large. Change the parameter `n_iter_no_change` such that +# the gradient boosting fitting will stop after adding 5 trees that do not # improve the overall generalization performance. # %% @@ -150,19 +154,19 @@ # %% [markdown] tags=["solution"] # We see that the number of trees used is far below 1000 with the current -# dataset. Training the gradient boosting model with the entire 1000 trees -# would have been useless. +# dataset. Training the gradient boosting model with the entire 1000 trees would +# have been useless. # %% [markdown] -# Estimate the generalization performance of this model again using -# the `sklearn.metrics.mean_absolute_error` metric but this time using -# the test set that we held out at the beginning of the notebook. -# Compare the resulting value with the values observed in the validation -# curve. +# Estimate the generalization performance of this model again using the +# `sklearn.metrics.mean_absolute_error` metric but this time using the test set +# that we held out at the beginning of the notebook. Compare the resulting value +# with the values observed in the validation curve. # %% # solution from sklearn.metrics import mean_absolute_error + error = mean_absolute_error(target_test, gbdt.predict(data_test)) print(f"On average, our GBDT regressor makes an error of {error:.2f} k$") diff --git a/python_scripts/ensemble_sol_04.py b/python_scripts/ensemble_sol_04.py index 5c7f7aba6..071e5877a 100644 --- a/python_scripts/ensemble_sol_04.py +++ b/python_scripts/ensemble_sol_04.py @@ -9,9 +9,9 @@ # # ๐Ÿ“ƒ Solution for Exercise M6.04 # # The aim of the exercise is to get familiar with the histogram -# gradient-boosting in scikit-learn. Besides, we will use this model within -# a cross-validation framework in order to inspect internal parameters found -# via grid-search. +# gradient-boosting in scikit-learn. Besides, we will use this model within a +# cross-validation framework in order to inspect internal parameters found via +# grid-search. # # We will use the California housing dataset. @@ -22,26 +22,27 @@ target *= 100 # rescale the target in k$ # %% [markdown] -# First, create a histogram gradient boosting regressor. You can set the -# trees number to be large, and configure the model to use early-stopping. +# First, create a histogram gradient boosting regressor. You can set the trees +# number to be large, and configure the model to use early-stopping. # %% # solution from sklearn.ensemble import HistGradientBoostingRegressor hist_gbdt = HistGradientBoostingRegressor( - max_iter=1000, early_stopping=True, random_state=0) + max_iter=1000, early_stopping=True, random_state=0 +) # %% [markdown] -# We will use a grid-search to find some optimal parameter for this model. -# In this grid-search, you should search for the following parameters: +# We will use a grid-search to find some optimal parameter for this model. In +# this grid-search, you should search for the following parameters: # # * `max_depth: [3, 8]`; # * `max_leaf_nodes: [15, 31]`; # * `learning_rate: [0.1, 1]`. # -# Feel free to explore the space with additional values. Create the -# grid-search providing the previous gradient boosting instance as the model. +# Feel free to explore the space with additional values. Create the grid-search +# providing the previous gradient boosting instance as the model. # %% # solution @@ -58,9 +59,9 @@ # %% [markdown] # Finally, we will run our experiment through cross-validation. In this regard, # define a 5-fold cross-validation. Besides, be sure to shuffle the data. -# Subsequently, use the function `sklearn.model_selection.cross_validate` -# to run the cross-validation. You should also set `return_estimator=True`, -# so that we can investigate the inner model trained via cross-validation. +# Subsequently, use the function `sklearn.model_selection.cross_validate` to run +# the cross-validation. You should also set `return_estimator=True`, so that we +# can investigate the inner model trained via cross-validation. # %% # solution @@ -69,17 +70,20 @@ cv = KFold(n_splits=5, shuffle=True, random_state=0) results = cross_validate( - search, data, target, cv=cv, return_estimator=True, n_jobs=2) + search, data, target, cv=cv, return_estimator=True, n_jobs=2 +) # %% [markdown] -# Now that we got the cross-validation results, print out the mean and -# standard deviation score. +# Now that we got the cross-validation results, print out the mean and standard +# deviation score. # %% # solution -print(f"R2 score with cross-validation:\n" - f"{results['test_score'].mean():.3f} ยฑ " - f"{results['test_score'].std():.3f}") +print( + "R2 score with cross-validation:\n" + f"{results['test_score'].mean():.3f} ยฑ " + f"{results['test_score'].std():.3f}" +) # %% [markdown] # Then inspect the `estimator` entry of the results and check the best @@ -112,7 +116,8 @@ search_cv_results = pd.DataFrame(estimator.cv_results_) search_cv_results = search_cv_results[columns].set_index(index_columns) search_cv_results = search_cv_results.rename( - columns={"mean_test_score": f"CV {cv_idx}"}) + columns={"mean_test_score": f"CV {cv_idx}"} + ) inner_cv_results.append(search_cv_results) inner_cv_results = pd.concat(inner_cv_results, axis=1).T @@ -123,11 +128,12 @@ inner_cv_results.plot.box(vert=False, color=color) plt.xlabel("R2 score") plt.ylabel("Parameters") -_ = plt.title("Inner CV results with parameters\n" - "(max_depth, max_leaf_nodes, learning_rate)") +_ = plt.title( + "Inner CV results with parameters\n" + "(max_depth, max_leaf_nodes, learning_rate)" +) # %% [markdown] tags=["solution"] -# We see that the first 4 ranked set of parameters are very close. -# We could select any of these 4 combinations. -# It coincides with the results we observe when inspecting the -# best parameters of the outer CV. +# We see that the first 4 ranked set of parameters are very close. We could +# select any of these 4 combinations. It coincides with the results we observe +# when inspecting the best parameters of the outer CV. diff --git a/python_scripts/feature_selection_ex_01.py b/python_scripts/feature_selection_ex_01.py index dc07cb733..dd80e3513 100644 --- a/python_scripts/feature_selection_ex_01.py +++ b/python_scripts/feature_selection_ex_01.py @@ -16,17 +16,17 @@ # # ๐Ÿ“ Exercise 01 # # The aim of this exercise is to highlight caveats to have in mind when using -# feature selection. You have to be extremely careful regarding the set of -# data on which you will compute the statistic that helps your feature -# selection algorithm to decide which feature to select. +# feature selection. You have to be extremely careful regarding the set of data +# on which you will compute the statistic that helps your feature selection +# algorithm to decide which feature to select. # # On purpose, we will make you program the wrong way of doing feature selection # to gain insights. # # First, you will create a completely random dataset using NumPy. Using the # function `np.random.randn`, generate a matrix `data` containing 100 samples -# and 100,000 features. Then, using the function `np.random.randint`, generate -# a vector `target` with 100 samples containing either 0 or 1. +# and 100,000 features. Then, using the function `np.random.randint`, generate a +# vector `target` with 100 samples containing either 0 or 1. # # This type of dimensionality is typical in bioinformatics when dealing with # RNA-seq. However, we will use completely randomized features such that we @@ -40,8 +40,8 @@ # Write your code here. # %% [markdown] -# Now, create a logistic regression model and use cross-validation to check -# the score of such a model. It will allow use to confirm that our model cannot +# Now, create a logistic regression model and use cross-validation to check the +# score of such a model. It will allow use to confirm that our model cannot # predict anything meaningful from random data. # %% @@ -51,8 +51,8 @@ # Now, we will ask you to program the **wrong** pattern to select feature. # Select the feature by using the entire dataset. We will choose ten features # with the highest ANOVA F-score computed on the full dataset. Subsequently, -# subsample the dataset `data` by selecting the features' subset. Finally, -# train and test a logistic regression model. +# subsample the dataset `data` by selecting the features' subset. Finally, train +# and test a logistic regression model. # # You should get some surprising results. @@ -69,11 +69,11 @@ # Write your code here. # %% [markdown] -# However, the previous case is not perfect. For instance, if we were asking -# to perform cross-validation, the manual `fit`/`transform` of the datasets -# will make our life hard. Indeed, the solution here is to use a scikit-learn -# pipeline in which the feature selection will be a pre processing stage -# before to train the model. +# However, the previous case is not perfect. For instance, if we were asking to +# perform cross-validation, the manual `fit`/`transform` of the datasets will +# make our life hard. Indeed, the solution here is to use a scikit-learn +# pipeline in which the feature selection will be a pre processing stage before +# to train the model. # # Thus, start by creating a pipeline with the feature selector and the logistic # regression. Then, use cross-validation to get an estimate of the uncertainty diff --git a/python_scripts/feature_selection_introduction.py b/python_scripts/feature_selection_introduction.py index a66131eec..c3c4af0bd 100644 --- a/python_scripts/feature_selection_introduction.py +++ b/python_scripts/feature_selection_introduction.py @@ -8,12 +8,12 @@ # %% [markdown] # # Benefits of using feature selection # -# In this notebook, we aim at introducing the main benefits that can be -# gained when using feature selection. +# In this notebook, we aim at introducing the main benefits that can be gained +# when using feature selection. # # Indeed, the principal advantage of selecting features within a machine -# learning pipeline is to reduce the time to train this pipeline and its time -# to predict. We will give an example to highlights these advantages. First, we +# learning pipeline is to reduce the time to train this pipeline and its time to +# predict. We will give an example to highlights these advantages. First, we # generate a synthetic dataset to control the number of features that will be # informative, redundant, repeated, and random. @@ -36,12 +36,12 @@ # # We will create two machine learning pipelines. The former will be a random # forest that will use all available features. The latter will also be a random -# forest, but we will add a feature selection step to train this classifier. -# The feature selection is based on a univariate test (ANOVA F-value) between -# each feature and the target that we want to predict. The features with the -# two most significant scores are selected. +# forest, but we will add a feature selection step to train this classifier. The +# feature selection is based on a univariate test (ANOVA F-value) between each +# feature and the target that we want to predict. The features with the two most +# significant scores are selected. # -# Let's create the model without any feature selection +# Let's create the model without any feature selection: # %% from sklearn.ensemble import RandomForestClassifier @@ -65,8 +65,8 @@ # %% [markdown] # We will measure the average time spent to train each pipeline and make it -# predict. Besides, we will compute the testing score of the model. We -# will collect these results via cross-validation. +# predict. Besides, we will compute the testing score of the model. We will +# collect these results via cross-validation. # # Let's start with the random forest without feature selection. We will store # the results into a dataframe. @@ -75,8 +75,9 @@ import pandas as pd from sklearn.model_selection import cross_validate -cv_results_without_selection = cross_validate(model_without_selection, data, - target) +cv_results_without_selection = cross_validate( + model_without_selection, data, target +) cv_results_without_selection = pd.DataFrame(cv_results_without_selection) # %% [markdown] @@ -85,12 +86,13 @@ # %% cv_results_with_selection = cross_validate( - model_with_selection, data, target, return_estimator=True) + model_with_selection, data, target, return_estimator=True +) cv_results_with_selection = pd.DataFrame(cv_results_with_selection) # %% [markdown] -# To analyze the results, we will merge the results from the two pipeline in -# a single pandas dataframe. +# To analyze the results, we will merge the results from the two pipeline in a +# single pandas dataframe. # %% cv_results = pd.concat( @@ -121,8 +123,8 @@ # We can draw the same conclusions for both training and scoring elapsed time: # selecting the most informative features speed-up our pipeline. # -# Of course, such speed-up is beneficial only if the generalization performance in -# terms of metrics remain the same. Let's check the testing score. +# Of course, such speed-up is beneficial only if the generalization performance +# in terms of metrics remain the same. Let's check the testing score. # %% cv_results["test_score"].plot.box(color=color, vert=False) @@ -130,8 +132,8 @@ _ = plt.title("Test score via cross-validation") # %% [markdown] -# We can observe that the model's generalization performance selecting a subset of -# features decreases compared with the model using all available features. +# We can observe that the model's generalization performance selecting a subset +# of features decreases compared with the model using all available features. # Since we generated the dataset, we can infer that the decrease is because of # the selection. The feature selection algorithm did not choose the two # informative features. @@ -149,8 +151,8 @@ ) # %% [markdown] -# We see that the feature `53` is always selected while the other feature -# varies depending on the cross-validation fold. +# We see that the feature `53` is always selected while the other feature varies +# depending on the cross-validation fold. # # If we would like to keep our score with similar generalization performance, we # could choose another metric to perform the test or select more features. For @@ -161,9 +163,9 @@ # harder. # # Therefore, we could come with a much more complicated procedure that could -# tune (via cross-validation) the number of selected features and change -# the way feature is selected (e.g. using a machine-learning model). However, -# going towards these solutions alienates the feature selection's primary -# purpose to get a significant train/test speed-up. Also, if the primary goal -# was to get a more performant model, performant models exclude non-informative -# features natively. +# tune (via cross-validation) the number of selected features and change the way +# feature is selected (e.g. using a machine-learning model). However, going +# towards these solutions alienates the feature selection's primary purpose to +# get a significant train/test speed-up. Also, if the primary goal was to get a +# more performant model, performant models exclude non-informative features +# natively. diff --git a/python_scripts/feature_selection_limitation_model.py b/python_scripts/feature_selection_limitation_model.py index f46e307f2..4da031d2c 100644 --- a/python_scripts/feature_selection_limitation_model.py +++ b/python_scripts/feature_selection_limitation_model.py @@ -42,20 +42,21 @@ model_without_selection = RandomForestClassifier() # %% [markdown] -# We will evaluate this model by a k-fold cross validation and store the -# results in a pandas dataframe. +# We will evaluate this model by a k-fold cross validation and store the results +# in a pandas dataframe. # %% import pandas as pd from sklearn.model_selection import cross_validate cv_results_without_selection = cross_validate( - model_without_selection, data, target, cv=5) + model_without_selection, data, target, cv=5 +) cv_results_without_selection = pd.DataFrame(cv_results_without_selection) # %% [markdown] -# Then, we will build another model which will include a feature selection -# step based on a random forest and evaluate it as well with cross-validation. +# Then, we will build another model which will include a feature selection step +# based on a random forest and evaluate it as well with cross-validation. # %% from sklearn.pipeline import make_pipeline @@ -63,16 +64,18 @@ feature_selector = SelectFromModel(RandomForestClassifier()) model_with_selection = make_pipeline( - feature_selector, RandomForestClassifier()) + feature_selector, RandomForestClassifier() +) # %% -cv_results_with_selection = cross_validate(model_with_selection, data, target, - cv=5) +cv_results_with_selection = cross_validate( + model_with_selection, data, target, cv=5 +) cv_results_with_selection = pd.DataFrame(cv_results_with_selection) # %% [markdown] -# We can compare the testing score of the two models. For this matter, -# we are combining results in a single dataframe. +# We can compare the testing score of the two models. For this matter, we are +# combining results in a single dataframe. # %% cv_results = pd.concat( @@ -93,8 +96,8 @@ _ = plt.title("Limitation of using a random forest for feature selection") # %% [markdown] -# The model that selected a subset of feature is less performant than a -# random forest fitted on the full dataset. +# The model that selected a subset of feature is less performant than a random +# forest fitted on the full dataset. # # We can rely on some aspects tackled in the notebook presenting the model # inspection to explain this behaviour. The decision tree's relative feature diff --git a/python_scripts/feature_selection_sol_01.py b/python_scripts/feature_selection_sol_01.py index 84e4cb94c..0c24f2476 100644 --- a/python_scripts/feature_selection_sol_01.py +++ b/python_scripts/feature_selection_sol_01.py @@ -9,17 +9,17 @@ # # ๐Ÿ“ƒ Solution for Exercise 01 # # The aim of this exercise is to highlight caveats to have in mind when using -# feature selection. You have to be extremely careful regarding the set of -# data on which you will compute the statistic that helps your feature -# selection algorithm to decide which feature to select. +# feature selection. You have to be extremely careful regarding the set of data +# on which you will compute the statistic that helps your feature selection +# algorithm to decide which feature to select. # # On purpose, we will make you program the wrong way of doing feature selection # to gain insights. # # First, you will create a completely random dataset using NumPy. Using the # function `np.random.randn`, generate a matrix `data` containing 100 samples -# and 100,000 features. Then, using the function `np.random.randint`, generate -# a vector `target` with 100 samples containing either 0 or 1. +# and 100,000 features. Then, using the function `np.random.randint`, generate a +# vector `target` with 100 samples containing either 0 or 1. # # This type of dimensionality is typical in bioinformatics when dealing with # RNA-seq. However, we will use completely randomized features such that we @@ -35,8 +35,8 @@ data, target = rng.randn(100, 100000), rng.randint(0, 2, size=100) # %% [markdown] -# Now, create a logistic regression model and use cross-validation to check -# the score of such a model. It will allow use to confirm that our model cannot +# Now, create a logistic regression model and use cross-validation to check the +# score of such a model. It will allow use to confirm that our model cannot # predict anything meaningful from random data. # %% @@ -57,13 +57,12 @@ # Now, we will ask you to program the **wrong** pattern to select feature. # Select the feature by using the entire dataset. We will choose ten features # with the highest ANOVA F-score computed on the full dataset. Subsequently, -# subsample the dataset `data` by selecting the features' subset. Finally, -# train and test a logistic regression model. +# subsample the dataset `data` by selecting the features' subset. Finally, train +# and test a logistic regression model. # # You should get some surprising results. # %% -# solution from sklearn.feature_selection import SelectKBest, f_classif # solution @@ -73,9 +72,9 @@ print(f"The mean accuracy is: {test_score.mean():.3f}") # %% [markdown] tags=["solution"] -# Surprisingly, the logistic regression succeeded in having a fantastic -# accuracy using data that did not have any link with the target in the first -# place. We therefore know that these results are not legit. +# Surprisingly, the logistic regression succeeded in having a fantastic accuracy +# using data that did not have any link with the target in the first place. We +# therefore know that these results are not legit. # # The reasons for obtaining these results are two folds: the pool of available # features is large compared to the number of samples. It is possible to find a @@ -90,12 +89,12 @@ # testing sets before you train and test the logistic regression. # %% -# solution from sklearn.model_selection import train_test_split # solution data_train, data_test, target_train, target_test = train_test_split( - data, target, random_state=0) + data, target, random_state=0 +) feature_selector.fit(data_train, target_train) data_train_subset = feature_selector.transform(data_train) data_test_subset = feature_selector.transform(data_test) @@ -108,24 +107,23 @@ # features only on the training set will not help when testing our model. In # this case, we obtained the expected results. # -# Therefore, as with hyperparameters optimization or model selection, tuning -# the feature space should be done solely on the training set, keeping a part -# of the data left-out. +# Therefore, as with hyperparameters optimization or model selection, tuning the +# feature space should be done solely on the training set, keeping a part of the +# data left-out. # # %% [markdown] -# However, the previous case is not perfect. For instance, if we were asking -# to perform cross-validation, the manual `fit`/`transform` of the datasets -# will make our life hard. Indeed, the solution here is to use a scikit-learn -# pipeline in which the feature selection will be a pre processing stage -# before to train the model. +# However, the previous case is not perfect. For instance, if we were asking to +# perform cross-validation, the manual `fit`/`transform` of the datasets will +# make our life hard. Indeed, the solution here is to use a scikit-learn +# pipeline in which the feature selection will be a pre processing stage before +# to train the model. # # Thus, start by creating a pipeline with the feature selector and the logistic # regression. Then, use cross-validation to get an estimate of the uncertainty # of your model generalization performance. # %% -# solution from sklearn.pipeline import make_pipeline # solution @@ -134,6 +132,6 @@ print(f"The mean accuracy is: {test_score.mean():.3f}") # %% [markdown] tags=["solution"] -# We see that using a scikit-learn pipeline removes a lot of boilerplate -# code and helps avoid mistakes when calling `fit` and `transform` on the -# different set of data. +# We see that using a scikit-learn pipeline removes a lot of boilerplate code +# and helps avoid mistakes when calling `fit` and `transform` on the different +# set of data. diff --git a/python_scripts/linear_models_ex_02.py b/python_scripts/linear_models_ex_02.py index 50393ebc2..626de7483 100644 --- a/python_scripts/linear_models_ex_02.py +++ b/python_scripts/linear_models_ex_02.py @@ -28,16 +28,17 @@ # %% import numpy as np + # Set the seed for reproduction rng = np.random.RandomState(0) # Generate data n_sample = 100 data_max, data_min = 1.4, -1.4 -len_data = (data_max - data_min) +len_data = data_max - data_min data = rng.rand(n_sample) * len_data - len_data / 2 -noise = rng.randn(n_sample) * .3 -target = data ** 3 - 0.5 * data ** 2 + noise +noise = rng.randn(n_sample) * 0.3 +target = data**3 - 0.5 * data**2 + noise # %% [markdown] # ```{note} @@ -47,13 +48,15 @@ # %% import pandas as pd + full_data = pd.DataFrame({"data": data, "target": target}) # %% import seaborn as sns -_ = sns.scatterplot(data=full_data, x="data", y="target", color="black", - alpha=0.5) +_ = sns.scatterplot( + data=full_data, x="data", y="target", color="black", alpha=0.5 +) # %% [markdown] # We observe that the link between the data `data` and vector `target` is @@ -65,11 +68,13 @@ # `intercept` that you think will lead to a good linear model. Plot both the # data and the predictions of this model. + # %% def f(data, weight=0, intercept=0): target_predict = weight * data + intercept return target_predict + # %% # Write your code here. diff --git a/python_scripts/linear_models_ex_05.py b/python_scripts/linear_models_ex_05.py index db708b9f6..9951ebafa 100644 --- a/python_scripts/linear_models_ex_05.py +++ b/python_scripts/linear_models_ex_05.py @@ -35,8 +35,9 @@ penguins = pd.read_csv("../datasets/penguins_classification.csv") # only keep the Adelie and Chinstrap classes -penguins = penguins.set_index("Species").loc[ - ["Adelie", "Chinstrap"]].reset_index() +penguins = ( + penguins.set_index("Species").loc[["Adelie", "Chinstrap"]].reset_index() +) culmen_columns = ["Culmen Length (mm)", "Culmen Depth (mm)"] target_column = "Species" @@ -61,7 +62,8 @@ from sklearn.linear_model import LogisticRegression logistic_regression = make_pipeline( - StandardScaler(), LogisticRegression(penalty="l2")) + StandardScaler(), LogisticRegression(penalty="l2") +) # %% [markdown] # Given the following candidates for the `C` parameter, find out the impact of diff --git a/python_scripts/linear_models_regularization.py b/python_scripts/linear_models_regularization.py index 8d527afd0..494192ab7 100644 --- a/python_scripts/linear_models_regularization.py +++ b/python_scripts/linear_models_regularization.py @@ -11,12 +11,12 @@ # In this notebook, we will see the limitations of linear regression models and # the advantage of using regularized models instead. # -# Besides, we will also present the preprocessing required when dealing -# with regularized models, furthermore when the regularization parameter -# needs to be tuned. +# Besides, we will also present the preprocessing required when dealing with +# regularized models, furthermore when the regularization parameter needs to be +# tuned. # -# We will start by highlighting the over-fitting issue that can arise with -# a simple linear regression model. +# We will start by highlighting the over-fitting issue that can arise with a +# simple linear regression model. # # ## Effect of regularization # @@ -42,10 +42,9 @@ # We showed that one can use the `PolynomialFeatures` transformer to create # additional features encoding non-linear interactions between features. # -# Here, we will use this transformer to augment the feature space. -# Subsequently, we will train a linear regression model. We will use the -# out-of-sample test set to evaluate the generalization capabilities of our -# model. +# Here, we will use this transformer to augment the feature space. Subsequently, +# we will train a linear regression model. We will use the out-of-sample test +# set to evaluate the generalization capabilities of our model. # %% from sklearn.model_selection import cross_validate @@ -53,12 +52,18 @@ from sklearn.preprocessing import PolynomialFeatures from sklearn.linear_model import LinearRegression -linear_regression = make_pipeline(PolynomialFeatures(degree=2), - LinearRegression()) -cv_results = cross_validate(linear_regression, data, target, - cv=10, scoring="neg_mean_squared_error", - return_train_score=True, - return_estimator=True) +linear_regression = make_pipeline( + PolynomialFeatures(degree=2), LinearRegression() +) +cv_results = cross_validate( + linear_regression, + data, + target, + cv=10, + scoring="neg_mean_squared_error", + return_train_score=True, + return_estimator=True, +) # %% [markdown] # We can compare the mean squared error on the training and testing set to @@ -66,41 +71,45 @@ # %% train_error = -cv_results["train_score"] -print(f"Mean squared error of linear regression model on the train set:\n" - f"{train_error.mean():.3f} ยฑ {train_error.std():.3f}") +print( + "Mean squared error of linear regression model on the train set:\n" + f"{train_error.mean():.3f} ยฑ {train_error.std():.3f}" +) # %% test_error = -cv_results["test_score"] -print(f"Mean squared error of linear regression model on the test set:\n" - f"{test_error.mean():.3f} ยฑ {test_error.std():.3f}") +print( + "Mean squared error of linear regression model on the test set:\n" + f"{test_error.mean():.3f} ยฑ {test_error.std():.3f}" +) # %% [markdown] # The score on the training set is much better. This generalization performance # gap between the training and testing score is an indication that our model # overfitted our training set. # -# Indeed, this is one of the danger when augmenting the number of features -# with a `PolynomialFeatures` transformer. Our model will focus on some -# specific features. We can check the weights of the model to have a -# confirmation. Let's create a dataframe: the columns will contain the name -# of the feature while the line the coefficients values stored by each model -# during the cross-validation. +# Indeed, this is one of the danger when augmenting the number of features with +# a `PolynomialFeatures` transformer. Our model will focus on some specific +# features. We can check the weights of the model to have a confirmation. Let's +# create a dataframe: the columns will contain the name of the feature while the +# line the coefficients values stored by each model during the cross-validation. # # Since we used a `PolynomialFeatures` to augment the data, we will create -# feature names representative of the feature combination. Scikit-learn -# provides a `get_feature_names_out` method for this purpose. First, let's get -# the first fitted model from the cross-validation. +# feature names representative of the feature combination. Scikit-learn provides +# a `get_feature_names_out` method for this purpose. First, let's get the first +# fitted model from the cross-validation. # %% model_first_fold = cv_results["estimator"][0] # %% [markdown] # Now, we can access to the fitted `PolynomialFeatures` to generate the feature -# names +# names: # %% feature_names = model_first_fold[0].get_feature_names_out( - input_features=data.columns) + input_features=data.columns +) feature_names # %% [markdown] @@ -131,12 +140,16 @@ # %% from sklearn.linear_model import Ridge -ridge = make_pipeline(PolynomialFeatures(degree=2), - Ridge(alpha=100)) -cv_results = cross_validate(ridge, data, target, - cv=10, scoring="neg_mean_squared_error", - return_train_score=True, - return_estimator=True) +ridge = make_pipeline(PolynomialFeatures(degree=2), Ridge(alpha=100)) +cv_results = cross_validate( + ridge, + data, + target, + cv=10, + scoring="neg_mean_squared_error", + return_train_score=True, + return_estimator=True, +) # %% [markdown] # The code cell above will generate a couple of warnings because the features @@ -147,13 +160,17 @@ # %% train_error = -cv_results["train_score"] -print(f"Mean squared error of linear regression model on the train set:\n" - f"{train_error.mean():.3f} ยฑ {train_error.std():.3f}") +print( + "Mean squared error of linear regression model on the train set:\n" + f"{train_error.mean():.3f} ยฑ {train_error.std():.3f}" +) # %% test_error = -cv_results["test_score"] -print(f"Mean squared error of linear regression model on the test set:\n" - f"{test_error.mean():.3f} ยฑ {test_error.std():.3f}") +print( + "Mean squared error of linear regression model on the test set:\n" + f"{test_error.mean():.3f} ยฑ {test_error.std():.3f}" +) # %% [markdown] # We see that the training and testing scores are much closer, indicating that @@ -181,30 +198,28 @@ # ## Feature scaling and regularization # # On the one hand, weights define the link between feature values and the -# predicted target. -# On the other hand, regularization adds constraints on the weights of the -# model through the `alpha` parameter. Therefore, the effect that feature -# rescaling has on the final weights also interacts with regularization. +# predicted target. On the other hand, regularization adds constraints on the +# weights of the model through the `alpha` parameter. Therefore, the effect that +# feature rescaling has on the final weights also interacts with regularization. # -# Let's consider the case where features live on the same scale/units: if -# two features are found to be equally important by the model, they will be -# affected similarly by regularization strength. +# Let's consider the case where features live on the same scale/units: if two +# features are found to be equally important by the model, they will be affected +# similarly by regularization strength. # -# Now, let's consider the scenario where features have completely different -# data scale (for instance age in years and annual revenue in dollars). -# If two features are as important, our model will boost the weights of -# features with small scale and reduce the weights of features with -# high scale. +# Now, let's consider the scenario where features have completely different data +# scale (for instance age in years and annual revenue in dollars). If two +# features are as important, our model will boost the weights of features with +# small scale and reduce the weights of features with high scale. # # We recall that regularization forces weights to be closer. Therefore, we get -# an intuition that if we want to use regularization, dealing with rescaled -# data would make it easier to find an optimal regularization parameter and -# thus an adequate model. +# an intuition that if we want to use regularization, dealing with rescaled data +# would make it easier to find an optimal regularization parameter and thus an +# adequate model. # # As a side note, some solvers based on gradient computation are expecting such # rescaled data. Unscaled data will be detrimental when computing the optimal -# weights. Therefore, when working with a linear model and numerical data, it -# is generally good practice to scale the data. +# weights. Therefore, when working with a linear model and numerical data, it is +# generally good practice to scale the data. # # Thus, we will add a `StandardScaler` in the machine learning pipeline. This # scaler will be placed just before the regressor. @@ -212,22 +227,32 @@ # %% from sklearn.preprocessing import StandardScaler -ridge = make_pipeline(PolynomialFeatures(degree=2), StandardScaler(), - Ridge(alpha=0.5)) -cv_results = cross_validate(ridge, data, target, - cv=10, scoring="neg_mean_squared_error", - return_train_score=True, - return_estimator=True) +ridge = make_pipeline( + PolynomialFeatures(degree=2), StandardScaler(), Ridge(alpha=0.5) +) +cv_results = cross_validate( + ridge, + data, + target, + cv=10, + scoring="neg_mean_squared_error", + return_train_score=True, + return_estimator=True, +) # %% train_error = -cv_results["train_score"] -print(f"Mean squared error of linear regression model on the train set:\n" - f"{train_error.mean():.3f} ยฑ {train_error.std():.3f}") +print( + "Mean squared error of linear regression model on the train set:\n" + f"{train_error.mean():.3f} ยฑ {train_error.std():.3f}" +) # %% test_error = -cv_results["test_score"] -print(f"Mean squared error of linear regression model on the test set:\n" - f"{test_error.mean():.3f} ยฑ {test_error.std():.3f}") +print( + "Mean squared error of linear regression model on the test set:\n" + f"{test_error.mean():.3f} ยฑ {test_error.std():.3f}" +) # %% [markdown] # We observe that scaling data has a positive impact on the test score and that @@ -249,16 +274,22 @@ # Compare to the previous plots, we see that now all weight magnitudes are # closer and that all features are more equally contributing. # -# In the previous example, we fixed `alpha=0.5`. We will now check the impact -# of the value of `alpha` by increasing its value. +# In the previous example, we fixed `alpha=0.5`. We will now check the impact of +# the value of `alpha` by increasing its value. # %% -ridge = make_pipeline(PolynomialFeatures(degree=2), StandardScaler(), - Ridge(alpha=1_000_000)) -cv_results = cross_validate(ridge, data, target, - cv=10, scoring="neg_mean_squared_error", - return_train_score=True, - return_estimator=True) +ridge = make_pipeline( + PolynomialFeatures(degree=2), StandardScaler(), Ridge(alpha=1_000_000) +) +cv_results = cross_validate( + ridge, + data, + target, + cv=10, + scoring="neg_mean_squared_error", + return_train_score=True, + return_estimator=True, +) # %% coefs = [est[-1].coef_ for est in cv_results["estimator"]] @@ -280,9 +311,9 @@ # # However, this choice can be questioned since scaling interacts with # regularization as well. For instance, scaling categorical features that are -# imbalanced (e.g. more occurrences of a specific category) would even out -# the impact of regularization to each category. However, scaling such features -# in the presence of rare categories could be problematic (i.e. division by a very +# imbalanced (e.g. more occurrences of a specific category) would even out the +# impact of regularization to each category. However, scaling such features in +# the presence of rare categories could be problematic (i.e. division by a very # small standard deviation) and it can therefore introduce numerical issues. # ``` # @@ -290,8 +321,8 @@ # an effect on the performance. We chose the parameter beforehand and fixed it # for the analysis. # -# In the next section, we will check the impact of the regularization -# parameter `alpha` and how it should be tuned. +# In the next section, we will check the impact of the regularization parameter +# `alpha` and how it should be tuned. # # ## Fine tuning the regularization parameter # @@ -299,9 +330,9 @@ # The default parameter will not lead to the optimal model. Therefore, we need # to tune the `alpha` parameter. # -# Model hyperparameter tuning should be done with care. Indeed, we want to -# find an optimal parameter that maximizes some metrics. Thus, it requires both -# a training set and testing set. +# Model hyperparameter tuning should be done with care. Indeed, we want to find +# an optimal parameter that maximizes some metrics. Thus, it requires both a +# training set and testing set. # # However, this testing set should be different from the out-of-sample testing # set that we used to evaluate our model: if we use the same one, we are using @@ -309,44 +340,56 @@ # out-of-sample rule. # # Therefore, we should include search of the hyperparameter `alpha` within the -# cross-validation. As we saw in previous notebooks, we could use a -# grid-search. However, some predictor in scikit-learn are available with -# an integrated hyperparameter search, more efficient than using a grid-search. -# The name of these predictors finishes by `CV`. In the case of `Ridge`, -# scikit-learn provides a `RidgeCV` regressor. +# cross-validation. As we saw in previous notebooks, we could use a grid-search. +# However, some predictor in scikit-learn are available with an integrated +# hyperparameter search, more efficient than using a grid-search. The name of +# these predictors finishes by `CV`. In the case of `Ridge`, scikit-learn +# provides a `RidgeCV` regressor. # # Therefore, we can use this predictor as the last step of the pipeline. # Including the pipeline a cross-validation allows to make a nested -# cross-validation: the inner cross-validation will search for the best -# alpha, while the outer cross-validation will give an estimate of the -# testing score. +# cross-validation: the inner cross-validation will search for the best alpha, +# while the outer cross-validation will give an estimate of the testing score. # %% import numpy as np from sklearn.linear_model import RidgeCV alphas = np.logspace(-2, 0, num=21) -ridge = make_pipeline(PolynomialFeatures(degree=2), StandardScaler(), - RidgeCV(alphas=alphas, store_cv_values=True)) +ridge = make_pipeline( + PolynomialFeatures(degree=2), + StandardScaler(), + RidgeCV(alphas=alphas, store_cv_values=True), +) # %% from sklearn.model_selection import ShuffleSplit cv = ShuffleSplit(n_splits=5, random_state=1) -cv_results = cross_validate(ridge, data, target, - cv=cv, scoring="neg_mean_squared_error", - return_train_score=True, - return_estimator=True, n_jobs=2) +cv_results = cross_validate( + ridge, + data, + target, + cv=cv, + scoring="neg_mean_squared_error", + return_train_score=True, + return_estimator=True, + n_jobs=2, +) # %% train_error = -cv_results["train_score"] -print(f"Mean squared error of linear regression model on the train set:\n" - f"{train_error.mean():.3f} ยฑ {train_error.std():.3f}") +print( + "Mean squared error of linear regression model on the train set:\n" + f"{train_error.mean():.3f} ยฑ {train_error.std():.3f}" +) # %% test_error = -cv_results["test_score"] -print(f"Mean squared error of linear regression model on the test set:\n" - f"{test_error.mean():.3f} ยฑ {test_error.std():.3f}") +print( + "Mean squared error of linear regression model on the test set:\n" + f"{test_error.mean():.3f} ยฑ {test_error.std():.3f}" +) # %% [markdown] # By optimizing `alpha`, we see that the training and testing scores are close. @@ -355,19 +398,19 @@ # When fitting the ridge regressor, we also requested to store the error found # during cross-validation (by setting the parameter `store_cv_values=True`). We # will plot the mean squared error for the different `alphas` regularization -# strength that we tried. The error bars represent one standard deviation of -# the average mean square error across folds for a given value of `alpha`. +# strength that we tried. The error bars represent one standard deviation of the +# average mean square error across folds for a given value of `alpha`. # %% -mse_alphas = [est[-1].cv_values_.mean(axis=0) - for est in cv_results["estimator"]] +mse_alphas = [ + est[-1].cv_values_.mean(axis=0) for est in cv_results["estimator"] +] cv_alphas = pd.DataFrame(mse_alphas, columns=alphas) cv_alphas = cv_alphas.aggregate(["mean", "std"]).T cv_alphas # %% -plt.errorbar(cv_alphas.index, cv_alphas["mean"], - yerr=cv_alphas["std"]) +plt.errorbar(cv_alphas.index, cv_alphas["mean"], yerr=cv_alphas["std"]) plt.xlim((0.0, 1.0)) plt.ylim((4_500, 11_000)) plt.ylabel("Mean squared error\n (lower is better)") @@ -376,8 +419,8 @@ # %% [markdown] # As we can see, regularization is just like salt in cooking: one must balance -# its amount to get the best generalization performance. We can check if the best -# `alpha` found is stable across the cross-validation fold. +# its amount to get the best generalization performance. We can check if the +# best `alpha` found is stable across the cross-validation fold. # %% best_alphas = [est[-1].alpha_ for est in cv_results["estimator"]] @@ -386,17 +429,19 @@ # %% [markdown] # The optimal regularization strength is not necessarily the same on all # cross-validation iterations. But since we expect each cross-validation -# resampling to stem from the same data distribution, it is common practice -# to choose the best `alpha` to put into production as lying in the range -# defined by: +# resampling to stem from the same data distribution, it is common practice to +# choose the best `alpha` to put into production as lying in the range defined +# by: # %% -print(f"Min optimal alpha: {np.min(best_alphas):.2f} and " - f"Max optimal alpha: {np.max(best_alphas):.2f}") +print( + f"Min optimal alpha: {np.min(best_alphas):.2f} and " + f"Max optimal alpha: {np.max(best_alphas):.2f}" +) # %% [markdown] # This range can be reduced by decreasing the spacing between the grid of # `alphas`. # -# In this notebook, you learned about the concept of regularization and -# the importance of preprocessing and parameter tuning. +# In this notebook, you learned about the concept of regularization and the +# importance of preprocessing and parameter tuning. diff --git a/python_scripts/linear_models_sol_01.py b/python_scripts/linear_models_sol_01.py index 525af896e..0f98303be 100644 --- a/python_scripts/linear_models_sol_01.py +++ b/python_scripts/linear_models_sol_01.py @@ -52,12 +52,12 @@ def linear_model_flipper_mass( # %% [markdown] # ## Main exercise # -# Define a vector `weights = [...]` and a vector `intercepts = [...]` of -# the same length. Each pair of entries `(weights[i], intercepts[i])` tags a +# Define a vector `weights = [...]` and a vector `intercepts = [...]` of the +# same length. Each pair of entries `(weights[i], intercepts[i])` tags a # different model. Use these vectors along with the vector -# `flipper_length_range` to plot several linear models that could possibly -# fit our data. Use the above helper function to visualize both the models and -# the real samples. +# `flipper_length_range` to plot several linear models that could possibly fit +# our data. Use the above helper function to visualize both the models and the +# real samples. # %% import numpy as np @@ -72,22 +72,27 @@ def linear_model_flipper_mass( weights = [-40, 45, 90] intercepts = [15000, -5000, -14000] -ax = sns.scatterplot(data=penguins, x=feature_name, y=target_name, - color="black", alpha=0.5) +ax = sns.scatterplot( + data=penguins, x=feature_name, y=target_name, color="black", alpha=0.5 +) label = "{0:.2f} (g / mm) * flipper length + {1:.2f} (g)" for weight, intercept in zip(weights, intercepts): predicted_body_mass = linear_model_flipper_mass( - flipper_length_range, weight, intercept) + flipper_length_range, weight, intercept + ) - ax.plot(flipper_length_range, predicted_body_mass, - label=label.format(weight, intercept)) -_ = ax.legend(loc='center left', bbox_to_anchor=(-0.25, 1.25), ncol=1) + ax.plot( + flipper_length_range, + predicted_body_mass, + label=label.format(weight, intercept), + ) +_ = ax.legend(loc="center left", bbox_to_anchor=(-0.25, 1.25), ncol=1) # %% [markdown] -# In the previous question, you were asked to create several linear models. -# The visualization allowed you to qualitatively assess if a model was better -# than another. +# In the previous question, you were asked to create several linear models. The +# visualization allowed you to qualitatively assess if a model was better than +# another. # # Now, you should come up with a quantitative measure which indicates the # goodness of fit of each linear model and allows you to select the best model. @@ -111,9 +116,10 @@ def goodness_fit_measure(true_values, predictions): # as an example. return np.mean(np.abs(errors)) + # %% [markdown] -# You can now copy and paste the code below to show the goodness of fit for -# each model. +# You can now copy and paste the code below to show the goodness of fit for each +# model. # # ```python # for model_idx, (weight, intercept) in enumerate(zip(weights, intercepts)): diff --git a/python_scripts/linear_models_sol_02.py b/python_scripts/linear_models_sol_02.py index 19cbe27e5..d62a4b983 100644 --- a/python_scripts/linear_models_sol_02.py +++ b/python_scripts/linear_models_sol_02.py @@ -21,16 +21,17 @@ # %% import numpy as np + # Set the seed for reproduction rng = np.random.RandomState(0) # Generate data n_sample = 100 data_max, data_min = 1.4, -1.4 -len_data = (data_max - data_min) +len_data = data_max - data_min data = rng.rand(n_sample) * len_data - len_data / 2 -noise = rng.randn(n_sample) * .3 -target = data ** 3 - 0.5 * data ** 2 + noise +noise = rng.randn(n_sample) * 0.3 +target = data**3 - 0.5 * data**2 + noise # %% [markdown] # ```{note} @@ -40,36 +41,41 @@ # %% import pandas as pd + full_data = pd.DataFrame({"data": data, "target": target}) # %% import seaborn as sns -_ = sns.scatterplot(data=full_data, x="data", y="target", color="black", - alpha=0.5) +_ = sns.scatterplot( + data=full_data, x="data", y="target", color="black", alpha=0.5 +) # %% [markdown] # We observe that the link between the data `data` and vector `target` is -# non-linear. For instance, `data` could represent the years of -# experience (normalized) and `target` the salary (normalized). Therefore, the -# problem here would be to infer the salary given the years of experience. +# non-linear. For instance, `data` could represent the years of experience +# (normalized) and `target` the salary (normalized). Therefore, the problem here +# would be to infer the salary given the years of experience. # # Using the function `f` defined below, find both the `weight` and the # `intercept` that you think will lead to a good linear model. Plot both the # data and the predictions of this model. + # %% def f(data, weight=0, intercept=0): target_predict = weight * data + intercept return target_predict + # %% # solution predictions = f(data, weight=1.2, intercept=-0.2) # %% tags=["solution"] -ax = sns.scatterplot(data=full_data, x="data", y="target", color="black", - alpha=0.5) +ax = sns.scatterplot( + data=full_data, x="data", y="target", color="black", alpha=0.5 +) _ = ax.plot(data, predictions) # %% [markdown] @@ -110,8 +116,9 @@ def f(data, weight=0, intercept=0): predictions = linear_regression.predict(data_2d) # %% tags=["solution"] -ax = sns.scatterplot(data=full_data, x="data", y="target", color="black", - alpha=0.5) +ax = sns.scatterplot( + data=full_data, x="data", y="target", color="black", alpha=0.5 +) _ = ax.plot(data, predictions) # %% [markdown] diff --git a/python_scripts/linear_models_sol_03.py b/python_scripts/linear_models_sol_03.py index 7d5ee27ab..7fadc8468 100644 --- a/python_scripts/linear_models_sol_03.py +++ b/python_scripts/linear_models_sol_03.py @@ -15,11 +15,10 @@ # The aim of this notebook is to train a linear regression algorithm on a # dataset with more than a single feature. # -# We will load a dataset about house prices in California. -# The dataset consists of 8 features regarding the demography and geography of -# districts in California and the aim is to predict the median house price of -# each district. We will use all 8 features to predict the target, the median -# house price. +# We will load a dataset about house prices in California. The dataset consists +# of 8 features regarding the demography and geography of districts in +# California and the aim is to predict the median house price of each district. +# We will use all 8 features to predict the target, the median house price. # %% [markdown] # ```{note} @@ -35,8 +34,8 @@ data.head() # %% [markdown] -# Now it is your turn to train a linear regression model on this dataset. -# First, create a linear regression model. +# Now it is your turn to train a linear regression model on this dataset. First, +# create a linear regression model. # %% # solution @@ -45,30 +44,38 @@ linear_regression = LinearRegression() # %% [markdown] -# Execute a cross-validation with 10 folds and use the mean absolute error -# (MAE) as metric. Be sure to *return* the fitted *estimators*. +# Execute a cross-validation with 10 folds and use the mean absolute error (MAE) +# as metric. Be sure to *return* the fitted *estimators*. # %% # solution from sklearn.model_selection import cross_validate -cv_results = cross_validate(linear_regression, data, target, - scoring="neg_mean_absolute_error", - return_estimator=True, cv=10, n_jobs=2) +cv_results = cross_validate( + linear_regression, + data, + target, + scoring="neg_mean_absolute_error", + return_estimator=True, + cv=10, + n_jobs=2, +) # %% [markdown] # Compute the mean and std of the MAE in thousands of dollars (k$). # %% # solution -print(f"Mean absolute error on testing set: " - f"{-cv_results['test_score'].mean():.3f} k$ ยฑ " - f"{cv_results['test_score'].std():.3f}") +print( + "Mean absolute error on testing set: " + f"{-cv_results['test_score'].mean():.3f} k$ ยฑ " + f"{cv_results['test_score'].std():.3f}" +) # %% [markdown] # Inspect the fitted model using a box plot to show the distribution of values -# for the coefficients returned from the cross-validation. Hint: -# use the function +# for the coefficients returned from the cross-validation. Hint: use the +# function # [`df.plot.box()`](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.plot.box.html) # to create a box plot. @@ -78,7 +85,8 @@ import pandas as pd weights = pd.DataFrame( - [est.coef_ for est in cv_results["estimator"]], columns=data.columns) + [est.coef_ for est in cv_results["estimator"]], columns=data.columns +) # %% tags=["solution"] import matplotlib.pyplot as plt diff --git a/python_scripts/linear_models_sol_04.py b/python_scripts/linear_models_sol_04.py index 76b0c1d3e..a759c3d24 100644 --- a/python_scripts/linear_models_sol_04.py +++ b/python_scripts/linear_models_sol_04.py @@ -8,8 +8,8 @@ # %% [markdown] # # ๐Ÿ“ƒ Solution for Exercise M4.04 # -# In the previous notebook, we saw the effect of applying some regularization -# on the coefficient of a linear model. +# In the previous notebook, we saw the effect of applying some regularization on +# the coefficient of a linear model. # # In this exercise, we will study the advantage of using some regularization # when dealing with correlated features. @@ -31,8 +31,8 @@ ) # %% [markdown] -# When creating the dataset, `make_regression` returns the true coefficient -# used to generate the dataset. Let's plot this information. +# When creating the dataset, `make_regression` returns the true coefficient used +# to generate the dataset. Let's plot this information. # %% import pandas as pd @@ -49,9 +49,9 @@ coef # %% [markdown] -# Create a `LinearRegression` regressor and fit on the entire dataset and -# check the value of the coefficients. Are the coefficients of the linear -# regressor close to the coefficients used to generate the dataset? +# Create a `LinearRegression` regressor and fit on the entire dataset and check +# the value of the coefficients. Are the coefficients of the linear regressor +# close to the coefficients used to generate the dataset? # %% # solution @@ -89,8 +89,8 @@ data = np.concatenate([data, data[:, [0, 1]], data[:, [0, 1]]], axis=1) # %% [markdown] -# Fit again the linear regressor on this new dataset and check the -# coefficients. What do you observe? +# Fit again the linear regressor on this new dataset and check the coefficients. +# What do you observe? # %% # solution @@ -114,12 +114,12 @@ _ = coef.plot.barh() # %% [markdown] tags=["solution"] -# We see that the coefficient values are far from what one could expect. -# By repeating the informative features, one would have expected these -# coefficients to be similarly informative. +# We see that the coefficient values are far from what one could expect. By +# repeating the informative features, one would have expected these coefficients +# to be similarly informative. # -# Instead, we see that some coefficients have a huge norm ~1e14. It indeed -# means that we try to solve an mathematical ill-posed problem. Indeed, finding +# Instead, we see that some coefficients have a huge norm ~1e14. It indeed means +# that we try to solve an mathematical ill-posed problem. Indeed, finding # coefficients in a linear regression involves inverting the matrix # `np.dot(data.T, data)` which is not possible (or lead to high numerical # errors). @@ -143,8 +143,8 @@ # %% [markdown] tags=["solution"] # We see that the penalty applied on the weights give a better results: the # values of the coefficients do not suffer from numerical issues. Indeed, the -# matrix to be inverted internally is `np.dot(data.T, data) + alpha * I`. -# Adding this penalty `alpha` allow the inversion without numerical issue. +# matrix to be inverted internally is `np.dot(data.T, data) + alpha * I`. Adding +# this penalty `alpha` allow the inversion without numerical issue. # %% [markdown] # Can you find the relationship between the ridge coefficients and the original @@ -155,8 +155,8 @@ ridge.coef_[:5] * 3 # %% [markdown] tags=["solution"] -# Repeating three times each informative features induced to divide the -# ridge coefficients by three. +# Repeating three times each informative features induced to divide the ridge +# coefficients by three. # %% [markdown] tags=["solution"] # ```{tip} @@ -180,7 +180,7 @@ import pandas as pd from sklearn.model_selection import train_test_split -ames_housing = pd.read_csv("../datasets/house_prices.csv", na_values='?') +ames_housing = pd.read_csv("../datasets/house_prices.csv", na_values="?") ames_housing = ames_housing.drop(columns="Id") categorical_columns = ["Street", "Foundation", "CentralAir", "PavedDrive"] @@ -195,8 +195,8 @@ # # We previously presented that a `OneHotEncoder` creates as many columns as # categories. Therefore, there is always one column (i.e. one encoded category) -# that can be inferred from the others. Thus, `OneHotEncoder` creates -# collinear features. +# that can be inferred from the others. Thus, `OneHotEncoder` creates collinear +# features. # # We illustrate this behaviour by considering the "CentralAir" feature that # contains only two categories: @@ -221,20 +221,19 @@ # Here, we see that the encoded category "CentralAir_N" is the opposite of the # encoded category "CentralAir_Y". Therefore, we observe that using a # `OneHotEncoder` creates two features having the problematic pattern observed -# earlier in this exercise. Training a linear regression model on such a -# of one-hot encoded binary feature can therefore lead to numerical -# problems, especially without regularization. Furthermore, the two one-hot -# features are redundant as they encode exactly the same information in -# opposite ways. +# earlier in this exercise. Training a linear regression model on such a of +# one-hot encoded binary feature can therefore lead to numerical problems, +# especially without regularization. Furthermore, the two one-hot features are +# redundant as they encode exactly the same information in opposite ways. # -# Using regularization helps to overcome the numerical issues that we highlighted -# earlier in this exercise. +# Using regularization helps to overcome the numerical issues that we +# highlighted earlier in this exercise. # # Another strategy is to arbitrarily drop one of the encoded categories. # Scikit-learn provides such an option by setting the parameter `drop` in the -# `OneHotEncoder`. This parameter can be set to `first` to always drop the -# first encoded category or `binary_only` to only drop a column in the case of -# binary categories. +# `OneHotEncoder`. This parameter can be set to `first` to always drop the first +# encoded category or `binary_only` to only drop a column in the case of binary +# categories. # %% tags=["solution"] encoder = OneHotEncoder(drop="first", sparse_output=False, dtype=np.int32) @@ -248,13 +247,14 @@ # %% [markdown] tags=["solution"] # # We see that only the second column of the previous encoded data is kept. -# Dropping one of the one-hot encoded column is a common practice, -# especially for binary categorical features. Note however that this breaks -# symmetry between categories and impacts the number of coefficients of the -# model, their values, and thus their meaning, especially when applying -# strong regularization. +# Dropping one of the one-hot encoded column is a common practice, especially +# for binary categorical features. Note however that this breaks symmetry +# between categories and impacts the number of coefficients of the model, their +# values, and thus their meaning, especially when applying strong +# regularization. # -# Let's finally illustrate how to use this option is a machine-learning pipeline: +# Let's finally illustrate how to use this option is a machine-learning +# pipeline: # %% tags=["solution"] from sklearn.pipeline import make_pipeline @@ -262,9 +262,7 @@ model = make_pipeline(OneHotEncoder(drop="first", dtype=np.int32), Ridge()) model.fit(X_train, y_train) n_categories = [X_train[col].nunique() for col in X_train.columns] -print( - f"R2 score on the testing set: {model.score(X_test, y_test):.2f}" -) +print(f"R2 score on the testing set: {model.score(X_test, y_test):.2f}") print( f"Our model contains {model[-1].coef_.size} features while " f"{sum(n_categories)} categories are originally available." diff --git a/python_scripts/linear_models_sol_05.py b/python_scripts/linear_models_sol_05.py index fb57c5f48..bc4a15df1 100644 --- a/python_scripts/linear_models_sol_05.py +++ b/python_scripts/linear_models_sol_05.py @@ -7,13 +7,14 @@ # %% [markdown] # # ๐Ÿ“ƒ Solution for Exercise M4.05 +# # In the previous notebook we set `penalty="none"` to disable regularization -# entirely. This parameter can also control the **type** of regularization to use, -# whereas the regularization **strength** is set using the parameter `C`. -# Setting`penalty="none"` is equivalent to an infinitely large value of `C`. -# In this exercise, we ask you to train a logistic regression classifier using the -# `penalty="l2"` regularization (which happens to be the default in scikit-learn) -# to find by yourself the effect of the parameter `C`. +# entirely. This parameter can also control the **type** of regularization to +# use, whereas the regularization **strength** is set using the parameter `C`. +# Setting`penalty="none"` is equivalent to an infinitely large value of `C`. In +# this exercise, we ask you to train a logistic regression classifier using the +# `penalty="l2"` regularization (which happens to be the default in +# scikit-learn) to find by yourself the effect of the parameter `C`. # # We will start by loading the dataset. @@ -28,8 +29,9 @@ penguins = pd.read_csv("../datasets/penguins_classification.csv") # only keep the Adelie and Chinstrap classes -penguins = penguins.set_index("Species").loc[ - ["Adelie", "Chinstrap"]].reset_index() +penguins = ( + penguins.set_index("Species").loc[["Adelie", "Chinstrap"]].reset_index() +) culmen_columns = ["Culmen Length (mm)", "Culmen Depth (mm)"] target_column = "Species" @@ -54,7 +56,8 @@ from sklearn.linear_model import LogisticRegression logistic_regression = make_pipeline( - StandardScaler(), LogisticRegression(penalty="l2")) + StandardScaler(), LogisticRegression(penalty="l2") +) # %% [markdown] # Given the following candidates for the `C` parameter, find out the impact of @@ -83,8 +86,12 @@ alpha=0.5, ) sns.scatterplot( - data=penguins_test, x=culmen_columns[0], y=culmen_columns[1], - hue=target_column, palette=["tab:red", "tab:blue"]) + data=penguins_test, + x=culmen_columns[0], + y=culmen_columns[1], + hue=target_column, + palette=["tab:red", "tab:blue"], + ) plt.legend(bbox_to_anchor=(1.05, 0.8), loc="upper left") plt.title(f"C: {C} \n Accuracy on the test set: {accuracy:.2f}") @@ -101,8 +108,7 @@ weights_ridge.append(pd.Series(coefs, index=culmen_columns)) # %% tags=["solution"] -weights_ridge = pd.concat( - weights_ridge, axis=1, keys=[f"C: {C}" for C in Cs]) +weights_ridge = pd.concat(weights_ridge, axis=1, keys=[f"C: {C}" for C in Cs]) weights_ridge.plot.barh() _ = plt.title("LogisticRegression weights depending of C") diff --git a/python_scripts/linear_regression_in_sklearn.py b/python_scripts/linear_regression_in_sklearn.py index 49447eb1e..199b0d976 100644 --- a/python_scripts/linear_regression_in_sklearn.py +++ b/python_scripts/linear_regression_in_sklearn.py @@ -9,14 +9,14 @@ # # Linear regression using scikit-learn # # In the previous notebook, we presented the parametrization of a linear model. -# During the exercise, you saw that varying parameters will give different models -# that will fit better or worse the data. To evaluate quantitatively this +# During the exercise, you saw that varying parameters will give different +# models that will fit better or worse the data. To evaluate quantitatively this # goodness of fit, you implemented a so-called metric. # # When doing machine learning, you are interested in selecting the model which -# will minimize the error on the data available the most. -# From the previous exercise, we could implement a brute-force approach, -# varying the weights and intercept and select the model with the lowest error. +# will minimize the error on the data available the most. From the previous +# exercise, we could implement a brute-force approach, varying the weights and +# intercept and select the model with the lowest error. # # Hopefully, this problem of finding the best parameters values (i.e. that # result in the lowest error) can be solved without the need to check every @@ -47,8 +47,8 @@ # %% [markdown] # The instance `linear_regression` will store the parameter values in the -# attributes `coef_` and `intercept_`. We can check what the optimal model -# found is: +# attributes `coef_` and `intercept_`. We can check what the optimal model found +# is: # %% weight_flipper_length = linear_regression.coef_[0] @@ -67,7 +67,8 @@ flipper_length_range = np.linspace(data.min(), data.max(), num=300) predicted_body_mass = ( - weight_flipper_length * flipper_length_range + intercept_body_mass) + weight_flipper_length * flipper_length_range + intercept_body_mass +) # %% import matplotlib.pyplot as plt @@ -78,10 +79,10 @@ _ = plt.title("Model using LinearRegression from scikit-learn") # %% [markdown] -# In the solution of the previous exercise, we implemented a function to -# compute the goodness of fit of a model. Indeed, we mentioned two metrics: (i) -# the mean squared error and (ii) the mean absolute error. These metrics are -# implemented in scikit-learn and we do not need to use our own implementation. +# In the solution of the previous exercise, we implemented a function to compute +# the goodness of fit of a model. Indeed, we mentioned two metrics: (i) the mean +# squared error and (ii) the mean absolute error. These metrics are implemented +# in scikit-learn and we do not need to use our own implementation. # # We can first compute the mean squared error. @@ -100,8 +101,8 @@ # a higher mean squared error on the training set. # # However, the mean squared error is difficult to interpret. The mean absolute -# error is more intuitive since it provides an error in the same unit as the -# one of the target. +# error is more intuitive since it provides an error in the same unit as the one +# of the target. # %% from sklearn.metrics import mean_absolute_error @@ -110,8 +111,8 @@ print(f"The mean absolute error of the optimal model is {model_error:.2f} g") # %% [markdown] -# A mean absolute error of 313 means that in average, our model make an error -# of ยฑ 313 grams when predicting the body mass of a penguin given its flipper +# A mean absolute error of 313 means that in average, our model make an error of +# ยฑ 313 grams when predicting the body mass of a penguin given its flipper # length. diff --git a/python_scripts/linear_regression_non_linear_link.py b/python_scripts/linear_regression_non_linear_link.py index 69ca2878a..2fc6699ac 100644 --- a/python_scripts/linear_regression_non_linear_link.py +++ b/python_scripts/linear_regression_non_linear_link.py @@ -8,12 +8,12 @@ # %% [markdown] # # Linear regression for a non-linear features-target relationship # -# In the previous exercise, you were asked to train a linear regression model -# on a dataset where the matrix `data` and the vector `target` do not have a -# linear link. +# In the previous exercise, you were asked to train a linear regression model on +# a dataset where the matrix `data` and the vector `target` do not have a linear +# link. # -# In this notebook, we show that even if the parametrization of linear models -# is not natively adapted to the problem at hand, it is still possible to make +# In this notebook, we show that even if the parametrization of linear models is +# not natively adapted to the problem at hand, it is still possible to make # linear models more expressive by engineering additional features. # # A machine learning pipeline that combines a non-linear feature engineering @@ -30,11 +30,11 @@ n_sample = 100 data_max, data_min = 1.4, -1.4 -len_data = (data_max - data_min) +len_data = data_max - data_min # sort the data to make plotting easier later data = np.sort(rng.rand(n_sample) * len_data - len_data / 2) -noise = rng.randn(n_sample) * .3 -target = data ** 3 - 0.5 * data ** 2 + noise +noise = rng.randn(n_sample) * 0.3 +target = data**3 - 0.5 * data**2 + noise # %% [markdown] # ```{note} @@ -50,12 +50,13 @@ # %% import seaborn as sns -_ = sns.scatterplot(data=full_data, x="input_feature", y="target", - color="black", alpha=0.5) +_ = sns.scatterplot( + data=full_data, x="input_feature", y="target", color="black", alpha=0.5 +) # %% [markdown] -# We will highlight the limitations of fitting a linear regression model as -# done in the previous exercise. +# We will highlight the limitations of fitting a linear regression model as done +# in the previous exercise. # # ```{warning} # In scikit-learn, by convention `data` (also called `X` in the scikit-learn @@ -82,26 +83,28 @@ mse = mean_squared_error(target, target_predicted) # %% -ax = sns.scatterplot(data=full_data, x="input_feature", y="target", - color="black", alpha=0.5) +ax = sns.scatterplot( + data=full_data, x="input_feature", y="target", color="black", alpha=0.5 +) ax.plot(data, target_predicted) _ = ax.set_title(f"Mean squared error = {mse:.2f}") # %% [markdown] # # Here the coefficient and intercept learnt by `LinearRegression` define the -# best "straight line" that fits the data. We can inspect the coefficients -# using the attributes of the model learnt as follows: +# best "straight line" that fits the data. We can inspect the coefficients using +# the attributes of the model learnt as follows: # %% -print(f"weight: {linear_regression.coef_[0]:.2f}, " - f"intercept: {linear_regression.intercept_:.2f}") +print( + f"weight: {linear_regression.coef_[0]:.2f}, " + f"intercept: {linear_regression.intercept_:.2f}" +) # %% [markdown] -# # It is important to note that the learnt model will not be able to handle the -# non-linear relationship between `data` and `target` since linear models -# assume the relationship between `data` and `target` to be linear. +# non-linear relationship between `data` and `target` since linear models assume +# the relationship between `data` and `target` to be linear. # # Indeed, there are 3 possibilities to solve this issue: # @@ -122,18 +125,18 @@ mse = mean_squared_error(target, target_predicted) # %% -ax = sns.scatterplot(data=full_data, x="input_feature", y="target", - color="black", alpha=0.5) +ax = sns.scatterplot( + data=full_data, x="input_feature", y="target", color="black", alpha=0.5 +) ax.plot(data, target_predicted) _ = ax.set_title(f"Mean squared error = {mse:.2f}") # %% [markdown] -# -# Instead of having a model which can natively deal with non-linearity, we -# could also modify our data: we could create new features, derived from the -# original features, using some expert knowledge. In this example, we know that -# we have a cubic and squared relationship between `data` and `target` (because -# we generated the data). +# Instead of having a model which can natively deal with non-linearity, we could +# also modify our data: we could create new features, derived from the original +# features, using some expert knowledge. In this example, we know that we have a +# cubic and squared relationship between `data` and `target` (because we +# generated the data). # # Indeed, we could create two new features (`data ** 2` and `data ** 3`) using # this information as follows. This kind of transformation is called a @@ -143,7 +146,7 @@ data.shape # %% -data_expanded = np.concatenate([data, data ** 2, data ** 3], axis=1) +data_expanded = np.concatenate([data, data**2, data**3], axis=1) data_expanded.shape @@ -153,13 +156,13 @@ mse = mean_squared_error(target, target_predicted) # %% -ax = sns.scatterplot(data=full_data, x="input_feature", y="target", - color="black", alpha=0.5) +ax = sns.scatterplot( + data=full_data, x="input_feature", y="target", color="black", alpha=0.5 +) ax.plot(data, target_predicted) _ = ax.set_title(f"Mean squared error = {mse:.2f}") # %% [markdown] -# # We can see that even with a linear model, we can overcome the linearity # limitation of the model by adding the non-linear components in the design of # additional features. Here, we created new features by knowing the way the @@ -185,11 +188,10 @@ mse = mean_squared_error(target, target_predicted) # %% [markdown] -# # In the previous cell we had to set `include_bias=False` as otherwise we would # create a column perfectly correlated to the `intercept_` introduced by the -# `LinearRegression`. We can verify that this procedure is equivalent to creating -# the features by hand up to numerical error by computing the maximum +# `LinearRegression`. We can verify that this procedure is equivalent to +# creating the features by hand up to numerical error by computing the maximum # of the absolute values of the differences between the features generated by # both methods and checking that it is close to zero: @@ -197,24 +199,22 @@ np.abs(polynomial_regression[0].fit_transform(data) - data_expanded).max() # %% [markdown] -# # Then it should not be surprising that the predictions of the # `PolynomialFeatures` pipeline match the predictions of the linear model fit on # manually engineered features. # %% -ax = sns.scatterplot(data=full_data, x="input_feature", y="target", - color="black", alpha=0.5) +ax = sns.scatterplot( + data=full_data, x="input_feature", y="target", color="black", alpha=0.5 +) ax.plot(data, target_predicted) _ = ax.set_title(f"Mean squared error = {mse:.2f}") # %% [markdown] -# # The last possibility is to make a linear model more expressive is to use a # "kernel". Instead of learning a weight per feature as we previously -# emphasized, a weight will be assigned to each sample. However, not all -# samples will be used. This is the base of the support vector machine -# algorithm. +# emphasized, a weight will be assigned to each sample. However, not all samples +# will be used. This is the base of the support vector machine algorithm. # # The mathematical definition of "kernels" and "support vector machines" is # beyond the scope of this course. We encourage interested readers with a @@ -222,8 +222,8 @@ # SVMs](https://scikit-learn.org/stable/modules/svm.html) for more details. # # For the rest of us, let us just develop some intuitions on the relative -# expressive power of support vector machines with linear and non-linear -# kernels by fitting them on the same dataset. +# expressive power of support vector machines with linear and non-linear kernels +# by fitting them on the same dataset. # # First, consider a support vector machine with a linear kernel: @@ -236,13 +236,13 @@ mse = mean_squared_error(target, target_predicted) # %% -ax = sns.scatterplot(data=full_data, x="input_feature", y="target", - color="black", alpha=0.5) +ax = sns.scatterplot( + data=full_data, x="input_feature", y="target", color="black", alpha=0.5 +) ax.plot(data, target_predicted) _ = ax.set_title(f"Mean squared error = {mse:.2f}") # %% [markdown] -# # The predictions of our SVR with a linear kernel are all aligned on a straight # line. `SVR(kernel="linear")` is indeed yet another example of a linear model. # @@ -261,13 +261,13 @@ mse = mean_squared_error(target, target_predicted) # %% -ax = sns.scatterplot(data=full_data, x="input_feature", y="target", - color="black", alpha=0.5) +ax = sns.scatterplot( + data=full_data, x="input_feature", y="target", color="black", alpha=0.5 +) ax.plot(data, target_predicted) _ = ax.set_title(f"Mean squared error = {mse:.2f}") # %% [markdown] -# # Kernel methods such as SVR are very efficient for small to medium datasets. # # For larger datasets with `n_samples >> 10_000`, it is often computationally @@ -278,22 +278,24 @@ # or # [Nystroem](https://scikit-learn.org/stable/modules/generated/sklearn.kernel_approximation.Nystroem.html). # -# Here again we refer the interested reader to the documentation to get a -# proper definition of those methods. The following just gives an intuitive -# overview of the predictions we would get using those on our toy dataset: +# Here again we refer the interested reader to the documentation to get a proper +# definition of those methods. The following just gives an intuitive overview of +# the predictions we would get using those on our toy dataset: # %% from sklearn.preprocessing import KBinsDiscretizer binned_regression = make_pipeline( - KBinsDiscretizer(n_bins=8), LinearRegression(), + KBinsDiscretizer(n_bins=8), + LinearRegression(), ) binned_regression.fit(data, target) target_predicted = binned_regression.predict(data) mse = mean_squared_error(target, target_predicted) -ax = sns.scatterplot(data=full_data, x="input_feature", y="target", - color="black", alpha=0.5) +ax = sns.scatterplot( + data=full_data, x="input_feature", y="target", color="black", alpha=0.5 +) ax.plot(data, target_predicted) _ = ax.set_title(f"Mean squared error = {mse:.2f}") @@ -301,13 +303,15 @@ from sklearn.kernel_approximation import Nystroem nystroem_regression = make_pipeline( - Nystroem(n_components=5), LinearRegression(), + Nystroem(n_components=5), + LinearRegression(), ) nystroem_regression.fit(data, target) target_predicted = nystroem_regression.predict(data) mse = mean_squared_error(target, target_predicted) -ax = sns.scatterplot(data=full_data, x="input_feature", y="target", - color="black", alpha=0.5) +ax = sns.scatterplot( + data=full_data, x="input_feature", y="target", color="black", alpha=0.5 +) ax.plot(data, target_predicted) _ = ax.set_title(f"Mean squared error = {mse:.2f}") diff --git a/python_scripts/linear_regression_without_sklearn.py b/python_scripts/linear_regression_without_sklearn.py index a834a68fd..3ce72bc2d 100644 --- a/python_scripts/linear_regression_without_sklearn.py +++ b/python_scripts/linear_regression_without_sklearn.py @@ -25,8 +25,8 @@ penguins.head() # %% [markdown] -# We will formulate the following problem: using the flipper length of a penguin, -# we would like to infer its mass. +# We will formulate the following problem: using the flipper length of a +# penguin, we would like to infer its mass. # %% import seaborn as sns @@ -35,44 +35,45 @@ target_name = "Body Mass (g)" data, target = penguins[[feature_name]], penguins[target_name] -ax = sns.scatterplot(data=penguins, x=feature_name, y=target_name, - color="black", alpha=0.5) +ax = sns.scatterplot( + data=penguins, x=feature_name, y=target_name, color="black", alpha=0.5 +) ax.set_title("Body Mass as a function of the Flipper Length") # %% [markdown] # ```{tip} -# The function `scatterplot` from seaborn take as input the full dataframe -# and the parameter `x` and `y` allows to specify the name of the columns to -# be plotted. Note that this function returns a matplotlib axis -# (named `ax` in the example above) that can be further used to add elements on -# the same matplotlib axis (such as a title). +# The function `scatterplot` from seaborn take as input the full dataframe and +# the parameter `x` and `y` allows to specify the name of the columns to be +# plotted. Note that this function returns a matplotlib axis (named `ax` in the +# example above) that can be further used to add elements on the same matplotlib +# axis (such as a title). # ``` # %% [markdown] -# In this problem, penguin mass is our target. It is a continuous -# variable that roughly varies between 2700 g and 6300 g. Thus, this is a -# regression problem (in contrast to classification). We also see that there is -# almost a linear relationship between the body mass of the penguin and its -# flipper length. The longer the flipper, the heavier the penguin. +# In this problem, penguin mass is our target. It is a continuous variable that +# roughly varies between 2700 g and 6300 g. Thus, this is a regression problem +# (in contrast to classification). We also see that there is almost a linear +# relationship between the body mass of the penguin and its flipper length. The +# longer the flipper, the heavier the penguin. # -# Thus, we could come up with a simple formula, where given a flipper length -# we could compute the body mass of a penguin using a linear relationship -# of the form `y = a * x + b` where `a` and `b` are the 2 parameters of our -# model. +# Thus, we could come up with a simple formula, where given a flipper length we +# could compute the body mass of a penguin using a linear relationship of the +# form `y = a * x + b` where `a` and `b` are the 2 parameters of our model. # %% -def linear_model_flipper_mass(flipper_length, weight_flipper_length, - intercept_body_mass): +def linear_model_flipper_mass( + flipper_length, weight_flipper_length, intercept_body_mass +): """Linear model of the form y = a * x + b""" body_mass = weight_flipper_length * flipper_length + intercept_body_mass return body_mass # %% [markdown] -# Using the model we defined above, we can check the body mass values -# predicted for a range of flipper lengths. We will set `weight_flipper_length` -# to be 45 and `intercept_body_mass` to be -5000. +# Using the model we defined above, we can check the body mass values predicted +# for a range of flipper lengths. We will set `weight_flipper_length` to be 45 +# and `intercept_body_mass` to be -5000. # %% import numpy as np @@ -82,7 +83,8 @@ def linear_model_flipper_mass(flipper_length, weight_flipper_length, flipper_length_range = np.linspace(data.min(), data.max(), num=300) predicted_body_mass = linear_model_flipper_mass( - flipper_length_range, weight_flipper_length, intercept_body_mass) + flipper_length_range, weight_flipper_length, intercept_body_mass +) # %% [markdown] # We can now plot all samples and the linear model prediction. @@ -90,8 +92,9 @@ def linear_model_flipper_mass(flipper_length, weight_flipper_length, # %% label = "{0:.2f} (g / mm) * flipper length + {1:.2f} (g)" -ax = sns.scatterplot(data=penguins, x=feature_name, y=target_name, - color="black", alpha=0.5) +ax = sns.scatterplot( + data=penguins, x=feature_name, y=target_name, color="black", alpha=0.5 +) ax.plot(flipper_length_range, predicted_body_mass) _ = ax.set_title(label.format(weight_flipper_length, intercept_body_mass)) @@ -100,50 +103,55 @@ def linear_model_flipper_mass(flipper_length, weight_flipper_length, # `flipper_length` in order to make the inference. When this coefficient is # positive, it means that penguins with longer flipper lengths will have larger # body masses. If the coefficient is negative, it means that penguins with -# shorter flipper lengths have larger body masses. Graphically, this -# coefficient is represented by the slope of the curve in the plot. Below we -# show what the curve would look like when the `weight_flipper_length` -# coefficient is negative. +# shorter flipper lengths have larger body masses. Graphically, this coefficient +# is represented by the slope of the curve in the plot. Below we show what the +# curve would look like when the `weight_flipper_length` coefficient is +# negative. # %% weight_flipper_length = -40 intercept_body_mass = 13000 predicted_body_mass = linear_model_flipper_mass( - flipper_length_range, weight_flipper_length, intercept_body_mass) + flipper_length_range, weight_flipper_length, intercept_body_mass +) # %% [markdown] # We can now plot all samples and the linear model prediction. # %% -ax = sns.scatterplot(data=penguins, x=feature_name, y=target_name, - color="black", alpha=0.5) +ax = sns.scatterplot( + data=penguins, x=feature_name, y=target_name, color="black", alpha=0.5 +) ax.plot(flipper_length_range, predicted_body_mass) _ = ax.set_title(label.format(weight_flipper_length, intercept_body_mass)) # %% [markdown] -# In our case, this coefficient has a meaningful unit: g/mm. -# For instance, a coefficient of 40 g/mm, means that for each -# additional millimeter in flipper length, the body weight predicted will -# increase by 40 g. +# In our case, this coefficient has a meaningful unit: g/mm. For instance, a +# coefficient of 40 g/mm, means that for each additional millimeter in flipper +# length, the body weight predicted will increase by 40 g. # %% body_mass_180 = linear_model_flipper_mass( - flipper_length=180, weight_flipper_length=40, intercept_body_mass=0) + flipper_length=180, weight_flipper_length=40, intercept_body_mass=0 +) body_mass_181 = linear_model_flipper_mass( - flipper_length=181, weight_flipper_length=40, intercept_body_mass=0) + flipper_length=181, weight_flipper_length=40, intercept_body_mass=0 +) -print(f"The body mass for a flipper length of 180 mm " - f"is {body_mass_180} g and {body_mass_181} g " - f"for a flipper length of 181 mm") +print( + "The body mass for a flipper length of 180 mm " + f"is {body_mass_180} g and {body_mass_181} g " + "for a flipper length of 181 mm" +) # %% [markdown] # We can also see that we have a parameter `intercept_body_mass` in our model. # This parameter corresponds to the value on the y-axis if `flipper_length=0` -# (which in our case is only a mathematical consideration, as in our data, -# the value of `flipper_length` only goes from 170mm to 230mm). This y-value -# when x=0 is called the y-intercept. If `intercept_body_mass` is 0, the curve -# will pass through the origin: +# (which in our case is only a mathematical consideration, as in our data, the +# value of `flipper_length` only goes from 170mm to 230mm). This y-value when +# x=0 is called the y-intercept. If `intercept_body_mass` is 0, the curve will +# pass through the origin: # %% weight_flipper_length = 25 @@ -152,11 +160,13 @@ def linear_model_flipper_mass(flipper_length, weight_flipper_length, # redefined the flipper length to start at 0 to plot the intercept value flipper_length_range = np.linspace(0, data.max(), num=300) predicted_body_mass = linear_model_flipper_mass( - flipper_length_range, weight_flipper_length, intercept_body_mass) + flipper_length_range, weight_flipper_length, intercept_body_mass +) # %% -ax = sns.scatterplot(data=penguins, x=feature_name, y=target_name, - color="black", alpha=0.5) +ax = sns.scatterplot( + data=penguins, x=feature_name, y=target_name, color="black", alpha=0.5 +) ax.plot(flipper_length_range, predicted_body_mass) _ = ax.set_title(label.format(weight_flipper_length, intercept_body_mass)) @@ -168,11 +178,13 @@ def linear_model_flipper_mass(flipper_length, weight_flipper_length, intercept_body_mass = -5000 predicted_body_mass = linear_model_flipper_mass( - flipper_length_range, weight_flipper_length, intercept_body_mass) + flipper_length_range, weight_flipper_length, intercept_body_mass +) # %% -ax = sns.scatterplot(data=penguins, x=feature_name, y=target_name, - color="black", alpha=0.5) +ax = sns.scatterplot( + data=penguins, x=feature_name, y=target_name, color="black", alpha=0.5 +) ax.plot(flipper_length_range, predicted_body_mass) _ = ax.set_title(label.format(weight_flipper_length, intercept_body_mass)) diff --git a/python_scripts/logistic_regression.py b/python_scripts/logistic_regression.py index 5664349f4..3156ebda0 100644 --- a/python_scripts/logistic_regression.py +++ b/python_scripts/logistic_regression.py @@ -7,13 +7,14 @@ # %% [markdown] # # Linear model for classification +# # In regression, we saw that the target to be predicted was a continuous # variable. In classification, this target will be discrete (e.g. categorical). # # We will go back to our penguin dataset. However, this time we will try to # predict the penguin species using the culmen information. We will also -# simplify our classification problem by selecting only 2 of the penguin -# species to solve a binary classification problem. +# simplify our classification problem by selecting only 2 of the penguin species +# to solve a binary classification problem. # %% [markdown] # ```{note} @@ -27,8 +28,9 @@ penguins = pd.read_csv("../datasets/penguins_classification.csv") # only keep the Adelie and Chinstrap classes -penguins = penguins.set_index("Species").loc[ - ["Adelie", "Chinstrap"]].reset_index() +penguins = ( + penguins.set_index("Species").loc[["Adelie", "Chinstrap"]].reset_index() +) culmen_columns = ["Culmen Length (mm)", "Culmen Depth (mm)"] target_column = "Species" @@ -45,13 +47,12 @@ plt.xlabel(feature_name) # %% [markdown] -# We can observe that we have quite a simple problem. When the culmen -# length increases, the probability that the penguin is a Chinstrap is closer -# to 1. However, the culmen depth is not helpful for predicting the penguin -# species. +# We can observe that we have quite a simple problem. When the culmen length +# increases, the probability that the penguin is a Chinstrap is closer to 1. +# However, the culmen depth is not helpful for predicting the penguin species. # -# For model fitting, we will separate the target from the data and -# we will create a training and a testing set. +# For model fitting, we will separate the target from the data and we will +# create a training and a testing set. # %% from sklearn.model_selection import train_test_split @@ -65,11 +66,9 @@ target_test = penguins_test[target_column] # %% [markdown] -# -# The linear regression that we previously saw will predict a continuous -# output. When the target is a binary outcome, one can use the logistic -# function to model the probability. This model is known as logistic -# regression. +# The linear regression that we previously saw will predict a continuous output. +# When the target is a binary outcome, one can use the logistic function to +# model the probability. This model is known as logistic regression. # # Scikit-learn provides the class `LogisticRegression` which implements this # algorithm. @@ -87,11 +86,10 @@ print(f"Accuracy on test set: {accuracy:.3f}") # %% [markdown] -# -# Since we are dealing with a classification problem containing only 2 -# features, it is then possible to observe the decision function boundary. -# The boundary is the rule used by our predictive model to affect a class label -# given the feature values of the sample. +# Since we are dealing with a classification problem containing only 2 features, +# it is then possible to observe the decision function boundary. The boundary is +# the rule used by our predictive model to affect a class label given the +# feature values of the sample. # # ```{note} # Here, we will use the class `DecisionBoundaryDisplay`. This educational tool @@ -109,11 +107,19 @@ from sklearn.inspection import DecisionBoundaryDisplay DecisionBoundaryDisplay.from_estimator( - logistic_regression, data_test, response_method="predict", cmap="RdBu_r", alpha=0.5 + logistic_regression, + data_test, + response_method="predict", + cmap="RdBu_r", + alpha=0.5, ) sns.scatterplot( - data=penguins_test, x=culmen_columns[0], y=culmen_columns[1], - hue=target_column, palette=["tab:red", "tab:blue"]) + data=penguins_test, + x=culmen_columns[0], + y=culmen_columns[1], + hue=target_column, + palette=["tab:red", "tab:blue"], +) _ = plt.title("Decision boundary of the trained\n LogisticRegression") # %% [markdown] diff --git a/python_scripts/logistic_regression_non_linear.py b/python_scripts/logistic_regression_non_linear.py index 0a51c87e7..d28a4a9e6 100644 --- a/python_scripts/logistic_regression_non_linear.py +++ b/python_scripts/logistic_regression_non_linear.py @@ -8,16 +8,15 @@ # %% [markdown] # # Beyond linear separation in classification # -# As we saw in the regression section, the linear classification model -# expects the data to be linearly separable. When this assumption does not -# hold, the model is not expressive enough to properly fit the data. -# Therefore, we need to apply the same tricks as in regression: feature -# augmentation (potentially using expert-knowledge) or using a -# kernel-based method. +# As we saw in the regression section, the linear classification model expects +# the data to be linearly separable. When this assumption does not hold, the +# model is not expressive enough to properly fit the data. Therefore, we need to +# apply the same tricks as in regression: feature augmentation (potentially +# using expert-knowledge) or using a kernel-based method. # -# We will provide examples where we will use a kernel support vector machine -# to perform classification on some toy-datasets where it is impossible to -# find a perfect linear separation. +# We will provide examples where we will use a kernel support vector machine to +# perform classification on some toy-datasets where it is impossible to find a +# perfect linear separation. # # We will generate a first dataset where the data are represented as two # interlaced half circles. This dataset is generated using the function @@ -34,8 +33,10 @@ X, y = make_moons(n_samples=100, noise=0.13, random_state=42) # We store both the data and target in a dataframe to ease plotting -moons = pd.DataFrame(np.concatenate([X, y[:, np.newaxis]], axis=1), - columns=feature_names + [target_name]) +moons = pd.DataFrame( + np.concatenate([X, y[:, np.newaxis]], axis=1), + columns=feature_names + [target_name], +) data_moons, target_moons = moons[feature_names], moons[target_name] # %% [markdown] @@ -46,18 +47,23 @@ import matplotlib.pyplot as plt import seaborn as sns -sns.scatterplot(data=moons, x=feature_names[0], y=feature_names[1], - hue=target_moons, palette=["tab:red", "tab:blue"]) +sns.scatterplot( + data=moons, + x=feature_names[0], + y=feature_names[1], + hue=target_moons, + palette=["tab:red", "tab:blue"], +) _ = plt.title("Illustration of the moons dataset") # %% [markdown] -# From the intuitions that we got by studying linear model, it should be -# obvious that a linear classifier will not be able to find a perfect decision -# function to separate the two classes. +# From the intuitions that we got by studying linear model, it should be obvious +# that a linear classifier will not be able to find a perfect decision function +# to separate the two classes. # -# Let's try to see what is the decision boundary of such a linear classifier. -# We will create a predictive model by standardizing the dataset followed by -# a linear support vector machine classifier. +# Let's try to see what is the decision boundary of such a linear classifier. We +# will create a predictive model by standardizing the dataset followed by a +# linear support vector machine classifier. # %% from sklearn.pipeline import make_pipeline @@ -69,11 +75,11 @@ # %% [markdown] # ```{warning} -# Be aware that we fit and will check the boundary decision of the classifier -# on the same dataset without splitting the dataset into a training set and a +# Be aware that we fit and will check the boundary decision of the classifier on +# the same dataset without splitting the dataset into a training set and a # testing set. While this is a bad practice, we use it for the sake of -# simplicity to depict the model behavior. Always use cross-validation when -# you want to assess the generalization performance of a machine-learning model. +# simplicity to depict the model behavior. Always use cross-validation when you +# want to assess the generalization performance of a machine-learning model. # ``` # %% [markdown] @@ -85,8 +91,13 @@ DecisionBoundaryDisplay.from_estimator( linear_model, data_moons, response_method="predict", cmap="RdBu", alpha=0.5 ) -sns.scatterplot(data=moons, x=feature_names[0], y=feature_names[1], - hue=target_moons, palette=["tab:red", "tab:blue"]) +sns.scatterplot( + data=moons, + x=feature_names[0], + y=feature_names[1], + hue=target_moons, + palette=["tab:red", "tab:blue"], +) _ = plt.title("Decision boundary of a linear model") # %% [markdown] @@ -103,14 +114,22 @@ target_name = "class" X, y = make_gaussian_quantiles( - n_samples=100, n_features=2, n_classes=2, random_state=42) -gauss = pd.DataFrame(np.concatenate([X, y[:, np.newaxis]], axis=1), - columns=feature_names + [target_name]) + n_samples=100, n_features=2, n_classes=2, random_state=42 +) +gauss = pd.DataFrame( + np.concatenate([X, y[:, np.newaxis]], axis=1), + columns=feature_names + [target_name], +) data_gauss, target_gauss = gauss[feature_names], gauss[target_name] # %% -ax = sns.scatterplot(data=gauss, x=feature_names[0], y=feature_names[1], - hue=target_gauss, palette=["tab:red", "tab:blue"]) +ax = sns.scatterplot( + data=gauss, + x=feature_names[0], + y=feature_names[1], + hue=target_gauss, + palette=["tab:red", "tab:blue"], +) _ = plt.title("Illustration of the Gaussian quantiles dataset") # %% [markdown] @@ -123,19 +142,24 @@ DecisionBoundaryDisplay.from_estimator( linear_model, data_gauss, response_method="predict", cmap="RdBu", alpha=0.5 ) -sns.scatterplot(data=gauss, x=feature_names[0], y=feature_names[1], - hue=target_gauss, palette=["tab:red", "tab:blue"]) +sns.scatterplot( + data=gauss, + x=feature_names[0], + y=feature_names[1], + hue=target_gauss, + palette=["tab:red", "tab:blue"], +) _ = plt.title("Decision boundary of a linear model") # %% [markdown] # As expected, a linear separation cannot be used to separate the classes -# properly: the model will under-fit as it will make errors even on -# the training set. +# properly: the model will under-fit as it will make errors even on the training +# set. # # In the section about linear regression, we saw that we could use several -# tricks to make a linear model more flexible by augmenting features or -# using a kernel. Here, we will use the later solution by using a radial basis -# function (RBF) kernel together with a support vector machine classifier. +# tricks to make a linear model more flexible by augmenting features or using a +# kernel. Here, we will use the later solution by using a radial basis function +# (RBF) kernel together with a support vector machine classifier. # # We will repeat the two previous experiments and check the obtained decision # function. @@ -148,13 +172,18 @@ DecisionBoundaryDisplay.from_estimator( kernel_model, data_moons, response_method="predict", cmap="RdBu", alpha=0.5 ) -sns.scatterplot(data=moons, x=feature_names[0], y=feature_names[1], - hue=target_moons, palette=["tab:red", "tab:blue"]) +sns.scatterplot( + data=moons, + x=feature_names[0], + y=feature_names[1], + hue=target_moons, + palette=["tab:red", "tab:blue"], +) _ = plt.title("Decision boundary with a model using an RBF kernel") # %% [markdown] -# We see that the decision boundary is not anymore a straight line. Indeed, -# an area is defined around the red samples and we could imagine that this +# We see that the decision boundary is not anymore a straight line. Indeed, an +# area is defined around the red samples and we could imagine that this # classifier should be able to generalize on unseen data. # # Let's check the decision function on the second dataset. @@ -164,8 +193,13 @@ DecisionBoundaryDisplay.from_estimator( kernel_model, data_gauss, response_method="predict", cmap="RdBu", alpha=0.5 ) -ax = sns.scatterplot(data=gauss, x=feature_names[0], y=feature_names[1], - hue=target_gauss, palette=["tab:red", "tab:blue"]) +ax = sns.scatterplot( + data=gauss, + x=feature_names[0], + y=feature_names[1], + hue=target_gauss, + palette=["tab:red", "tab:blue"], +) _ = plt.title("Decision boundary with a model using an RBF kernel") # %% [markdown] @@ -177,7 +211,7 @@ # # Keep in mind that adding flexibility to a model can also risk increasing # overfitting by making the decision function to be sensitive to individual -# (possibly noisy) data points of the training set. Here we can observe that -# the decision functions remain smooth enough to preserve good generalization. -# If you are curious, you can try to repeat the above experiment with -# `gamma=100` and look at the decision functions. +# (possibly noisy) data points of the training set. Here we can observe that the +# decision functions remain smooth enough to preserve good generalization. If +# you are curious, you can try to repeat the above experiment with `gamma=100` +# and look at the decision functions. diff --git a/python_scripts/matplotlibrc b/python_scripts/matplotlibrc index f05e0c23b..5b66f14e6 100644 --- a/python_scripts/matplotlibrc +++ b/python_scripts/matplotlibrc @@ -28,4 +28,4 @@ ytick.labelsize: 16.5 ytick.major.size: 9.0 ytick.major.width: 1.875 ytick.minor.size: 6.0 -ytick.minor.width: 1.5 \ No newline at end of file +ytick.minor.width: 1.5 diff --git a/python_scripts/metrics_classification.py b/python_scripts/metrics_classification.py index 5e57c1e28..ce91a3483 100644 --- a/python_scripts/metrics_classification.py +++ b/python_scripts/metrics_classification.py @@ -12,13 +12,13 @@ # its minimum or maximum. It is important to understand that this objective # function is usually decoupled from the evaluation metric that we want to # optimize in practice. The objective function serves as a proxy for the -# evaluation metric. Therefore, in the upcoming notebooks, we will present -# the different evaluation metrics used in machine learning. +# evaluation metric. Therefore, in the upcoming notebooks, we will present the +# different evaluation metrics used in machine learning. # # This notebook aims at giving an overview of the classification metrics that -# can be used to evaluate the predictive model generalization performance. We can -# recall that in a classification setting, the vector `target` is categorical -# rather than continuous. +# can be used to evaluate the predictive model generalization performance. We +# can recall that in a classification setting, the vector `target` is +# categorical rather than continuous. # # We will load the blood transfusion dataset. @@ -57,7 +57,8 @@ from sklearn.model_selection import train_test_split data_train, data_test, target_train, target_test = train_test_split( - data, target, shuffle=True, random_state=0, test_size=0.5) + data, target, shuffle=True, random_state=0, test_size=0.5 +) # %% [markdown] # We will use a logistic regression classifier as a base model. We will train @@ -72,6 +73,7 @@ # %% [markdown] # ## Classifier predictions +# # Before we go into details regarding the metrics, we will recall what type of # predictions a classifier can provide. # @@ -97,13 +99,12 @@ classifier.predict(new_donor) # %% [markdown] -# With this information, our classifier predicts that this synthetic subject -# is more likely to not donate blood again. +# With this information, our classifier predicts that this synthetic subject is +# more likely to not donate blood again. # -# However, we cannot check whether the prediction is correct (we do not know -# the true target value). That's the purpose of the testing set. First, we -# predict whether a subject will give blood with the help of the trained -# classifier. +# However, we cannot check whether the prediction is correct (we do not know the +# true target value). That's the purpose of the testing set. First, we predict +# whether a subject will give blood with the help of the trained classifier. # %% target_predicted = classifier.predict(data_test) @@ -111,6 +112,7 @@ # %% [markdown] # ## Accuracy as a baseline +# # Now that we have these predictions, we can compare them with the true # predictions (sometimes called ground-truth) which we did not use until now. @@ -120,8 +122,8 @@ # %% [markdown] # In the comparison above, a `True` value means that the value predicted by our # classifier is identical to the real value, while a `False` means that our -# classifier made a mistake. One way of getting an overall rate representing -# the generalization performance of our classifier would be to compute how many +# classifier made a mistake. One way of getting an overall rate representing the +# generalization performance of our classifier would be to compute how many # times our classifier was right and divide it by the number of samples in our # set. @@ -131,9 +133,9 @@ np.mean(target_test == target_predicted) # %% [markdown] -# This measure is called the accuracy. Here, our classifier is 78% -# accurate at classifying if a subject will give blood. `scikit-learn` provides -# a function that computes this metric in the module `sklearn.metrics`. +# This measure is called the accuracy. Here, our classifier is 78% accurate at +# classifying if a subject will give blood. `scikit-learn` provides a function +# that computes this metric in the module `sklearn.metrics`. # %% from sklearn.metrics import accuracy_score @@ -150,6 +152,7 @@ # %% [markdown] # ## Confusion matrix and derived metrics +# # The comparison that we did above and the accuracy that we calculated did not # take into account the type of error our classifier was making. Accuracy is an # aggregate of the errors made by the classifier. We may be interested in finer @@ -165,20 +168,19 @@ _ = ConfusionMatrixDisplay.from_estimator(classifier, data_test, target_test) # %% [markdown] -# The in-diagonal numbers are related to predictions that were correct -# while off-diagonal numbers are related to incorrect predictions +# The in-diagonal numbers are related to predictions that were correct while +# off-diagonal numbers are related to incorrect predictions # (misclassifications). We now know the four types of correct and erroneous # predictions: # -# * the top left corner are true positives (TP) and corresponds to people -# who gave blood and were predicted as such by the classifier; -# * the bottom right corner are true negatives (TN) and correspond to -# people who did not give blood and were predicted as such by the -# classifier; -# * the top right corner are false negatives (FN) and correspond to -# people who gave blood but were predicted to not have given blood; -# * the bottom left corner are false positives (FP) and correspond to -# people who did not give blood but were predicted to have given blood. +# * the top left corner are true positives (TP) and corresponds to people who +# gave blood and were predicted as such by the classifier; +# * the bottom right corner are true negatives (TN) and correspond to people who +# did not give blood and were predicted as such by the classifier; +# * the top right corner are false negatives (FN) and correspond to people who +# gave blood but were predicted to not have given blood; +# * the bottom left corner are false positives (FP) and correspond to people who +# did not give blood but were predicted to have given blood. # # Once we have split this information, we can compute metrics to highlight the # generalization performance of our classifier in a particular setting. For @@ -186,14 +188,13 @@ # blood when the classifier predicted so or the fraction of people predicted to # have given blood out of the total population that actually did so. # -# The former metric, known as the precision, is defined as TP / (TP + FP) -# and represents how likely the person actually gave blood when the classifier -# predicted that they did. -# The latter, known as the recall, defined as TP / (TP + FN) and -# assesses how well the classifier is able to correctly identify people who -# did give blood. -# We could, similarly to accuracy, manually compute these values, -# however scikit-learn provides functions to compute these statistics. +# The former metric, known as the precision, is defined as TP / (TP + FP) and +# represents how likely the person actually gave blood when the classifier +# predicted that they did. The latter, known as the recall, defined as TP / (TP +# + FN) and assesses how well the classifier is able to correctly identify +# people who did give blood. We could, similarly to accuracy, manually compute +# these values, however scikit-learn provides functions to compute these +# statistics. # %% from sklearn.metrics import precision_score, recall_score @@ -208,17 +209,17 @@ # These results are in line with what was seen in the confusion matrix. Looking # at the left column, more than half of the "donated" predictions were correct, # leading to a precision above 0.5. However, our classifier mislabeled a lot of -# people who gave blood as "not donated", leading to a very low recall of -# around 0.1. +# people who gave blood as "not donated", leading to a very low recall of around +# 0.1. # # ## The issue of class imbalance # At this stage, we could ask ourself a reasonable question. While the accuracy # did not look bad (i.e. 77%), the recall score is relatively low (i.e. 12%). # # As we mentioned, precision and recall only focuses on samples predicted to be -# positive, while accuracy takes both into account. In addition, we did not -# look at the ratio of classes (labels). We could check this ratio in the -# training set. +# positive, while accuracy takes both into account. In addition, we did not look +# at the ratio of classes (labels). We could check this ratio in the training +# set. # %% target_train.value_counts(normalize=True).plot.barh() @@ -237,8 +238,10 @@ dummy_classifier = DummyClassifier(strategy="most_frequent") dummy_classifier.fit(data_train, target_train) -print(f"Accuracy of the dummy classifier: " - f"{dummy_classifier.score(data_test, target_test):.3f}") +print( + "Accuracy of the dummy classifier: " + f"{dummy_classifier.score(data_test, target_test):.3f}" +) # %% [markdown] # With the dummy classifier, which always predicts the negative class `'not @@ -264,15 +267,15 @@ # # All statistics that we presented up to now rely on `classifier.predict` which # outputs the most likely label. We haven't made use of the probability -# associated with this prediction, which gives the confidence of the -# classifier in this prediction. By default, the prediction of a classifier -# corresponds to a threshold of 0.5 probability in a binary classification -# problem. We can quickly check this relationship with the classifier that -# we trained. +# associated with this prediction, which gives the confidence of the classifier +# in this prediction. By default, the prediction of a classifier corresponds to +# a threshold of 0.5 probability in a binary classification problem. We can +# quickly check this relationship with the classifier that we trained. # %% -target_proba_predicted = pd.DataFrame(classifier.predict_proba(data_test), - columns=classifier.classes_) +target_proba_predicted = pd.DataFrame( + classifier.predict_proba(data_test), columns=classifier.classes_ +) target_proba_predicted[:5] # %% @@ -280,33 +283,39 @@ target_predicted[:5] # %% [markdown] -# Since probabilities sum to 1 we can get the class with the highest -# probability without using the threshold 0.5. +# Since probabilities sum to 1 we can get the class with the highest probability +# without using the threshold 0.5. # %% equivalence_pred_proba = ( - target_proba_predicted.idxmax(axis=1).to_numpy() == target_predicted) + target_proba_predicted.idxmax(axis=1).to_numpy() == target_predicted +) np.all(equivalence_pred_proba) # %% [markdown] # The default decision threshold (0.5) might not be the best threshold that -# leads to optimal generalization performance of our classifier. In this case, one -# can vary the decision threshold, and therefore the underlying prediction, and -# compute the same statistics presented earlier. Usually, the two metrics +# leads to optimal generalization performance of our classifier. In this case, +# one can vary the decision threshold, and therefore the underlying prediction, +# and compute the same statistics presented earlier. Usually, the two metrics # recall and precision are computed and plotted on a graph. Each metric plotted -# on a graph axis and each point on the graph corresponds to a specific -# decision threshold. Let's start by computing the precision-recall curve. +# on a graph axis and each point on the graph corresponds to a specific decision +# threshold. Let's start by computing the precision-recall curve. # %% from sklearn.metrics import PrecisionRecallDisplay disp = PrecisionRecallDisplay.from_estimator( - classifier, data_test, target_test, pos_label='donated', - marker="+" + classifier, data_test, target_test, pos_label="donated", marker="+" ) disp = PrecisionRecallDisplay.from_estimator( - dummy_classifier, data_test, target_test, pos_label='donated', - color="tab:orange", linestyle="--", ax=disp.ax_) + dummy_classifier, + data_test, + target_test, + pos_label="donated", + color="tab:orange", + linestyle="--", + ax=disp.ax_, +) plt.xlabel("Recall (also known as TPR or sensitivity)") plt.ylabel("Precision (also known as PPV)") plt.xlim(0, 1) @@ -338,9 +347,7 @@ # of the positive class). # %% -prevalence = ( - target_test.value_counts()[1] / target_test.value_counts().sum() -) +prevalence = target_test.value_counts()[1] / target_test.value_counts().sum() print(f"Prevalence of the class 'donated': {prevalence:.2f}") # %% [markdown] @@ -349,8 +356,8 @@ # positive class and accurately discriminating the negative classes. The # statistics used for this are sensitivity and specificity. Sensitivity is just # another name for recall. However, specificity measures the proportion of -# correctly classified samples in the negative class defined as: TN / (TN + -# FP). Similar to the precision-recall curve, sensitivity and specificity are +# correctly classified samples in the negative class defined as: TN / (TN + FP). +# Similar to the precision-recall curve, sensitivity and specificity are # generally plotted as a curve called the Receiver Operating Characteristic # (ROC) curve. Below is such a curve: @@ -358,11 +365,17 @@ from sklearn.metrics import RocCurveDisplay disp = RocCurveDisplay.from_estimator( - classifier, data_test, target_test, pos_label='donated', - marker="+") + classifier, data_test, target_test, pos_label="donated", marker="+" +) disp = RocCurveDisplay.from_estimator( - dummy_classifier, data_test, target_test, pos_label='donated', - color="tab:orange", linestyle="--", ax=disp.ax_) + dummy_classifier, + data_test, + target_test, + pos_label="donated", + color="tab:orange", + linestyle="--", + ax=disp.ax_, +) plt.xlabel("False positive rate") plt.ylabel("True positive rate\n(also known as sensitivity or recall)") plt.xlim(0, 1) diff --git a/python_scripts/metrics_regression.py b/python_scripts/metrics_regression.py index 292cd9627..fc1b148d2 100644 --- a/python_scripts/metrics_regression.py +++ b/python_scripts/metrics_regression.py @@ -16,9 +16,9 @@ # it is a continuous variable in regression, while a discrete variable in # classification. # -# We will use the Ames housing dataset. The goal is to predict the price -# of houses in the city of Ames, Iowa. As with classification, we will only use -# a single train-test split to focus solely on the regression metrics. +# We will use the Ames housing dataset. The goal is to predict the price of +# houses in the city of Ames, Iowa. As with classification, we will only use a +# single train-test split to focus solely on the regression metrics. # %% import pandas as pd @@ -49,10 +49,9 @@ # %% [markdown] # Some machine learning models are designed to be solved as an optimization # problem: minimizing an error (also known as the loss function) using a -# training set. -# A basic loss function used in regression is the mean squared error (MSE). -# Thus, this metric is sometimes used to evaluate the model since it is -# optimized by said model. +# training set. A basic loss function used in regression is the mean squared +# error (MSE). Thus, this metric is sometimes used to evaluate the model since +# it is optimized by said model. # # We will give an example using a linear regression model. @@ -64,27 +63,31 @@ regressor.fit(data_train, target_train) target_predicted = regressor.predict(data_train) -print(f"Mean squared error on the training set: " - f"{mean_squared_error(target_train, target_predicted):.3f}") +print( + "Mean squared error on the training set: " + f"{mean_squared_error(target_train, target_predicted):.3f}" +) # %% [markdown] # Our linear regression model is minimizing the mean squared error on the -# training set. It means that there is no other set of coefficients which -# will decrease the error. +# training set. It means that there is no other set of coefficients which will +# decrease the error. # # Then, we can compute the mean squared error on the test set. # %% target_predicted = regressor.predict(data_test) -print(f"Mean squared error on the testing set: " - f"{mean_squared_error(target_test, target_predicted):.3f}") +print( + "Mean squared error on the testing set: " + f"{mean_squared_error(target_test, target_predicted):.3f}" +) # %% [markdown] -# The raw MSE can be difficult to interpret. One way is to rescale the MSE -# by the variance of the target. This score is known as the $R^2$ also called -# the coefficient of determination. Indeed, this is the default score used -# in scikit-learn by calling the method `score`. +# The raw MSE can be difficult to interpret. One way is to rescale the MSE by +# the variance of the target. This score is known as the $R^2$ also called the +# coefficient of determination. Indeed, this is the default score used in +# scikit-learn by calling the method `score`. # %% regressor.score(data_test, target_test) @@ -100,8 +103,10 @@ dummy_regressor = DummyRegressor(strategy="mean") dummy_regressor.fit(data_train, target_train) -print(f"R2 score for a regressor predicting the mean:" - f"{dummy_regressor.score(data_test, target_test):.3f}") +print( + "R2 score for a regressor predicting the mean:" + f"{dummy_regressor.score(data_test, target_test):.3f}" +) # %% [markdown] # The $R^2$ score gives insight into the quality of the model's fit. However, @@ -114,8 +119,10 @@ from sklearn.metrics import mean_absolute_error target_predicted = regressor.predict(data_test) -print(f"Mean absolute error: " - f"{mean_absolute_error(target_test, target_predicted):.3f} k$") +print( + "Mean absolute error: " + f"{mean_absolute_error(target_test, target_predicted):.3f} k$" +) # %% [markdown] # By computing the mean absolute error, we can interpret that our model is @@ -127,8 +134,10 @@ # %% from sklearn.metrics import median_absolute_error -print(f"Median absolute error: " - f"{median_absolute_error(target_test, target_predicted):.3f} k$") +print( + "Median absolute error: " + f"{median_absolute_error(target_test, target_predicted):.3f} k$" +) # %% [markdown] # The mean absolute error (or median absolute error) still have a known @@ -141,33 +150,41 @@ # %% from sklearn.metrics import mean_absolute_percentage_error -print(f"Mean absolute percentage error: " - f"{mean_absolute_percentage_error(target_test, target_predicted) * 100:.3f} %") +print( + "Mean absolute percentage error: " + f"{mean_absolute_percentage_error(target_test, target_predicted) * 100:.3f} %" +) # %% [markdown] -# In addition of metrics, we can visually represent the results by plotting -# the predicted values versus the true values. +# In addition of metrics, we can visually represent the results by plotting the +# predicted values versus the true values. # %% predicted_actual = { - "True values (k$)": target_test, "Predicted values (k$)": target_predicted} + "True values (k$)": target_test, + "Predicted values (k$)": target_predicted, +} predicted_actual = pd.DataFrame(predicted_actual) # %% import matplotlib.pyplot as plt import seaborn as sns -sns.scatterplot(data=predicted_actual, - x="True values (k$)", y="Predicted values (k$)", - color="black", alpha=0.5) +sns.scatterplot( + data=predicted_actual, + x="True values (k$)", + y="Predicted values (k$)", + color="black", + alpha=0.5, +) plt.axline((0, 0), slope=1, label="Perfect fit") -plt.axis('square') +plt.axis("square") _ = plt.title("Regression using a model without \ntarget transformation") # %% [markdown] # On this plot, correct predictions would lie on the diagonal line. This plot -# allows us to detect if the model makes errors in a consistent way, i.e. -# has some bias. +# allows us to detect if the model makes errors in a consistent way, i.e. has +# some bias. # # On this plot, we see that for the large True price values, our model tends to # under-estimate the price of the house. Typically, this issue arises when the @@ -179,26 +196,35 @@ from sklearn.compose import TransformedTargetRegressor transformer = QuantileTransformer( - n_quantiles=900, output_distribution="normal") + n_quantiles=900, output_distribution="normal" +) model_transformed_target = TransformedTargetRegressor( - regressor=regressor, transformer=transformer) + regressor=regressor, transformer=transformer +) model_transformed_target.fit(data_train, target_train) target_predicted = model_transformed_target.predict(data_test) # %% predicted_actual = { - "True values (k$)": target_test, "Predicted values (k$)": target_predicted} + "True values (k$)": target_test, + "Predicted values (k$)": target_predicted, +} predicted_actual = pd.DataFrame(predicted_actual) # %% -sns.scatterplot(data=predicted_actual, - x="True values (k$)", y="Predicted values (k$)", - color="black", alpha=0.5) +sns.scatterplot( + data=predicted_actual, + x="True values (k$)", + y="Predicted values (k$)", + color="black", + alpha=0.5, +) plt.axline((0, 0), slope=1, label="Perfect fit") -plt.axis('square') +plt.axis("square") plt.legend() -_ = plt.title("Regression using a model that\n transform the target before " - "fitting") +_ = plt.title( + "Regression using a model that\ntransform the target before fitting" +) # %% [markdown] # Thus, once we transformed the target, we see that we corrected some of the diff --git a/python_scripts/metrics_sol_01.py b/python_scripts/metrics_sol_01.py index 6beb959d9..b81238605 100644 --- a/python_scripts/metrics_sol_01.py +++ b/python_scripts/metrics_sol_01.py @@ -39,9 +39,9 @@ # %% [markdown] # Create a `StratifiedKFold` cross-validation object. Then use it inside the # `cross_val_score` function to evaluate the decision tree. We will first use -# the accuracy as a score function. Explicitly use the `scoring` parameter -# of `cross_val_score` to compute the accuracy (even if this is the default -# score). Check its documentation to learn how to do that. +# the accuracy as a score function. Explicitly use the `scoring` parameter of +# `cross_val_score` to compute the accuracy (even if this is the default score). +# Check its documentation to learn how to do that. # %% # solution @@ -56,17 +56,18 @@ # %% # solution -scores = cross_val_score(tree, data, target, cv=cv, - scoring="balanced_accuracy") +scores = cross_val_score( + tree, data, target, cv=cv, scoring="balanced_accuracy" +) print(f"Balanced accuracy score: {scores.mean():.3f} ยฑ {scores.std():.3f}") # %% [markdown] -# We will now add a bit of complexity. We would like to compute the precision -# of our model. However, during the course we saw that we need to mention the +# We will now add a bit of complexity. We would like to compute the precision of +# our model. However, during the course we saw that we need to mention the # positive label which in our case we consider to be the class `donated`. # -# We will show that computing the precision without providing the positive -# label will not be supported by scikit-learn because it is indeed ambiguous. +# We will show that computing the precision without providing the positive label +# will not be supported by scikit-learn because it is indeed ambiguous. # %% from sklearn.model_selection import cross_val_score @@ -89,9 +90,8 @@ # # So, import `sklearn.metrics.make_scorer` and # `sklearn.metrics.precision_score`. Check their documentations for more -# information. -# Finally, create a scorer by calling `make_scorer` using the score function -# `precision_score` and pass the extra parameter `pos_label="donated"`. +# information. Finally, create a scorer by calling `make_scorer` using the score +# function `precision_score` and pass the extra parameter `pos_label="donated"`. # %% # solution @@ -111,8 +111,8 @@ # %% [markdown] # `cross_val_score` will only compute a single score provided to the `scoring` # parameter. The function `cross_validate` allows the computation of multiple -# scores by passing a list of string or scorer to the parameter `scoring`, -# which could be handy. +# scores by passing a list of string or scorer to the parameter `scoring`, which +# could be handy. # # Import `sklearn.model_selection.cross_validate` and compute the accuracy and # balanced accuracy through cross-validation. Plot the cross-validation score @@ -121,8 +121,8 @@ # %% # solution from sklearn.model_selection import cross_validate -scoring = ["accuracy", "balanced_accuracy"] +scoring = ["accuracy", "balanced_accuracy"] scores = cross_validate(tree, data, target, cv=cv, scoring=scoring) scores @@ -133,7 +133,7 @@ metrics = pd.DataFrame( [scores["test_accuracy"], scores["test_balanced_accuracy"]], - index=["Accuracy", "Balanced accuracy"] + index=["Accuracy", "Balanced accuracy"], ).T # %% tags=["solution"] diff --git a/python_scripts/metrics_sol_02.py b/python_scripts/metrics_sol_02.py index 30d45a20d..6a4520811 100644 --- a/python_scripts/metrics_sol_02.py +++ b/python_scripts/metrics_sol_02.py @@ -58,11 +58,11 @@ # %% # solution -scores = cross_val_score(model, data, target, cv=10, - scoring="neg_mean_absolute_error") +scores = cross_val_score( + model, data, target, cv=10, scoring="neg_mean_absolute_error" +) errors = -scores -print(f"Mean absolute error: " - f"{errors.mean():.3f} k$ ยฑ {errors.std():.3f}") +print(f"Mean absolute error: {errors.mean():.3f} k$ ยฑ {errors.std():.3f}") # %% [markdown] tags=["solution"] # The `scoring` parameter in scikit-learn expects score. It means that the @@ -70,7 +70,7 @@ # Therefore, the error should be multiplied by -1. That's why the string given # the `scoring` starts with `neg_` when dealing with metrics which are errors. -# %% [markdown] +# %% [markdown] # Finally, use the `cross_validate` function and compute multiple scores/errors # at once by passing a list of scorers to the `scoring` parameter. You can # compute the $R^2$ score and the mean absolute error for instance. @@ -85,7 +85,9 @@ # %% tags=["solution"] import pandas as pd -scores = {"R2": cv_results["test_r2"], - "MAE": -cv_results["test_neg_mean_absolute_error"]} +scores = { + "R2": cv_results["test_r2"], + "MAE": -cv_results["test_neg_mean_absolute_error"], +} scores = pd.DataFrame(scores) scores diff --git a/python_scripts/parameter_tuning_ex_02.py b/python_scripts/parameter_tuning_ex_02.py index 0f92bc226..893b18414 100644 --- a/python_scripts/parameter_tuning_ex_02.py +++ b/python_scripts/parameter_tuning_ex_02.py @@ -35,27 +35,38 @@ data = adult_census.drop(columns=[target_name, "education-num"]) data_train, data_test, target_train, target_test = train_test_split( - data, target, train_size=0.2, random_state=42) + data, target, train_size=0.2, random_state=42 +) # %% from sklearn.compose import ColumnTransformer from sklearn.compose import make_column_selector as selector from sklearn.preprocessing import OrdinalEncoder -categorical_preprocessor = OrdinalEncoder(handle_unknown="use_encoded_value", - unknown_value=-1) +categorical_preprocessor = OrdinalEncoder( + handle_unknown="use_encoded_value", unknown_value=-1 +) preprocessor = ColumnTransformer( - [('cat_preprocessor', categorical_preprocessor, - selector(dtype_include=object))], - remainder='passthrough', sparse_threshold=0) + [ + ( + "cat_preprocessor", + categorical_preprocessor, + selector(dtype_include=object), + ) + ], + remainder="passthrough", + sparse_threshold=0, +) from sklearn.ensemble import HistGradientBoostingClassifier from sklearn.pipeline import Pipeline -model = Pipeline([ - ("preprocessor", preprocessor), - ("classifier", HistGradientBoostingClassifier(random_state=42)) -]) +model = Pipeline( + [ + ("preprocessor", preprocessor), + ("classifier", HistGradientBoostingClassifier(random_state=42)), + ] +) # %% [markdown] # diff --git a/python_scripts/parameter_tuning_ex_03.py b/python_scripts/parameter_tuning_ex_03.py index 514edb6c3..25bc5f73e 100644 --- a/python_scripts/parameter_tuning_ex_03.py +++ b/python_scripts/parameter_tuning_ex_03.py @@ -26,7 +26,8 @@ target *= 100 # rescale the target in k$ data_train, data_test, target_train, target_test = train_test_split( - data, target, random_state=42) + data, target, random_state=42 +) # %% [markdown] # In this exercise, we will progressively define the regression pipeline diff --git a/python_scripts/parameter_tuning_grid_search.py b/python_scripts/parameter_tuning_grid_search.py index fc81136f5..1cf89cdd9 100644 --- a/python_scripts/parameter_tuning_grid_search.py +++ b/python_scripts/parameter_tuning_grid_search.py @@ -45,7 +45,8 @@ from sklearn.model_selection import train_test_split data_train, data_test, target_train, target_test = train_test_split( - data, target, random_state=42) + data, target, random_state=42 +) # %% [markdown] # We will define a pipeline as seen in the first module. It will handle both @@ -60,13 +61,13 @@ categorical_columns = categorical_columns_selector(data) # %% [markdown] -# Here we will use a tree-based model as a classifier -# (i.e. `HistGradientBoostingClassifier`). That means: +# Here we will use a tree-based model as a classifier (i.e. +# `HistGradientBoostingClassifier`). That means: # # * Numerical variables don't need scaling; -# * Categorical variables can be dealt with an `OrdinalEncoder` even if the +# * Categorical variables can be dealt with an `OrdinalEncoder` even if the # coding order is not meaningful; -# * For tree-based models, the `OrdinalEncoder` avoids having high-dimensional +# * For tree-based models, the `OrdinalEncoder` avoids having high-dimensional # representations. # # We now build our `OrdinalEncoder` by passing it the known categories. @@ -74,19 +75,22 @@ # %% from sklearn.preprocessing import OrdinalEncoder -categorical_preprocessor = OrdinalEncoder(handle_unknown="use_encoded_value", - unknown_value=-1) +categorical_preprocessor = OrdinalEncoder( + handle_unknown="use_encoded_value", unknown_value=-1 +) # %% [markdown] -# We then use a `ColumnTransformer` to select the categorical columns and -# apply the `OrdinalEncoder` to them. +# We then use a `ColumnTransformer` to select the categorical columns and apply +# the `OrdinalEncoder` to them. # %% from sklearn.compose import ColumnTransformer -preprocessor = ColumnTransformer([ - ('cat_preprocessor', categorical_preprocessor, categorical_columns)], - remainder='passthrough', sparse_threshold=0) +preprocessor = ColumnTransformer( + [("cat_preprocessor", categorical_preprocessor, categorical_columns)], + remainder="passthrough", + sparse_threshold=0, +) # %% [markdown] # Finally, we use a tree-based classifier (i.e. histogram gradient-boosting) to @@ -96,21 +100,27 @@ from sklearn.ensemble import HistGradientBoostingClassifier from sklearn.pipeline import Pipeline -model = Pipeline([ - ("preprocessor", preprocessor), - ("classifier", - HistGradientBoostingClassifier(random_state=42, max_leaf_nodes=4))]) +model = Pipeline( + [ + ("preprocessor", preprocessor), + ( + "classifier", + HistGradientBoostingClassifier(random_state=42, max_leaf_nodes=4), + ), + ] +) model # %% [markdown] # ## Tuning using a grid-search # -# In the previous exercise we used one `for` loop for each hyperparameter to find the -# best combination over a fixed grid of values. `GridSearchCV` is a scikit-learn class -# that implements a very similar logic with less repetitive code. +# In the previous exercise we used one `for` loop for each hyperparameter to +# find the best combination over a fixed grid of values. `GridSearchCV` is a +# scikit-learn class that implements a very similar logic with less repetitive +# code. # -# Let's see how to use the `GridSearchCV` estimator for doing such search. -# Since the grid-search will be costly, we will only explore the combination +# Let's see how to use the `GridSearchCV` estimator for doing such search. Since +# the grid-search will be costly, we will only explore the combination # learning-rate and the maximum number of nodes. # %% @@ -118,10 +128,10 @@ from sklearn.model_selection import GridSearchCV param_grid = { - 'classifier__learning_rate': (0.01, 0.1, 1, 10), - 'classifier__max_leaf_nodes': (3, 10, 30)} -model_grid_search = GridSearchCV(model, param_grid=param_grid, - n_jobs=2, cv=2) + "classifier__learning_rate": (0.01, 0.1, 1, 10), + "classifier__max_leaf_nodes": (3, 10, 30), +} +model_grid_search = GridSearchCV(model, param_grid=param_grid, n_jobs=2, cv=2) model_grid_search.fit(data_train, target_train) # %% [markdown] @@ -130,8 +140,7 @@ # %% accuracy = model_grid_search.score(data_test, target_test) print( - f"The test accuracy score of the grid-searched pipeline is: " - f"{accuracy:.2f}" + f"The test accuracy score of the grid-searched pipeline is: {accuracy:.2f}" ) # %% [markdown] @@ -140,21 +149,20 @@ # cross-validation by providing `model_grid_search` as a model to the # `cross_validate` function. # -# Here, we used a single train-test split to to evaluate `model_grid_search`. -# In a future notebook will go into more detail about nested cross-validation, -# when you use cross-validation both for hyperparameter tuning and model -# evaluation. +# Here, we used a single train-test split to to evaluate `model_grid_search`. In +# a future notebook will go into more detail about nested cross-validation, when +# you use cross-validation both for hyperparameter tuning and model evaluation. # ``` # %% [markdown] -# The `GridSearchCV` estimator takes a `param_grid` parameter which defines -# all hyperparameters and their associated values. The grid-search will be in -# charge of creating all possible combinations and test them. +# The `GridSearchCV` estimator takes a `param_grid` parameter which defines all +# hyperparameters and their associated values. The grid-search will be in charge +# of creating all possible combinations and test them. # -# The number of combinations will be equal to the product of the -# number of values to explore for each parameter (e.g. in our example 4 x 3 -# combinations). Thus, adding new parameters with their associated values to be -# explored become rapidly computationally expensive. +# The number of combinations will be equal to the product of the number of +# values to explore for each parameter (e.g. in our example 4 x 3 combinations). +# Thus, adding new parameters with their associated values to be explored become +# rapidly computationally expensive. # # Once the grid-search is fitted, it can be used as any other predictor by # calling `predict` and `predict_proba`. Internally, it will use the model with @@ -171,32 +179,31 @@ # attribute. # %% -print(f"The best set of parameters is: " - f"{model_grid_search.best_params_}") +print(f"The best set of parameters is: {model_grid_search.best_params_}") # %% [markdown] -# The accuracy and the best parameters of the grid-searched pipeline are -# similar to the ones we found in the previous exercise, where we searched the -# best parameters "by hand" through a double for loop. +# The accuracy and the best parameters of the grid-searched pipeline are similar +# to the ones we found in the previous exercise, where we searched the best +# parameters "by hand" through a double for loop. # # In addition, we can inspect all results which are stored in the attribute -# `cv_results_` of the grid-search. We will filter some specific columns -# from these results. +# `cv_results_` of the grid-search. We will filter some specific columns from +# these results. # %% cv_results = pd.DataFrame(model_grid_search.cv_results_).sort_values( - "mean_test_score", ascending=False) + "mean_test_score", ascending=False +) cv_results.head() # %% [markdown] -# Let us focus on the most interesting columns and shorten the parameter -# names to remove the `"param_classifier__"` prefix for readability: +# Let us focus on the most interesting columns and shorten the parameter names +# to remove the `"param_classifier__"` prefix for readability: # %% # get the parameter names column_results = [f"param_{name}" for name in param_grid.keys()] -column_results += [ - "mean_test_score", "std_test_score", "rank_test_score"] +column_results += ["mean_test_score", "std_test_score", "rank_test_score"] cv_results = cv_results[column_results] @@ -220,8 +227,10 @@ def shorten_param(param_name): # %% pivoted_cv_results = cv_results.pivot_table( - values="mean_test_score", index=["learning_rate"], - columns=["max_leaf_nodes"]) + values="mean_test_score", + index=["learning_rate"], + columns=["max_leaf_nodes"], +) pivoted_cv_results @@ -231,22 +240,23 @@ def shorten_param(param_name): # %% import seaborn as sns -ax = sns.heatmap(pivoted_cv_results, annot=True, cmap="YlGnBu", vmin=0.7, - vmax=0.9) +ax = sns.heatmap( + pivoted_cv_results, annot=True, cmap="YlGnBu", vmin=0.7, vmax=0.9 +) ax.invert_yaxis() # %% [markdown] # The above tables highlights the following things: # -# * for too high values of `learning_rate`, the generalization performance of the -# model is degraded and adjusting the value of `max_leaf_nodes` cannot fix +# * for too high values of `learning_rate`, the generalization performance of +# the model is degraded and adjusting the value of `max_leaf_nodes` cannot fix # that problem; -# * outside of this pathological region, we observe that the optimal choice -# of `max_leaf_nodes` depends on the value of `learning_rate`; -# * in particular, we observe a "diagonal" of good models with an accuracy -# close to the maximal of 0.87: when the value of `max_leaf_nodes` is -# increased, one should decrease the value of `learning_rate` accordingly -# to preserve a good accuracy. +# * outside of this pathological region, we observe that the optimal choice of +# `max_leaf_nodes` depends on the value of `learning_rate`; +# * in particular, we observe a "diagonal" of good models with an accuracy close +# to the maximal of 0.87: when the value of `max_leaf_nodes` is increased, one +# should decrease the value of `learning_rate` accordingly to preserve a good +# accuracy. # # The precise meaning of those two parameters will be explained later. # @@ -258,7 +268,6 @@ def shorten_param(param_name): # %% [markdown] # In this notebook we have seen: # -# * how to optimize the hyperparameters of a predictive model via a -# grid-search; +# * how to optimize the hyperparameters of a predictive model via a grid-search; # * that searching for more than two hyperparamters is too costly; # * that a grid-search does not necessarily find an optimal solution. diff --git a/python_scripts/parameter_tuning_manual.py b/python_scripts/parameter_tuning_manual.py index d7c6371d7..59072bca8 100644 --- a/python_scripts/parameter_tuning_manual.py +++ b/python_scripts/parameter_tuning_manual.py @@ -19,9 +19,9 @@ # interchangeably. # ``` # -# This notebook shows how one can get and set the value of a hyperparameter in -# a scikit-learn estimator. We recall that hyperparameters refer to the -# parameter that will control the learning process. +# This notebook shows how one can get and set the value of a hyperparameter in a +# scikit-learn estimator. We recall that hyperparameters refer to the parameter +# that will control the learning process. # # They should not be confused with the fitted parameters, resulting from the # training. These fitted parameters are recognizable in scikit-learn because @@ -36,8 +36,7 @@ adult_census = pd.read_csv("../datasets/adult-census.csv") target_name = "class" -numerical_columns = [ - "age", "capital-gain", "capital-loss", "hours-per-week"] +numerical_columns = ["age", "capital-gain", "capital-loss", "hours-per-week"] target = adult_census[target_name] data = adult_census[numerical_columns] @@ -49,22 +48,24 @@ data.head() # %% [markdown] -# Let's create a simple predictive model made of a scaler followed by a -# logistic regression classifier. +# Let's create a simple predictive model made of a scaler followed by a logistic +# regression classifier. # -# As mentioned in previous notebooks, many models, including linear ones, -# work better if all features have a similar scaling. For this purpose, -# we use a `StandardScaler`, which transforms the data by rescaling features. +# As mentioned in previous notebooks, many models, including linear ones, work +# better if all features have a similar scaling. For this purpose, we use a +# `StandardScaler`, which transforms the data by rescaling features. # %% from sklearn.pipeline import Pipeline from sklearn.preprocessing import StandardScaler from sklearn.linear_model import LogisticRegression -model = Pipeline(steps=[ - ("preprocessor", StandardScaler()), - ("classifier", LogisticRegression()) -]) +model = Pipeline( + steps=[ + ("preprocessor", StandardScaler()), + ("classifier", LogisticRegression()), + ] +) # %% [markdown] # We can evaluate the generalization performance of the model via @@ -75,13 +76,15 @@ cv_results = cross_validate(model, data, target) scores = cv_results["test_score"] -print(f"Accuracy score via cross-validation:\n" - f"{scores.mean():.3f} ยฑ {scores.std():.3f}") +print( + "Accuracy score via cross-validation:\n" + f"{scores.mean():.3f} ยฑ {scores.std():.3f}" +) # %% [markdown] -# We created a model with the default `C` value that is equal to 1. If we -# wanted to use a different `C` parameter we could have done so when we created -# the `LogisticRegression` object with something like `LogisticRegression(C=1e-3)`. +# We created a model with the default `C` value that is equal to 1. If we wanted +# to use a different `C` parameter we could have done so when we created the +# `LogisticRegression` object with something like `LogisticRegression(C=1e-3)`. # # ```{note} # For more information on the model hyperparameter `C`, refer to the @@ -89,16 +92,18 @@ # Be aware that we will focus on linear models in an upcoming module. # ``` # -# We can also change the parameter of a model after it has been created with -# the `set_params` method, which is available for all scikit-learn estimators. -# For example, we can set `C=1e-3`, fit and evaluate the model: +# We can also change the parameter of a model after it has been created with the +# `set_params` method, which is available for all scikit-learn estimators. For +# example, we can set `C=1e-3`, fit and evaluate the model: # %% model.set_params(classifier__C=1e-3) cv_results = cross_validate(model, data, target) scores = cv_results["test_score"] -print(f"Accuracy score via cross-validation:\n" - f"{scores.mean():.3f} ยฑ {scores.std():.3f}") +print( + "Accuracy score via cross-validation:\n" + f"{scores.mean():.3f} ยฑ {scores.std():.3f}" +) # %% [markdown] # When the model of interest is a `Pipeline`, the parameter names are of the @@ -106,9 +111,9 @@ # middle). In our case, `classifier` comes from the `Pipeline` definition and # `C` is the parameter name of `LogisticRegression`. # -# In general, you can use the `get_params` method on scikit-learn models to -# list all the parameters with their values. For example, if you want to -# get all the parameter names, you can use: +# In general, you can use the `get_params` method on scikit-learn models to list +# all the parameters with their values. For example, if you want to get all the +# parameter names, you can use: # %% for parameter in model.get_params(): @@ -120,7 +125,7 @@ # parameter, for example `classifier__C`, you can use: # %% -model.get_params()['classifier__C'] +model.get_params()["classifier__C"] # %% [markdown] # We can systematically vary the value of C to see if there is an optimal @@ -131,23 +136,23 @@ model.set_params(classifier__C=C) cv_results = cross_validate(model, data, target) scores = cv_results["test_score"] - print(f"Accuracy score via cross-validation with C={C}:\n" - f"{scores.mean():.3f} ยฑ {scores.std():.3f}") + print( + f"Accuracy score via cross-validation with C={C}:\n" + f"{scores.mean():.3f} ยฑ {scores.std():.3f}" + ) # %% [markdown] -# We can see that as long as C is high enough, the model seems to perform -# well. +# We can see that as long as C is high enough, the model seems to perform well. # -# What we did here is very manual: it involves scanning the values for C -# and picking the best one manually. In the next lesson, we will see how -# to do this automatically. +# What we did here is very manual: it involves scanning the values for C and +# picking the best one manually. In the next lesson, we will see how to do this +# automatically. # # ```{warning} -# When we evaluate a family of models on test data and pick the best -# performer, we can not trust the corresponding prediction accuracy, and -# we need to apply the selected model to new data. Indeed, the test data -# has been used to select the model, and it is thus no longer independent -# from this model. +# When we evaluate a family of models on test data and pick the best performer, +# we can not trust the corresponding prediction accuracy, and we need to apply +# the selected model to new data. Indeed, the test data has been used to select +# the model, and it is thus no longer independent from this model. # ``` # %% [markdown] diff --git a/python_scripts/parameter_tuning_nested.py b/python_scripts/parameter_tuning_nested.py index ca262a6ac..137825a01 100644 --- a/python_scripts/parameter_tuning_nested.py +++ b/python_scripts/parameter_tuning_nested.py @@ -16,8 +16,8 @@ # "Selecting the best model" to show how to evaluate models where # hyperparameters need to be tuned. # -# Thus, we will first load the dataset and create the predictive model that -# we want to optimize and later on, evaluate. +# Thus, we will first load the dataset and create the predictive model that we +# want to optimize and later on, evaluate. # # ## Loading the dataset # @@ -37,8 +37,8 @@ # %% [markdown] # ## Our predictive model # -# We now create the predictive model that we want to optimize. Note that -# this pipeline is identical to the one we used in the previous notebook. +# We now create the predictive model that we want to optimize. Note that this +# pipeline is identical to the one we used in the previous notebook. # %% from sklearn.compose import ColumnTransformer @@ -53,9 +53,9 @@ ) preprocessor = ColumnTransformer( [ - ('cat_preprocessor', categorical_preprocessor, categorical_columns), + ("cat_preprocessor", categorical_preprocessor, categorical_columns), ], - remainder='passthrough', + remainder="passthrough", sparse_threshold=0, ) @@ -63,15 +63,15 @@ from sklearn.ensemble import HistGradientBoostingClassifier from sklearn.pipeline import Pipeline -model = Pipeline([ - ("preprocessor", preprocessor), - ( - "classifier", - HistGradientBoostingClassifier( - random_state=42, max_leaf_nodes=4 - ) - ), -]) +model = Pipeline( + [ + ("preprocessor", preprocessor), + ( + "classifier", + HistGradientBoostingClassifier(random_state=42, max_leaf_nodes=4), + ), + ] +) model # %% [markdown] @@ -95,18 +95,19 @@ cv_results # %% [markdown] -# The cross-validation scores are coming from a 5-fold cross-validation. So -# we can compute the mean and standard deviation of the generalization score. +# The cross-validation scores are coming from a 5-fold cross-validation. So we +# can compute the mean and standard deviation of the generalization score. # %% print( - "Generalization score without hyperparameters tuning:\n" - f"{cv_results['test_score'].mean():.3f} ยฑ {cv_results['test_score'].std():.3f}" + "Generalization score without hyperparameters" + f" tuning:\n{cv_results['test_score'].mean():.3f} ยฑ" + f" {cv_results['test_score'].std():.3f}" ) # %% [markdown] -# We now present how to evaluate the model with hyperparameter tuning, -# where an extra step is required to select the best set of parameters. +# We now present how to evaluate the model with hyperparameter tuning, where an +# extra step is required to select the best set of parameters. # # ### With hyperparameter tuning # @@ -121,39 +122,39 @@ from sklearn.model_selection import GridSearchCV param_grid = { - 'classifier__learning_rate': (0.05, 0.5), - 'classifier__max_leaf_nodes': (10, 30), + "classifier__learning_rate": (0.05, 0.5), + "classifier__max_leaf_nodes": (10, 30), } -model_grid_search = GridSearchCV( - model, param_grid=param_grid, n_jobs=2, cv=2 -) +model_grid_search = GridSearchCV(model, param_grid=param_grid, n_jobs=2, cv=2) model_grid_search.fit(data, target) # %% [markdown] # As previously seen, when calling the `fit` method, the model embedded in the -# grid-search is trained with every possible combination of parameters -# resulting from the parameter grid. The best combination is selected by -# keeping the combination leading to the best mean cross-validated score. +# grid-search is trained with every possible combination of parameters resulting +# from the parameter grid. The best combination is selected by keeping the +# combination leading to the best mean cross-validated score. # %% cv_results = pd.DataFrame(model_grid_search.cv_results_) -cv_results[[ - "param_classifier__learning_rate", - "param_classifier__max_leaf_nodes", - "mean_test_score", - "std_test_score", - "rank_test_score" -]] +cv_results[ + [ + "param_classifier__learning_rate", + "param_classifier__max_leaf_nodes", + "mean_test_score", + "std_test_score", + "rank_test_score", + ] +] # %% model_grid_search.best_params_ # %% [markdown] # One important caveat here concerns the evaluation of the generalization -# performance. Indeed, the mean and standard deviation of the scores computed -# by the cross-validation in the grid-search are potentially not good estimates -# of the generalization performance we would obtain by refitting a model with -# the best combination of hyper-parameter values on the full dataset. Note that +# performance. Indeed, the mean and standard deviation of the scores computed by +# the cross-validation in the grid-search are potentially not good estimates of +# the generalization performance we would obtain by refitting a model with the +# best combination of hyper-parameter values on the full dataset. Note that # scikit-learn automatically performs this refit by default when calling # `model_grid_search.fit`. This refitted model is trained with more data than # the different models trained internally during the cross-validation of the @@ -180,22 +181,23 @@ # %% [markdown] # The score measure on the final test set is almost within the range of the # internal CV score for the best hyper-parameter combination. This is reassuring -# as it means that the tuning procedure did not cause significant overfitting -# in itself (other-wise the final test score would have been lower than the +# as it means that the tuning procedure did not cause significant overfitting in +# itself (other-wise the final test score would have been lower than the # internal CV scores). That is expected because our grid search explored very # few hyper-parameter combinations for the sake of speed. The test score of the -# final model is actually a bit higher than what we could have expected from -# the internal cross-validation. This is also expected because the refitted -# model is trained on a larger dataset than the models evaluated in the -# internal CV loop of the grid-search procedure. This is often the case that -# models trained on a larger number of samples tend to generalize better. +# final model is actually a bit higher than what we could have expected from the +# internal cross-validation. This is also expected because the refitted model is +# trained on a larger dataset than the models evaluated in the internal CV loop +# of the grid-search procedure. This is often the case that models trained on a +# larger number of samples tend to generalize better. # # In the code above, the selection of the best hyperparameters was done only on # the train set from the initial train-test split. Then, we evaluated the # generalization performance of our tuned model on the left out test set. This # can be shown schematically as follows # -# ![Cross-validation tuning diagram](../figures/cross_validation_train_test_diagram.png) +# ![Cross-validation tuning +# diagram](../figures/cross_validation_train_test_diagram.png) # # ```{note} # This figure shows the particular case of **K-fold** cross-validation @@ -214,8 +216,8 @@ # ``` # # However, this evaluation only provides us a single point estimate of the -# generalization performance. As recall at the beginning of this notebook, it -# is beneficial to have a rough idea of the uncertainty of our estimated +# generalization performance. As recall at the beginning of this notebook, it is +# beneficial to have a rough idea of the uncertainty of our estimated # generalization performance. Therefore, we should instead use an additional # cross-validation for this evaluation. # @@ -234,7 +236,7 @@ # %% cv_results = pd.DataFrame(cv_results) -cv_test_scores = cv_results['test_score'] +cv_test_scores = cv_results["test_score"] print( "Generalization score with hyperparameters tuning:\n" f"{cv_test_scores.mean():.3f} ยฑ {cv_test_scores.std():.3f}" @@ -244,9 +246,9 @@ # This result is compatible with the test score measured with the string outer # train-test split. # -# However, in this case, we can apprehend the variability of our estimate of -# the generalization performance thanks to the measure of the -# standard-deviation of the scores measured in the outer cross-validation. +# However, in this case, we can apprehend the variability of our estimate of the +# generalization performance thanks to the measure of the standard-deviation of +# the scores measured in the outer cross-validation. # # Here is a schematic representation of the complete nested cross-validation # procedure: @@ -289,18 +291,17 @@ # expect that it will have an actual predictive performance close to what we # measured in the outer cross-validation. # -# But it is also possible that some hyperparameters do not matter at all, and -# as a result in different tuning sessions give different results. In this -# case, any value will do. This can typically be confirmed by doing a parallel -# coordinate plot of the results of a large hyperparameter search as seen in -# the exercises. +# But it is also possible that some hyperparameters do not matter at all, and as +# a result in different tuning sessions give different results. In this case, +# any value will do. This can typically be confirmed by doing a parallel +# coordinate plot of the results of a large hyperparameter search as seen in the +# exercises. # # From a deployment point of view, one could also choose to deploy all the # models found by the outer cross-validation loop and make them vote to get the -# final predictions. However this can cause operational problems because it -# uses more memory and makes computing prediction slower, resulting in a higher +# final predictions. However this can cause operational problems because it uses +# more memory and makes computing prediction slower, resulting in a higher # computational resource usage per prediction. # -# In this notebook, we have seen how to evaluate the predictive performance of -# a model with tuned hyper-parameters using the nested cross-validation -# procedure. +# In this notebook, we have seen how to evaluate the predictive performance of a +# model with tuned hyper-parameters using the nested cross-validation procedure. diff --git a/python_scripts/parameter_tuning_parallel_plot.py b/python_scripts/parameter_tuning_parallel_plot.py index 2eaefd475..304585cb0 100644 --- a/python_scripts/parameter_tuning_parallel_plot.py +++ b/python_scripts/parameter_tuning_parallel_plot.py @@ -9,24 +9,27 @@ # # Analysis of hyperparameter search results # %% [markdown] -# In the previous notebook we showed how to implement a randomized -# search for tuning the hyperparameters of a `HistGradientBoostingClassifier` -# to fit the `adult_census` dataset. In practice, a randomized hyperparameter -# search is usually run with a large number of iterations. +# In the previous notebook we showed how to implement a randomized search for +# tuning the hyperparameters of a `HistGradientBoostingClassifier` to fit the +# `adult_census` dataset. In practice, a randomized hyperparameter search is +# usually run with a large number of iterations. # %% [markdown] -# In order to avoid the computational cost and still make a decent analysis, -# we load the results obtained from a similar search with 500 iterations. +# In order to avoid the computational cost and still make a decent analysis, we +# load the results obtained from a similar search with 500 iterations. # %% import pandas as pd -cv_results = pd.read_csv("../figures/randomized_search_results.csv", index_col=0) +cv_results = pd.read_csv( + "../figures/randomized_search_results.csv", index_col=0 +) cv_results # %% [markdown] -# We define a function to remove the prefixes in the hyperparameters -# column names. +# We define a function to remove the prefixes in the hyperparameters column +# names. + # %% def shorten_param(param_name): @@ -34,15 +37,15 @@ def shorten_param(param_name): return param_name.rsplit("__", 1)[1] return param_name + cv_results = cv_results.rename(shorten_param, axis=1) cv_results # %% [markdown] -# As we have more than 2 parameters in our randomized-search, we -# cannot visualize the results using a heatmap. We could still do -# it pair-wise, but having a two-dimensional projection of a -# multi-dimensional problem can lead to a wrong interpretation of -# the scores. +# As we have more than 2 parameters in our randomized-search, we cannot +# visualize the results using a heatmap. We could still do it pair-wise, but +# having a two-dimensional projection of a multi-dimensional problem can lead to +# a wrong interpretation of the scores. # %% import seaborn as sns @@ -70,7 +73,9 @@ def shorten_param(param_name): ax.set_xscale("log") ax.set_yscale("log") -_ = ax.legend(title="mean_test_score", loc="center left", bbox_to_anchor=(1, 0.5)) +_ = ax.legend( + title="mean_test_score", loc="center left", bbox_to_anchor=(1, 0.5) +) # %% [markdown] # In the previous plot we see that the top performing values are located in a @@ -131,9 +136,8 @@ def shorten_param(param_name): # models, whatever the values of the other hyperparameters. # %% [markdown] -# -# In this notebook, we saw how to interactively explore the results of a -# large randomized search with multiple interacting hyperparameters. -# In particular we observed that some hyperparameters have very little -# impact on the cross-validation score, while others have to be adjusted -# within a specific range to get models with good predictive accuracy. +# In this notebook, we saw how to interactively explore the results of a large +# randomized search with multiple interacting hyperparameters. In particular we +# observed that some hyperparameters have very little impact on the +# cross-validation score, while others have to be adjusted within a specific +# range to get models with good predictive accuracy. diff --git a/python_scripts/parameter_tuning_randomized_search.py b/python_scripts/parameter_tuning_randomized_search.py index 7fbf1c8cc..81d45786d 100644 --- a/python_scripts/parameter_tuning_randomized_search.py +++ b/python_scripts/parameter_tuning_randomized_search.py @@ -12,9 +12,9 @@ # search for the best hyperparameters maximizing the generalization performance # of a predictive model. # -# However, a grid-search approach has limitations. It does not scale when -# the number of parameters to tune is increasing. Also, the grid will impose -# a regularity during the search which might be problematic. +# However, a grid-search approach has limitations. It does not scale when the +# number of parameters to tune is increasing. Also, the grid will impose a +# regularity during the search which might be problematic. # # In this notebook, we will present another method to tune hyperparameters # called randomized search. @@ -52,7 +52,8 @@ from sklearn.model_selection import train_test_split data_train, data_test, target_train, target_test = train_test_split( - data, target, random_state=42) + data, target, random_state=42 +) # %% [markdown] # We will create the same predictive pipeline as seen in the grid-search @@ -66,20 +67,28 @@ categorical_columns_selector = selector(dtype_include=object) categorical_columns = categorical_columns_selector(data) -categorical_preprocessor = OrdinalEncoder(handle_unknown="use_encoded_value", - unknown_value=-1) -preprocessor = ColumnTransformer([ - ('cat_preprocessor', categorical_preprocessor, categorical_columns)], - remainder='passthrough', sparse_threshold=0) +categorical_preprocessor = OrdinalEncoder( + handle_unknown="use_encoded_value", unknown_value=-1 +) +preprocessor = ColumnTransformer( + [("cat_preprocessor", categorical_preprocessor, categorical_columns)], + remainder="passthrough", + sparse_threshold=0, +) # %% from sklearn.ensemble import HistGradientBoostingClassifier from sklearn.pipeline import Pipeline -model = Pipeline([ - ("preprocessor", preprocessor), - ("classifier", HistGradientBoostingClassifier(random_state=42, max_leaf_nodes=4)), -]) +model = Pipeline( + [ + ("preprocessor", preprocessor), + ( + "classifier", + HistGradientBoostingClassifier(random_state=42, max_leaf_nodes=4), + ), + ] +) model @@ -90,40 +99,39 @@ # explicitly. We already mentioned that exploring a large number of values for # different parameters will be quickly untractable. # -# Instead, we can randomly generate the parameter candidates. Indeed, -# such approach avoids the regularity of the grid. Hence, adding more -# evaluations can increase the resolution in each direction. This is the -# case in the frequent situation where the choice of some hyperparameters -# is not very important, as for hyperparameter 2 in the figure below. +# Instead, we can randomly generate the parameter candidates. Indeed, such +# approach avoids the regularity of the grid. Hence, adding more evaluations can +# increase the resolution in each direction. This is the case in the frequent +# situation where the choice of some hyperparameters is not very important, as +# for hyperparameter 2 in the figure below. # # ![Randomized vs grid search](../figures/grid_vs_random_search.svg) # -# Indeed, the number of evaluation points needs to be divided across the -# two different hyperparameters. With a grid, the danger is that the -# region of good hyperparameters fall between the line of the grid: this -# region is aligned with the grid given that hyperparameter 2 has a weak -# influence. Rather, stochastic search will sample hyperparameter 1 -# independently from hyperparameter 2 and find the optimal region. +# Indeed, the number of evaluation points needs to be divided across the two +# different hyperparameters. With a grid, the danger is that the region of good +# hyperparameters fall between the line of the grid: this region is aligned with +# the grid given that hyperparameter 2 has a weak influence. Rather, stochastic +# search will sample hyperparameter 1 independently from hyperparameter 2 and +# find the optimal region. # -# The `RandomizedSearchCV` class allows for such stochastic search. It is -# used similarly to the `GridSearchCV` but the sampling distributions -# need to be specified instead of the parameter values. For instance, we -# will draw candidates using a log-uniform distribution because the parameters -# we are interested in take positive values with a natural log scaling (.1 is -# as close to 1 as 10 is). +# The `RandomizedSearchCV` class allows for such stochastic search. It is used +# similarly to the `GridSearchCV` but the sampling distributions need to be +# specified instead of the parameter values. For instance, we will draw +# candidates using a log-uniform distribution because the parameters we are +# interested in take positive values with a natural log scaling (.1 is as close +# to 1 as 10 is). # # ```{note} -# Random search (with `RandomizedSearchCV`) is typically beneficial compared -# to grid search (with `GridSearchCV`) to optimize 3 or more -# hyperparameters. +# Random search (with `RandomizedSearchCV`) is typically beneficial compared to +# grid search (with `GridSearchCV`) to optimize 3 or more hyperparameters. # ``` # -# We will optimize 3 other parameters in addition to the ones we -# optimized in the notebook presenting the `GridSearchCV`: +# We will optimize 3 other parameters in addition to the ones we optimized in +# the notebook presenting the `GridSearchCV`: # # * `l2_regularization`: it corresponds to the strength of the regularization; -# * `min_samples_leaf`: it corresponds to the minimum number of samples -# required in a leaf; +# * `min_samples_leaf`: it corresponds to the minimum number of samples required +# in a leaf; # * `max_bins`: it corresponds to the maximum number of bins to construct the # histograms. # @@ -135,9 +143,9 @@ # tree in the ensemble. # # ```{note} -# `scipy.stats.loguniform` can be used to generate floating numbers. To -# generate random values for integer-valued parameters (e.g. -# `min_samples_leaf`) we can adapt is as follows: +# `scipy.stats.loguniform` can be used to generate floating numbers. To generate +# random values for integer-valued parameters (e.g. `min_samples_leaf`) we can +# adapt is as follows: # ``` # %% @@ -146,6 +154,7 @@ class loguniform_int: """Integer valued version of the log-uniform distribution""" + def __init__(self, a, b): self._distribution = loguniform(a, b) @@ -155,28 +164,30 @@ def rvs(self, *args, **kwargs): # %% [markdown] -# # Now, we can define the randomized search using the different distributions. -# Executing 10 iterations of 5-fold cross-validation for random -# parametrizations of this model on this dataset can take from 10 seconds to -# several minutes, depending on the speed of the host computer and the number -# of available processors. +# Executing 10 iterations of 5-fold cross-validation for random parametrizations +# of this model on this dataset can take from 10 seconds to several minutes, +# depending on the speed of the host computer and the number of available +# processors. # %% -%%time +# %%time from sklearn.model_selection import RandomizedSearchCV param_distributions = { - 'classifier__l2_regularization': loguniform(1e-6, 1e3), - 'classifier__learning_rate': loguniform(0.001, 10), - 'classifier__max_leaf_nodes': loguniform_int(2, 256), - 'classifier__min_samples_leaf': loguniform_int(1, 100), - 'classifier__max_bins': loguniform_int(2, 255), + "classifier__l2_regularization": loguniform(1e-6, 1e3), + "classifier__learning_rate": loguniform(0.001, 10), + "classifier__max_leaf_nodes": loguniform_int(2, 256), + "classifier__min_samples_leaf": loguniform_int(1, 100), + "classifier__max_bins": loguniform_int(2, 255), } model_random_search = RandomizedSearchCV( - model, param_distributions=param_distributions, n_iter=10, - cv=5, verbose=1, + model, + param_distributions=param_distributions, + n_iter=10, + cv=5, + verbose=1, ) model_random_search.fit(data_train, target_train) @@ -186,8 +197,7 @@ def rvs(self, *args, **kwargs): # %% accuracy = model_random_search.score(data_test, target_test) -print(f"The test accuracy score of the best model is " - f"{accuracy:.2f}") +print(f"The test accuracy score of the best model is {accuracy:.2f}") # %% from pprint import pprint @@ -196,39 +206,38 @@ def rvs(self, *args, **kwargs): pprint(model_random_search.best_params_) # %% [markdown] -# # We can inspect the results using the attributes `cv_results` as we did # previously. # %% # get the parameter names -column_results = [ - f"param_{name}" for name in param_distributions.keys()] -column_results += [ - "mean_test_score", "std_test_score", "rank_test_score"] +column_results = [f"param_{name}" for name in param_distributions.keys()] +column_results += ["mean_test_score", "std_test_score", "rank_test_score"] cv_results = pd.DataFrame(model_random_search.cv_results_) cv_results = cv_results[column_results].sort_values( - "mean_test_score", ascending=False) + "mean_test_score", ascending=False +) + def shorten_param(param_name): if "__" in param_name: return param_name.rsplit("__", 1)[1] return param_name + cv_results = cv_results.rename(shorten_param, axis=1) cv_results # %% [markdown] -# Keep in mind that tuning is limited by the number of different combinations -# of parameters that are scored by the randomized search. In fact, there might -# be other sets of parameters leading to similar or better generalization -# performances but that were not tested in the search. -# In practice, a randomized hyperparameter search is usually run with a large -# number of iterations. In order to avoid the computation cost and still make a -# decent analysis, we load the results obtained from a similar search with 500 -# iterations. +# Keep in mind that tuning is limited by the number of different combinations of +# parameters that are scored by the randomized search. In fact, there might be +# other sets of parameters leading to similar or better generalization +# performances but that were not tested in the search. In practice, a randomized +# hyperparameter search is usually run with a large number of iterations. In +# order to avoid the computation cost and still make a decent analysis, we load +# the results obtained from a similar search with 500 iterations. # %% # model_random_search = RandomizedSearchCV( @@ -239,25 +248,29 @@ def shorten_param(param_name): # cv_results.to_csv("../figures/randomized_search_results.csv") # %% -cv_results = pd.read_csv("../figures/randomized_search_results.csv", - index_col=0) +cv_results = pd.read_csv( + "../figures/randomized_search_results.csv", index_col=0 +) -(cv_results[column_results].rename( - shorten_param, axis=1).sort_values("mean_test_score", ascending=False)) +( + cv_results[column_results] + .rename(shorten_param, axis=1) + .sort_values("mean_test_score", ascending=False) +) # %% [markdown] -# In this case the top performing models have test scores with a high -# overlap between each other, meaning that indeed, the set of parameters -# leading to the best generalization performance is not unique. +# In this case the top performing models have test scores with a high overlap +# between each other, meaning that indeed, the set of parameters leading to the +# best generalization performance is not unique. # %% [markdown] # -# In this notebook, we saw how a randomized search offers a valuable -# alternative to grid-search when the number of hyperparameters to tune is more -# than two. It also alleviates the regularity imposed by the grid that might be -# problematic sometimes. +# In this notebook, we saw how a randomized search offers a valuable alternative +# to grid-search when the number of hyperparameters to tune is more than two. It +# also alleviates the regularity imposed by the grid that might be problematic +# sometimes. # # In the following, we will see how to use interactive plotting tools to explore -# the results of large hyperparameter search sessions and gain some -# insights on range of parameter values that lead to the highest performing -# models and how different hyperparameter are coupled or not. +# the results of large hyperparameter search sessions and gain some insights on +# range of parameter values that lead to the highest performing models and how +# different hyperparameter are coupled or not. diff --git a/python_scripts/parameter_tuning_sol_02.py b/python_scripts/parameter_tuning_sol_02.py index d82243f97..ed9aeaf3f 100644 --- a/python_scripts/parameter_tuning_sol_02.py +++ b/python_scripts/parameter_tuning_sol_02.py @@ -11,10 +11,9 @@ # The goal is to write an exhaustive search to find the best parameters # combination maximizing the model generalization performance. # -# Here we use a small subset of the Adult Census dataset to make the code -# faster to execute. Once your code works on the small subset, try to -# change `train_size` to a larger value (e.g. 0.8 for 80% instead of -# 20%). +# Here we use a small subset of the Adult Census dataset to make the code faster +# to execute. Once your code works on the small subset, try to change +# `train_size` to a larger value (e.g. 0.8 for 80% instead of 20%). # %% import pandas as pd @@ -28,30 +27,40 @@ data = adult_census.drop(columns=[target_name, "education-num"]) data_train, data_test, target_train, target_test = train_test_split( - data, target, train_size=0.2, random_state=42) + data, target, train_size=0.2, random_state=42 +) # %% from sklearn.compose import ColumnTransformer from sklearn.compose import make_column_selector as selector from sklearn.preprocessing import OrdinalEncoder -categorical_preprocessor = OrdinalEncoder(handle_unknown="use_encoded_value", - unknown_value=-1) +categorical_preprocessor = OrdinalEncoder( + handle_unknown="use_encoded_value", unknown_value=-1 +) preprocessor = ColumnTransformer( - [('cat_preprocessor', categorical_preprocessor, - selector(dtype_include=object))], - remainder='passthrough', sparse_threshold=0) + [ + ( + "cat_preprocessor", + categorical_preprocessor, + selector(dtype_include=object), + ) + ], + remainder="passthrough", + sparse_threshold=0, +) from sklearn.ensemble import HistGradientBoostingClassifier from sklearn.pipeline import Pipeline -model = Pipeline([ - ("preprocessor", preprocessor), - ("classifier", HistGradientBoostingClassifier(random_state=42)) -]) +model = Pipeline( + [ + ("preprocessor", preprocessor), + ("classifier", HistGradientBoostingClassifier(random_state=42)), + ] +) # %% [markdown] -# # Use the previously defined model (called `model`) and using two nested `for` # loops, make a search of the best combinations of the `learning_rate` and # `max_leaf_nodes` parameters. In this regard, you will need to train and test @@ -61,8 +70,8 @@ # - `learning_rate` for the values 0.01, 0.1, 1 and 10. This parameter controls # the ability of a new tree to correct the error of the previous sequence of # trees -# - `max_leaf_nodes` for the values 3, 10, 30. This parameter controls the -# depth of each tree. +# - `max_leaf_nodes` for the values 3, 10, 30. This parameter controls the depth +# of each tree. # %% # solution @@ -75,18 +84,22 @@ best_params = {} for lr in learning_rate: for mln in max_leaf_nodes: - print(f"Evaluating model with learning rate {lr:.3f}" - f" and max leaf nodes {mln}... ", end="") + print( + ( + f"Evaluating model with learning rate {lr:.3f}" + f" and max leaf nodes {mln}... " + ), + end="", + ) model.set_params( - classifier__learning_rate=lr, - classifier__max_leaf_nodes=mln + classifier__learning_rate=lr, classifier__max_leaf_nodes=mln ) scores = cross_val_score(model, data_train, target_train, cv=2) mean_score = scores.mean() print(f"score: {mean_score:.3f}") if mean_score > best_score: best_score = mean_score - best_params = {'learning_rate': lr, 'max_leaf_nodes': mln} + best_params = {"learning_rate": lr, "max_leaf_nodes": mln} print(f"Found new best model with score {best_score:.3f}!") print(f"The best accuracy obtained is {best_score:.3f}") @@ -99,11 +112,12 @@ # %% # solution -best_lr = best_params['learning_rate'] -best_mln = best_params['max_leaf_nodes'] +best_lr = best_params["learning_rate"] +best_mln = best_params["max_leaf_nodes"] -model.set_params(classifier__learning_rate=best_lr, - classifier__max_leaf_nodes=best_mln) +model.set_params( + classifier__learning_rate=best_lr, classifier__max_leaf_nodes=best_mln +) model.fit(data_train, target_train) test_score = model.score(data_test, target_test) diff --git a/python_scripts/parameter_tuning_sol_03.py b/python_scripts/parameter_tuning_sol_03.py index a73925467..149cc0de1 100644 --- a/python_scripts/parameter_tuning_sol_03.py +++ b/python_scripts/parameter_tuning_sol_03.py @@ -19,11 +19,12 @@ target *= 100 # rescale the target in k$ data_train, data_test, target_train, target_test = train_test_split( - data, target, random_state=42) + data, target, random_state=42 +) # %% [markdown] -# In this exercise, we will progressively define the regression pipeline -# and later tune its hyperparameters. +# In this exercise, we will progressively define the regression pipeline and +# later tune its hyperparameters. # # Start by defining a pipeline that: # * uses a `StandardScaler` to normalize the numerical data; @@ -46,8 +47,8 @@ # `np.logspace(0, 3, num=10).astype(np.int32)`; # - the parameter `with_mean` of the `StandardScaler` with possible values # `True` or `False`; -# - the parameter `with_std` of the `StandardScaler` with possible values -# `True` or `False`. +# - the parameter `with_std` of the `StandardScaler` with possible values `True` +# or `False`. # # Notice that in the notebook "Hyperparameter tuning by randomized-search" we # pass distributions to be sampled by the `RandomizedSearchCV`. In this case we @@ -68,14 +69,21 @@ from sklearn.model_selection import RandomizedSearchCV param_distributions = { - "kneighborsregressor__n_neighbors": np.logspace(0, 3, num=10).astype(np.int32), + "kneighborsregressor__n_neighbors": np.logspace(0, 3, num=10).astype( + np.int32 + ), "standardscaler__with_mean": [True, False], "standardscaler__with_std": [True, False], } model_random_search = RandomizedSearchCV( - model, param_distributions=param_distributions, - n_iter=20, n_jobs=2, verbose=1, random_state=1) + model, + param_distributions=param_distributions, + n_iter=20, + n_jobs=2, + verbose=1, + random_state=1, +) model_random_search.fit(data_train, target_train) model_random_search.best_params_ @@ -90,9 +98,9 @@ # conduct such an interactive analysis for this this pipeline using a parallel # coordinate plot using the `plotly` library. # -# We could use `cv_results = model_random_search.cv_results_` to make a -# parallel coordinate plot as we did in the previous notebook (you are more -# than welcome to try!). +# We could use `cv_results = model_random_search.cv_results_` to make a parallel +# coordinate plot as we did in the previous notebook (you are more than welcome +# to try!). # %% tags=["solution"] import pandas as pd @@ -113,7 +121,8 @@ cv_results = cv_results.rename(columns=column_name_mapping) cv_results = cv_results[column_name_mapping.values()].sort_values( - "mean test score", ascending=False) + "mean test score", ascending=False +) # %% [markdown] tags=["solution"] # In addition, the parallel coordinate plot from `plotly` expects all data to be @@ -141,8 +150,8 @@ # %% [markdown] tags=["solution"] # We recall that it is possible to select a range of results by clicking and -# holding on any axis of the parallel coordinate plot. You can then slide -# (move) the range selection and cross two selections to see the intersections. +# holding on any axis of the parallel coordinate plot. You can then slide (move) +# the range selection and cross two selections to see the intersections. # # Selecting the best performing models (i.e. above an accuracy of ~0.68), we # observe that **in this case**: @@ -166,10 +175,9 @@ # the values of A and B will be approximately between -3 and 3 and the neighbor # structure will be impacted more or less equivalently by both variables. # -# Note that **in this case** the models with scaled features perform better -# than the models with non-scaled features because all the variables are -# expected to be predictive and we rather avoid some of them being comparatively -# ignored. +# Note that **in this case** the models with scaled features perform better than +# the models with non-scaled features because all the variables are expected to +# be predictive and we rather avoid some of them being comparatively ignored. # # If the variables in lower scales were not predictive one may experience a # decrease of the performance after scaling the features: noisy features would diff --git a/python_scripts/trees_classification.py b/python_scripts/trees_classification.py index 9d0d473ea..21b772666 100644 --- a/python_scripts/trees_classification.py +++ b/python_scripts/trees_classification.py @@ -33,16 +33,16 @@ data, target = penguins[culmen_columns], penguins[target_column] data_train, data_test, target_train, target_test = train_test_split( - data, target, random_state=0) + data, target, random_state=0 +) # %% [markdown] -# # In a previous notebook, we learnt that a linear classifier will define a # linear separation to split classes using a linear combination of the input # features. In our 2-dimensional space, it means that a linear classifier will -# define some oblique lines that best separate our classes. We define a -# function below that, given a set of data points and a classifier, will plot -# the decision boundaries learnt by the classifier. +# define some oblique lines that best separate our classes. We define a function +# below that, given a set of data points and a classifier, will plot the +# decision boundaries learnt by the classifier. # # Thus, for a linear classifier, we will obtain the following decision # boundaries. These boundaries lines indicate where the model changes its @@ -66,10 +66,15 @@ DecisionBoundaryDisplay.from_estimator( linear_model, data_train, response_method="predict", cmap="RdBu", alpha=0.5 ) -sns.scatterplot(data=penguins, x=culmen_columns[0], y=culmen_columns[1], - hue=target_column, palette=palette) +sns.scatterplot( + data=penguins, + x=culmen_columns[0], + y=culmen_columns[1], + hue=target_column, + palette=palette, +) # put the legend outside the plot -plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left') +plt.legend(bbox_to_anchor=(1.05, 1), loc="upper left") _ = plt.title("Decision boundary using a logistic regression") # %% [markdown] @@ -78,8 +83,8 @@ # parametrization that we saw in the previous notebook, controlled by the # model's weights and intercept. # -# Besides, it seems that the linear model would be a good candidate for -# such problem as it gives good accuracy. +# Besides, it seems that the linear model would be a good candidate for such +# problem as it gives good accuracy. # %% linear_model.fit(data_train, target_train) @@ -92,8 +97,8 @@ # intercept to be optimized. # # Indeed, decision trees will partition the space by considering a single -# feature at a time. Let's illustrate this behaviour by having a decision -# tree make a single split to partition the feature space. +# feature at a time. Let's illustrate this behaviour by having a decision tree +# make a single split to partition the feature space. # %% from sklearn.tree import DecisionTreeClassifier @@ -105,9 +110,14 @@ DecisionBoundaryDisplay.from_estimator( tree, data_train, response_method="predict", cmap="RdBu", alpha=0.5 ) -sns.scatterplot(data=penguins, x=culmen_columns[0], y=culmen_columns[1], - hue=target_column, palette=palette) -plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left') +sns.scatterplot( + data=penguins, + x=culmen_columns[0], + y=culmen_columns[1], + hue=target_column, + palette=palette, +) +plt.legend(bbox_to_anchor=(1.05, 1), loc="upper left") _ = plt.title("Decision boundary using a decision tree") # %% [markdown] @@ -120,8 +130,13 @@ from sklearn.tree import plot_tree _, ax = plt.subplots(figsize=(8, 6)) -_ = plot_tree(tree, feature_names=culmen_columns, - class_names=tree.classes_, impurity=False, ax=ax) +_ = plot_tree( + tree, + feature_names=culmen_columns, + class_names=tree.classes_, + impurity=False, + ax=ax, +) # %% [markdown] # ```{tip} @@ -132,12 +147,12 @@ # %% [markdown] # We see that the split was done on the culmen depth feature. The original -# dataset was subdivided into 2 sets based on the culmen depth -# (inferior or superior to 16.45 mm). +# dataset was subdivided into 2 sets based on the culmen depth (inferior or +# superior to 16.45 mm). # # This partition of the dataset minimizes the class diversities in each -# sub-partitions. This measure is also known as a **criterion**, -# and is a settable parameter. +# sub-partitions. This measure is also known as a **criterion**, and is a +# settable parameter. # # If we look more closely at the partition, we see that the sample superior to # 16.45 belongs mainly to the Adelie class. Looking at the values, we indeed @@ -150,19 +165,15 @@ # where the culmen depth is inferior to the threshold. # %% -sample_1 = pd.DataFrame( - {"Culmen Length (mm)": [0], "Culmen Depth (mm)": [15]} -) +sample_1 = pd.DataFrame({"Culmen Length (mm)": [0], "Culmen Depth (mm)": [15]}) tree.predict(sample_1) # %% [markdown] -# The class predicted is the Gentoo. We can now check what happens if we pass -# a culmen depth superior to the threshold. +# The class predicted is the Gentoo. We can now check what happens if we pass a +# culmen depth superior to the threshold. # %% -sample_2 = pd.DataFrame( - {"Culmen Length (mm)": [0], "Culmen Depth (mm)": [17]} -) +sample_2 = pd.DataFrame({"Culmen Length (mm)": [0], "Culmen Depth (mm)": [17]}) tree.predict(sample_2) # %% [markdown] @@ -171,8 +182,8 @@ # Thus, we can conclude that a decision tree classifier will predict the most # represented class within a partition. # -# During the training, we have a count of samples in each partition, we can -# also compute the probability of belonging to a specific class within this +# During the training, we have a count of samples in each partition, we can also +# compute the probability of belonging to a specific class within this # partition. # %% @@ -185,15 +196,15 @@ _ = plt.title("Probability to belong to a penguin class") # %% [markdown] -# We can also compute the different probabilities manually directly from the tree -# structure. +# We can also compute the different probabilities manually directly from the +# tree structure. # %% adelie_proba = 103 / 161 chinstrap_proba = 52 / 161 gentoo_proba = 6 / 161 print( - f"Probabilities for the different classes:\n" + "Probabilities for the different classes:\n" f"Adelie: {adelie_proba:.3f}\n" f"Chinstrap: {chinstrap_proba:.3f}\n" f"Gentoo: {gentoo_proba:.3f}\n" @@ -201,8 +212,8 @@ # %% [markdown] # It is also important to note that the culmen length has been disregarded for -# the moment. It means that whatever the value given, it will not be used -# during the prediction. +# the moment. It means that whatever the value given, it will not be used during +# the prediction. # %% sample_3 = pd.DataFrame( @@ -211,8 +222,8 @@ tree.predict_proba(sample_3) # %% [markdown] -# Going back to our classification problem, the split found with a maximum -# depth of 1 is not powerful enough to separate the three species and the model +# Going back to our classification problem, the split found with a maximum depth +# of 1 is not powerful enough to separate the three species and the model # accuracy is low when compared to the linear model. # %% @@ -221,9 +232,9 @@ print(f"Accuracy of the DecisionTreeClassifier: {test_score:.2f}") # %% [markdown] -# Indeed, it is not a surprise. We saw earlier that a single feature will not -# be able to separate all three species. However, from the previous analysis we -# saw that by using both features we should be able to get fairly good results. +# Indeed, it is not a surprise. We saw earlier that a single feature will not be +# able to separate all three species. However, from the previous analysis we saw +# that by using both features we should be able to get fairly good results. # # In the next exercise, you will increase the size of the tree depth. You will # get intuitions on how the space partitioning is repeated over time. diff --git a/python_scripts/trees_dataset.py b/python_scripts/trees_dataset.py index 64441eab5..708c61b29 100644 --- a/python_scripts/trees_dataset.py +++ b/python_scripts/trees_dataset.py @@ -8,11 +8,10 @@ # %% [markdown] # # The penguins datasets # -# In this notebook, we make a quick presentation of the -# [Palmer penguins dataset](https://allisonhorst.github.io/palmerpenguins/) -# dataset. We use this dataset for both classification and regression -# problems by selecting a subset of the features to make our explanations -# intuitive. +# In this notebook, we make a quick presentation of the [Palmer penguins +# dataset](https://allisonhorst.github.io/palmerpenguins/) dataset. We use this +# dataset for both classification and regression problems by selecting a subset +# of the features to make our explanations intuitive. # # ## Classification dataset # @@ -23,15 +22,17 @@ # Chinstrap. See the illustration below depicting the three different penguin # species: # -# ![Image of penguins](https://github.com/allisonhorst/palmerpenguins/raw/master/man/figures/lter_penguins.png) +# ![Image of +# penguins](https://github.com/allisonhorst/palmerpenguins/raw/master/man/figures/lter_penguins.png) # -# This problem is a classification problem since the target is categorical. -# We will limit our input data to a subset of the original features -# to simplify our explanations when presenting the decision tree algorithm. -# Indeed, we will use features based on penguins' culmen measurement. You can -# learn more about the penguins' culmen with the illustration below: +# This problem is a classification problem since the target is categorical. We +# will limit our input data to a subset of the original features to simplify our +# explanations when presenting the decision tree algorithm. Indeed, we will use +# features based on penguins' culmen measurement. You can learn more about the +# penguins' culmen with the illustration below: # -# ![Image of culmen](https://github.com/allisonhorst/palmerpenguins/raw/master/man/figures/culmen_depth.png) +# ![Image of +# culmen](https://github.com/allisonhorst/palmerpenguins/raw/master/man/figures/culmen_depth.png) # # We will start by loading this subset of the dataset. @@ -90,6 +91,5 @@ # %% [markdown] # Here, we deal with a regression problem because our target is a continuous # variable ranging from 2.7 kg to 6.3 kg. From the scatter plot above, we -# observe that we have a linear relationship between the flipper length -# and the body mass. The longer the flipper of a penguin, the heavier the -# penguin. +# observe that we have a linear relationship between the flipper length and the +# body mass. The longer the flipper of a penguin, the heavier the penguin. diff --git a/python_scripts/trees_ex_01.py b/python_scripts/trees_ex_01.py index c9ac9d1e6..6b2d8f4e5 100644 --- a/python_scripts/trees_ex_01.py +++ b/python_scripts/trees_ex_01.py @@ -16,9 +16,9 @@ # # ๐Ÿ“ Exercise M5.01 # # In the previous notebook, we showed how a tree with a depth of 1 level was -# working. The aim of this exercise is to repeat part of the previous -# experiment for a depth with 2 levels to show how the process of partitioning -# is repeated over time. +# working. The aim of this exercise is to repeat part of the previous experiment +# for a depth with 2 levels to show how the process of partitioning is repeated +# over time. # # Before to start, we will: # @@ -48,18 +48,18 @@ ) # %% [markdown] -# Create a decision tree classifier with a maximum depth of 2 levels and fit -# the training data. Once this classifier trained, plot the data and the -# decision boundary to see the benefit of increasing the depth. To plot the -# decision boundary, you should import the class `DecisionBoundaryDisplay` -# from the module `sklearn.inspection` as shown in the previous course notebook. +# Create a decision tree classifier with a maximum depth of 2 levels and fit the +# training data. Once this classifier trained, plot the data and the decision +# boundary to see the benefit of increasing the depth. To plot the decision +# boundary, you should import the class `DecisionBoundaryDisplay` from the +# module `sklearn.inspection` as shown in the previous course notebook. # %% # Write your code here. # %% [markdown] -# Did we make use of the feature "Culmen Length"? -# Plot the tree using the function `sklearn.tree.plot_tree` to find out! +# Did we make use of the feature "Culmen Length"? Plot the tree using the +# function `sklearn.tree.plot_tree` to find out! # %% # Write your code here. diff --git a/python_scripts/trees_hyperparameters.py b/python_scripts/trees_hyperparameters.py index bf21d825e..007998851 100644 --- a/python_scripts/trees_hyperparameters.py +++ b/python_scripts/trees_hyperparameters.py @@ -9,8 +9,8 @@ # # Importance of decision tree hyperparameters on generalization # # In this notebook, we will illustrate the importance of some key -# hyperparameters on the decision tree; we will demonstrate their effects on -# the classification and regression problems we saw previously. +# hyperparameters on the decision tree; we will demonstrate their effects on the +# classification and regression problems we saw previously. # # First, we will load the classification and regression datasets. @@ -52,12 +52,20 @@ def fit_and_plot_classification(model, data, feature_names, target_names): else: palette = ["tab:red", "tab:blue", "black"] DecisionBoundaryDisplay.from_estimator( - model, data[feature_names], response_method="predict", - cmap="RdBu", alpha=0.5 + model, + data[feature_names], + response_method="predict", + cmap="RdBu", + alpha=0.5, ) - sns.scatterplot(data=data, x=feature_names[0], y=feature_names[1], - hue=target_names, palette=palette) - plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left') + sns.scatterplot( + data=data, + x=feature_names[0], + y=feature_names[1], + hue=target_names, + palette=palette, + ) + plt.legend(bbox_to_anchor=(1.05, 1), loc="upper left") def fit_and_plot_regression(model, data, feature_names, target_names): @@ -69,7 +77,8 @@ def fit_and_plot_regression(model, data, feature_names, target_names): target_predicted = model.predict(data_test) sns.scatterplot( - x=data.iloc[:, 0], y=data[target_names], color="black", alpha=0.5) + x=data.iloc[:, 0], y=data[target_names], color="black", alpha=0.5 + ) plt.plot(data_test.iloc[:, 0], target_predicted, linewidth=4) @@ -93,17 +102,19 @@ def fit_and_plot_regression(model, data, feature_names, target_names): # %% fit_and_plot_classification( - tree_clf, data_clf, data_clf_columns, target_clf_column) + tree_clf, data_clf, data_clf_columns, target_clf_column +) _ = plt.title(f"Shallow classification tree with max-depth of {max_depth}") # %% fit_and_plot_regression( - tree_reg, data_reg, data_reg_columns, target_reg_column) + tree_reg, data_reg, data_reg_columns, target_reg_column +) _ = plt.title(f"Shallow regression tree with max-depth of {max_depth}") # %% [markdown] -# Now, let's increase the `max_depth` parameter value to check the difference -# by observing the decision function. +# Now, let's increase the `max_depth` parameter value to check the difference by +# observing the decision function. # %% max_depth = 30 @@ -112,21 +123,22 @@ def fit_and_plot_regression(model, data, feature_names, target_names): # %% fit_and_plot_classification( - tree_clf, data_clf, data_clf_columns, target_clf_column) + tree_clf, data_clf, data_clf_columns, target_clf_column +) _ = plt.title(f"Deep classification tree with max-depth of {max_depth}") # %% fit_and_plot_regression( - tree_reg, data_reg, data_reg_columns, target_reg_column) + tree_reg, data_reg, data_reg_columns, target_reg_column +) _ = plt.title(f"Deep regression tree with max-depth of {max_depth}") # %% [markdown] -# For both classification and regression setting, we observe that -# increasing the depth will make the tree model more expressive. However, a -# tree that is too deep will overfit the training data, creating partitions -# which are only correct for "outliers" (noisy samples). The `max_depth` is one -# of the hyperparameters that one should optimize via cross-validation and -# grid-search. +# For both classification and regression setting, we observe that increasing the +# depth will make the tree model more expressive. However, a tree that is too +# deep will overfit the training data, creating partitions which are only +# correct for "outliers" (noisy samples). The `max_depth` is one of the +# hyperparameters that one should optimize via cross-validation and grid-search. # %% from sklearn.model_selection import GridSearchCV @@ -137,19 +149,23 @@ def fit_and_plot_regression(model, data, feature_names, target_names): # %% fit_and_plot_classification( - tree_clf, data_clf, data_clf_columns, target_clf_column) -_ = plt.title(f"Optimal depth found via CV: " - f"{tree_clf.best_params_['max_depth']}") + tree_clf, data_clf, data_clf_columns, target_clf_column +) +_ = plt.title( + f"Optimal depth found via CV: {tree_clf.best_params_['max_depth']}" +) # %% fit_and_plot_regression( - tree_reg, data_reg, data_reg_columns, target_reg_column) -_ = plt.title(f"Optimal depth found via CV: " - f"{tree_reg.best_params_['max_depth']}") + tree_reg, data_reg, data_reg_columns, target_reg_column +) +_ = plt.title( + f"Optimal depth found via CV: {tree_reg.best_params_['max_depth']}" +) # %% [markdown] -# With this example, we see that there is not a single value that is optimal -# for any dataset. Thus, this parameter is required to be optimized for each +# With this example, we see that there is not a single value that is optimal for +# any dataset. Thus, this parameter is required to be optimized for each # application. # # ## Other hyperparameters in decision trees @@ -162,10 +178,9 @@ def fit_and_plot_regression(model, data, feature_names, target_names): # # We will build a dataset where we will illustrate this asymmetry. We will # generate a dataset composed of 2 subsets: one subset where a clear separation -# should be found by the tree and another subset where samples from both -# classes will be mixed. It implies that a decision tree will need more splits -# to classify properly samples from the second subset than from the first -# subset. +# should be found by the tree and another subset where samples from both classes +# will be mixed. It implies that a decision tree will need more splits to +# classify properly samples from the second subset than from the first subset. # %% from sklearn.datasets import make_blobs @@ -175,21 +190,27 @@ def fit_and_plot_regression(model, data, feature_names, target_names): # Blobs that will be interlaced X_1, y_1 = make_blobs( - n_samples=300, centers=[[0, 0], [-1, -1]], random_state=0) + n_samples=300, centers=[[0, 0], [-1, -1]], random_state=0 +) # Blobs that will be easily separated -X_2, y_2 = make_blobs( - n_samples=300, centers=[[3, 6], [7, 0]], random_state=0) +X_2, y_2 = make_blobs(n_samples=300, centers=[[3, 6], [7, 0]], random_state=0) X = np.concatenate([X_1, X_2], axis=0) y = np.concatenate([y_1, y_2]) data_clf = np.concatenate([X, y[:, np.newaxis]], axis=1) data_clf = pd.DataFrame( - data_clf, columns=data_clf_columns + [target_clf_column]) + data_clf, columns=data_clf_columns + [target_clf_column] +) data_clf[target_clf_column] = data_clf[target_clf_column].astype(np.int32) # %% -sns.scatterplot(data=data_clf, x=data_clf_columns[0], y=data_clf_columns[1], - hue=target_clf_column, palette=["tab:red", "tab:blue"]) +sns.scatterplot( + data=data_clf, + x=data_clf_columns[0], + y=data_clf_columns[1], + hue=target_clf_column, + palette=["tab:red", "tab:blue"], +) _ = plt.title("Synthetic dataset") # %% [markdown] @@ -201,7 +222,8 @@ def fit_and_plot_regression(model, data, feature_names, target_names): max_depth = 2 tree_clf = DecisionTreeClassifier(max_depth=max_depth) fit_and_plot_classification( - tree_clf, data_clf, data_clf_columns, target_clf_column) + tree_clf, data_clf, data_clf_columns, target_clf_column +) _ = plt.title(f"Decision tree with max-depth of {max_depth}") # %% [markdown] @@ -209,9 +231,9 @@ def fit_and_plot_regression(model, data, feature_names, target_names): # top are easily separated. However, more splits will be required to better # split the blob were both blue and red data points are mixed. # -# Indeed, we see that red blob on the top and the blue blob on the right of -# the plot are perfectly separated. However, the tree is still making mistakes -# in the area where the blobs are mixed together. Let's check the tree +# Indeed, we see that red blob on the top and the blue blob on the right of the +# plot are perfectly separated. However, the tree is still making mistakes in +# the area where the blobs are mixed together. Let's check the tree # representation. # %% @@ -221,14 +243,15 @@ def fit_and_plot_regression(model, data, feature_names, target_names): _ = plot_tree(tree_clf, ax=ax, feature_names=data_clf_columns) # %% [markdown] -# We see that the right branch achieves perfect classification. Now, we -# increase the depth to check how the tree will grow. +# We see that the right branch achieves perfect classification. Now, we increase +# the depth to check how the tree will grow. # %% max_depth = 6 tree_clf = DecisionTreeClassifier(max_depth=max_depth) fit_and_plot_classification( - tree_clf, data_clf, data_clf_columns, target_clf_column) + tree_clf, data_clf, data_clf_columns, target_clf_column +) _ = plt.title(f"Decision tree with max-depth of {max_depth}") # %% @@ -238,28 +261,29 @@ def fit_and_plot_regression(model, data, feature_names, target_names): # %% [markdown] # As expected, the left branch of the tree continue to grow while no further # splits were done on the right branch. Fixing the `max_depth` parameter would -# cut the tree horizontally at a specific level, whether or not it would -# be more beneficial that a branch continue growing. +# cut the tree horizontally at a specific level, whether or not it would be more +# beneficial that a branch continue growing. # -# The hyperparameters `min_samples_leaf`, `min_samples_split`, -# `max_leaf_nodes`, or `min_impurity_decrease` allows growing asymmetric trees -# and apply a constraint at the leaves or nodes level. We will check the effect -# of `min_samples_leaf`. +# The hyperparameters `min_samples_leaf`, `min_samples_split`, `max_leaf_nodes`, +# or `min_impurity_decrease` allows growing asymmetric trees and apply a +# constraint at the leaves or nodes level. We will check the effect of +# `min_samples_leaf`. # %% min_samples_leaf = 60 tree_clf = DecisionTreeClassifier(min_samples_leaf=min_samples_leaf) fit_and_plot_classification( - tree_clf, data_clf, data_clf_columns, target_clf_column) + tree_clf, data_clf, data_clf_columns, target_clf_column +) _ = plt.title( - f"Decision tree with leaf having at least {min_samples_leaf} samples") + f"Decision tree with leaf having at least {min_samples_leaf} samples" +) # %% _, ax = plt.subplots(figsize=(10, 7)) _ = plot_tree(tree_clf, ax=ax, feature_names=data_clf_columns) # %% [markdown] -# This hyperparameter allows to have leaves with a minimum number of samples -# and no further splits will be searched otherwise. Therefore, these -# hyperparameters could be an alternative to fix the `max_depth` -# hyperparameter. +# This hyperparameter allows to have leaves with a minimum number of samples and +# no further splits will be searched otherwise. Therefore, these hyperparameters +# could be an alternative to fix the `max_depth` hyperparameter. diff --git a/python_scripts/trees_regression.py b/python_scripts/trees_regression.py index c97c7d211..56fbfd3f0 100644 --- a/python_scripts/trees_regression.py +++ b/python_scripts/trees_regression.py @@ -38,14 +38,15 @@ # %% import numpy as np -data_test = pd.DataFrame(np.arange(data_train[feature_name].min(), - data_train[feature_name].max()), - columns=[feature_name]) +data_test = pd.DataFrame( + np.arange(data_train[feature_name].min(), data_train[feature_name].max()), + columns=[feature_name], +) # %% [markdown] -# Using the term "test" here refers to data that was not used for training. -# It should not be confused with data coming from a train-test split, as it -# was generated in equally-spaced intervals for the visual evaluation of the +# Using the term "test" here refers to data that was not used for training. It +# should not be confused with data coming from a train-test split, as it was +# generated in equally-spaced intervals for the visual evaluation of the # predictions. # # Note that this is methodologically valid here because our objective is to get @@ -60,8 +61,9 @@ import matplotlib.pyplot as plt import seaborn as sns -sns.scatterplot(data=penguins, x=feature_name, y=target_name, - color="black", alpha=0.5) +sns.scatterplot( + data=penguins, x=feature_name, y=target_name, color="black", alpha=0.5 +) _ = plt.title("Illustration of the regression dataset used") # %% [markdown] @@ -76,8 +78,9 @@ target_predicted = linear_model.predict(data_test) # %% -sns.scatterplot(data=penguins, x=feature_name, y=target_name, - color="black", alpha=0.5) +sns.scatterplot( + data=penguins, x=feature_name, y=target_name, color="black", alpha=0.5 +) plt.plot(data_test[feature_name], target_predicted, label="Linear regression") plt.legend() _ = plt.title("Prediction function using a LinearRegression") @@ -89,12 +92,21 @@ # the line. # %% -ax = sns.scatterplot(data=penguins, x=feature_name, y=target_name, - color="black", alpha=0.5) -plt.plot(data_test[feature_name], target_predicted, label="Linear regression", - linestyle="--") -plt.scatter(data_test[::3], target_predicted[::3], label="Predictions", - color="tab:orange") +ax = sns.scatterplot( + data=penguins, x=feature_name, y=target_name, color="black", alpha=0.5 +) +plt.plot( + data_test[feature_name], + target_predicted, + label="Linear regression", + linestyle="--", +) +plt.scatter( + data_test[::3], + target_predicted[::3], + label="Predictions", + color="tab:orange", +) plt.legend() _ = plt.title("Prediction function using a LinearRegression") @@ -112,8 +124,9 @@ target_predicted = tree.predict(data_test) # %% -sns.scatterplot(data=penguins, x=feature_name, y=target_name, - color="black", alpha=0.5) +sns.scatterplot( + data=penguins, x=feature_name, y=target_name, color="black", alpha=0.5 +) plt.plot(data_test[feature_name], target_predicted, label="Decision tree") plt.legend() _ = plt.title("Prediction function using a DecisionTreeRegressor") @@ -140,8 +153,8 @@ # partition. # # In classification, we saw that increasing the depth of the tree allowed us to -# get more complex decision boundaries. -# Let's check the effect of increasing the depth in a regression setting: +# get more complex decision boundaries. Let's check the effect of increasing the +# depth in a regression setting: # %% tree = DecisionTreeRegressor(max_depth=3) @@ -149,8 +162,9 @@ target_predicted = tree.predict(data_test) # %% -sns.scatterplot(data=penguins, x=feature_name, y=target_name, - color="black", alpha=0.5) +sns.scatterplot( + data=penguins, x=feature_name, y=target_name, color="black", alpha=0.5 +) plt.plot(data_test[feature_name], target_predicted, label="Decision tree") plt.legend() _ = plt.title("Prediction function using a DecisionTreeRegressor") diff --git a/python_scripts/trees_sol_01.py b/python_scripts/trees_sol_01.py index 32916f5dd..bdf10b00e 100644 --- a/python_scripts/trees_sol_01.py +++ b/python_scripts/trees_sol_01.py @@ -9,9 +9,9 @@ # # ๐Ÿ“ƒ Solution for Exercise M5.01 # # In the previous notebook, we showed how a tree with a depth of 1 level was -# working. The aim of this exercise is to repeat part of the previous -# experiment for a depth with 2 levels to show how the process of partitioning -# is repeated over time. +# working. The aim of this exercise is to repeat part of the previous experiment +# for a depth with 2 levels to show how the process of partitioning is repeated +# over time. # # Before to start, we will: # @@ -41,11 +41,11 @@ ) # %% [markdown] -# Create a decision tree classifier with a maximum depth of 2 levels and fit -# the training data. Once this classifier trained, plot the data and the -# decision boundary to see the benefit of increasing the depth. To plot the -# decision boundary, you should import the class `DecisionBoundaryDisplay` -# from the module `sklearn.inspection` as shown in the previous course notebook. +# Create a decision tree classifier with a maximum depth of 2 levels and fit the +# training data. Once this classifier trained, plot the data and the decision +# boundary to see the benefit of increasing the depth. To plot the decision +# boundary, you should import the class `DecisionBoundaryDisplay` from the +# module `sklearn.inspection` as shown in the previous course notebook. # %% # solution @@ -64,29 +64,38 @@ DecisionBoundaryDisplay.from_estimator( tree, data_train, response_method="predict", cmap="RdBu", alpha=0.5 ) -ax = sns.scatterplot(data=penguins, x=culmen_columns[0], y=culmen_columns[1], - hue=target_column, palette=palette) -plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left') +ax = sns.scatterplot( + data=penguins, + x=culmen_columns[0], + y=culmen_columns[1], + hue=target_column, + palette=palette, +) +plt.legend(bbox_to_anchor=(1.05, 1), loc="upper left") _ = plt.title("Decision boundary using a decision tree") # %% [markdown] -# Did we make use of the feature "Culmen Length"? -# Plot the tree using the function `sklearn.tree.plot_tree` to find out! +# Did we make use of the feature "Culmen Length"? Plot the tree using the +# function `sklearn.tree.plot_tree` to find out! # %% # solution from sklearn.tree import plot_tree _, ax = plt.subplots(figsize=(16, 12)) -_ = plot_tree(tree, feature_names=culmen_columns, - class_names=tree.classes_, impurity=False, ax=ax) +_ = plot_tree( + tree, + feature_names=culmen_columns, + class_names=tree.classes_, + impurity=False, + ax=ax, +) # %% [markdown] tags=["solution"] -# The resulting tree has 7 nodes: 3 of them are "split nodes" and 4 -# are "leaf nodes" (or simply "leaves"), organized in 2 levels. -# We see that the second tree level used the "Culmen Length" to make -# two new decisions. Qualitatively, we saw that such a simple tree was enough -# to classify the penguins' species. +# The resulting tree has 7 nodes: 3 of them are "split nodes" and 4 are "leaf +# nodes" (or simply "leaves"), organized in 2 levels. We see that the second +# tree level used the "Culmen Length" to make two new decisions. Qualitatively, +# we saw that such a simple tree was enough to classify the penguins' species. # %% [markdown] # Compute the accuracy of the decision tree on the testing data. @@ -103,5 +112,5 @@ # # We predict an Adelie penguin if the feature value is below the threshold, # which is not surprising since this partition was almost pure. If the feature -# value is above the threshold, we predict the Gentoo penguin, the class that -# is most probable. +# value is above the threshold, we predict the Gentoo penguin, the class that is +# most probable. diff --git a/python_scripts/trees_sol_02.py b/python_scripts/trees_sol_02.py index cd2c4ef12..cc7d5dbce 100644 --- a/python_scripts/trees_sol_02.py +++ b/python_scripts/trees_sol_02.py @@ -8,11 +8,11 @@ # %% [markdown] # # ๐Ÿ“ƒ Solution for Exercise M5.02 # -# The aim of this exercise is to find out whether a decision tree -# model is able to extrapolate. +# The aim of this exercise is to find out whether a decision tree model is able +# to extrapolate. # -# By extrapolation, we refer to values predicted by a model outside of the -# range of feature values seen during the training. +# By extrapolation, we refer to values predicted by a model outside of the range +# of feature values seen during the training. # # We will first load the regression data. @@ -33,8 +33,8 @@ # %% [markdown] # First, create two models, a linear regression model and a decision tree -# regression model, and fit them on the training data. Limit the depth at -# 3 levels for the decision tree. +# regression model, and fit them on the training data. Limit the depth at 3 +# levels for the decision tree. # %% # solution @@ -48,17 +48,18 @@ tree.fit(data_train, target_train) # %% [markdown] -# Create a synthetic dataset containing all possible flipper length from -# the minimum to the maximum of the training dataset. Get the predictions of -# each model using this dataset. +# Create a synthetic dataset containing all possible flipper length from the +# minimum to the maximum of the training dataset. Get the predictions of each +# model using this dataset. # %% # solution import numpy as np -data_test = pd.DataFrame(np.arange(data_train[feature_name].min(), - data_train[feature_name].max()), - columns=[feature_name]) +data_test = pd.DataFrame( + np.arange(data_train[feature_name].min(), data_train[feature_name].max()), + columns=[feature_name], +) # %% tags=["solution"] target_predicted_linear_regression = linear_regression.predict(data_test) @@ -73,10 +74,14 @@ import matplotlib.pyplot as plt import seaborn as sns -sns.scatterplot(data=penguins, x=feature_name, y=target_name, - color="black", alpha=0.5) -plt.plot(data_test[feature_name], target_predicted_linear_regression, - label="Linear regression") +sns.scatterplot( + data=penguins, x=feature_name, y=target_name, color="black", alpha=0.5 +) +plt.plot( + data_test[feature_name], + target_predicted_linear_regression, + label="Linear regression", +) plt.plot(data_test[feature_name], target_predicted_tree, label="Decision tree") plt.legend() _ = plt.title("Prediction of linear model and a decision tree") @@ -88,16 +93,20 @@ # %% [markdown] # Now, we will check the extrapolation capabilities of each model. Create a -# dataset containing a broader range of values than your previous dataset, -# in other words, add values below and above the minimum and the maximum of -# the flipper length seen during training. +# dataset containing a broader range of values than your previous dataset, in +# other words, add values below and above the minimum and the maximum of the +# flipper length seen during training. # %% # solution offset = 30 -data_test = pd.DataFrame(np.arange(data_train[feature_name].min() - offset, - data_train[feature_name].max() + offset), - columns=[feature_name]) +data_test = pd.DataFrame( + np.arange( + data_train[feature_name].min() - offset, + data_train[feature_name].max() + offset, + ), + columns=[feature_name], +) # %% [markdown] # Finally, make predictions with both models on this new interval of data. @@ -109,18 +118,22 @@ target_predicted_tree = tree.predict(data_test) # %% tags=["solution"] -sns.scatterplot(data=penguins, x=feature_name, y=target_name, - color="black", alpha=0.5) -plt.plot(data_test[feature_name], target_predicted_linear_regression, - label="Linear regression") +sns.scatterplot( + data=penguins, x=feature_name, y=target_name, color="black", alpha=0.5 +) +plt.plot( + data_test[feature_name], + target_predicted_linear_regression, + label="Linear regression", +) plt.plot(data_test[feature_name], target_predicted_tree, label="Decision tree") plt.legend() _ = plt.title("Prediction of linear model and a decision tree") # %% [markdown] tags=["solution"] -# The linear model will extrapolate using the fitted model for flipper lengths -# < 175 mm and > 235 mm. In fact, we are using the model parametrization to -# make this predictions. +# The linear model will extrapolate using the fitted model for flipper lengths < +# 175 mm and > 235 mm. In fact, we are using the model parametrization to make +# this predictions. # # As mentioned, decision trees are non-parametric models and we observe that # they cannot extrapolate. For flipper lengths below the minimum, the mass of From 424bc74d964d6c7560363afbf866135d6a7c3ca5 Mon Sep 17 00:00:00 2001 From: Olivier Grisel Date: Mon, 5 Jun 2023 14:18:06 +0200 Subject: [PATCH 027/108] MAINT: synchronize exercises (#694) --- .../01_tabular_data_exploration_ex_01.py | 3 +- python_scripts/02_numerical_pipeline_ex_00.py | 13 ++++---- python_scripts/02_numerical_pipeline_ex_01.py | 27 ++++++++-------- .../03_categorical_pipeline_ex_01.py | 32 +++++++++---------- .../03_categorical_pipeline_ex_02.py | 21 ++++++------ python_scripts/cross_validation_ex_01.py | 3 +- python_scripts/cross_validation_ex_02.py | 31 +++++++++--------- python_scripts/ensemble_ex_01.py | 25 +++++++-------- python_scripts/ensemble_ex_02.py | 8 ++--- python_scripts/ensemble_ex_03.py | 24 +++++++------- python_scripts/ensemble_ex_04.py | 31 +++++++++--------- python_scripts/feature_selection_ex_01.py | 9 ++++-- python_scripts/linear_models_ex_01.py | 23 +++++++------ python_scripts/linear_models_ex_02.py | 9 +++--- python_scripts/linear_models_ex_03.py | 24 +++++++------- python_scripts/linear_models_ex_04.py | 21 ++++++------ python_scripts/linear_models_ex_05.py | 16 +++++----- python_scripts/metrics_ex_01.py | 26 +++++++-------- python_scripts/metrics_ex_02.py | 3 +- python_scripts/parameter_tuning_ex_02.py | 15 ++++----- python_scripts/parameter_tuning_ex_03.py | 11 +++---- python_scripts/trees_ex_01.py | 3 +- python_scripts/trees_ex_02.py | 27 ++++++++-------- 23 files changed, 190 insertions(+), 215 deletions(-) diff --git a/python_scripts/01_tabular_data_exploration_ex_01.py b/python_scripts/01_tabular_data_exploration_ex_01.py index 79f90ad2b..b09b00dc3 100644 --- a/python_scripts/01_tabular_data_exploration_ex_01.py +++ b/python_scripts/01_tabular_data_exploration_ex_01.py @@ -1,4 +1,3 @@ -# -*- coding: utf-8 -*- # --- # jupyter: # jupytext: @@ -6,7 +5,7 @@ # extension: .py # format_name: percent # format_version: '1.3' -# jupytext_version: 1.11.5 +# jupytext_version: 1.14.5 # kernelspec: # display_name: Python 3 # name: python3 diff --git a/python_scripts/02_numerical_pipeline_ex_00.py b/python_scripts/02_numerical_pipeline_ex_00.py index 5d41ab982..0436dfc50 100644 --- a/python_scripts/02_numerical_pipeline_ex_00.py +++ b/python_scripts/02_numerical_pipeline_ex_00.py @@ -1,4 +1,3 @@ -# -*- coding: utf-8 -*- # --- # jupyter: # jupytext: @@ -6,7 +5,7 @@ # extension: .py # format_name: percent # format_version: '1.3' -# jupytext_version: 1.11.5 +# jupytext_version: 1.14.5 # kernelspec: # display_name: Python 3 # name: python3 @@ -32,12 +31,12 @@ # %% [markdown] # In the previous notebook we used `model = KNeighborsClassifier()`. All # scikit-learn models can be created without arguments. This is convenient -# because it means that you don't need to understand the full details of a -# model before starting to use it. +# because it means that you don't need to understand the full details of a model +# before starting to use it. # -# One of the `KNeighborsClassifier` parameters is `n_neighbors`. It controls -# the number of neighbors we are going to use to make a prediction for a new -# data point. +# One of the `KNeighborsClassifier` parameters is `n_neighbors`. It controls the +# number of neighbors we are going to use to make a prediction for a new data +# point. # # What is the default value of the `n_neighbors` parameter? Hint: Look at the # documentation on the [scikit-learn diff --git a/python_scripts/02_numerical_pipeline_ex_01.py b/python_scripts/02_numerical_pipeline_ex_01.py index 826f99759..7654753d4 100644 --- a/python_scripts/02_numerical_pipeline_ex_01.py +++ b/python_scripts/02_numerical_pipeline_ex_01.py @@ -1,4 +1,3 @@ -# -*- coding: utf-8 -*- # --- # jupyter: # jupytext: @@ -6,7 +5,7 @@ # extension: .py # format_name: percent # format_version: '1.3' -# jupytext_version: 1.11.5 +# jupytext_version: 1.14.5 # kernelspec: # display_name: Python 3 # name: python3 @@ -16,19 +15,19 @@ # # ๐Ÿ“ Exercise M1.03 # # The goal of this exercise is to compare the performance of our classifier in -# the previous notebook (roughly 81% accuracy with `LogisticRegression`) to -# some simple baseline classifiers. The simplest baseline classifier is one -# that always predicts the same class, irrespective of the input data. +# the previous notebook (roughly 81% accuracy with `LogisticRegression`) to some +# simple baseline classifiers. The simplest baseline classifier is one that +# always predicts the same class, irrespective of the input data. # # - What would be the score of a model that always predicts `' >50K'`? # - What would be the score of a model that always predicts `' <=50K'`? # - Is 81% or 82% accuracy a good score for this problem? # -# Use a `DummyClassifier` and do a train-test split to evaluate -# its accuracy on the test set. This +# Use a `DummyClassifier` and do a train-test split to evaluate its accuracy on +# the test set. This # [link](https://scikit-learn.org/stable/modules/model_evaluation.html#dummy-estimators) -# shows a few examples of how to evaluate the generalization performance of these -# baseline models. +# shows a few examples of how to evaluate the generalization performance of +# these baseline models. # %% import pandas as pd @@ -62,12 +61,12 @@ # Write your code here. # %% [markdown] -# Use a `DummyClassifier` such that the resulting classifier will always -# predict the class `' >50K'`. What is the accuracy score on the test set? -# Repeat the experiment by always predicting the class `' <=50K'`. +# Use a `DummyClassifier` such that the resulting classifier will always predict +# the class `' >50K'`. What is the accuracy score on the test set? Repeat the +# experiment by always predicting the class `' <=50K'`. # -# Hint: you can set the `strategy` parameter of the `DummyClassifier` to -# achieve the desired behavior. +# Hint: you can set the `strategy` parameter of the `DummyClassifier` to achieve +# the desired behavior. # %% from sklearn.dummy import DummyClassifier diff --git a/python_scripts/03_categorical_pipeline_ex_01.py b/python_scripts/03_categorical_pipeline_ex_01.py index eaf14d270..ae19eab2f 100644 --- a/python_scripts/03_categorical_pipeline_ex_01.py +++ b/python_scripts/03_categorical_pipeline_ex_01.py @@ -1,4 +1,3 @@ -# -*- coding: utf-8 -*- # --- # jupyter: # jupytext: @@ -6,7 +5,7 @@ # extension: .py # format_name: percent # format_version: '1.3' -# jupytext_version: 1.11.5 +# jupytext_version: 1.14.5 # kernelspec: # display_name: Python 3 # name: python3 @@ -16,8 +15,8 @@ # # ๐Ÿ“ Exercise M1.04 # # The goal of this exercise is to evaluate the impact of using an arbitrary -# integer encoding for categorical variables along with a linear -# classification model such as Logistic Regression. +# integer encoding for categorical variables along with a linear classification +# model such as Logistic Regression. # # To do so, let's try to use `OrdinalEncoder` to preprocess the categorical # variables. This preprocessor is assembled in a pipeline with @@ -57,8 +56,8 @@ # # Because `OrdinalEncoder` can raise errors if it sees an unknown category at # prediction time, you can set the `handle_unknown="use_encoded_value"` and -# `unknown_value` parameters. You can refer to the -# [scikit-learn documentation](https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.OrdinalEncoder.html) +# `unknown_value` parameters. You can refer to the [scikit-learn +# documentation](https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.OrdinalEncoder.html) # for more details regarding these parameters. # %% @@ -74,13 +73,12 @@ # # ```{note} # Be aware that if an error happened during the cross-validation, -# `cross_validate` will raise a warning and return NaN (Not a Number) -# as scores. To make it raise a standard Python exception with a traceback, -# you can pass the `error_score="raise"` argument in the call to -# `cross_validate`. An exception will be raised instead of a warning at the first -# encountered problem and `cross_validate` will stop right away instead of -# returning NaN values. This is particularly handy when developing -# complex machine learning pipelines. +# `cross_validate` will raise a warning and return NaN (Not a Number) as scores. +# To make it raise a standard Python exception with a traceback, you can pass +# the `error_score="raise"` argument in the call to `cross_validate`. An +# exception will be raised instead of a warning at the first encountered problem +# and `cross_validate` will stop right away instead of returning NaN values. +# This is particularly handy when developing complex machine learning pipelines. # ``` # %% @@ -90,10 +88,10 @@ # %% [markdown] # Now, we would like to compare the generalization performance of our previous -# model with a new model where instead of using an `OrdinalEncoder`, we will -# use a `OneHotEncoder`. Repeat the model evaluation using cross-validation. -# Compare the score of both models and conclude on the impact of choosing a -# specific encoding strategy when using a linear model. +# model with a new model where instead of using an `OrdinalEncoder`, we will use +# a `OneHotEncoder`. Repeat the model evaluation using cross-validation. Compare +# the score of both models and conclude on the impact of choosing a specific +# encoding strategy when using a linear model. # %% from sklearn.preprocessing import OneHotEncoder diff --git a/python_scripts/03_categorical_pipeline_ex_02.py b/python_scripts/03_categorical_pipeline_ex_02.py index 6211844c8..7daacfbd4 100644 --- a/python_scripts/03_categorical_pipeline_ex_02.py +++ b/python_scripts/03_categorical_pipeline_ex_02.py @@ -1,4 +1,3 @@ -# -*- coding: utf-8 -*- # --- # jupyter: # jupytext: @@ -6,7 +5,7 @@ # extension: .py # format_name: percent # format_version: '1.3' -# jupytext_version: 1.11.5 +# jupytext_version: 1.14.5 # kernelspec: # display_name: Python 3 # name: python3 @@ -36,9 +35,9 @@ data = adult_census.drop(columns=[target_name, "education-num"]) # %% [markdown] -# As in the previous notebooks, we use the utility `make_column_selector` -# to select only columns with a specific data type. Besides, we list in -# advance all categories for the categorical columns. +# As in the previous notebooks, we use the utility `make_column_selector` to +# select only columns with a specific data type. Besides, we list in advance all +# categories for the categorical columns. # %% from sklearn.compose import make_column_selector as selector @@ -99,15 +98,15 @@ # # We observed that integer coding of categorical variables can be very # detrimental for linear models. However, it does not seem to be the case for -# `HistGradientBoostingClassifier` models, as the cross-validation score -# of the reference pipeline with `OrdinalEncoder` is reasonably good. +# `HistGradientBoostingClassifier` models, as the cross-validation score of the +# reference pipeline with `OrdinalEncoder` is reasonably good. # # Let's see if we can get an even better accuracy with `OneHotEncoder`. # -# Hint: `HistGradientBoostingClassifier` does not yet support sparse input -# data. You might want to use -# `OneHotEncoder(handle_unknown="ignore", sparse_output=False)` to force the use of a -# dense representation as a workaround. +# Hint: `HistGradientBoostingClassifier` does not yet support sparse input data. +# You might want to use `OneHotEncoder(handle_unknown="ignore", +# sparse_output=False)` to force the use of a dense representation as a +# workaround. # %% # Write your code here. diff --git a/python_scripts/cross_validation_ex_01.py b/python_scripts/cross_validation_ex_01.py index c5f4d1182..d671b8fef 100644 --- a/python_scripts/cross_validation_ex_01.py +++ b/python_scripts/cross_validation_ex_01.py @@ -1,4 +1,3 @@ -# -*- coding: utf-8 -*- # --- # jupyter: # jupytext: @@ -6,7 +5,7 @@ # extension: .py # format_name: percent # format_version: '1.3' -# jupytext_version: 1.11.5 +# jupytext_version: 1.14.5 # kernelspec: # display_name: Python 3 # name: python3 diff --git a/python_scripts/cross_validation_ex_02.py b/python_scripts/cross_validation_ex_02.py index 07a100bf6..5912c2f11 100644 --- a/python_scripts/cross_validation_ex_02.py +++ b/python_scripts/cross_validation_ex_02.py @@ -1,4 +1,3 @@ -# -*- coding: utf-8 -*- # --- # jupyter: # jupytext: @@ -6,7 +5,7 @@ # extension: .py # format_name: percent # format_version: '1.3' -# jupytext_version: 1.11.5 +# jupytext_version: 1.14.5 # kernelspec: # display_name: Python 3 # name: python3 @@ -15,12 +14,12 @@ # %% [markdown] # # ๐Ÿ“ Exercise M7.01 # -# In this exercise we will define dummy classification baselines and use them -# as reference to assess the relative predictive performance of a given model -# of interest. +# In this exercise we will define dummy classification baselines and use them as +# reference to assess the relative predictive performance of a given model of +# interest. # -# We illustrate those baselines with the help of the Adult Census dataset, -# using only the numerical features for the sake of simplicity. +# We illustrate those baselines with the help of the Adult Census dataset, using +# only the numerical features for the sake of simplicity. # %% import pandas as pd @@ -43,16 +42,16 @@ # Write your code here. # %% [markdown] -# Compute the cross-validation (test) scores for the classifier on this -# dataset. Store the results pandas Series as we did in the previous notebook. +# Compute the cross-validation (test) scores for the classifier on this dataset. +# Store the results pandas Series as we did in the previous notebook. # %% # Write your code here. # %% [markdown] -# Now, compute the cross-validation scores of a dummy classifier that -# constantly predicts the most frequent class observed the training set. Please -# refer to the online documentation for the [sklearn.dummy.DummyClassifier +# Now, compute the cross-validation scores of a dummy classifier that constantly +# predicts the most frequent class observed the training set. Please refer to +# the online documentation for the [sklearn.dummy.DummyClassifier # ](https://scikit-learn.org/stable/modules/generated/sklearn.dummy.DummyClassifier.html) # class. # @@ -62,16 +61,16 @@ # Write your code here. # %% [markdown] -# Now that we collected the results from the baseline and the model, -# concatenate the test scores as columns a single pandas dataframe. +# Now that we collected the results from the baseline and the model, concatenate +# the test scores as columns a single pandas dataframe. # %% # Write your code here. # %% [markdown] # -# Next, plot the histogram of the cross-validation test scores for both -# models with the help of [pandas built-in plotting +# Next, plot the histogram of the cross-validation test scores for both models +# with the help of [pandas built-in plotting # function](https://pandas.pydata.org/pandas-docs/stable/user_guide/visualization.html#histograms). # # What conclusions do you draw from the results? diff --git a/python_scripts/ensemble_ex_01.py b/python_scripts/ensemble_ex_01.py index cad686f26..b0f454c5c 100644 --- a/python_scripts/ensemble_ex_01.py +++ b/python_scripts/ensemble_ex_01.py @@ -1,4 +1,3 @@ -# -*- coding: utf-8 -*- # --- # jupyter: # jupytext: @@ -6,7 +5,7 @@ # extension: .py # format_name: percent # format_version: '1.3' -# jupytext_version: 1.11.5 +# jupytext_version: 1.14.5 # kernelspec: # display_name: Python 3 # name: python3 @@ -18,8 +17,8 @@ # The aim of this notebook is to investigate if we can tune the hyperparameters # of a bagging regressor and evaluate the gain obtained. # -# We will load the California housing dataset and split it into a training and -# a testing set. +# We will load the California housing dataset and split it into a training and a +# testing set. # %% from sklearn.datasets import fetch_california_housing @@ -38,23 +37,21 @@ # ``` # %% [markdown] -# Create a `BaggingRegressor` and provide a `DecisionTreeRegressor` -# to its parameter `estimator`. Train the regressor and evaluate its -# generalization performance on the testing set using the mean absolute error. +# Create a `BaggingRegressor` and provide a `DecisionTreeRegressor` to its +# parameter `estimator`. Train the regressor and evaluate its generalization +# performance on the testing set using the mean absolute error. # %% # Write your code here. # %% [markdown] -# Now, create a `RandomizedSearchCV` instance using the previous model and -# tune the important parameters of the bagging regressor. Find the best -# parameters and check if you are able to find a set of parameters that -# improve the default regressor still using the mean absolute error as a -# metric. +# Now, create a `RandomizedSearchCV` instance using the previous model and tune +# the important parameters of the bagging regressor. Find the best parameters +# and check if you are able to find a set of parameters that improve the default +# regressor still using the mean absolute error as a metric. # # ```{tip} -# You can list the bagging regressor's parameters using the `get_params` -# method. +# You can list the bagging regressor's parameters using the `get_params` method. # ``` # %% diff --git a/python_scripts/ensemble_ex_02.py b/python_scripts/ensemble_ex_02.py index 147d28c82..1cdd45cd8 100644 --- a/python_scripts/ensemble_ex_02.py +++ b/python_scripts/ensemble_ex_02.py @@ -1,4 +1,3 @@ -# -*- coding: utf-8 -*- # --- # jupyter: # jupytext: @@ -6,7 +5,7 @@ # extension: .py # format_name: percent # format_version: '1.3' -# jupytext_version: 1.11.5 +# jupytext_version: 1.14.5 # kernelspec: # display_name: Python 3 # name: python3 @@ -39,9 +38,8 @@ # ``` # %% [markdown] -# Create a random forest containing three trees. Train the forest and -# check the generalization performance on the testing set in terms of mean -# absolute error. +# Create a random forest containing three trees. Train the forest and check the +# generalization performance on the testing set in terms of mean absolute error. # %% # Write your code here. diff --git a/python_scripts/ensemble_ex_03.py b/python_scripts/ensemble_ex_03.py index 3be02f899..35d0cf66e 100644 --- a/python_scripts/ensemble_ex_03.py +++ b/python_scripts/ensemble_ex_03.py @@ -1,4 +1,3 @@ -# -*- coding: utf-8 -*- # --- # jupyter: # jupytext: @@ -6,7 +5,7 @@ # extension: .py # format_name: percent # format_version: '1.3' -# jupytext_version: 1.11.5 +# jupytext_version: 1.14.5 # kernelspec: # display_name: Python 3 # name: python3 @@ -17,10 +16,10 @@ # # The aim of this exercise is to: # -# * verifying if a random forest or a gradient-boosting decision tree overfit -# if the number of estimators is not properly chosen; -# * use the early-stopping strategy to avoid adding unnecessary trees, to -# get the best generalization performances. +# * verifying if a random forest or a gradient-boosting decision tree overfit if +# the number of estimators is not properly chosen; +# * use the early-stopping strategy to avoid adding unnecessary trees, to get +# the best generalization performances. # # We will use the California housing dataset to conduct our experiments. @@ -76,19 +75,18 @@ # improving for several iterations, it will stop adding trees. # # Now, create a gradient-boosting model with `n_estimators=1_000`. This number -# of trees will be too large. Change the parameter `n_iter_no_change` such -# that the gradient boosting fitting will stop after adding 5 trees that do not +# of trees will be too large. Change the parameter `n_iter_no_change` such that +# the gradient boosting fitting will stop after adding 5 trees that do not # improve the overall generalization performance. # %% # Write your code here. # %% [markdown] -# Estimate the generalization performance of this model again using -# the `sklearn.metrics.mean_absolute_error` metric but this time using -# the test set that we held out at the beginning of the notebook. -# Compare the resulting value with the values observed in the validation -# curve. +# Estimate the generalization performance of this model again using the +# `sklearn.metrics.mean_absolute_error` metric but this time using the test set +# that we held out at the beginning of the notebook. Compare the resulting value +# with the values observed in the validation curve. # %% # Write your code here. diff --git a/python_scripts/ensemble_ex_04.py b/python_scripts/ensemble_ex_04.py index 227978436..aed9e2fb9 100644 --- a/python_scripts/ensemble_ex_04.py +++ b/python_scripts/ensemble_ex_04.py @@ -1,4 +1,3 @@ -# -*- coding: utf-8 -*- # --- # jupyter: # jupytext: @@ -6,7 +5,7 @@ # extension: .py # format_name: percent # format_version: '1.3' -# jupytext_version: 1.11.5 +# jupytext_version: 1.14.5 # kernelspec: # display_name: Python 3 # name: python3 @@ -16,9 +15,9 @@ # # ๐Ÿ“ Exercise M6.04 # # The aim of the exercise is to get familiar with the histogram -# gradient-boosting in scikit-learn. Besides, we will use this model within -# a cross-validation framework in order to inspect internal parameters found -# via grid-search. +# gradient-boosting in scikit-learn. Besides, we will use this model within a +# cross-validation framework in order to inspect internal parameters found via +# grid-search. # # We will use the California housing dataset. @@ -29,22 +28,22 @@ target *= 100 # rescale the target in k$ # %% [markdown] -# First, create a histogram gradient boosting regressor. You can set the -# trees number to be large, and configure the model to use early-stopping. +# First, create a histogram gradient boosting regressor. You can set the trees +# number to be large, and configure the model to use early-stopping. # %% # Write your code here. # %% [markdown] -# We will use a grid-search to find some optimal parameter for this model. -# In this grid-search, you should search for the following parameters: +# We will use a grid-search to find some optimal parameter for this model. In +# this grid-search, you should search for the following parameters: # # * `max_depth: [3, 8]`; # * `max_leaf_nodes: [15, 31]`; # * `learning_rate: [0.1, 1]`. # -# Feel free to explore the space with additional values. Create the -# grid-search providing the previous gradient boosting instance as the model. +# Feel free to explore the space with additional values. Create the grid-search +# providing the previous gradient boosting instance as the model. # %% # Write your code here. @@ -52,16 +51,16 @@ # %% [markdown] # Finally, we will run our experiment through cross-validation. In this regard, # define a 5-fold cross-validation. Besides, be sure to shuffle the data. -# Subsequently, use the function `sklearn.model_selection.cross_validate` -# to run the cross-validation. You should also set `return_estimator=True`, -# so that we can investigate the inner model trained via cross-validation. +# Subsequently, use the function `sklearn.model_selection.cross_validate` to run +# the cross-validation. You should also set `return_estimator=True`, so that we +# can investigate the inner model trained via cross-validation. # %% # Write your code here. # %% [markdown] -# Now that we got the cross-validation results, print out the mean and -# standard deviation score. +# Now that we got the cross-validation results, print out the mean and standard +# deviation score. # %% # Write your code here. diff --git a/python_scripts/feature_selection_ex_01.py b/python_scripts/feature_selection_ex_01.py index dd80e3513..d137e2b58 100644 --- a/python_scripts/feature_selection_ex_01.py +++ b/python_scripts/feature_selection_ex_01.py @@ -1,4 +1,3 @@ -# -*- coding: utf-8 -*- # --- # jupyter: # jupytext: @@ -6,7 +5,7 @@ # extension: .py # format_name: percent # format_version: '1.3' -# jupytext_version: 1.11.5 +# jupytext_version: 1.14.5 # kernelspec: # display_name: Python 3 # name: python3 @@ -57,6 +56,8 @@ # You should get some surprising results. # %% +from sklearn.feature_selection import SelectKBest, f_classif + # Write your code here. # %% [markdown] @@ -66,6 +67,8 @@ # testing sets before you train and test the logistic regression. # %% +from sklearn.model_selection import train_test_split + # Write your code here. # %% [markdown] @@ -80,4 +83,6 @@ # of your model generalization performance. # %% +from sklearn.pipeline import make_pipeline + # Write your code here. diff --git a/python_scripts/linear_models_ex_01.py b/python_scripts/linear_models_ex_01.py index 584357c4e..c1ace0f45 100644 --- a/python_scripts/linear_models_ex_01.py +++ b/python_scripts/linear_models_ex_01.py @@ -1,4 +1,3 @@ -# -*- coding: utf-8 -*- # --- # jupyter: # jupytext: @@ -6,7 +5,7 @@ # extension: .py # format_name: percent # format_version: '1.3' -# jupytext_version: 1.11.5 +# jupytext_version: 1.14.5 # kernelspec: # display_name: Python 3 # name: python3 @@ -59,12 +58,12 @@ def linear_model_flipper_mass( # %% [markdown] # ## Main exercise # -# Define a vector `weights = [...]` and a vector `intercepts = [...]` of -# the same length. Each pair of entries `(weights[i], intercepts[i])` tags a +# Define a vector `weights = [...]` and a vector `intercepts = [...]` of the +# same length. Each pair of entries `(weights[i], intercepts[i])` tags a # different model. Use these vectors along with the vector -# `flipper_length_range` to plot several linear models that could possibly -# fit our data. Use the above helper function to visualize both the models and -# the real samples. +# `flipper_length_range` to plot several linear models that could possibly fit +# our data. Use the above helper function to visualize both the models and the +# real samples. # %% import numpy as np @@ -75,9 +74,9 @@ def linear_model_flipper_mass( # Write your code here. # %% [markdown] -# In the previous question, you were asked to create several linear models. -# The visualization allowed you to qualitatively assess if a model was better -# than another. +# In the previous question, you were asked to create several linear models. The +# visualization allowed you to qualitatively assess if a model was better than +# another. # # Now, you should come up with a quantitative measure which indicates the # goodness of fit of each linear model and allows you to select the best model. @@ -90,8 +89,8 @@ def linear_model_flipper_mass( # Write your code here. # %% [markdown] -# You can now copy and paste the code below to show the goodness of fit for -# each model. +# You can now copy and paste the code below to show the goodness of fit for each +# model. # # ```python # for model_idx, (weight, intercept) in enumerate(zip(weights, intercepts)): diff --git a/python_scripts/linear_models_ex_02.py b/python_scripts/linear_models_ex_02.py index 626de7483..640c44046 100644 --- a/python_scripts/linear_models_ex_02.py +++ b/python_scripts/linear_models_ex_02.py @@ -1,4 +1,3 @@ -# -*- coding: utf-8 -*- # --- # jupyter: # jupytext: @@ -6,7 +5,7 @@ # extension: .py # format_name: percent # format_version: '1.3' -# jupytext_version: 1.11.5 +# jupytext_version: 1.14.5 # kernelspec: # display_name: Python 3 # name: python3 @@ -60,9 +59,9 @@ # %% [markdown] # We observe that the link between the data `data` and vector `target` is -# non-linear. For instance, `data` could represent the years of -# experience (normalized) and `target` the salary (normalized). Therefore, the -# problem here would be to infer the salary given the years of experience. +# non-linear. For instance, `data` could represent the years of experience +# (normalized) and `target` the salary (normalized). Therefore, the problem here +# would be to infer the salary given the years of experience. # # Using the function `f` defined below, find both the `weight` and the # `intercept` that you think will lead to a good linear model. Plot both the diff --git a/python_scripts/linear_models_ex_03.py b/python_scripts/linear_models_ex_03.py index 5f5d3d079..07ca53ac7 100644 --- a/python_scripts/linear_models_ex_03.py +++ b/python_scripts/linear_models_ex_03.py @@ -1,4 +1,3 @@ -# -*- coding: utf-8 -*- # --- # jupyter: # jupytext: @@ -6,7 +5,7 @@ # extension: .py # format_name: percent # format_version: '1.3' -# jupytext_version: 1.11.5 +# jupytext_version: 1.14.5 # kernelspec: # display_name: Python 3 # name: python3 @@ -22,11 +21,10 @@ # The aim of this notebook is to train a linear regression algorithm on a # dataset with more than a single feature. # -# We will load a dataset about house prices in California. -# The dataset consists of 8 features regarding the demography and geography of -# districts in California and the aim is to predict the median house price of -# each district. We will use all 8 features to predict the target, the median -# house price. +# We will load a dataset about house prices in California. The dataset consists +# of 8 features regarding the demography and geography of districts in +# California and the aim is to predict the median house price of each district. +# We will use all 8 features to predict the target, the median house price. # %% [markdown] # ```{note} @@ -42,15 +40,15 @@ data.head() # %% [markdown] -# Now it is your turn to train a linear regression model on this dataset. -# First, create a linear regression model. +# Now it is your turn to train a linear regression model on this dataset. First, +# create a linear regression model. # %% # Write your code here. # %% [markdown] -# Execute a cross-validation with 10 folds and use the mean absolute error -# (MAE) as metric. Be sure to *return* the fitted *estimators*. +# Execute a cross-validation with 10 folds and use the mean absolute error (MAE) +# as metric. Be sure to *return* the fitted *estimators*. # %% # Write your code here. @@ -63,8 +61,8 @@ # %% [markdown] # Inspect the fitted model using a box plot to show the distribution of values -# for the coefficients returned from the cross-validation. Hint: -# use the function +# for the coefficients returned from the cross-validation. Hint: use the +# function # [`df.plot.box()`](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.plot.box.html) # to create a box plot. diff --git a/python_scripts/linear_models_ex_04.py b/python_scripts/linear_models_ex_04.py index 2ca56fd52..18191bccf 100644 --- a/python_scripts/linear_models_ex_04.py +++ b/python_scripts/linear_models_ex_04.py @@ -1,4 +1,3 @@ -# -*- coding: utf-8 -*- # --- # jupyter: # jupytext: @@ -6,7 +5,7 @@ # extension: .py # format_name: percent # format_version: '1.3' -# jupytext_version: 1.11.5 +# jupytext_version: 1.14.5 # kernelspec: # display_name: Python 3 # name: python3 @@ -15,8 +14,8 @@ # %% [markdown] # # ๐Ÿ“ Exercise M4.04 # -# In the previous notebook, we saw the effect of applying some regularization -# on the coefficient of a linear model. +# In the previous notebook, we saw the effect of applying some regularization on +# the coefficient of a linear model. # # In this exercise, we will study the advantage of using some regularization # when dealing with correlated features. @@ -38,8 +37,8 @@ ) # %% [markdown] -# When creating the dataset, `make_regression` returns the true coefficient -# used to generate the dataset. Let's plot this information. +# When creating the dataset, `make_regression` returns the true coefficient used +# to generate the dataset. Let's plot this information. # %% import pandas as pd @@ -56,9 +55,9 @@ coef # %% [markdown] -# Create a `LinearRegression` regressor and fit on the entire dataset and -# check the value of the coefficients. Are the coefficients of the linear -# regressor close to the coefficients used to generate the dataset? +# Create a `LinearRegression` regressor and fit on the entire dataset and check +# the value of the coefficients. Are the coefficients of the linear regressor +# close to the coefficients used to generate the dataset? # %% # Write your code here. @@ -72,8 +71,8 @@ # Write your code here. # %% [markdown] -# Fit again the linear regressor on this new dataset and check the -# coefficients. What do you observe? +# Fit again the linear regressor on this new dataset and check the coefficients. +# What do you observe? # %% # Write your code here. diff --git a/python_scripts/linear_models_ex_05.py b/python_scripts/linear_models_ex_05.py index 9951ebafa..1c36b83c2 100644 --- a/python_scripts/linear_models_ex_05.py +++ b/python_scripts/linear_models_ex_05.py @@ -1,4 +1,3 @@ -# -*- coding: utf-8 -*- # --- # jupyter: # jupytext: @@ -6,7 +5,7 @@ # extension: .py # format_name: percent # format_version: '1.3' -# jupytext_version: 1.11.5 +# jupytext_version: 1.14.5 # kernelspec: # display_name: Python 3 # name: python3 @@ -14,13 +13,14 @@ # %% [markdown] # # ๐Ÿ“ Exercise M4.05 +# # In the previous notebook we set `penalty="none"` to disable regularization -# entirely. This parameter can also control the **type** of regularization to use, -# whereas the regularization **strength** is set using the parameter `C`. -# Setting`penalty="none"` is equivalent to an infinitely large value of `C`. -# In this exercise, we ask you to train a logistic regression classifier using the -# `penalty="l2"` regularization (which happens to be the default in scikit-learn) -# to find by yourself the effect of the parameter `C`. +# entirely. This parameter can also control the **type** of regularization to +# use, whereas the regularization **strength** is set using the parameter `C`. +# Setting`penalty="none"` is equivalent to an infinitely large value of `C`. In +# this exercise, we ask you to train a logistic regression classifier using the +# `penalty="l2"` regularization (which happens to be the default in +# scikit-learn) to find by yourself the effect of the parameter `C`. # # We will start by loading the dataset. diff --git a/python_scripts/metrics_ex_01.py b/python_scripts/metrics_ex_01.py index 31d394ebd..2c0dfeebb 100644 --- a/python_scripts/metrics_ex_01.py +++ b/python_scripts/metrics_ex_01.py @@ -1,4 +1,3 @@ -# -*- coding: utf-8 -*- # --- # jupyter: # jupytext: @@ -6,7 +5,7 @@ # extension: .py # format_name: percent # format_version: '1.3' -# jupytext_version: 1.11.5 +# jupytext_version: 1.14.5 # kernelspec: # display_name: Python 3 # name: python3 @@ -43,9 +42,9 @@ # %% [markdown] # Create a `StratifiedKFold` cross-validation object. Then use it inside the # `cross_val_score` function to evaluate the decision tree. We will first use -# the accuracy as a score function. Explicitly use the `scoring` parameter -# of `cross_val_score` to compute the accuracy (even if this is the default -# score). Check its documentation to learn how to do that. +# the accuracy as a score function. Explicitly use the `scoring` parameter of +# `cross_val_score` to compute the accuracy (even if this is the default score). +# Check its documentation to learn how to do that. # %% # Write your code here. @@ -57,12 +56,12 @@ # Write your code here. # %% [markdown] -# We will now add a bit of complexity. We would like to compute the precision -# of our model. However, during the course we saw that we need to mention the +# We will now add a bit of complexity. We would like to compute the precision of +# our model. However, during the course we saw that we need to mention the # positive label which in our case we consider to be the class `donated`. # -# We will show that computing the precision without providing the positive -# label will not be supported by scikit-learn because it is indeed ambiguous. +# We will show that computing the precision without providing the positive label +# will not be supported by scikit-learn because it is indeed ambiguous. # %% from sklearn.model_selection import cross_val_score @@ -85,9 +84,8 @@ # # So, import `sklearn.metrics.make_scorer` and # `sklearn.metrics.precision_score`. Check their documentations for more -# information. -# Finally, create a scorer by calling `make_scorer` using the score function -# `precision_score` and pass the extra parameter `pos_label="donated"`. +# information. Finally, create a scorer by calling `make_scorer` using the score +# function `precision_score` and pass the extra parameter `pos_label="donated"`. # %% # Write your code here. @@ -102,8 +100,8 @@ # %% [markdown] # `cross_val_score` will only compute a single score provided to the `scoring` # parameter. The function `cross_validate` allows the computation of multiple -# scores by passing a list of string or scorer to the parameter `scoring`, -# which could be handy. +# scores by passing a list of string or scorer to the parameter `scoring`, which +# could be handy. # # Import `sklearn.model_selection.cross_validate` and compute the accuracy and # balanced accuracy through cross-validation. Plot the cross-validation score diff --git a/python_scripts/metrics_ex_02.py b/python_scripts/metrics_ex_02.py index f0bcbd409..2bd3ec05c 100644 --- a/python_scripts/metrics_ex_02.py +++ b/python_scripts/metrics_ex_02.py @@ -1,4 +1,3 @@ -# -*- coding: utf-8 -*- # --- # jupyter: # jupytext: @@ -6,7 +5,7 @@ # extension: .py # format_name: percent # format_version: '1.3' -# jupytext_version: 1.11.5 +# jupytext_version: 1.14.5 # kernelspec: # display_name: Python 3 # name: python3 diff --git a/python_scripts/parameter_tuning_ex_02.py b/python_scripts/parameter_tuning_ex_02.py index 893b18414..2d99c4bfb 100644 --- a/python_scripts/parameter_tuning_ex_02.py +++ b/python_scripts/parameter_tuning_ex_02.py @@ -1,4 +1,3 @@ -# -*- coding: utf-8 -*- # --- # jupyter: # jupytext: @@ -6,7 +5,7 @@ # extension: .py # format_name: percent # format_version: '1.3' -# jupytext_version: 1.11.5 +# jupytext_version: 1.14.5 # kernelspec: # display_name: Python 3 # name: python3 @@ -18,10 +17,9 @@ # The goal is to write an exhaustive search to find the best parameters # combination maximizing the model generalization performance. # -# Here we use a small subset of the Adult Census dataset to make the code -# faster to execute. Once your code works on the small subset, try to -# change `train_size` to a larger value (e.g. 0.8 for 80% instead of -# 20%). +# Here we use a small subset of the Adult Census dataset to make the code faster +# to execute. Once your code works on the small subset, try to change +# `train_size` to a larger value (e.g. 0.8 for 80% instead of 20%). # %% import pandas as pd @@ -69,7 +67,6 @@ ) # %% [markdown] -# # Use the previously defined model (called `model`) and using two nested `for` # loops, make a search of the best combinations of the `learning_rate` and # `max_leaf_nodes` parameters. In this regard, you will need to train and test @@ -79,8 +76,8 @@ # - `learning_rate` for the values 0.01, 0.1, 1 and 10. This parameter controls # the ability of a new tree to correct the error of the previous sequence of # trees -# - `max_leaf_nodes` for the values 3, 10, 30. This parameter controls the -# depth of each tree. +# - `max_leaf_nodes` for the values 3, 10, 30. This parameter controls the depth +# of each tree. # %% # Write your code here. diff --git a/python_scripts/parameter_tuning_ex_03.py b/python_scripts/parameter_tuning_ex_03.py index 25bc5f73e..48c9a5c41 100644 --- a/python_scripts/parameter_tuning_ex_03.py +++ b/python_scripts/parameter_tuning_ex_03.py @@ -1,4 +1,3 @@ -# -*- coding: utf-8 -*- # --- # jupyter: # jupytext: @@ -6,7 +5,7 @@ # extension: .py # format_name: percent # format_version: '1.3' -# jupytext_version: 1.11.5 +# jupytext_version: 1.14.5 # kernelspec: # display_name: Python 3 # name: python3 @@ -30,8 +29,8 @@ ) # %% [markdown] -# In this exercise, we will progressively define the regression pipeline -# and later tune its hyperparameters. +# In this exercise, we will progressively define the regression pipeline and +# later tune its hyperparameters. # # Start by defining a pipeline that: # * uses a `StandardScaler` to normalize the numerical data; @@ -48,8 +47,8 @@ # `np.logspace(0, 3, num=10).astype(np.int32)`; # - the parameter `with_mean` of the `StandardScaler` with possible values # `True` or `False`; -# - the parameter `with_std` of the `StandardScaler` with possible values -# `True` or `False`. +# - the parameter `with_std` of the `StandardScaler` with possible values `True` +# or `False`. # # Notice that in the notebook "Hyperparameter tuning by randomized-search" we # pass distributions to be sampled by the `RandomizedSearchCV`. In this case we diff --git a/python_scripts/trees_ex_01.py b/python_scripts/trees_ex_01.py index 6b2d8f4e5..ecfd6bf55 100644 --- a/python_scripts/trees_ex_01.py +++ b/python_scripts/trees_ex_01.py @@ -1,4 +1,3 @@ -# -*- coding: utf-8 -*- # --- # jupyter: # jupytext: @@ -6,7 +5,7 @@ # extension: .py # format_name: percent # format_version: '1.3' -# jupytext_version: 1.11.5 +# jupytext_version: 1.14.5 # kernelspec: # display_name: Python 3 # name: python3 diff --git a/python_scripts/trees_ex_02.py b/python_scripts/trees_ex_02.py index 58350c978..6c7d3b1b1 100644 --- a/python_scripts/trees_ex_02.py +++ b/python_scripts/trees_ex_02.py @@ -1,4 +1,3 @@ -# -*- coding: utf-8 -*- # --- # jupyter: # jupytext: @@ -6,7 +5,7 @@ # extension: .py # format_name: percent # format_version: '1.3' -# jupytext_version: 1.11.5 +# jupytext_version: 1.14.5 # kernelspec: # display_name: Python 3 # name: python3 @@ -15,11 +14,11 @@ # %% [markdown] # # ๐Ÿ“ Exercise M5.02 # -# The aim of this exercise is to find out whether a decision tree -# model is able to extrapolate. +# The aim of this exercise is to find out whether a decision tree model is able +# to extrapolate. # -# By extrapolation, we refer to values predicted by a model outside of the -# range of feature values seen during the training. +# By extrapolation, we refer to values predicted by a model outside of the range +# of feature values seen during the training. # # We will first load the regression data. @@ -40,16 +39,16 @@ # %% [markdown] # First, create two models, a linear regression model and a decision tree -# regression model, and fit them on the training data. Limit the depth at -# 3 levels for the decision tree. +# regression model, and fit them on the training data. Limit the depth at 3 +# levels for the decision tree. # %% # Write your code here. # %% [markdown] -# Create a synthetic dataset containing all possible flipper length from -# the minimum to the maximum of the training dataset. Get the predictions of -# each model using this dataset. +# Create a synthetic dataset containing all possible flipper length from the +# minimum to the maximum of the training dataset. Get the predictions of each +# model using this dataset. # %% # Write your code here. @@ -63,9 +62,9 @@ # %% [markdown] # Now, we will check the extrapolation capabilities of each model. Create a -# dataset containing a broader range of values than your previous dataset, -# in other words, add values below and above the minimum and the maximum of -# the flipper length seen during training. +# dataset containing a broader range of values than your previous dataset, in +# other words, add values below and above the minimum and the maximum of the +# flipper length seen during training. # %% # Write your code here. From 6aa53743c88344e24a083bac29396139f618f0b4 Mon Sep 17 00:00:00 2001 From: Olivier Grisel Date: Mon, 5 Jun 2023 15:45:14 +0200 Subject: [PATCH 028/108] MAINT: update build_tools/convert-python-script-to-notebook.py to myst-parser 1.0.0 (#695) --- .pre-commit-config.yaml | 6 +- .../convert-python-script-to-notebook.py | 25 +- notebooks/01_tabular_data_exploration.ipynb | 73 ++-- .../01_tabular_data_exploration_ex_01.ipynb | 1 - ..._numerical_pipeline_cross_validation.ipynb | 19 +- notebooks/02_numerical_pipeline_ex_00.ipynb | 12 +- notebooks/02_numerical_pipeline_ex_01.ipynb | 28 +- .../02_numerical_pipeline_hands_on.ipynb | 89 ++--- .../02_numerical_pipeline_introduction.ipynb | 103 +++--- notebooks/02_numerical_pipeline_scaling.ipynb | 126 +++---- notebooks/02_numerical_pipeline_sol_00.ipynb | 17 +- notebooks/02_numerical_pipeline_sol_01.ipynb | 54 +-- notebooks/03_categorical_pipeline.ipynb | 181 +++++----- ...egorical_pipeline_column_transformer.ipynb | 104 +++--- notebooks/03_categorical_pipeline_ex_01.ipynb | 30 +- notebooks/03_categorical_pipeline_ex_02.ipynb | 39 ++- .../03_categorical_pipeline_sol_01.ipynb | 63 ++-- .../03_categorical_pipeline_sol_02.ipynb | 112 ++++--- ...3_categorical_pipeline_visualization.ipynb | 65 ++-- notebooks/cross_validation_baseline.ipynb | 22 +- notebooks/cross_validation_ex_01.ipynb | 1 - notebooks/cross_validation_ex_02.ipynb | 29 +- notebooks/cross_validation_grouping.ipynb | 79 +++-- .../cross_validation_learning_curve.ipynb | 49 ++- notebooks/cross_validation_nested.ipynb | 74 ++-- notebooks/cross_validation_sol_01.ipynb | 15 +- notebooks/cross_validation_sol_02.ipynb | 75 +++-- .../cross_validation_stratification.ipynb | 87 ++--- notebooks/cross_validation_time.ipynb | 47 +-- notebooks/cross_validation_train_test.ipynb | 51 +-- .../cross_validation_validation_curve.ipynb | 116 ++++--- notebooks/datasets_ames_housing.ipynb | 38 ++- notebooks/datasets_bike_rides.ipynb | 110 +++--- notebooks/datasets_blood_transfusion.ipynb | 41 ++- notebooks/datasets_california_housing.ipynb | 98 +++--- notebooks/dev_features_importance.ipynb | 317 ++++++++++-------- notebooks/ensemble_adaboost.ipynb | 128 ++++--- notebooks/ensemble_bagging.ipynb | 217 +++++++----- notebooks/ensemble_ex_01.ipynb | 26 +- notebooks/ensemble_ex_02.ipynb | 9 +- notebooks/ensemble_ex_03.ipynb | 25 +- notebooks/ensemble_ex_04.ipynb | 29 +- notebooks/ensemble_gradient_boosting.ipynb | 205 ++++++----- .../ensemble_hist_gradient_boosting.ipynb | 125 ++++--- notebooks/ensemble_hyperparameters.ipynb | 35 +- notebooks/ensemble_introduction.ipynb | 66 ++-- notebooks/ensemble_random_forest.ipynb | 70 ++-- notebooks/ensemble_sol_01.ipynb | 41 +-- notebooks/ensemble_sol_02.ipynb | 33 +- notebooks/ensemble_sol_03.ipynb | 34 +- notebooks/ensemble_sol_04.ipynb | 58 ++-- notebooks/feature_selection_ex_01.ipynb | 35 +- .../feature_selection_introduction.ipynb | 58 ++-- .../feature_selection_limitation_model.ipynb | 27 +- notebooks/feature_selection_sol_01.ipynb | 52 ++- notebooks/linear_models_ex_01.ipynb | 21 +- notebooks/linear_models_ex_02.ipynb | 32 +- notebooks/linear_models_ex_03.ipynb | 22 +- notebooks/linear_models_ex_04.ipynb | 19 +- notebooks/linear_models_ex_05.ipynb | 22 +- notebooks/linear_models_regularization.ipynb | 277 ++++++++------- notebooks/linear_models_sol_01.ipynb | 41 +-- notebooks/linear_models_sol_02.ipynb | 41 +-- notebooks/linear_models_sol_03.ipynb | 44 ++- notebooks/linear_models_sol_04.ipynb | 78 +++-- notebooks/linear_models_sol_05.ipynb | 32 +- notebooks/linear_regression_in_sklearn.ipynb | 33 +- .../linear_regression_non_linear_link.ipynb | 124 +++---- .../linear_regression_without_sklearn.ipynb | 116 ++++--- notebooks/logistic_regression.ipynb | 52 +-- .../logistic_regression_non_linear.ipynb | 128 ++++--- notebooks/matplotlibrc | 2 +- notebooks/metrics_classification.ipynb | 155 +++++---- notebooks/metrics_ex_01.ipynb | 24 +- notebooks/metrics_ex_02.ipynb | 1 - notebooks/metrics_regression.ipynb | 112 ++++--- notebooks/metrics_sol_01.ipynb | 32 +- notebooks/metrics_sol_02.ipynb | 14 +- notebooks/parameter_tuning_ex_02.ipynb | 44 ++- notebooks/parameter_tuning_ex_03.ipynb | 12 +- notebooks/parameter_tuning_grid_search.ipynb | 137 ++++---- notebooks/parameter_tuning_manual.ipynb | 85 ++--- notebooks/parameter_tuning_nested.ipynb | 129 +++---- .../parameter_tuning_parallel_plot.ipynb | 53 +-- .../parameter_tuning_randomized_search.ipynb | 177 +++++----- notebooks/parameter_tuning_sol_02.ipynb | 66 ++-- notebooks/parameter_tuning_sol_03.ipynb | 44 ++- notebooks/trees_classification.ipynb | 93 ++--- notebooks/trees_dataset.ipynb | 30 +- notebooks/trees_ex_01.ipynb | 21 +- notebooks/trees_ex_02.ipynb | 25 +- notebooks/trees_hyperparameters.ipynb | 138 ++++---- notebooks/trees_regression.ipynb | 58 ++-- notebooks/trees_sol_01.ipynb | 53 +-- notebooks/trees_sol_02.ipynb | 71 ++-- 95 files changed, 3498 insertions(+), 2829 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index bcea7193e..4c1606a68 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -3,16 +3,14 @@ repos: rev: v4.4.0 hooks: - id: check-yaml - exclude: doc/ - id: end-of-file-fixer - exclude: doc/ + exclude: notebooks - id: trailing-whitespace - exclude: doc/ + exclude: notebooks - repo: https://github.com/psf/black rev: 23.1.0 hooks: - id: black - exclude: doc/ - repo: https://github.com/pycqa/flake8 rev: 4.0.1 hooks: diff --git a/build_tools/convert-python-script-to-notebook.py b/build_tools/convert-python-script-to-notebook.py index 02a24d358..1e45d1a6e 100644 --- a/build_tools/convert-python-script-to-notebook.py +++ b/build_tools/convert-python-script-to-notebook.py @@ -24,8 +24,9 @@ from bs4 import BeautifulSoup -from myst_parser.main import MdParserConfig, default_parser - +from myst_parser.parsers.mdit import create_md_parser +from myst_parser.config.main import MdParserConfig +from myst_parser.mdit_to_docutils.base import DocutilsRenderer import jupytext @@ -88,20 +89,23 @@ def admonition_html(doc): html_node = convert_to_html(doc, "div.admonition") bootstrap_class = sphinx_name_to_bootstrap[adm_node.tagname] html_node.attrs["class"] += [f"alert alert-{bootstrap_class}"] - html_node.select_one(".admonition-title").attrs["style"] = "font-weight: bold;" + html_node.select_one(".admonition-title").attrs[ + "style" + ] = "font-weight: bold;" return str(html_node) def replace_admonition_in_cell_source(cell_str): - """Returns cell source with admonition replaced by its generated HTML. - """ - config = MdParserConfig(renderer="docutils") - parser = default_parser(config) + """Returns cell source with admonition replaced by its generated HTML.""" + config = MdParserConfig() + parser = create_md_parser(config, renderer=DocutilsRenderer) tokens = parser.parse(cell_str) admonition_tokens = [ - t for t in tokens if t.type == "fence" and t.info in all_directive_names + t + for t in tokens + if t.type == "fence" and t.info in all_directive_names ] cell_lines = cell_str.splitlines() @@ -118,8 +122,7 @@ def replace_admonition_in_cell_source(cell_str): def replace_admonitions(nb): - """Replaces all admonitions by its generated HTML in a notebook object. - """ + """Replaces all admonitions by its generated HTML in a notebook object.""" # FIXME this would not work with advanced syntax for admonition with # ::: but we are not using it for now. We could parse all the markdowns # cell, a bit wasteful, but probably good enough @@ -137,7 +140,7 @@ def replace_admonitions(nb): def replace_escaped_dollars(nb): - """Replace escaped dollar to make Jupyter notebook interfaces happy. + r"""Replace escaped dollar to make Jupyter notebook interfaces happy. Jupyter interfaces wants \\$, JupyterBook wants \$. See https://github.com/jupyterlab/jupyterlab/issues/8645 for more details. diff --git a/notebooks/01_tabular_data_exploration.ipynb b/notebooks/01_tabular_data_exploration.ipynb index 140e9b8d0..2f47aadde 100644 --- a/notebooks/01_tabular_data_exploration.ipynb +++ b/notebooks/01_tabular_data_exploration.ipynb @@ -13,8 +13,8 @@ "* looking at the variables in the dataset, in particular, differentiate\n", " between numerical and categorical variables, which need different\n", " preprocessing in most machine learning workflows;\n", - "* visualizing the distribution of the variables to gain some insights into\n", - " the dataset." + "* visualizing the distribution of the variables to gain some insights into the\n", + " dataset." ] }, { @@ -67,9 +67,9 @@ "source": [ "## The variables (columns) in the dataset\n", "\n", - "The data are stored in a `pandas` dataframe. A dataframe is a type of structured\n", - "data composed of 2 dimensions. This type of data is also referred as tabular\n", - "data.\n", + "The data are stored in a `pandas` dataframe. A dataframe is a type of\n", + "structured data composed of 2 dimensions. This type of data is also referred\n", + "as tabular data.\n", "\n", "Each row represents a \"sample\". In the field of machine learning or\n", "descriptive statistics, commonly used equivalent terms are \"record\",\n", @@ -102,12 +102,12 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "The column named **class** is our target variable (i.e., the variable which\n", - "we want to predict). The two possible classes are `<=50K` (low-revenue) and\n", - "`>50K` (high-revenue). The resulting prediction problem is therefore a\n", - "binary classification problem as `class` has only two possible values.\n", - "We will use the left-over columns (any column other than `class`) as input\n", - "variables for our model." + "The column named **class** is our target variable (i.e., the variable which we\n", + "want to predict). The two possible classes are `<=50K` (low-revenue) and\n", + "`>50K` (high-revenue). The resulting prediction problem is therefore a binary\n", + "classification problem as `class` has only two possible values. We will use\n", + "the left-over columns (any column other than `class`) as input variables for\n", + "our model." ] }, { @@ -126,13 +126,13 @@ "source": [ "

" ] }, @@ -301,9 +301,9 @@ "real life setting.\n", "\n", "We recommend our readers to refer to [fairlearn.org](https://fairlearn.org)\n", - "for resources on how to quantify and potentially mitigate fairness\n", - "issues related to the deployment of automated decision making\n", - "systems that rely on machine learning components.\n", + "for resources on how to quantify and potentially mitigate fairness issues\n", + "related to the deployment of automated decision making systems that rely on\n", + "machine learning components.\n", "\n", "Studying why the data collection process of this dataset lead to such an\n", "unexpected gender imbalance is beyond the scope of this MOOC but we should\n", @@ -327,9 +327,9 @@ "lines_to_next_cell": 0 }, "source": [ - "As noted above, `\"education-num\"` distribution has two clear peaks around 10 and\n", - "13. It would be reasonable to expect that `\"education-num\"` is the number of\n", - "years of education.\n", + "As noted above, `\"education-num\"` distribution has two clear peaks around 10\n", + "and 13. It would be reasonable to expect that `\"education-num\"` is the number\n", + "of years of education.\n", "\n", "Let's look at the relationship between `\"education\"` and `\"education-num\"`." ] @@ -340,7 +340,9 @@ "metadata": {}, "outputs": [], "source": [ - "pd.crosstab(index=adult_census[\"education\"], columns=adult_census[\"education-num\"])" + "pd.crosstab(\n", + " index=adult_census[\"education\"], columns=adult_census[\"education-num\"]\n", + ")" ] }, { @@ -348,11 +350,12 @@ "metadata": {}, "source": [ "For every entry in `\\\"education\\\"`, there is only one single corresponding\n", - "value in `\\\"education-num\\\"`. This shows that `\"education\"` and `\"education-num\"`\n", - "give you the same information. For example, `\"education-num\"=2` is equivalent to\n", - "`\"education\"=\"1st-4th\"`. In practice that means we can remove\n", - "`\"education-num\"` without losing information. Note that having redundant (or\n", - "highly correlated) columns can be a problem for machine learning algorithms." + "value in `\\\"education-num\\\"`. This shows that `\"education\"` and\n", + "`\"education-num\"` give you the same information. For example,\n", + "`\"education-num\"=2` is equivalent to `\"education\"=\"1st-4th\"`. In practice that\n", + "means we can remove `\"education-num\"` without losing information. Note that\n", + "having redundant (or highly correlated) columns can be a problem for machine\n", + "learning algorithms." ] }, { @@ -461,7 +464,9 @@ "plt.axvline(x=age_limit, ymin=0, ymax=1, color=\"black\", linestyle=\"--\")\n", "\n", "hours_per_week_limit = 40\n", - "plt.axhline(y=hours_per_week_limit, xmin=0.18, xmax=1, color=\"black\", linestyle=\"--\")\n", + "plt.axhline(\n", + " y=hours_per_week_limit, xmin=0.18, xmax=1, color=\"black\", linestyle=\"--\"\n", + ")\n", "\n", "plt.annotate(\"<=50K\", (17, 25), rotation=90, fontsize=35)\n", "plt.annotate(\"<=50K\", (35, 20), fontsize=35)\n", @@ -488,10 +493,10 @@ "will choose the \"best\" splits based on data without human intervention or\n", "inspection. Decision trees will be covered more in detail in a future module.\n", "\n", - "Note that machine learning is often used when creating rules by hand\n", - "is not straightforward. For example because we are in high dimension (many\n", - "features in a table) or because there are no simple and obvious rules that\n", - "separate the two classes as in the top-right region of the previous plot.\n", + "Note that machine learning is often used when creating rules by hand is not\n", + "straightforward. For example because we are in high dimension (many features\n", + "in a table) or because there are no simple and obvious rules that separate the\n", + "two classes as in the top-right region of the previous plot.\n", "\n", "To sum up, the important thing to remember is that in a machine-learning\n", "setting, a model automatically creates the \"rules\" from the existing data in\n", diff --git a/notebooks/01_tabular_data_exploration_ex_01.ipynb b/notebooks/01_tabular_data_exploration_ex_01.ipynb index 4b06715e0..040c50c82 100644 --- a/notebooks/01_tabular_data_exploration_ex_01.ipynb +++ b/notebooks/01_tabular_data_exploration_ex_01.ipynb @@ -116,7 +116,6 @@ ], "metadata": { "jupytext": { - "encoding": "# -*- coding: utf-8 -*-", "main_language": "python" }, "kernelspec": { diff --git a/notebooks/02_numerical_pipeline_cross_validation.ipynb b/notebooks/02_numerical_pipeline_cross_validation.ipynb index dcf3c357b..c7422f698 100644 --- a/notebooks/02_numerical_pipeline_cross_validation.ipynb +++ b/notebooks/02_numerical_pipeline_cross_validation.ipynb @@ -118,10 +118,11 @@ "
\n", "

Note

\n", "

This figure shows the particular case of K-fold cross-validation strategy.\n", - "For each cross-validation split, the procedure trains a clone of model on all the red\n", - "samples and evaluate the score of the model on the blue samples.\n", - "As mentioned earlier, there is a variety of different cross-validation\n", - "strategies. Some of these aspects will be covered in more detail in future notebooks.

\n", + "For each cross-validation split, the procedure trains a clone of model on all\n", + "the red samples and evaluate the score of the model on the blue samples. As\n", + "mentioned earlier, there is a variety of different cross-validation\n", + "strategies. Some of these aspects will be covered in more detail in future\n", + "notebooks.

\n", "
\n", "\n", "Cross-validation is therefore computationally intensive because it requires\n", @@ -153,8 +154,10 @@ "source": [ "The output of `cross_validate` is a Python dictionary, which by default\n", "contains three entries:\n", - "- (i) the time to train the model on the training data for each fold, `fit_time`\n", - "- (ii) the time to predict with the model on the testing data for each fold, `score_time`\n", + "- (i) the time to train the model on the training data for each fold,\n", + " `fit_time`\n", + "- (ii) the time to predict with the model on the testing data for each fold,\n", + " `score_time`\n", "- (iii) the default score on the testing data for each fold, `test_score`.\n", "\n", "Setting `cv=5` created 5 distinct splits to get 5 variations for the training\n", @@ -203,8 +206,8 @@ "we can estimate the uncertainty of our model generalization performance. This\n", "is the main advantage of cross-validation and can be crucial in practice, for\n", "example when comparing different models to figure out whether one is better\n", - "than the other or whether our measures of the generalization performance of each\n", - "model are within the error bars of one-another.\n", + "than the other or whether our measures of the generalization performance of\n", + "each model are within the error bars of one-another.\n", "\n", "In this particular case, only the first 2 decimals seem to be trustworthy. If\n", "you go up in this notebook, you can check that the performance we get with\n", diff --git a/notebooks/02_numerical_pipeline_ex_00.ipynb b/notebooks/02_numerical_pipeline_ex_00.ipynb index e9632c86e..ef7d6b923 100644 --- a/notebooks/02_numerical_pipeline_ex_00.ipynb +++ b/notebooks/02_numerical_pipeline_ex_00.ipynb @@ -25,6 +25,7 @@ "outputs": [], "source": [ "import pandas as pd\n", + "\n", "adult_census = pd.read_csv(\"../datasets/adult-census-numeric.csv\")\n", "data = adult_census.drop(columns=\"class\")\n", "target = adult_census[\"class\"]" @@ -36,12 +37,12 @@ "source": [ "In the previous notebook we used `model = KNeighborsClassifier()`. All\n", "scikit-learn models can be created without arguments. This is convenient\n", - "because it means that you don't need to understand the full details of a\n", - "model before starting to use it.\n", + "because it means that you don't need to understand the full details of a model\n", + "before starting to use it.\n", "\n", - "One of the `KNeighborsClassifier` parameters is `n_neighbors`. It controls\n", - "the number of neighbors we are going to use to make a prediction for a new\n", - "data point.\n", + "One of the `KNeighborsClassifier` parameters is `n_neighbors`. It controls the\n", + "number of neighbors we are going to use to make a prediction for a new data\n", + "point.\n", "\n", "What is the default value of the `n_neighbors` parameter? Hint: Look at the\n", "documentation on the [scikit-learn\n", @@ -146,7 +147,6 @@ ], "metadata": { "jupytext": { - "encoding": "# -*- coding: utf-8 -*-", "main_language": "python" }, "kernelspec": { diff --git a/notebooks/02_numerical_pipeline_ex_01.ipynb b/notebooks/02_numerical_pipeline_ex_01.ipynb index 0f8f4da1e..688f435e6 100644 --- a/notebooks/02_numerical_pipeline_ex_01.ipynb +++ b/notebooks/02_numerical_pipeline_ex_01.ipynb @@ -7,19 +7,19 @@ "# \ud83d\udcdd Exercise M1.03\n", "\n", "The goal of this exercise is to compare the performance of our classifier in\n", - "the previous notebook (roughly 81% accuracy with `LogisticRegression`) to\n", - "some simple baseline classifiers. The simplest baseline classifier is one\n", - "that always predicts the same class, irrespective of the input data.\n", + "the previous notebook (roughly 81% accuracy with `LogisticRegression`) to some\n", + "simple baseline classifiers. The simplest baseline classifier is one that\n", + "always predicts the same class, irrespective of the input data.\n", "\n", "- What would be the score of a model that always predicts `' >50K'`?\n", "- What would be the score of a model that always predicts `' <=50K'`?\n", "- Is 81% or 82% accuracy a good score for this problem?\n", "\n", - "Use a `DummyClassifier` and do a train-test split to evaluate\n", - "its accuracy on the test set. This\n", + "Use a `DummyClassifier` and do a train-test split to evaluate its accuracy on\n", + "the test set. This\n", "[link](https://scikit-learn.org/stable/modules/model_evaluation.html#dummy-estimators)\n", - "shows a few examples of how to evaluate the generalization performance of these\n", - "baseline models." + "shows a few examples of how to evaluate the generalization performance of\n", + "these baseline models." ] }, { @@ -66,8 +66,7 @@ "metadata": {}, "outputs": [], "source": [ - "numerical_columns = [\n", - " \"age\", \"capital-gain\", \"capital-loss\", \"hours-per-week\"]\n", + "numerical_columns = [\"age\", \"capital-gain\", \"capital-loss\", \"hours-per-week\"]\n", "\n", "data_numeric = data[numerical_columns]" ] @@ -94,12 +93,12 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Use a `DummyClassifier` such that the resulting classifier will always\n", - "predict the class `' >50K'`. What is the accuracy score on the test set?\n", - "Repeat the experiment by always predicting the class `' <=50K'`.\n", + "Use a `DummyClassifier` such that the resulting classifier will always predict\n", + "the class `' >50K'`. What is the accuracy score on the test set? Repeat the\n", + "experiment by always predicting the class `' <=50K'`.\n", "\n", - "Hint: you can set the `strategy` parameter of the `DummyClassifier` to\n", - "achieve the desired behavior." + "Hint: you can set the `strategy` parameter of the `DummyClassifier` to achieve\n", + "the desired behavior." ] }, { @@ -116,7 +115,6 @@ ], "metadata": { "jupytext": { - "encoding": "# -*- coding: utf-8 -*-", "main_language": "python" }, "kernelspec": { diff --git a/notebooks/02_numerical_pipeline_hands_on.ipynb b/notebooks/02_numerical_pipeline_hands_on.ipynb index f2048fe3d..326bc0aad 100644 --- a/notebooks/02_numerical_pipeline_hands_on.ipynb +++ b/notebooks/02_numerical_pipeline_hands_on.ipynb @@ -6,12 +6,11 @@ "source": [ "# Working with numerical data\n", "\n", - "In the previous notebook, we trained a k-nearest neighbors model on\n", - "some data.\n", + "In the previous notebook, we trained a k-nearest neighbors model on some data.\n", "\n", "However, we oversimplified the procedure by loading a dataset that contained\n", - "exclusively numerical data. Besides, we used datasets which were already\n", - "split into train-test sets.\n", + "exclusively numerical data. Besides, we used datasets which were already split\n", + "into train-test sets.\n", "\n", "In this notebook, we aim at:\n", "\n", @@ -25,8 +24,8 @@ "\n", "## Loading the entire dataset\n", "\n", - "As in the previous notebook, we rely on pandas to open the CSV file into\n", - "a pandas dataframe." + "As in the previous notebook, we rely on pandas to open the CSV file into a\n", + "pandas dataframe." ] }, { @@ -94,18 +93,17 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "At this point, we can focus on the data we want to use to train our\n", - "predictive model.\n", + "At this point, we can focus on the data we want to use to train our predictive\n", + "model.\n", "\n", "## Identify numerical data\n", "\n", "Numerical data are represented with numbers. They are linked to measurable\n", - "(quantitative) data, such as age or the number of hours a person works a\n", - "week.\n", + "(quantitative) data, such as age or the number of hours a person works a week.\n", "\n", - "Predictive models are natively designed to work with numerical data.\n", - "Moreover, numerical data usually requires very little work before getting\n", - "started with training.\n", + "Predictive models are natively designed to work with numerical data. Moreover,\n", + "numerical data usually requires very little work before getting started with\n", + "training.\n", "\n", "The first task here will be to identify numerical data in our dataset.\n", "\n", @@ -132,8 +130,8 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "We seem to have only two data types: `int64` and `object`. We can make\n", - "sure by checking for unique data types." + "We seem to have only two data types: `int64` and `object`. We can make sure by\n", + "checking for unique data types." ] }, { @@ -149,9 +147,9 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Indeed, the only two types in the dataset are integer `int64` and `object`.\n", - "We can look at the first few lines of the dataframe to understand the\n", - "meaning of the `object` data type." + "Indeed, the only two types in the dataset are integer `int64` and `object`. We\n", + "can look at the first few lines of the dataframe to understand the meaning of\n", + "the `object` data type." ] }, { @@ -187,9 +185,9 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Now that we limited the dataset to numerical columns only,\n", - "we can analyse these numbers to figure out what they represent. We can\n", - "identify two types of usage.\n", + "Now that we limited the dataset to numerical columns only, we can analyse\n", + "these numbers to figure out what they represent. We can identify two types of\n", + "usage.\n", "\n", "The first column, `\"age\"`, is self-explanatory. We can note that the values\n", "are continuous, meaning they can take up any number in a given range. Let's\n", @@ -252,7 +250,8 @@ "from sklearn.model_selection import train_test_split\n", "\n", "data_train, data_test, target_train, target_test = train_test_split(\n", - " data_numeric, target, random_state=42, test_size=0.25)" + " data_numeric, target, random_state=42, test_size=0.25\n", + ")" ] }, { @@ -274,8 +273,8 @@ "source": [ "When calling the function `train_test_split`, we specified that we would like\n", "to have 25% of samples in the testing set while the remaining samples (75%)\n", - "will be available in the training set. We can check quickly if we got\n", - "what we expected." + "will be available in the training set. We can check quickly if we got what we\n", + "expected." ] }, { @@ -284,9 +283,11 @@ "metadata": {}, "outputs": [], "source": [ - "print(f\"Number of samples in testing: {data_test.shape[0]} => \"\n", - " f\"{data_test.shape[0] / data_numeric.shape[0] * 100:.1f}% of the\"\n", - " f\" original set\")" + "print(\n", + " f\"Number of samples in testing: {data_test.shape[0]} => \"\n", + " f\"{data_test.shape[0] / data_numeric.shape[0] * 100:.1f}% of the\"\n", + " \" original set\"\n", + ")" ] }, { @@ -295,9 +296,11 @@ "metadata": {}, "outputs": [], "source": [ - "print(f\"Number of samples in training: {data_train.shape[0]} => \"\n", - " f\"{data_train.shape[0] / data_numeric.shape[0] * 100:.1f}% of the\"\n", - " f\" original set\")" + "print(\n", + " f\"Number of samples in training: {data_train.shape[0]} => \"\n", + " f\"{data_train.shape[0] / data_numeric.shape[0] * 100:.1f}% of the\"\n", + " \" original set\"\n", + ")" ] }, { @@ -342,10 +345,10 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Now that the model has been created, you can use it exactly the same way as\n", - "we used the k-nearest neighbors model in the previous notebook. In\n", - "particular, we can use the `fit` method to train the model using the training\n", - "data and labels:" + "Now that the model has been created, you can use it exactly the same way as we\n", + "used the k-nearest neighbors model in the previous notebook. In particular, we\n", + "can use the `fit` method to train the model using the training data and\n", + "labels:" ] }, { @@ -361,8 +364,8 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "We can also use the `score` method to check the model generalization performance\n", - "on the test set." + "We can also use the `score` method to check the model generalization\n", + "performance on the test set." ] }, { @@ -381,18 +384,18 @@ "source": [ "## Notebook recap\n", "\n", - "In scikit-learn, the `score` method of a classification model returns the accuracy,\n", - "i.e. the fraction of correctly classified samples. In this case, around\n", - "8 / 10 of the times the logistic regression predicts the right income of a\n", - "person. Now the real question is: is this generalization performance relevant\n", - "of a good predictive model? Find out by solving the next exercise!\n", + "In scikit-learn, the `score` method of a classification model returns the\n", + "accuracy, i.e. the fraction of correctly classified samples. In this case,\n", + "around 8 / 10 of the times the logistic regression predicts the right income\n", + "of a person. Now the real question is: is this generalization performance\n", + "relevant of a good predictive model? Find out by solving the next exercise!\n", "\n", "In this notebook, we learned to:\n", "\n", "* identify numerical data in a heterogeneous dataset;\n", "* select the subset of columns corresponding to numerical data;\n", - "* use the scikit-learn `train_test_split` function to separate data into\n", - " a train and a test set;\n", + "* use the scikit-learn `train_test_split` function to separate data into a\n", + " train and a test set;\n", "* train and evaluate a logistic regression model." ] } diff --git a/notebooks/02_numerical_pipeline_introduction.ipynb b/notebooks/02_numerical_pipeline_introduction.ipynb index 5f96530cb..0e39aa5b1 100644 --- a/notebooks/02_numerical_pipeline_introduction.ipynb +++ b/notebooks/02_numerical_pipeline_introduction.ipynb @@ -22,8 +22,8 @@ ".\n", "\n", "Numerical data is the most natural type of data used in machine learning and\n", - "can (almost) directly be fed into predictive models. We will load a\n", - "subset of the original data with only the numerical columns." + "can (almost) directly be fed into predictive models. We will load a subset of\n", + "the original data with only the numerical columns." ] }, { @@ -58,10 +58,9 @@ "metadata": {}, "source": [ "We see that this CSV file contains all information: the target that we would\n", - "like to predict (i.e. `\"class\"`) and the data that we want to use to train\n", - "our predictive model (i.e. the remaining columns). The first step is to\n", - "separate columns to get on one side the target and on the other side the\n", - "data.\n", + "like to predict (i.e. `\"class\"`) and the data that we want to use to train our\n", + "predictive model (i.e. the remaining columns). The first step is to separate\n", + "columns to get on one side the target and on the other side the data.\n", "\n", "## Separate the data and the target" ] @@ -83,7 +82,7 @@ "metadata": {}, "outputs": [], "source": [ - "data = adult_census.drop(columns=[target_name, ])\n", + "data = adult_census.drop(columns=[target_name])\n", "data.head()" ] }, @@ -111,8 +110,10 @@ "metadata": {}, "outputs": [], "source": [ - "print(f\"The dataset contains {data.shape[0]} samples and \"\n", - " f\"{data.shape[1]} features\")" + "print(\n", + " f\"The dataset contains {data.shape[0]} samples and \"\n", + " f\"{data.shape[1]} features\"\n", + ")" ] }, { @@ -121,10 +122,10 @@ "source": [ "## Fit a model and make predictions\n", "\n", - "We will build a classification model using the \"K-nearest neighbors\"\n", - "strategy. To predict the target of a new sample, a k-nearest neighbors takes\n", - "into account its `k` closest samples in the training set and predicts the\n", - "majority target of these samples.\n", + "We will build a classification model using the \"K-nearest neighbors\" strategy.\n", + "To predict the target of a new sample, a k-nearest neighbors takes into\n", + "account its `k` closest samples in the training set and predicts the majority\n", + "target of these samples.\n", "\n", "
\n", "

Caution!

\n", @@ -157,11 +158,11 @@ "\n", "![Predictor fit diagram](../figures/api_diagram-predictor.fit.svg)\n", "\n", - "The method `fit` is composed of two elements: (i) a **learning algorithm**\n", - "and (ii) some **model states**. The learning algorithm takes the training\n", - "data and training target as input and sets the model states. These model\n", - "states will be used later to either predict (for classifiers and regressors)\n", - "or transform data (for transformers).\n", + "The method `fit` is composed of two elements: (i) a **learning algorithm** and\n", + "(ii) some **model states**. The learning algorithm takes the training data and\n", + "training target as input and sets the model states. These model states will be\n", + "used later to either predict (for classifiers and regressors) or transform\n", + "data (for transformers).\n", "\n", "Both the learning algorithm and the type of model states are specific to each\n", "type of model." @@ -212,8 +213,8 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Let's now have a look at the computed predictions. For the sake of\n", - "simplicity, we will look at the five first predicted targets." + "Let's now have a look at the computed predictions. For the sake of simplicity,\n", + "we will look at the five first predicted targets." ] }, { @@ -263,8 +264,10 @@ "metadata": {}, "outputs": [], "source": [ - "print(f\"Number of correct prediction: \"\n", - " f\"{(target[:5] == target_predicted[:5]).sum()} / 5\")" + "print(\n", + " \"Number of correct prediction: \"\n", + " f\"{(target[:5] == target_predicted[:5]).sum()} / 5\"\n", + ")" ] }, { @@ -290,26 +293,24 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "This result means that the model makes a correct prediction for\n", - "approximately 82 samples out of 100. Note that we used the same data\n", - "to train and evaluate our model. Can this evaluation be trusted or is\n", - "it too good to be true?\n", + "This result means that the model makes a correct prediction for approximately\n", + "82 samples out of 100. Note that we used the same data to train and evaluate\n", + "our model. Can this evaluation be trusted or is it too good to be true?\n", "\n", "## Train-test data split\n", "\n", "When building a machine learning model, it is important to evaluate the\n", "trained model on data that was not used to fit it, as **generalization** is\n", "more than memorization (meaning we want a rule that generalizes to new data,\n", - "without comparing to data we memorized).\n", - "It is harder to conclude on never-seen instances than on already seen ones.\n", + "without comparing to data we memorized). It is harder to conclude on\n", + "never-seen instances than on already seen ones.\n", "\n", "Correct evaluation is easily done by leaving out a subset of the data when\n", - "training the model and using it afterwards for model evaluation.\n", - "The data used to fit a model is called training data while the data used to\n", - "assess a model is called testing data.\n", + "training the model and using it afterwards for model evaluation. The data used\n", + "to fit a model is called training data while the data used to assess a model\n", + "is called testing data.\n", "\n", - "We can load more data, which was actually left-out from the original data\n", - "set." + "We can load more data, which was actually left-out from the original data set." ] }, { @@ -318,7 +319,7 @@ "metadata": {}, "outputs": [], "source": [ - "adult_census_test = pd.read_csv('../datasets/adult-census-numeric-test.csv')" + "adult_census_test = pd.read_csv(\"../datasets/adult-census-numeric-test.csv\")" ] }, { @@ -336,7 +337,7 @@ "outputs": [], "source": [ "target_test = adult_census_test[target_name]\n", - "data_test = adult_census_test.drop(columns=[target_name, ])" + "data_test = adult_census_test.drop(columns=[target_name])" ] }, { @@ -352,18 +353,19 @@ "metadata": {}, "outputs": [], "source": [ - "print(f\"The testing dataset contains {data_test.shape[0]} samples and \"\n", - " f\"{data_test.shape[1]} features\")" + "print(\n", + " f\"The testing dataset contains {data_test.shape[0]} samples and \"\n", + " f\"{data_test.shape[1]} features\"\n", + ")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "\n", - "Instead of computing the prediction and manually computing the average\n", - "success rate, we can use the method `score`. When dealing with classifiers\n", - "this method returns their performance metric." + "Instead of computing the prediction and manually computing the average success\n", + "rate, we can use the method `score`. When dealing with classifiers this method\n", + "returns their performance metric." ] }, { @@ -375,8 +377,7 @@ "accuracy = model.score(data_test, target_test)\n", "model_name = model.__class__.__name__\n", "\n", - "print(f\"The test accuracy using a {model_name} is \"\n", - " f\"{accuracy:.3f}\")" + "print(f\"The test accuracy using a {model_name} is {accuracy:.3f}\")" ] }, { @@ -387,18 +388,18 @@ "\n", "![Predictor score diagram](../figures/api_diagram-predictor.score.svg)\n", "\n", - "To compute the score, the predictor first computes the predictions (using\n", - "the `predict` method) and then uses a scoring function to compare the\n", - "true target `y` and the predictions. Finally, the score is returned." + "To compute the score, the predictor first computes the predictions (using the\n", + "`predict` method) and then uses a scoring function to compare the true target\n", + "`y` and the predictions. Finally, the score is returned." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "If we compare with the accuracy obtained by wrongly evaluating the model\n", - "on the training set, we find that this evaluation was indeed optimistic\n", - "compared to the score obtained on a held-out test set.\n", + "If we compare with the accuracy obtained by wrongly evaluating the model on\n", + "the training set, we find that this evaluation was indeed optimistic compared\n", + "to the score obtained on a held-out test set.\n", "\n", "It shows the importance to always testing the generalization performance of\n", "predictive models on a different set than the one used to train these models.\n", @@ -433,8 +434,8 @@ "* fitted a **k-nearest neighbors** model on a training dataset;\n", "* evaluated its generalization performance on the testing data;\n", "* introduced the scikit-learn API `.fit(X, y)` (to train a model),\n", - " `.predict(X)` (to make predictions) and `.score(X, y)`\n", - " (to evaluate a model)." + " `.predict(X)` (to make predictions) and `.score(X, y)` (to evaluate a\n", + " model)." ] } ], diff --git a/notebooks/02_numerical_pipeline_scaling.ipynb b/notebooks/02_numerical_pipeline_scaling.ipynb index 00fd4a929..d695c632b 100644 --- a/notebooks/02_numerical_pipeline_scaling.ipynb +++ b/notebooks/02_numerical_pipeline_scaling.ipynb @@ -11,8 +11,7 @@ "We will introduce these new aspects:\n", "\n", "* an example of preprocessing, namely **scaling numerical variables**;\n", - "* using a scikit-learn **pipeline** to chain preprocessing and model\n", - " training.\n", + "* using a scikit-learn **pipeline** to chain preprocessing and model training.\n", "\n", "## Data preparation\n", "\n", @@ -34,8 +33,8 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "We will now drop the target from the data we will use to train our\n", - "predictive model." + "We will now drop the target from the data we will use to train our predictive\n", + "model." ] }, { @@ -53,8 +52,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Then, we select only the numerical columns, as seen in the previous\n", - "notebook." + "Then, we select only the numerical columns, as seen in the previous notebook." ] }, { @@ -63,8 +61,7 @@ "metadata": {}, "outputs": [], "source": [ - "numerical_columns = [\n", - " \"age\", \"capital-gain\", \"capital-loss\", \"hours-per-week\"]\n", + "numerical_columns = [\"age\", \"capital-gain\", \"capital-loss\", \"hours-per-week\"]\n", "\n", "data_numeric = data[numerical_columns]" ] @@ -85,7 +82,8 @@ "from sklearn.model_selection import train_test_split\n", "\n", "data_train, data_test, target_train, target_test = train_test_split(\n", - " data_numeric, target, random_state=42)" + " data_numeric, target, random_state=42\n", + ")" ] }, { @@ -94,10 +92,10 @@ "source": [ "## Model fitting with preprocessing\n", "\n", - "A range of preprocessing algorithms in scikit-learn allow us to transform\n", - "the input data before training a model. In our case, we will standardize the\n", - "data and then train a new logistic regression model on that new version of\n", - "the dataset.\n", + "A range of preprocessing algorithms in scikit-learn allow us to transform the\n", + "input data before training a model. In our case, we will standardize the data\n", + "and then train a new logistic regression model on that new version of the\n", + "dataset.\n", "\n", "Let's start by printing some statistics about the training data." ] @@ -134,8 +132,8 @@ "\n", "Whether or not a machine learning model requires scaling the features depends\n", "on the model family. Linear models such as logistic regression generally\n", - "benefit from scaling the features while other models such as decision trees\n", - "do not need such preprocessing (but will not suffer from it).\n", + "benefit from scaling the features while other models such as decision trees do\n", + "not need such preprocessing (but will not suffer from it).\n", "\n", "We show how to apply such normalization using a scikit-learn transformer\n", "called `StandardScaler`. This transformer shifts and scales each feature\n", @@ -172,8 +170,8 @@ "![Transformer fit diagram](../figures/api_diagram-transformer.fit.svg)\n", "\n", "In this case, the algorithm needs to compute the mean and standard deviation\n", - "for each feature and store them into some NumPy arrays. Here, these\n", - "statistics are the model states.\n", + "for each feature and store them into some NumPy arrays. Here, these statistics\n", + "are the model states.\n", "\n", "
\n", "

Note

\n", @@ -245,10 +243,11 @@ "Let's illustrate the internal mechanism of the `transform` method and put it\n", "to perspective with what we already saw with predictors.\n", "\n", - "![Transformer transform diagram](../figures/api_diagram-transformer.transform.svg)\n", + "![Transformer transform\n", + "diagram](../figures/api_diagram-transformer.transform.svg)\n", "\n", - "The `transform` method for transformers is similar to the `predict` method\n", - "for predictors. It uses a predefined function, called a **transformation\n", + "The `transform` method for transformers is similar to the `predict` method for\n", + "predictors. It uses a predefined function, called a **transformation\n", "function**, and uses the model states and the input data. However, instead of\n", "outputting predictions, the job of the `transform` method is to output a\n", "transformed version of the input data." @@ -258,8 +257,8 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Finally, the method `fit_transform` is a shorthand method to call\n", - "successively `fit` and then `transform`.\n", + "Finally, the method `fit_transform` is a shorthand method to call successively\n", + "`fit` and then `transform`.\n", "\n", "![Transformer fit_transform diagram](../figures/api_diagram-transformer.fit_transform.svg)" ] @@ -280,8 +279,7 @@ "metadata": {}, "outputs": [], "source": [ - "data_train_scaled = pd.DataFrame(data_train_scaled,\n", - " columns=data_train.columns)\n", + "data_train_scaled = pd.DataFrame(data_train_scaled, columns=data_train.columns)\n", "data_train_scaled.describe()" ] }, @@ -289,12 +287,12 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Notice that the mean of all the columns is close to 0 and the standard deviation\n", - "in all cases is close to 1.\n", - "We can also visualize the effect of `StandardScaler` using a jointplot to show\n", - "both the histograms of the distributions and a scatterplot of any pair of numerical\n", - "features at the same time. We can observe that `StandardScaler` does not change\n", - "the structure of the data itself but the axes get shifted and scaled." + "Notice that the mean of all the columns is close to 0 and the standard\n", + "deviation in all cases is close to 1. We can also visualize the effect of\n", + "`StandardScaler` using a jointplot to show both the histograms of the\n", + "distributions and a scatterplot of any pair of numerical features at the same\n", + "time. We can observe that `StandardScaler` does not change the structure of\n", + "the data itself but the axes get shifted and scaled." ] }, { @@ -303,30 +301,42 @@ "metadata": {}, "outputs": [], "source": [ - "import matplotlib.pyplot as plt\n", + "import matplotlib.pyplot as plt\n", "import seaborn as sns\n", "\n", "# number of points to visualize to have a clearer plot\n", "num_points_to_plot = 300\n", "\n", - "sns.jointplot(data=data_train[:num_points_to_plot], x=\"age\",\n", - " y=\"hours-per-week\", marginal_kws=dict(bins=15))\n", - "plt.suptitle(\"Jointplot of 'age' vs 'hours-per-week' \\nbefore StandardScaler\", y=1.1)\n", + "sns.jointplot(\n", + " data=data_train[:num_points_to_plot],\n", + " x=\"age\",\n", + " y=\"hours-per-week\",\n", + " marginal_kws=dict(bins=15),\n", + ")\n", + "plt.suptitle(\n", + " \"Jointplot of 'age' vs 'hours-per-week' \\nbefore StandardScaler\", y=1.1\n", + ")\n", "\n", - "sns.jointplot(data=data_train_scaled[:num_points_to_plot], x=\"age\",\n", - " y=\"hours-per-week\", marginal_kws=dict(bins=15))\n", - "_ = plt.suptitle(\"Jointplot of 'age' vs 'hours-per-week' \\nafter StandardScaler\", y=1.1)" + "sns.jointplot(\n", + " data=data_train_scaled[:num_points_to_plot],\n", + " x=\"age\",\n", + " y=\"hours-per-week\",\n", + " marginal_kws=dict(bins=15),\n", + ")\n", + "_ = plt.suptitle(\n", + " \"Jointplot of 'age' vs 'hours-per-week' \\nafter StandardScaler\", y=1.1\n", + ")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "We can easily combine sequential operations with a scikit-learn\n", - "`Pipeline`, which chains together operations and is used as any other\n", - "classifier or regressor. The helper function `make_pipeline` will create a\n", - "`Pipeline`: it takes as arguments the successive transformations to perform,\n", - "followed by the classifier or regressor model." + "We can easily combine sequential operations with a scikit-learn `Pipeline`,\n", + "which chains together operations and is used as any other classifier or\n", + "regressor. The helper function `make_pipeline` will create a `Pipeline`: it\n", + "takes as arguments the successive transformations to perform, followed by the\n", + "classifier or regressor model." ] }, { @@ -367,8 +377,8 @@ "metadata": {}, "source": [ "This predictive pipeline exposes the same methods as the final predictor:\n", - "`fit` and `predict` (and additionally `predict_proba`, `decision_function`,\n", - "or `score`)." + "`fit` and `predict` (and additionally `predict_proba`, `decision_function`, or\n", + "`score`)." ] }, { @@ -386,8 +396,8 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "We can represent the internal mechanism of a pipeline when calling `fit`\n", - "by the following diagram:\n", + "We can represent the internal mechanism of a pipeline when calling `fit` by\n", + "the following diagram:\n", "\n", "![pipeline fit diagram](../figures/api_diagram-pipeline.fit.svg)\n", "\n", @@ -426,8 +436,8 @@ "the predictor that will output the predicted target by calling its method\n", "`predict`.\n", "\n", - "As a shorthand, we can check the score of the full predictive pipeline\n", - "calling the method `model.score`. Thus, let's check the computational and\n", + "As a shorthand, we can check the score of the full predictive pipeline calling\n", + "the method `model.score`. Thus, let's check the computational and\n", "generalization performance of such a predictive pipeline." ] }, @@ -439,17 +449,19 @@ "source": [ "model_name = model.__class__.__name__\n", "score = model.score(data_test, target_test)\n", - "print(f\"The accuracy using a {model_name} is {score:.3f} \"\n", - " f\"with a fitting time of {elapsed_time:.3f} seconds \"\n", - " f\"in {model[-1].n_iter_[0]} iterations\")" + "print(\n", + " f\"The accuracy using a {model_name} is {score:.3f} \"\n", + " f\"with a fitting time of {elapsed_time:.3f} seconds \"\n", + " f\"in {model[-1].n_iter_[0]} iterations\"\n", + ")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "We could compare this predictive model with the predictive model used in\n", - "the previous notebook which did not scale features." + "We could compare this predictive model with the predictive model used in the\n", + "previous notebook which did not scale features." ] }, { @@ -472,9 +484,11 @@ "source": [ "model_name = model.__class__.__name__\n", "score = model.score(data_test, target_test)\n", - "print(f\"The accuracy using a {model_name} is {score:.3f} \"\n", - " f\"with a fitting time of {elapsed_time:.3f} seconds \"\n", - " f\"in {model.n_iter_[0]} iterations\")" + "print(\n", + " f\"The accuracy using a {model_name} is {score:.3f} \"\n", + " f\"with a fitting time of {elapsed_time:.3f} seconds \"\n", + " f\"in {model.n_iter_[0]} iterations\"\n", + ")" ] }, { diff --git a/notebooks/02_numerical_pipeline_sol_00.ipynb b/notebooks/02_numerical_pipeline_sol_00.ipynb index 7fdcc8dba..ff144d5c0 100644 --- a/notebooks/02_numerical_pipeline_sol_00.ipynb +++ b/notebooks/02_numerical_pipeline_sol_00.ipynb @@ -25,6 +25,7 @@ "outputs": [], "source": [ "import pandas as pd\n", + "\n", "adult_census = pd.read_csv(\"../datasets/adult-census-numeric.csv\")\n", "data = adult_census.drop(columns=\"class\")\n", "target = adult_census[\"class\"]" @@ -36,12 +37,12 @@ "source": [ "In the previous notebook we used `model = KNeighborsClassifier()`. All\n", "scikit-learn models can be created without arguments. This is convenient\n", - "because it means that you don't need to understand the full details of a\n", - "model before starting to use it.\n", + "because it means that you don't need to understand the full details of a model\n", + "before starting to use it.\n", "\n", - "One of the `KNeighborsClassifier` parameters is `n_neighbors`. It controls\n", - "the number of neighbors we are going to use to make a prediction for a new\n", - "data point.\n", + "One of the `KNeighborsClassifier` parameters is `n_neighbors`. It controls the\n", + "number of neighbors we are going to use to make a prediction for a new data\n", + "point.\n", "\n", "What is the default value of the `n_neighbors` parameter? Hint: Look at the\n", "documentation on the [scikit-learn\n", @@ -151,11 +152,13 @@ "outputs": [], "source": [ "number_of_correct_predictions = (\n", - " first_predictions == first_target_values).sum()\n", + " first_predictions == first_target_values\n", + ").sum()\n", "number_of_predictions = len(first_predictions)\n", "print(\n", " f\"{number_of_correct_predictions}/{number_of_predictions} \"\n", - " \"of predictions are correct\")" + " \"of predictions are correct\"\n", + ")" ] }, { diff --git a/notebooks/02_numerical_pipeline_sol_01.ipynb b/notebooks/02_numerical_pipeline_sol_01.ipynb index 5008d3454..2198c76b8 100644 --- a/notebooks/02_numerical_pipeline_sol_01.ipynb +++ b/notebooks/02_numerical_pipeline_sol_01.ipynb @@ -7,19 +7,19 @@ "# \ud83d\udcc3 Solution for Exercise M1.03\n", "\n", "The goal of this exercise is to compare the performance of our classifier in\n", - "the previous notebook (roughly 81% accuracy with `LogisticRegression`) to\n", - "some simple baseline classifiers. The simplest baseline classifier is one\n", - "that always predicts the same class, irrespective of the input data.\n", + "the previous notebook (roughly 81% accuracy with `LogisticRegression`) to some\n", + "simple baseline classifiers. The simplest baseline classifier is one that\n", + "always predicts the same class, irrespective of the input data.\n", "\n", "- What would be the score of a model that always predicts `' >50K'`?\n", "- What would be the score of a model that always predicts `' <=50K'`?\n", "- Is 81% or 82% accuracy a good score for this problem?\n", "\n", - "Use a `DummyClassifier` and do a train-test split to evaluate\n", - "its accuracy on the test set. This\n", + "Use a `DummyClassifier` and do a train-test split to evaluate its accuracy on\n", + "the test set. This\n", "[link](https://scikit-learn.org/stable/modules/model_evaluation.html#dummy-estimators)\n", - "shows a few examples of how to evaluate the generalization performance of these\n", - "baseline models." + "shows a few examples of how to evaluate the generalization performance of\n", + "these baseline models." ] }, { @@ -66,8 +66,7 @@ "metadata": {}, "outputs": [], "source": [ - "numerical_columns = [\n", - " \"age\", \"capital-gain\", \"capital-loss\", \"hours-per-week\"]\n", + "numerical_columns = [\"age\", \"capital-gain\", \"capital-loss\", \"hours-per-week\"]\n", "\n", "data_numeric = data[numerical_columns]" ] @@ -88,20 +87,21 @@ "from sklearn.model_selection import train_test_split\n", "\n", "# solution\n", - "data_numeric_train, data_numeric_test, target_train, target_test = \\\n", - " train_test_split(data_numeric, target, random_state=42)" + "data_numeric_train, data_numeric_test, target_train, target_test = (\n", + " train_test_split(data_numeric, target, random_state=42)\n", + ")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "Use a `DummyClassifier` such that the resulting classifier will always\n", - "predict the class `' >50K'`. What is the accuracy score on the test set?\n", - "Repeat the experiment by always predicting the class `' <=50K'`.\n", + "Use a `DummyClassifier` such that the resulting classifier will always predict\n", + "the class `' >50K'`. What is the accuracy score on the test set? Repeat the\n", + "experiment by always predicting the class `' <=50K'`.\n", "\n", - "Hint: you can set the `strategy` parameter of the `DummyClassifier` to\n", - "achieve the desired behavior." + "Hint: you can set the `strategy` parameter of the `DummyClassifier` to achieve\n", + "the desired behavior." ] }, { @@ -114,8 +114,9 @@ "\n", "# solution\n", "class_to_predict = \" >50K\"\n", - "high_revenue_clf = DummyClassifier(strategy=\"constant\",\n", - " constant=class_to_predict)\n", + "high_revenue_clf = DummyClassifier(\n", + " strategy=\"constant\", constant=class_to_predict\n", + ")\n", "high_revenue_clf.fit(data_numeric_train, target_train)\n", "score = high_revenue_clf.score(data_numeric_test, target_test)\n", "print(f\"Accuracy of a model predicting only high revenue: {score:.3f}\")" @@ -129,8 +130,8 @@ ] }, "source": [ - "We clearly see that the score is below 0.5 which might be surprising at\n", - "first. We will now check the generalization performance of a model which always\n", + "We clearly see that the score is below 0.5 which might be surprising at first.\n", + "We will now check the generalization performance of a model which always\n", "predict the low revenue class, i.e. `\" <=50K\"`." ] }, @@ -145,8 +146,9 @@ "outputs": [], "source": [ "class_to_predict = \" <=50K\"\n", - "low_revenue_clf = DummyClassifier(strategy=\"constant\",\n", - " constant=class_to_predict)\n", + "low_revenue_clf = DummyClassifier(\n", + " strategy=\"constant\", constant=class_to_predict\n", + ")\n", "low_revenue_clf.fit(data_numeric_train, target_train)\n", "score = low_revenue_clf.score(data_numeric_test, target_test)\n", "print(f\"Accuracy of a model predicting only low revenue: {score:.3f}\")" @@ -160,8 +162,8 @@ ] }, "source": [ - "We observe that this model has an accuracy higher than 0.5. This is due to\n", - "the fact that we have 3/4 of the target belonging to low-revenue class." + "We observe that this model has an accuracy higher than 0.5. This is due to the\n", + "fact that we have 3/4 of the target belonging to low-revenue class." ] }, { @@ -210,8 +212,8 @@ ] }, "source": [ - "In practice, we could have the strategy `\"most_frequent\"` to predict the\n", - "class that appears the most in the training target." + "In practice, we could have the strategy `\"most_frequent\"` to predict the class\n", + "that appears the most in the training target." ] }, { diff --git a/notebooks/03_categorical_pipeline.ipynb b/notebooks/03_categorical_pipeline.ipynb index ac940c898..5f91c713b 100644 --- a/notebooks/03_categorical_pipeline.ipynb +++ b/notebooks/03_categorical_pipeline.ipynb @@ -6,9 +6,9 @@ "source": [ "# Encoding of categorical variables\n", "\n", - "In this notebook, we will present typical ways of dealing with\n", - "**categorical variables** by encoding them, namely **ordinal encoding** and\n", - "**one-hot encoding**." + "In this notebook, we will present typical ways of dealing with **categorical\n", + "variables** by encoding them, namely **ordinal encoding** and **one-hot\n", + "encoding**." ] }, { @@ -44,18 +44,16 @@ "\n", "## Identify categorical variables\n", "\n", - "As we saw in the previous section, a numerical variable is a\n", - "quantity represented by a real or integer number. These variables can be\n", - "naturally handled by machine learning algorithms that are typically composed\n", - "of a sequence of arithmetic instructions such as additions and\n", - "multiplications.\n", + "As we saw in the previous section, a numerical variable is a quantity\n", + "represented by a real or integer number. These variables can be naturally\n", + "handled by machine learning algorithms that are typically composed of a\n", + "sequence of arithmetic instructions such as additions and multiplications.\n", "\n", - "In contrast, categorical variables have discrete values, typically\n", - "represented by string labels (but not only) taken from a finite list of\n", - "possible choices. For instance, the variable `native-country` in our dataset\n", - "is a categorical variable because it encodes the data using a finite list of\n", - "possible countries (along with the `?` symbol when this information is\n", - "missing):" + "In contrast, categorical variables have discrete values, typically represented\n", + "by string labels (but not only) taken from a finite list of possible choices.\n", + "For instance, the variable `native-country` in our dataset is a categorical\n", + "variable because it encodes the data using a finite list of possible countries\n", + "(along with the `?` symbol when this information is missing):" ] }, { @@ -71,8 +69,8 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "How can we easily recognize categorical columns among the dataset? Part of\n", - "the answer lies in the columns' data type:" + "How can we easily recognize categorical columns among the dataset? Part of the\n", + "answer lies in the columns' data type:" ] }, { @@ -95,8 +93,8 @@ "\n", "In the previous notebook, we manually defined the numerical columns. We could\n", "do a similar approach. Instead, we will use the scikit-learn helper function\n", - "`make_column_selector`, which allows us to select columns based on\n", - "their data type. We will illustrate how to use this helper." + "`make_column_selector`, which allows us to select columns based on their data\n", + "type. We will illustrate how to use this helper." ] }, { @@ -159,9 +157,8 @@ "### Encoding ordinal categories\n", "\n", "The most intuitive strategy is to encode each category with a different\n", - "number. The `OrdinalEncoder` will transform the data in such manner.\n", - "We will start by encoding a single column to understand how the encoding\n", - "works." + "number. The `OrdinalEncoder` will transform the data in such manner. We will\n", + "start by encoding a single column to understand how the encoding works." ] }, { @@ -220,8 +217,7 @@ "metadata": {}, "outputs": [], "source": [ - "print(\n", - " f\"The dataset encoded contains {data_encoded.shape[1]} features\")" + "print(f\"The dataset encoded contains {data_encoded.shape[1]} features\")" ] }, { @@ -232,37 +228,37 @@ "independently. We also note that the number of features before and after the\n", "encoding is the same.\n", "\n", - "However, be careful when applying this encoding strategy:\n", - "using this integer representation leads downstream predictive models\n", - "to assume that the values are ordered (0 < 1 < 2 < 3... for instance).\n", + "However, be careful when applying this encoding strategy: using this integer\n", + "representation leads downstream predictive models to assume that the values\n", + "are ordered (0 < 1 < 2 < 3... for instance).\n", "\n", "By default, `OrdinalEncoder` uses a lexicographical strategy to map string\n", - "category labels to integers. This strategy is arbitrary and often\n", - "meaningless. For instance, suppose the dataset has a categorical variable\n", - "named `\"size\"` with categories such as \"S\", \"M\", \"L\", \"XL\". We would like the\n", - "integer representation to respect the meaning of the sizes by mapping them to\n", - "increasing integers such as `0, 1, 2, 3`.\n", - "However, the lexicographical strategy used by default would map the labels\n", - "\"S\", \"M\", \"L\", \"XL\" to 2, 1, 0, 3, by following the alphabetical order.\n", - "\n", - "The `OrdinalEncoder` class accepts a `categories` constructor argument to\n", - "pass categories in the expected ordering explicitly. You can find more\n", - "information in the\n", - "[scikit-learn documentation](https://scikit-learn.org/stable/modules/preprocessing.html#encoding-categorical-features)\n", + "category labels to integers. This strategy is arbitrary and often meaningless.\n", + "For instance, suppose the dataset has a categorical variable named `\"size\"`\n", + "with categories such as \"S\", \"M\", \"L\", \"XL\". We would like the integer\n", + "representation to respect the meaning of the sizes by mapping them to\n", + "increasing integers such as `0, 1, 2, 3`. However, the lexicographical\n", + "strategy used by default would map the labels \"S\", \"M\", \"L\", \"XL\" to 2, 1, 0,\n", + "3, by following the alphabetical order.\n", + "\n", + "The `OrdinalEncoder` class accepts a `categories` constructor argument to pass\n", + "categories in the expected ordering explicitly. You can find more information\n", + "in the [scikit-learn\n", + "documentation](https://scikit-learn.org/stable/modules/preprocessing.html#encoding-categorical-features)\n", "if needed.\n", "\n", - "If a categorical variable does not carry any meaningful order information\n", - "then this encoding might be misleading to downstream statistical models and\n", - "you might consider using one-hot encoding instead (see below).\n", + "If a categorical variable does not carry any meaningful order information then\n", + "this encoding might be misleading to downstream statistical models and you\n", + "might consider using one-hot encoding instead (see below).\n", "\n", "### Encoding nominal categories (without assuming any order)\n", "\n", - "`OneHotEncoder` is an alternative encoder that prevents the downstream\n", - "models to make a false assumption about the ordering of categories. For a\n", - "given feature, it will create as many new columns as there are possible\n", - "categories. For a given sample, the value of the column corresponding to the\n", - "category will be set to `1` while all the columns of the other categories\n", - "will be set to `0`.\n", + "`OneHotEncoder` is an alternative encoder that prevents the downstream models\n", + "to make a false assumption about the ordering of categories. For a given\n", + "feature, it will create as many new columns as there are possible categories.\n", + "For a given sample, the value of the column corresponding to the category will\n", + "be set to `1` while all the columns of the other categories will be set to\n", + "`0`.\n", "\n", "We will start by encoding a single feature (e.g. `\"education\"`) to illustrate\n", "how the encoding works." @@ -287,11 +283,11 @@ "source": [ "
" ] @@ -301,8 +297,8 @@ "metadata": {}, "source": [ "We see that encoding a single feature will give a NumPy array full of zeros\n", - "and ones. We can get a better understanding using the associated feature\n", - "names resulting from the transformation." + "and ones. We can get a better understanding using the associated feature names\n", + "resulting from the transformation." ] }, { @@ -332,8 +328,7 @@ "metadata": {}, "outputs": [], "source": [ - "print(\n", - " f\"The dataset is composed of {data_categorical.shape[1]} features\")\n", + "print(f\"The dataset is composed of {data_categorical.shape[1]} features\")\n", "data_categorical.head()" ] }, @@ -353,8 +348,7 @@ "metadata": {}, "outputs": [], "source": [ - "print(\n", - " f\"The encoded dataset contains {data_encoded.shape[1]} features\")" + "print(f\"The encoded dataset contains {data_encoded.shape[1]} features\")" ] }, { @@ -379,11 +373,11 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Look at how the `\"workclass\"` variable of the 3 first records has been\n", - "encoded and compare this to the original string representation.\n", + "Look at how the `\"workclass\"` variable of the 3 first records has been encoded\n", + "and compare this to the original string representation.\n", "\n", - "The number of features after the encoding is more than 10 times larger than\n", - "in the original data because some variables such as `occupation` and\n", + "The number of features after the encoding is more than 10 times larger than in\n", + "the original data because some variables such as `occupation` and\n", "`native-country` have many possible categories." ] }, @@ -403,9 +397,9 @@ "source": [ "
\n", "

Note

\n", - "

In general OneHotEncoder is the encoding strategy used when the\n", - "downstream models are linear models while OrdinalEncoder is often a\n", - "good strategy with tree-based models.

\n", + "

In general OneHotEncoder is the encoding strategy used when the downstream\n", + "models are linear models while OrdinalEncoder is often a good strategy\n", + "with tree-based models.

\n", "
" ] }, @@ -413,12 +407,11 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "\n", - "Using an `OrdinalEncoder` will output ordinal categories. This means\n", - "that there is an order in the resulting categories (e.g. `0 < 1 < 2`). The\n", - "impact of violating this ordering assumption is really dependent on the\n", - "downstream models. Linear models will be impacted by misordered categories\n", - "while tree-based models will not.\n", + "Using an `OrdinalEncoder` will output ordinal categories. This means that\n", + "there is an order in the resulting categories (e.g. `0 < 1 < 2`). The impact\n", + "of violating this ordering assumption is really dependent on the downstream\n", + "models. Linear models will be impacted by misordered categories while\n", + "tree-based models will not.\n", "\n", "You can still use an `OrdinalEncoder` with linear models but you need to be\n", "sure that:\n", @@ -429,10 +422,11 @@ "The **next exercise** shows what can happen when using an `OrdinalEncoder`\n", "with a liner model and the conditions above are not met.\n", "\n", - "One-hot encoding categorical variables with high cardinality can cause \n", - "computational inefficiency in tree-based models. Because of this, it is not recommended\n", - "to use `OneHotEncoder` in such cases even if the original categories do not \n", - "have a given order. We will show this in the **final exercise** of this sequence." + "One-hot encoding categorical variables with high cardinality can cause\n", + "computational inefficiency in tree-based models. Because of this, it is not\n", + "recommended to use `OneHotEncoder` in such cases even if the original\n", + "categories do not have a given order. We will show this in the **final\n", + "exercise** of this sequence." ] }, { @@ -443,8 +437,8 @@ "\n", "We can now integrate this encoder inside a machine learning pipeline like we\n", "did with numerical data: let's train a linear classifier on the encoded data\n", - "and check the generalization performance of this machine learning pipeline using\n", - "cross-validation.\n", + "and check the generalization performance of this machine learning pipeline\n", + "using cross-validation.\n", "\n", "Before we create the pipeline, we have to linger on the `native-country`.\n", "Let's recall some statistics regarding this column." @@ -472,9 +466,9 @@ "\n", "* list all the possible categories and provide it to the encoder via the\n", " keyword argument `categories`;\n", - "* use the parameter `handle_unknown`, i.e. if an unknown category is encountered\n", - " during transform, the resulting one-hot encoded columns for this feature will\n", - " be all zeros. \n", + "* use the parameter `handle_unknown`, i.e. if an unknown category is\n", + " encountered during transform, the resulting one-hot encoded columns for this\n", + " feature will be all zeros.\n", "\n", "Here, we will use the latter solution for simplicity." ] @@ -485,14 +479,12 @@ "source": [ "
\n", "

Tip

\n", - "

Be aware the OrdinalEncoder exposes as well a parameter\n", - "handle_unknown. It can be set to use_encoded_value. If that option is chosen,\n", - "you can define a fixed value to which all unknowns will be set to during\n", - "transform. For example,\n", - "OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=42)\n", - "will set all values encountered during transform to 42 which are not part of\n", - "the data encountered during the fit call.\n", - "You are going to use these parameters in the next exercise.

\n", + "

Be aware the OrdinalEncoder exposes as well a parameter handle_unknown. It\n", + "can be set to use_encoded_value. If that option is chosen, you can define a\n", + "fixed value to which all unknowns will be set to during transform. For\n", + "example, OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=42) will set all values encountered during transform to 42\n", + "which are not part of the data encountered during the fit call. You are\n", + "going to use these parameters in the next exercise.

\n", "
" ] }, @@ -524,10 +516,10 @@ "
\n", "

Note

\n", "

Here, we need to increase the maximum number of iterations to obtain a fully\n", - "converged LogisticRegression and silence a ConvergenceWarning. Contrary\n", - "to the numerical features, the one-hot encoded categorical features are all\n", - "on the same scale (values are 0 or 1), so they would not benefit from\n", - "scaling. In this case, increasing max_iter is the right thing to do.

\n", + "converged LogisticRegression and silence a ConvergenceWarning. Contrary to\n", + "the numerical features, the one-hot encoded categorical features are all on\n", + "the same scale (values are 0 or 1), so they would not benefit from scaling. In\n", + "this case, increasing max_iter is the right thing to do.

\n", "
" ] }, @@ -546,6 +538,7 @@ "outputs": [], "source": [ "from sklearn.model_selection import cross_validate\n", + "\n", "cv_results = cross_validate(model, data_categorical, target)\n", "cv_results" ] @@ -564,9 +557,9 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "As you can see, this representation of the categorical variables is\n", - "slightly more predictive of the revenue than the numerical variables\n", - "that we used previously." + "As you can see, this representation of the categorical variables is slightly\n", + "more predictive of the revenue than the numerical variables that we used\n", + "previously." ] }, { diff --git a/notebooks/03_categorical_pipeline_column_transformer.ipynb b/notebooks/03_categorical_pipeline_column_transformer.ipynb index 59aced6f1..aca827f4c 100644 --- a/notebooks/03_categorical_pipeline_column_transformer.ipynb +++ b/notebooks/03_categorical_pipeline_column_transformer.ipynb @@ -91,15 +91,14 @@ "\n", "We first define the columns depending on their data type:\n", "\n", - "* **one-hot encoding** will be applied to categorical columns. Besides, we\n", - " use `handle_unknown=\"ignore\"` to solve the potential issues due to rare\n", + "* **one-hot encoding** will be applied to categorical columns. Besides, we use\n", + " `handle_unknown=\"ignore\"` to solve the potential issues due to rare\n", " categories.\n", "* **numerical scaling** numerical features which will be standardized.\n", "\n", - "Now, we create our `ColumnTransfomer` by specifying three values:\n", - "the preprocessor name, the transformer, and the columns.\n", - "First, let's create the preprocessors for the numerical and categorical\n", - "parts." + "Now, we create our `ColumnTransfomer` by specifying three values: the\n", + "preprocessor name, the transformer, and the columns. First, let's create the\n", + "preprocessors for the numerical and categorical parts." ] }, { @@ -118,8 +117,8 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Now, we create the transformer and associate each of these preprocessors\n", - "with their respective columns." + "Now, we create the transformer and associate each of these preprocessors with\n", + "their respective columns." ] }, { @@ -130,9 +129,12 @@ "source": [ "from sklearn.compose import ColumnTransformer\n", "\n", - "preprocessor = ColumnTransformer([\n", - " ('one-hot-encoder', categorical_preprocessor, categorical_columns),\n", - " ('standard_scaler', numerical_preprocessor, numerical_columns)])" + "preprocessor = ColumnTransformer(\n", + " [\n", + " (\"one-hot-encoder\", categorical_preprocessor, categorical_columns),\n", + " (\"standard_scaler\", numerical_preprocessor, numerical_columns),\n", + " ]\n", + ")" ] }, { @@ -149,14 +151,14 @@ "* It **splits the columns** of the original dataset based on the column names\n", " or indices provided. We will obtain as many subsets as the number of\n", " transformers passed into the `ColumnTransformer`.\n", - "* It **transforms each subsets**. A specific transformer is applied to\n", - " each subset: it will internally call `fit_transform` or `transform`. The\n", - " output of this step is a set of transformed datasets.\n", + "* It **transforms each subsets**. A specific transformer is applied to each\n", + " subset: it will internally call `fit_transform` or `transform`. The output\n", + " of this step is a set of transformed datasets.\n", "* It then **concatenates the transformed datasets** into a single dataset.\n", "\n", - "The important thing is that `ColumnTransformer` is like any other\n", - "scikit-learn transformer. In particular it can be combined with a classifier\n", - "in a `Pipeline`:" + "The important thing is that `ColumnTransformer` is like any other scikit-learn\n", + "transformer. In particular it can be combined with a classifier in a\n", + "`Pipeline`:" ] }, { @@ -176,8 +178,8 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "The final model is more complex than the previous models but still follows\n", - "the same API (the same set of methods that can be called by the user):\n", + "The final model is more complex than the previous models but still follows the\n", + "same API (the same set of methods that can be called by the user):\n", "\n", "- the `fit` method is called to preprocess the data and then train the\n", " classifier of the preprocessed data;\n", @@ -197,7 +199,8 @@ "from sklearn.model_selection import train_test_split\n", "\n", "data_train, data_test, target_train, target_test = train_test_split(\n", - " data, target, random_state=42)" + " data, target, random_state=42\n", + ")" ] }, { @@ -209,9 +212,8 @@ "

Caution!

\n", "

Be aware that we use train_test_split here for didactic purposes, to show\n", "the scikit-learn API. In a real setting one might prefer to use\n", - "cross-validation to also be able to evaluate the uncertainty of\n", - "our estimation of the generalization performance of a model,\n", - "as previously demonstrated.

\n", + "cross-validation to also be able to evaluate the uncertainty of our estimation\n", + "of the generalization performance of a model, as previously demonstrated.

\n", "
\n", "\n", "Now, we can train the model on the train set." @@ -233,8 +235,7 @@ "Then, we can send the raw dataset straight to the pipeline. Indeed, we do not\n", "need to make any manual preprocessing (calling the `transform` or\n", "`fit_transform` methods) as it will be handled when calling the `predict`\n", - "method. As an example, we predict on the five first samples from the test\n", - "set." + "method. As an example, we predict on the five first samples from the test set." ] }, { @@ -311,8 +312,10 @@ "outputs": [], "source": [ "scores = cv_results[\"test_score\"]\n", - "print(\"The mean cross-validation accuracy is: \"\n", - " f\"{scores.mean():.3f} \u00b1 {scores.std():.3f}\")" + "print(\n", + " \"The mean cross-validation accuracy is: \"\n", + " f\"{scores.mean():.3f} \u00b1 {scores.std():.3f}\"\n", + ")" ] }, { @@ -329,15 +332,15 @@ "source": [ "## Fitting a more powerful model\n", "\n", - "**Linear models** are nice because they are usually cheap to train,\n", - "**small** to deploy, **fast** to predict and give a **good baseline**.\n", + "**Linear models** are nice because they are usually cheap to train, **small**\n", + "to deploy, **fast** to predict and give a **good baseline**.\n", "\n", "However, it is often useful to check whether more complex models such as an\n", "ensemble of decision trees can lead to higher predictive performance. In this\n", "section we will use such a model called **gradient-boosting trees** and\n", - "evaluate its generalization performance. More precisely, the scikit-learn model\n", - "we will use is called `HistGradientBoostingClassifier`. Note that boosting\n", - "models will be covered in more detail in a future module.\n", + "evaluate its generalization performance. More precisely, the scikit-learn\n", + "model we will use is called `HistGradientBoostingClassifier`. Note that\n", + "boosting models will be covered in more detail in a future module.\n", "\n", "For tree-based models, the handling of numerical and categorical variables is\n", "simpler than for linear models:\n", @@ -345,8 +348,8 @@ "* using an **ordinal encoding for the categorical variables** is fine even if\n", " the encoding results in an arbitrary ordering\n", "\n", - "Therefore, for `HistGradientBoostingClassifier`, the preprocessing pipeline\n", - "is slightly simpler than the one we saw earlier for the `LogisticRegression`:" + "Therefore, for `HistGradientBoostingClassifier`, the preprocessing pipeline is\n", + "slightly simpler than the one we saw earlier for the `LogisticRegression`:" ] }, { @@ -358,12 +361,14 @@ "from sklearn.ensemble import HistGradientBoostingClassifier\n", "from sklearn.preprocessing import OrdinalEncoder\n", "\n", - "categorical_preprocessor = OrdinalEncoder(handle_unknown=\"use_encoded_value\",\n", - " unknown_value=-1)\n", + "categorical_preprocessor = OrdinalEncoder(\n", + " handle_unknown=\"use_encoded_value\", unknown_value=-1\n", + ")\n", "\n", - "preprocessor = ColumnTransformer([\n", - " ('categorical', categorical_preprocessor, categorical_columns)],\n", - " remainder=\"passthrough\")\n", + "preprocessor = ColumnTransformer(\n", + " [(\"categorical\", categorical_preprocessor, categorical_columns)],\n", + " remainder=\"passthrough\",\n", + ")\n", "\n", "model = make_pipeline(preprocessor, HistGradientBoostingClassifier())" ] @@ -399,12 +404,12 @@ "metadata": {}, "source": [ "We can observe that we get significantly higher accuracies with the Gradient\n", - "Boosting model. This is often what we observe whenever the dataset has a\n", - "large number of samples and limited number of informative features (e.g. less\n", - "than 1000) with a mix of numerical and categorical variables.\n", + "Boosting model. This is often what we observe whenever the dataset has a large\n", + "number of samples and limited number of informative features (e.g. less than\n", + "1000) with a mix of numerical and categorical variables.\n", "\n", - "This explains why Gradient Boosted Machines are very popular among\n", - "datascience practitioners who work with tabular data." + "This explains why Gradient Boosted Machines are very popular among datascience\n", + "practitioners who work with tabular data." ] }, { @@ -413,12 +418,11 @@ "source": [ "In this notebook we:\n", "\n", - "* used a `ColumnTransformer` to apply different preprocessing for\n", - " categorical and numerical variables;\n", - "* used a pipeline to chain the `ColumnTransformer` preprocessing and\n", - " logistic regression fitting;\n", - "* saw that **gradient boosting methods** can outperform **linear\n", - " models**." + "* used a `ColumnTransformer` to apply different preprocessing for categorical\n", + " and numerical variables;\n", + "* used a pipeline to chain the `ColumnTransformer` preprocessing and logistic\n", + " regression fitting;\n", + "* saw that **gradient boosting methods** can outperform **linear models**." ] } ], diff --git a/notebooks/03_categorical_pipeline_ex_01.ipynb b/notebooks/03_categorical_pipeline_ex_01.ipynb index c42985c22..1f7ab830e 100644 --- a/notebooks/03_categorical_pipeline_ex_01.ipynb +++ b/notebooks/03_categorical_pipeline_ex_01.ipynb @@ -7,8 +7,8 @@ "# \ud83d\udcdd Exercise M1.04\n", "\n", "The goal of this exercise is to evaluate the impact of using an arbitrary\n", - "integer encoding for categorical variables along with a linear\n", - "classification model such as Logistic Regression.\n", + "integer encoding for categorical variables along with a linear classification\n", + "model such as Logistic Regression.\n", "\n", "To do so, let's try to use `OrdinalEncoder` to preprocess the categorical\n", "variables. This preprocessor is assembled in a pipeline with\n", @@ -74,8 +74,8 @@ "\n", "Because `OrdinalEncoder` can raise errors if it sees an unknown category at\n", "prediction time, you can set the `handle_unknown=\"use_encoded_value\"` and\n", - "`unknown_value` parameters. You can refer to the\n", - "[scikit-learn documentation](https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.OrdinalEncoder.html)\n", + "`unknown_value` parameters. You can refer to the [scikit-learn\n", + "documentation](https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.OrdinalEncoder.html)\n", "for more details regarding these parameters." ] }, @@ -102,13 +102,12 @@ "
\n", "

Note

\n", "

Be aware that if an error happened during the cross-validation,\n", - "cross_validate will raise a warning and return NaN (Not a Number)\n", - "as scores. To make it raise a standard Python exception with a traceback,\n", - "you can pass the error_score=\"raise\" argument in the call to\n", - "cross_validate. An exception will be raised instead of a warning at the first\n", - "encountered problem and cross_validate will stop right away instead of\n", - "returning NaN values. This is particularly handy when developing\n", - "complex machine learning pipelines.

\n", + "cross_validate will raise a warning and return NaN (Not a Number) as scores.\n", + "To make it raise a standard Python exception with a traceback, you can pass\n", + "the error_score=\"raise\" argument in the call to cross_validate. An\n", + "exception will be raised instead of a warning at the first encountered problem\n", + "and cross_validate will stop right away instead of returning NaN values.\n", + "This is particularly handy when developing complex machine learning pipelines.

\n", "
" ] }, @@ -128,10 +127,10 @@ "metadata": {}, "source": [ "Now, we would like to compare the generalization performance of our previous\n", - "model with a new model where instead of using an `OrdinalEncoder`, we will\n", - "use a `OneHotEncoder`. Repeat the model evaluation using cross-validation.\n", - "Compare the score of both models and conclude on the impact of choosing a\n", - "specific encoding strategy when using a linear model." + "model with a new model where instead of using an `OrdinalEncoder`, we will use\n", + "a `OneHotEncoder`. Repeat the model evaluation using cross-validation. Compare\n", + "the score of both models and conclude on the impact of choosing a specific\n", + "encoding strategy when using a linear model." ] }, { @@ -148,7 +147,6 @@ ], "metadata": { "jupytext": { - "encoding": "# -*- coding: utf-8 -*-", "main_language": "python" }, "kernelspec": { diff --git a/notebooks/03_categorical_pipeline_ex_02.ipynb b/notebooks/03_categorical_pipeline_ex_02.ipynb index b8d0ad094..83fafffcf 100644 --- a/notebooks/03_categorical_pipeline_ex_02.ipynb +++ b/notebooks/03_categorical_pipeline_ex_02.ipynb @@ -43,9 +43,9 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "As in the previous notebooks, we use the utility `make_column_selector`\n", - "to select only columns with a specific data type. Besides, we list in\n", - "advance all categories for the categorical columns." + "As in the previous notebooks, we use the utility `make_column_selector` to\n", + "select only columns with a specific data type. Besides, we list in advance all\n", + "categories for the categorical columns." ] }, { @@ -86,11 +86,13 @@ "from sklearn.preprocessing import OrdinalEncoder\n", "from sklearn.ensemble import HistGradientBoostingClassifier\n", "\n", - "categorical_preprocessor = OrdinalEncoder(handle_unknown=\"use_encoded_value\",\n", - " unknown_value=-1)\n", - "preprocessor = ColumnTransformer([\n", - " ('categorical', categorical_preprocessor, categorical_columns)],\n", - " remainder=\"passthrough\")\n", + "categorical_preprocessor = OrdinalEncoder(\n", + " handle_unknown=\"use_encoded_value\", unknown_value=-1\n", + ")\n", + "preprocessor = ColumnTransformer(\n", + " [(\"categorical\", categorical_preprocessor, categorical_columns)],\n", + " remainder=\"passthrough\",\n", + ")\n", "\n", "model = make_pipeline(preprocessor, HistGradientBoostingClassifier())\n", "\n", @@ -100,9 +102,11 @@ "\n", "scores = cv_results[\"test_score\"]\n", "\n", - "print(\"The mean cross-validation accuracy is: \"\n", - " f\"{scores.mean():.3f} \u00b1 {scores.std():.3f} \"\n", - " f\"with a fitting time of {elapsed_time:.3f}\")" + "print(\n", + " \"The mean cross-validation accuracy is: \"\n", + " f\"{scores.mean():.3f} \u00b1 {scores.std():.3f} \"\n", + " f\"with a fitting time of {elapsed_time:.3f}\"\n", + ")" ] }, { @@ -132,15 +136,15 @@ "\n", "We observed that integer coding of categorical variables can be very\n", "detrimental for linear models. However, it does not seem to be the case for\n", - "`HistGradientBoostingClassifier` models, as the cross-validation score\n", - "of the reference pipeline with `OrdinalEncoder` is reasonably good.\n", + "`HistGradientBoostingClassifier` models, as the cross-validation score of the\n", + "reference pipeline with `OrdinalEncoder` is reasonably good.\n", "\n", "Let's see if we can get an even better accuracy with `OneHotEncoder`.\n", "\n", - "Hint: `HistGradientBoostingClassifier` does not yet support sparse input\n", - "data. You might want to use\n", - "`OneHotEncoder(handle_unknown=\"ignore\", sparse_output=False)` to force the use of a\n", - "dense representation as a workaround." + "Hint: `HistGradientBoostingClassifier` does not yet support sparse input data.\n", + "You might want to use `OneHotEncoder(handle_unknown=\"ignore\",\n", + "sparse_output=False)` to force the use of a dense representation as a\n", + "workaround." ] }, { @@ -155,7 +159,6 @@ ], "metadata": { "jupytext": { - "encoding": "# -*- coding: utf-8 -*-", "main_language": "python" }, "kernelspec": { diff --git a/notebooks/03_categorical_pipeline_sol_01.ipynb b/notebooks/03_categorical_pipeline_sol_01.ipynb index 1a913d94b..206a36f4c 100644 --- a/notebooks/03_categorical_pipeline_sol_01.ipynb +++ b/notebooks/03_categorical_pipeline_sol_01.ipynb @@ -7,8 +7,8 @@ "# \ud83d\udcc3 Solution for Exercise M1.04\n", "\n", "The goal of this exercise is to evaluate the impact of using an arbitrary\n", - "integer encoding for categorical variables along with a linear\n", - "classification model such as Logistic Regression.\n", + "integer encoding for categorical variables along with a linear classification\n", + "model such as Logistic Regression.\n", "\n", "To do so, let's try to use `OrdinalEncoder` to preprocess the categorical\n", "variables. This preprocessor is assembled in a pipeline with\n", @@ -74,8 +74,8 @@ "\n", "Because `OrdinalEncoder` can raise errors if it sees an unknown category at\n", "prediction time, you can set the `handle_unknown=\"use_encoded_value\"` and\n", - "`unknown_value` parameters. You can refer to the\n", - "[scikit-learn documentation](https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.OrdinalEncoder.html)\n", + "`unknown_value` parameters. You can refer to the [scikit-learn\n", + "documentation](https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.OrdinalEncoder.html)\n", "for more details regarding these parameters." ] }, @@ -92,7 +92,8 @@ "# solution\n", "model = make_pipeline(\n", " OrdinalEncoder(handle_unknown=\"use_encoded_value\", unknown_value=-1),\n", - " LogisticRegression(max_iter=500))" + " LogisticRegression(max_iter=500),\n", + ")" ] }, { @@ -105,13 +106,12 @@ "
\n", "

Note

\n", "

Be aware that if an error happened during the cross-validation,\n", - "cross_validate will raise a warning and return NaN (Not a Number)\n", - "as scores. To make it raise a standard Python exception with a traceback,\n", - "you can pass the error_score=\"raise\" argument in the call to\n", - "cross_validate. An exception will be raised instead of a warning at the first\n", - "encountered problem and cross_validate will stop right away instead of\n", - "returning NaN values. This is particularly handy when developing\n", - "complex machine learning pipelines.

\n", + "cross_validate will raise a warning and return NaN (Not a Number) as scores.\n", + "To make it raise a standard Python exception with a traceback, you can pass\n", + "the error_score=\"raise\" argument in the call to cross_validate. An\n", + "exception will be raised instead of a warning at the first encountered problem\n", + "and cross_validate will stop right away instead of returning NaN values.\n", + "This is particularly handy when developing complex machine learning pipelines.

\n", "
" ] }, @@ -127,8 +127,10 @@ "cv_results = cross_validate(model, data_categorical, target)\n", "\n", "scores = cv_results[\"test_score\"]\n", - "print(\"The mean cross-validation accuracy is: \"\n", - " f\"{scores.mean():.3f} \u00b1 {scores.std():.3f}\")" + "print(\n", + " \"The mean cross-validation accuracy is: \"\n", + " f\"{scores.mean():.3f} \u00b1 {scores.std():.3f}\"\n", + ")" ] }, { @@ -160,11 +162,14 @@ "source": [ "from sklearn.dummy import DummyClassifier\n", "\n", - "cv_results = cross_validate(DummyClassifier(strategy=\"most_frequent\"),\n", - " data_categorical, target)\n", + "cv_results = cross_validate(\n", + " DummyClassifier(strategy=\"most_frequent\"), data_categorical, target\n", + ")\n", "scores = cv_results[\"test_score\"]\n", - "print(\"The mean cross-validation accuracy is: \"\n", - " f\"{scores.mean():.3f} \u00b1 {scores.std():.3f}\")" + "print(\n", + " \"The mean cross-validation accuracy is: \"\n", + " f\"{scores.mean():.3f} \u00b1 {scores.std():.3f}\"\n", + ")" ] }, { @@ -172,10 +177,10 @@ "metadata": {}, "source": [ "Now, we would like to compare the generalization performance of our previous\n", - "model with a new model where instead of using an `OrdinalEncoder`, we will\n", - "use a `OneHotEncoder`. Repeat the model evaluation using cross-validation.\n", - "Compare the score of both models and conclude on the impact of choosing a\n", - "specific encoding strategy when using a linear model." + "model with a new model where instead of using an `OrdinalEncoder`, we will use\n", + "a `OneHotEncoder`. Repeat the model evaluation using cross-validation. Compare\n", + "the score of both models and conclude on the impact of choosing a specific\n", + "encoding strategy when using a linear model." ] }, { @@ -188,12 +193,14 @@ "\n", "# solution\n", "model = make_pipeline(\n", - " OneHotEncoder(handle_unknown=\"ignore\"),\n", - " LogisticRegression(max_iter=500))\n", + " OneHotEncoder(handle_unknown=\"ignore\"), LogisticRegression(max_iter=500)\n", + ")\n", "cv_results = cross_validate(model, data_categorical, target)\n", "scores = cv_results[\"test_score\"]\n", - "print(\"The mean cross-validation accuracy is: \"\n", - " f\"{scores.mean():.3f} \u00b1 {scores.std():.3f}\")" + "print(\n", + " \"The mean cross-validation accuracy is: \"\n", + " f\"{scores.mean():.3f} \u00b1 {scores.std():.3f}\"\n", + ")" ] }, { @@ -204,8 +211,8 @@ ] }, "source": [ - "With the linear classifier chosen, using an encoding that does not assume\n", - "any ordering lead to much better result.\n", + "With the linear classifier chosen, using an encoding that does not assume any\n", + "ordering lead to much better result.\n", "\n", "The important message here is: linear model and `OrdinalEncoder` are used\n", "together only for ordinal categorical features, i.e. features that have a\n", diff --git a/notebooks/03_categorical_pipeline_sol_02.ipynb b/notebooks/03_categorical_pipeline_sol_02.ipynb index e43625325..725a86cdd 100644 --- a/notebooks/03_categorical_pipeline_sol_02.ipynb +++ b/notebooks/03_categorical_pipeline_sol_02.ipynb @@ -43,9 +43,9 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "As in the previous notebooks, we use the utility `make_column_selector`\n", - "to select only columns with a specific data type. Besides, we list in\n", - "advance all categories for the categorical columns." + "As in the previous notebooks, we use the utility `make_column_selector` to\n", + "select only columns with a specific data type. Besides, we list in advance all\n", + "categories for the categorical columns." ] }, { @@ -86,11 +86,13 @@ "from sklearn.preprocessing import OrdinalEncoder\n", "from sklearn.ensemble import HistGradientBoostingClassifier\n", "\n", - "categorical_preprocessor = OrdinalEncoder(handle_unknown=\"use_encoded_value\",\n", - " unknown_value=-1)\n", - "preprocessor = ColumnTransformer([\n", - " ('categorical', categorical_preprocessor, categorical_columns)],\n", - " remainder=\"passthrough\")\n", + "categorical_preprocessor = OrdinalEncoder(\n", + " handle_unknown=\"use_encoded_value\", unknown_value=-1\n", + ")\n", + "preprocessor = ColumnTransformer(\n", + " [(\"categorical\", categorical_preprocessor, categorical_columns)],\n", + " remainder=\"passthrough\",\n", + ")\n", "\n", "model = make_pipeline(preprocessor, HistGradientBoostingClassifier())\n", "\n", @@ -100,9 +102,11 @@ "\n", "scores = cv_results[\"test_score\"]\n", "\n", - "print(\"The mean cross-validation accuracy is: \"\n", - " f\"{scores.mean():.3f} \u00b1 {scores.std():.3f} \"\n", - " f\"with a fitting time of {elapsed_time:.3f}\")" + "print(\n", + " \"The mean cross-validation accuracy is: \"\n", + " f\"{scores.mean():.3f} \u00b1 {scores.std():.3f} \"\n", + " f\"with a fitting time of {elapsed_time:.3f}\"\n", + ")" ] }, { @@ -126,11 +130,18 @@ "\n", "from sklearn.preprocessing import StandardScaler\n", "\n", - "preprocessor = ColumnTransformer([\n", - " ('numerical', StandardScaler(), numerical_columns),\n", - " ('categorical', OrdinalEncoder(handle_unknown=\"use_encoded_value\",\n", - " unknown_value=-1),\n", - " categorical_columns)])\n", + "preprocessor = ColumnTransformer(\n", + " [\n", + " (\"numerical\", StandardScaler(), numerical_columns),\n", + " (\n", + " \"categorical\",\n", + " OrdinalEncoder(\n", + " handle_unknown=\"use_encoded_value\", unknown_value=-1\n", + " ),\n", + " categorical_columns,\n", + " ),\n", + " ]\n", + ")\n", "\n", "model = make_pipeline(preprocessor, HistGradientBoostingClassifier())\n", "\n", @@ -140,9 +151,11 @@ "\n", "scores = cv_results[\"test_score\"]\n", "\n", - "print(\"The mean cross-validation accuracy is: \"\n", - " f\"{scores.mean():.3f} \u00b1 {scores.std():.3f} \"\n", - " f\"with a fitting time of {elapsed_time:.3f}\")" + "print(\n", + " \"The mean cross-validation accuracy is: \"\n", + " f\"{scores.mean():.3f} \u00b1 {scores.std():.3f} \"\n", + " f\"with a fitting time of {elapsed_time:.3f}\"\n", + ")" ] }, { @@ -171,15 +184,15 @@ "\n", "We observed that integer coding of categorical variables can be very\n", "detrimental for linear models. However, it does not seem to be the case for\n", - "`HistGradientBoostingClassifier` models, as the cross-validation score\n", - "of the reference pipeline with `OrdinalEncoder` is reasonably good.\n", + "`HistGradientBoostingClassifier` models, as the cross-validation score of the\n", + "reference pipeline with `OrdinalEncoder` is reasonably good.\n", "\n", "Let's see if we can get an even better accuracy with `OneHotEncoder`.\n", "\n", - "Hint: `HistGradientBoostingClassifier` does not yet support sparse input\n", - "data. You might want to use\n", - "`OneHotEncoder(handle_unknown=\"ignore\", sparse_output=False)` to force the use of a\n", - "dense representation as a workaround." + "Hint: `HistGradientBoostingClassifier` does not yet support sparse input data.\n", + "You might want to use `OneHotEncoder(handle_unknown=\"ignore\",\n", + "sparse_output=False)` to force the use of a dense representation as a\n", + "workaround." ] }, { @@ -193,10 +206,13 @@ "\n", "from sklearn.preprocessing import OneHotEncoder\n", "\n", - "categorical_preprocessor = OneHotEncoder(handle_unknown=\"ignore\", sparse_output=False)\n", - "preprocessor = ColumnTransformer([\n", - " ('one-hot-encoder', categorical_preprocessor, categorical_columns)],\n", - " remainder=\"passthrough\")\n", + "categorical_preprocessor = OneHotEncoder(\n", + " handle_unknown=\"ignore\", sparse_output=False\n", + ")\n", + "preprocessor = ColumnTransformer(\n", + " [(\"one-hot-encoder\", categorical_preprocessor, categorical_columns)],\n", + " remainder=\"passthrough\",\n", + ")\n", "\n", "model = make_pipeline(preprocessor, HistGradientBoostingClassifier())\n", "\n", @@ -206,9 +222,11 @@ "\n", "scores = cv_results[\"test_score\"]\n", "\n", - "print(\"The mean cross-validation accuracy is: \"\n", - " f\"{scores.mean():.3f} \u00b1 {scores.std():.3f} \"\n", - " f\"with a fitting time of {elapsed_time:.3f}\")" + "print(\n", + " \"The mean cross-validation accuracy is: \"\n", + " f\"{scores.mean():.3f} \u00b1 {scores.std():.3f} \"\n", + " f\"with a fitting time of {elapsed_time:.3f}\"\n", + ")" ] }, { @@ -221,22 +239,22 @@ "source": [ "### Analysis\n", "\n", - "From an accuracy point of view, the result is almost exactly the same.\n", - "The reason is that `HistGradientBoostingClassifier` is expressive\n", - "and robust enough to deal with misleading ordering of integer coded\n", - "categories (which was not the case for linear models).\n", + "From an accuracy point of view, the result is almost exactly the same. The\n", + "reason is that `HistGradientBoostingClassifier` is expressive and robust\n", + "enough to deal with misleading ordering of integer coded categories (which was\n", + "not the case for linear models).\n", "\n", - "However from a computation point of view, the training time is\n", - "much longer: this is caused by the fact that `OneHotEncoder`\n", - "generates approximately 10 times more features than `OrdinalEncoder`.\n", + "However from a computation point of view, the training time is much longer:\n", + "this is caused by the fact that `OneHotEncoder` generates approximately 10\n", + "times more features than `OrdinalEncoder`.\n", "\n", - "Note that the current implementation `HistGradientBoostingClassifier`\n", - "is still incomplete, and once sparse representation are handled\n", - "correctly, training time might improve with such kinds of encodings.\n", + "Note that the current implementation `HistGradientBoostingClassifier` is still\n", + "incomplete, and once sparse representation are handled correctly, training\n", + "time might improve with such kinds of encodings.\n", "\n", - "The main take away message is that arbitrary integer coding of\n", - "categories is perfectly fine for `HistGradientBoostingClassifier`\n", - "and yields fast training times." + "The main take away message is that arbitrary integer coding of categories is\n", + "perfectly fine for `HistGradientBoostingClassifier` and yields fast training\n", + "times." ] }, { @@ -269,8 +287,8 @@ "\n", "\n", "
    \n", - "
  • OneHotEncoder: will always do something meaningful, but can be\n", - "unnecessary slow with trees.
  • \n", + "
  • OneHotEncoder: will always do something meaningful, but can be unnecessary\n", + "slow with trees.
  • \n", "
  • OrdinalEncoder: can be detrimental for linear models unless your category\n", "has a meaningful order and you make sure that OrdinalEncoder respects this\n", "order. Trees can deal with OrdinalEncoder fine as long as they are deep\n", diff --git a/notebooks/03_categorical_pipeline_visualization.ipynb b/notebooks/03_categorical_pipeline_visualization.ipynb index 29e29e213..dd16ea0b3 100644 --- a/notebooks/03_categorical_pipeline_visualization.ipynb +++ b/notebooks/03_categorical_pipeline_visualization.ipynb @@ -29,7 +29,8 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "We need to define our data and target. In this case we will build a classification model" + "We need to define our data and target. In this case we will build a\n", + "classification model" ] }, { @@ -40,10 +41,13 @@ "source": [ "import pandas as pd\n", "\n", - "ames_housing = pd.read_csv(\"../datasets/house_prices.csv\", na_values='?')\n", + "ames_housing = pd.read_csv(\"../datasets/house_prices.csv\", na_values=\"?\")\n", "\n", "target_name = \"SalePrice\"\n", - "data, target = ames_housing.drop(columns=target_name), ames_housing[target_name]\n", + "data, target = (\n", + " ames_housing.drop(columns=target_name),\n", + " ames_housing[target_name],\n", + ")\n", "target = (target > 200_000).astype(int)" ] }, @@ -77,8 +81,8 @@ "metadata": {}, "outputs": [], "source": [ - "numeric_features = ['LotArea', 'FullBath', 'HalfBath']\n", - "categorical_features = ['Neighborhood', 'HouseStyle']\n", + "numeric_features = [\"LotArea\", \"FullBath\", \"HalfBath\"]\n", + "categorical_features = [\"Neighborhood\", \"HouseStyle\"]\n", "data = data[numeric_features + categorical_features]" ] }, @@ -106,12 +110,17 @@ "from sklearn.impute import SimpleImputer\n", "from sklearn.preprocessing import StandardScaler, OneHotEncoder\n", "\n", - "numeric_transformer = Pipeline(steps=[\n", - " ('imputer', SimpleImputer(strategy='median')),\n", - " ('scaler', StandardScaler(),\n", - ")])\n", + "numeric_transformer = Pipeline(\n", + " steps=[\n", + " (\"imputer\", SimpleImputer(strategy=\"median\")),\n", + " (\n", + " \"scaler\",\n", + " StandardScaler(),\n", + " ),\n", + " ]\n", + ")\n", "\n", - "categorical_transformer = OneHotEncoder(handle_unknown='ignore')" + "categorical_transformer = OneHotEncoder(handle_unknown=\"ignore\")" ] }, { @@ -129,10 +138,12 @@ "source": [ "from sklearn.compose import ColumnTransformer\n", "\n", - "preprocessor = ColumnTransformer(transformers=[\n", - " ('num', numeric_transformer, numeric_features),\n", - " ('cat', categorical_transformer, categorical_features),\n", - "])" + "preprocessor = ColumnTransformer(\n", + " transformers=[\n", + " (\"num\", numeric_transformer, numeric_features),\n", + " (\"cat\", categorical_transformer, categorical_features),\n", + " ]\n", + ")" ] }, { @@ -150,10 +161,12 @@ "source": [ "from sklearn.linear_model import LogisticRegression\n", "\n", - "model = Pipeline(steps=[\n", - " ('preprocessor', preprocessor),\n", - " ('classifier', LogisticRegression()),\n", - "])" + "model = Pipeline(\n", + " steps=[\n", + " (\"preprocessor\", preprocessor),\n", + " (\"classifier\", LogisticRegression()),\n", + " ]\n", + ")" ] }, { @@ -189,8 +202,10 @@ "\n", "cv_results = cross_validate(model, data, target, cv=5)\n", "scores = cv_results[\"test_score\"]\n", - "print(\"The mean cross-validation accuracy is: \"\n", - " f\"{scores.mean():.3f} \u00b1 {scores.std():.3f}\")" + "print(\n", + " \"The mean cross-validation accuracy is: \"\n", + " f\"{scores.mean():.3f} \u00b1 {scores.std():.3f}\"\n", + ")" ] }, { @@ -200,14 +215,14 @@ "
    \n", "

    Note

    \n", "

    In this case, around 86% of the times the pipeline correctly predicts whether\n", - "the price of a house is above or below the 200_000 dollars threshold. But\n", - "be aware that this score was obtained by picking some features by hand, which\n", - "is not necessarily the best thing we can do for this classification task. In this\n", + "the price of a house is above or below the 200_000 dollars threshold. But be\n", + "aware that this score was obtained by picking some features by hand, which is\n", + "not necessarily the best thing we can do for this classification task. In this\n", "example we can hope that fitting a complex machine learning pipelines on a\n", "richer set of features can improve upon this performance level.

    \n", "

    Reducing a price estimation problem to a binary classification problem with a\n", - "single threshold at 200_000 dollars is probably too coarse to be useful in\n", - "in practice. Treating this problem as a regression problem is probably a better\n", + "single threshold at 200_000 dollars is probably too coarse to be useful in in\n", + "practice. Treating this problem as a regression problem is probably a better\n", "idea. We will see later in this MOOC how to train and evaluate the performance\n", "of various regression models.

    \n", "
    " diff --git a/notebooks/cross_validation_baseline.ipynb b/notebooks/cross_validation_baseline.ipynb index dab8ba88c..4c5f1de27 100644 --- a/notebooks/cross_validation_baseline.ipynb +++ b/notebooks/cross_validation_baseline.ipynb @@ -42,8 +42,8 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Across all evaluations, we will use a `ShuffleSplit` cross-validation\n", - "splitter with 20% of the data held on the validation side of the split." + "Across all evaluations, we will use a `ShuffleSplit` cross-validation splitter\n", + "with 20% of the data held on the validation side of the split." ] }, { @@ -100,9 +100,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "lines_to_next_cell": 2 - }, + "metadata": {}, "outputs": [], "source": [ "from sklearn.dummy import DummyRegressor\n", @@ -159,13 +157,13 @@ "metadata": {}, "source": [ "We see that the generalization performance of our decision tree is far from\n", - "being perfect: the price predictions are off by more than 45,000 US dollars\n", - "on average. However it is much better than the mean price baseline. So this\n", - "confirms that it is possible to predict the housing price much better by\n", - "using a model that takes into account the values of the input features\n", - "(housing location, size, neighborhood income...). Such a model makes more\n", - "informed predictions and approximately divides the error rate by a factor of 2\n", - "compared to the baseline that ignores the input features.\n", + "being perfect: the price predictions are off by more than 45,000 US dollars on\n", + "average. However it is much better than the mean price baseline. So this\n", + "confirms that it is possible to predict the housing price much better by using\n", + "a model that takes into account the values of the input features (housing\n", + "location, size, neighborhood income...). Such a model makes more informed\n", + "predictions and approximately divides the error rate by a factor of 2 compared\n", + "to the baseline that ignores the input features.\n", "\n", "Note that here we used the mean price as the baseline prediction. We could\n", "have used the median instead. See the online documentation of the\n", diff --git a/notebooks/cross_validation_ex_01.ipynb b/notebooks/cross_validation_ex_01.ipynb index 22ca64ba0..695981a3b 100644 --- a/notebooks/cross_validation_ex_01.ipynb +++ b/notebooks/cross_validation_ex_01.ipynb @@ -166,7 +166,6 @@ ], "metadata": { "jupytext": { - "encoding": "# -*- coding: utf-8 -*-", "main_language": "python" }, "kernelspec": { diff --git a/notebooks/cross_validation_ex_02.ipynb b/notebooks/cross_validation_ex_02.ipynb index ed7a795bd..9c2518fc2 100644 --- a/notebooks/cross_validation_ex_02.ipynb +++ b/notebooks/cross_validation_ex_02.ipynb @@ -6,12 +6,12 @@ "source": [ "# \ud83d\udcdd Exercise M7.01\n", "\n", - "In this exercise we will define dummy classification baselines and use them\n", - "as reference to assess the relative predictive performance of a given model\n", - "of interest.\n", + "In this exercise we will define dummy classification baselines and use them as\n", + "reference to assess the relative predictive performance of a given model of\n", + "interest.\n", "\n", - "We illustrate those baselines with the help of the Adult Census dataset,\n", - "using only the numerical features for the sake of simplicity." + "We illustrate those baselines with the help of the Adult Census dataset, using\n", + "only the numerical features for the sake of simplicity." ] }, { @@ -64,8 +64,8 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Compute the cross-validation (test) scores for the classifier on this\n", - "dataset. Store the results pandas Series as we did in the previous notebook." + "Compute the cross-validation (test) scores for the classifier on this dataset.\n", + "Store the results pandas Series as we did in the previous notebook." ] }, { @@ -81,9 +81,9 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Now, compute the cross-validation scores of a dummy classifier that\n", - "constantly predicts the most frequent class observed the training set. Please\n", - "refer to the online documentation for the [sklearn.dummy.DummyClassifier\n", + "Now, compute the cross-validation scores of a dummy classifier that constantly\n", + "predicts the most frequent class observed the training set. Please refer to\n", + "the online documentation for the [sklearn.dummy.DummyClassifier\n", "](https://scikit-learn.org/stable/modules/generated/sklearn.dummy.DummyClassifier.html)\n", "class.\n", "\n", @@ -103,8 +103,8 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Now that we collected the results from the baseline and the model,\n", - "concatenate the test scores as columns a single pandas dataframe." + "Now that we collected the results from the baseline and the model, concatenate\n", + "the test scores as columns a single pandas dataframe." ] }, { @@ -121,8 +121,8 @@ "metadata": {}, "source": [ "\n", - "Next, plot the histogram of the cross-validation test scores for both\n", - "models with the help of [pandas built-in plotting\n", + "Next, plot the histogram of the cross-validation test scores for both models\n", + "with the help of [pandas built-in plotting\n", "function](https://pandas.pydata.org/pandas-docs/stable/user_guide/visualization.html#histograms).\n", "\n", "What conclusions do you draw from the results?" @@ -166,7 +166,6 @@ ], "metadata": { "jupytext": { - "encoding": "# -*- coding: utf-8 -*-", "main_language": "python" }, "kernelspec": { diff --git a/notebooks/cross_validation_grouping.ipynb b/notebooks/cross_validation_grouping.ipynb index f2c6b72fc..b7aa2032e 100644 --- a/notebooks/cross_validation_grouping.ipynb +++ b/notebooks/cross_validation_grouping.ipynb @@ -26,8 +26,8 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "We will recreate the same model used in the previous notebook:\n", - "a logistic regression classifier with a preprocessor to scale the data." + "We will recreate the same model used in the previous notebook: a logistic\n", + "regression classifier with a preprocessor to scale the data." ] }, { @@ -60,11 +60,12 @@ "from sklearn.model_selection import cross_val_score, KFold\n", "\n", "cv = KFold(shuffle=False)\n", - "test_score_no_shuffling = cross_val_score(model, data, target, cv=cv,\n", - " n_jobs=2)\n", - "print(f\"The average accuracy is \"\n", - " f\"{test_score_no_shuffling.mean():.3f} \u00b1 \"\n", - " f\"{test_score_no_shuffling.std():.3f}\")" + "test_score_no_shuffling = cross_val_score(model, data, target, cv=cv, n_jobs=2)\n", + "print(\n", + " \"The average accuracy is \"\n", + " f\"{test_score_no_shuffling.mean():.3f} \u00b1 \"\n", + " f\"{test_score_no_shuffling.std():.3f}\"\n", + ")" ] }, { @@ -82,20 +83,23 @@ "outputs": [], "source": [ "cv = KFold(shuffle=True)\n", - "test_score_with_shuffling = cross_val_score(model, data, target, cv=cv,\n", - " n_jobs=2)\n", - "print(f\"The average accuracy is \"\n", - " f\"{test_score_with_shuffling.mean():.3f} \u00b1 \"\n", - " f\"{test_score_with_shuffling.std():.3f}\")" + "test_score_with_shuffling = cross_val_score(\n", + " model, data, target, cv=cv, n_jobs=2\n", + ")\n", + "print(\n", + " \"The average accuracy is \"\n", + " f\"{test_score_with_shuffling.mean():.3f} \u00b1 \"\n", + " f\"{test_score_with_shuffling.std():.3f}\"\n", + ")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "We observe that shuffling the data improves the mean accuracy.\n", - "We could go a little further and plot the distribution of the testing\n", - "score. We can first concatenate the test scores." + "We observe that shuffling the data improves the mean accuracy. We could go a\n", + "little further and plot the distribution of the testing score. We can first\n", + "concatenate the test scores." ] }, { @@ -230,8 +234,23 @@ "\n", "# defines the lower and upper bounds of sample indices\n", "# for each writer\n", - "writer_boundaries = [0, 130, 256, 386, 516, 646, 776, 915, 1029,\n", - " 1157, 1287, 1415, 1545, 1667, 1797]\n", + "writer_boundaries = [\n", + " 0,\n", + " 130,\n", + " 256,\n", + " 386,\n", + " 516,\n", + " 646,\n", + " 776,\n", + " 915,\n", + " 1029,\n", + " 1157,\n", + " 1287,\n", + " 1415,\n", + " 1545,\n", + " 1667,\n", + " 1797,\n", + "]\n", "groups = np.zeros_like(target)\n", "lower_bounds = writer_boundaries[:-1]\n", "upper_bounds = writer_boundaries[1:]\n", @@ -278,21 +297,22 @@ "from sklearn.model_selection import GroupKFold\n", "\n", "cv = GroupKFold()\n", - "test_score = cross_val_score(model, data, target, groups=groups, cv=cv,\n", - " n_jobs=2)\n", - "print(f\"The average accuracy is \"\n", - " f\"{test_score.mean():.3f} \u00b1 \"\n", - " f\"{test_score.std():.3f}\")" + "test_score = cross_val_score(\n", + " model, data, target, groups=groups, cv=cv, n_jobs=2\n", + ")\n", + "print(\n", + " f\"The average accuracy is {test_score.mean():.3f} \u00b1 {test_score.std():.3f}\"\n", + ")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "We see that this strategy is less optimistic regarding the model generalization\n", - "performance. However, this is the most reliable if our goal is to make\n", - "handwritten digits recognition writers independent. Besides, we can as well\n", - "see that the standard deviation was reduced." + "We see that this strategy is less optimistic regarding the model\n", + "generalization performance. However, this is the most reliable if our goal is\n", + "to make handwritten digits recognition writers independent. Besides, we can as\n", + "well see that the standard deviation was reduced." ] }, { @@ -303,8 +323,11 @@ "source": [ "all_scores = pd.DataFrame(\n", " [test_score_no_shuffling, test_score_with_shuffling, test_score],\n", - " index=[\"KFold without shuffling\", \"KFold with shuffling\",\n", - " \"KFold with groups\"],\n", + " index=[\n", + " \"KFold without shuffling\",\n", + " \"KFold with shuffling\",\n", + " \"KFold with groups\",\n", + " ],\n", ").T" ] }, diff --git a/notebooks/cross_validation_learning_curve.ipynb b/notebooks/cross_validation_learning_curve.ipynb index b75f275db..44f04e7d2 100644 --- a/notebooks/cross_validation_learning_curve.ipynb +++ b/notebooks/cross_validation_learning_curve.ipynb @@ -11,8 +11,8 @@ "generalizing. Besides these aspects, it is also important to understand how\n", "the different errors are influenced by the number of samples available.\n", "\n", - "In this notebook, we will show this aspect by looking a the variability of\n", - "the different errors.\n", + "In this notebook, we will show this aspect by looking a the variability of the\n", + "different errors.\n", "\n", "Let's first load the data and create the same model as in the previous\n", "notebook." @@ -69,11 +69,11 @@ "the validation curve, but instead of varying a hyperparameter, we vary the\n", "number of training samples. This curve is called the **learning curve**.\n", "\n", - "It gives information regarding the benefit of adding new training samples\n", - "to improve a model's generalization performance.\n", + "It gives information regarding the benefit of adding new training samples to\n", + "improve a model's generalization performance.\n", "\n", - "Let's compute the learning curve for a decision tree and vary the\n", - "proportion of the training set from 10% to 100%." + "Let's compute the learning curve for a decision tree and vary the proportion\n", + "of the training set from 10% to 100%." ] }, { @@ -83,6 +83,7 @@ "outputs": [], "source": [ "import numpy as np\n", + "\n", "train_sizes = np.linspace(0.1, 1.0, num=5, endpoint=True)\n", "train_sizes" ] @@ -121,8 +122,14 @@ "from sklearn.model_selection import learning_curve\n", "\n", "results = learning_curve(\n", - " regressor, data, target, train_sizes=train_sizes, cv=cv,\n", - " scoring=\"neg_mean_absolute_error\", n_jobs=2)\n", + " regressor,\n", + " data,\n", + " target,\n", + " train_sizes=train_sizes,\n", + " cv=cv,\n", + " scoring=\"neg_mean_absolute_error\",\n", + " n_jobs=2,\n", + ")\n", "train_size, train_scores, test_scores = results[:3]\n", "# Convert the scores into errors\n", "train_errors, test_errors = -train_scores, -test_scores" @@ -143,10 +150,18 @@ "source": [ "import matplotlib.pyplot as plt\n", "\n", - "plt.errorbar(train_size, train_errors.mean(axis=1),\n", - " yerr=train_errors.std(axis=1), label=\"Training error\")\n", - "plt.errorbar(train_size, test_errors.mean(axis=1),\n", - " yerr=test_errors.std(axis=1), label=\"Testing error\")\n", + "plt.errorbar(\n", + " train_size,\n", + " train_errors.mean(axis=1),\n", + " yerr=train_errors.std(axis=1),\n", + " label=\"Training error\",\n", + ")\n", + "plt.errorbar(\n", + " train_size,\n", + " test_errors.mean(axis=1),\n", + " yerr=test_errors.std(axis=1),\n", + " label=\"Testing error\",\n", + ")\n", "plt.legend()\n", "\n", "plt.xscale(\"log\")\n", @@ -163,11 +178,11 @@ "means that the trained model (i.e. decision tree) is clearly overfitting the\n", "training data.\n", "\n", - "Looking at the testing error alone, we observe that the more samples are\n", - "added into the training set, the lower the testing error becomes. Also, we\n", - "are searching for the plateau of the testing error for which there is no\n", - "benefit to adding samples anymore or assessing the potential gain of adding\n", - "more samples into the training set.\n", + "Looking at the testing error alone, we observe that the more samples are added\n", + "into the training set, the lower the testing error becomes. Also, we are\n", + "searching for the plateau of the testing error for which there is no benefit\n", + "to adding samples anymore or assessing the potential gain of adding more\n", + "samples into the training set.\n", "\n", "If we achieve a plateau and adding new samples in the training set does not\n", "reduce the testing error, we might have reached the Bayes error rate using the\n", diff --git a/notebooks/cross_validation_nested.ipynb b/notebooks/cross_validation_nested.ipynb index c72fbed5b..635e0b36e 100644 --- a/notebooks/cross_validation_nested.ipynb +++ b/notebooks/cross_validation_nested.ipynb @@ -6,16 +6,14 @@ "source": [ "# Nested cross-validation\n", "\n", - "Cross-validation can be used both for hyperparameter tuning and for\n", - "estimating the generalization performance of a model. However, using\n", - "it for both purposes at the same time is problematic, as the resulting\n", - "evaluation can underestimate some overfitting that results from\n", - "the hyperparameter tuning procedure itself.\n", + "Cross-validation can be used both for hyperparameter tuning and for estimating\n", + "the generalization performance of a model. However, using it for both purposes\n", + "at the same time is problematic, as the resulting evaluation can underestimate\n", + "some overfitting that results from the hyperparameter tuning procedure itself.\n", "\n", - "Philosophically, hyperparameter tuning is a form of machine learning\n", - "itself and therefore, we need another outer loop of cross-validation to\n", - "properly evaluate the generalization performance of the full modeling\n", - "procedure.\n", + "Philosophically, hyperparameter tuning is a form of machine learning itself\n", + "and therefore, we need another outer loop of cross-validation to properly\n", + "evaluate the generalization performance of the full modeling procedure.\n", "\n", "This notebook highlights nested cross-validation and its impact on the\n", "estimated generalization performance compared to naively using a single level\n", @@ -53,12 +51,10 @@ "from sklearn.model_selection import GridSearchCV\n", "from sklearn.svm import SVC\n", "\n", - "param_grid = {\"C\": [0.1, 1, 10], \"gamma\": [.01, .1]}\n", + "param_grid = {\"C\": [0.1, 1, 10], \"gamma\": [0.01, 0.1]}\n", "model_to_tune = SVC()\n", "\n", - "search = GridSearchCV(\n", - " estimator=model_to_tune, param_grid=param_grid, n_jobs=2\n", - ")\n", + "search = GridSearchCV(estimator=model_to_tune, param_grid=param_grid, n_jobs=2)\n", "search.fit(data, target)" ] }, @@ -68,12 +64,12 @@ "source": [ "We recall that, internally, `GridSearchCV` trains several models for each on\n", "sub-sampled training sets and evaluate each of them on the matching testing\n", - "sets using cross-validation. This evaluation procedure is controlled via\n", - "using the `cv` parameter. The procedure is then repeated for all possible\n", + "sets using cross-validation. This evaluation procedure is controlled via using\n", + "the `cv` parameter. The procedure is then repeated for all possible\n", "combinations of parameters given in `param_grid`.\n", "\n", - "The attribute `best_params_` gives us the best set of parameters that\n", - "maximize the mean score on the internal test sets." + "The attribute `best_params_` gives us the best set of parameters that maximize\n", + "the mean score on the internal test sets." ] }, { @@ -107,9 +103,8 @@ "source": [ "At this stage, one should be extremely careful using this score. The\n", "misinterpretation would be the following: since this mean score was computed\n", - "using cross-validation test sets, we could use it to assess the\n", - "generalization performance of the model trained with the best\n", - "hyper-parameters.\n", + "using cross-validation test sets, we could use it to assess the generalization\n", + "performance of the model trained with the best hyper-parameters.\n", "\n", "However, we should not forget that we used this score to pick-up the best\n", "model. It means that we used knowledge from the test sets (i.e. test scores)\n", @@ -127,8 +122,8 @@ "dedicated to estimate the testing error of our tuned model.\n", "\n", "In this case, our inner cross-validation always gets the training set of the\n", - "outer cross-validation, making it possible to always compute the final\n", - "testing scores on completely independent sets of samples.\n", + "outer cross-validation, making it possible to always compute the final testing\n", + "scores on completely independent sets of samples.\n", "\n", "Let us do this in one go as follows:" ] @@ -152,8 +147,10 @@ "\n", "# Outer cross-validation to compute the testing score\n", "test_score = cross_val_score(model, data, target, cv=outer_cv, n_jobs=2)\n", - "print(f\"The mean score using nested cross-validation is: \"\n", - " f\"{test_score.mean():.3f} \u00b1 {test_score.std():.3f}\")" + "print(\n", + " \"The mean score using nested cross-validation is: \"\n", + " f\"{test_score.mean():.3f} \u00b1 {test_score.std():.3f}\"\n", + ")" ] }, { @@ -166,9 +163,9 @@ "\n", "We would like to better assess the difference between the nested and\n", "non-nested cross-validation scores to show that the latter can be too\n", - "optimistic in practice. To do this, we repeat the experiment several times\n", - "and shuffle the data differently to ensure that our conclusion does not\n", - "depend on a particular resampling of the data." + "optimistic in practice. To do this, we repeat the experiment several times and\n", + "shuffle the data differently to ensure that our conclusion does not depend on\n", + "a particular resampling of the data." ] }, { @@ -189,8 +186,9 @@ " outer_cv = KFold(n_splits=3, shuffle=True, random_state=i)\n", "\n", " # Non_nested parameter search and scoring\n", - " model = GridSearchCV(estimator=model_to_tune, param_grid=param_grid,\n", - " cv=inner_cv, n_jobs=2)\n", + " model = GridSearchCV(\n", + " estimator=model_to_tune, param_grid=param_grid, cv=inner_cv, n_jobs=2\n", + " )\n", " model.fit(data, target)\n", " test_score_not_nested.append(model.best_score_)\n", "\n", @@ -232,8 +230,10 @@ "color = {\"whiskers\": \"black\", \"medians\": \"black\", \"caps\": \"black\"}\n", "all_scores.plot.box(color=color, vert=False)\n", "plt.xlabel(\"Accuracy\")\n", - "_ = plt.title(\"Comparison of mean accuracy obtained on the test sets with\\n\"\n", - " \"and without nested cross-validation\")" + "_ = plt.title(\n", + " \"Comparison of mean accuracy obtained on the test sets with\\n\"\n", + " \"and without nested cross-validation\"\n", + ")" ] }, { @@ -241,12 +241,12 @@ "metadata": {}, "source": [ "We observe that the generalization performance estimated without using nested\n", - "CV is higher than what we obtain with nested CV. The reason is that the\n", - "tuning procedure itself selects the model with the highest inner CV score. If\n", - "there are many hyper-parameter combinations and if the inner CV scores have\n", - "comparatively large standard deviations, taking the maximum value can lure\n", - "the naive data scientist into over-estimating the true generalization\n", - "performance of the result of the full learning procedure. By using an outer\n", + "CV is higher than what we obtain with nested CV. The reason is that the tuning\n", + "procedure itself selects the model with the highest inner CV score. If there\n", + "are many hyper-parameter combinations and if the inner CV scores have\n", + "comparatively large standard deviations, taking the maximum value can lure the\n", + "naive data scientist into over-estimating the true generalization performance\n", + "of the result of the full learning procedure. By using an outer\n", "cross-validation procedure one gets a more trustworthy estimate of the\n", "generalization performance of the full learning procedure, including the\n", "effect of tuning the hyperparameters.\n", diff --git a/notebooks/cross_validation_sol_01.ipynb b/notebooks/cross_validation_sol_01.ipynb index c42ca4f26..04780c59d 100644 --- a/notebooks/cross_validation_sol_01.ipynb +++ b/notebooks/cross_validation_sol_01.ipynb @@ -120,7 +120,7 @@ "outputs": [], "source": [ "print(\n", - " f\"Accuracy score of our model:\\n\"\n", + " \"Accuracy score of our model:\\n\"\n", " f\"{cv_results['test_score'].mean():.3f} \u00b1 \"\n", " f\"{cv_results['test_score'].std():.3f}\"\n", ")" @@ -159,8 +159,14 @@ "gammas = np.logspace(-3, 2, num=30)\n", "param_name = \"svc__gamma\"\n", "train_scores, test_scores = validation_curve(\n", - " model, data, target, param_name=param_name, param_range=gammas, cv=cv,\n", - " n_jobs=2)" + " model,\n", + " data,\n", + " target,\n", + " param_name=param_name,\n", + " param_range=gammas,\n", + " cv=cv,\n", + " n_jobs=2,\n", + ")" ] }, { @@ -237,7 +243,8 @@ "\n", "train_sizes = np.linspace(0.1, 1, num=10)\n", "results = learning_curve(\n", - " model, data, target, train_sizes=train_sizes, cv=cv, n_jobs=2)\n", + " model, data, target, train_sizes=train_sizes, cv=cv, n_jobs=2\n", + ")\n", "train_size, train_scores, test_scores = results[:3]" ] }, diff --git a/notebooks/cross_validation_sol_02.ipynb b/notebooks/cross_validation_sol_02.ipynb index 25b14a574..4c6428669 100644 --- a/notebooks/cross_validation_sol_02.ipynb +++ b/notebooks/cross_validation_sol_02.ipynb @@ -6,12 +6,12 @@ "source": [ "# \ud83d\udcc3 Solution for Exercise M7.01\n", "\n", - "In this exercise we will define dummy classification baselines and use them\n", - "as reference to assess the relative predictive performance of a given model\n", - "of interest.\n", + "In this exercise we will define dummy classification baselines and use them as\n", + "reference to assess the relative predictive performance of a given model of\n", + "interest.\n", "\n", - "We illustrate those baselines with the help of the Adult Census dataset,\n", - "using only the numerical features for the sake of simplicity." + "We illustrate those baselines with the help of the Adult Census dataset, using\n", + "only the numerical features for the sake of simplicity." ] }, { @@ -72,8 +72,8 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Compute the cross-validation (test) scores for the classifier on this\n", - "dataset. Store the results pandas Series as we did in the previous notebook." + "Compute the cross-validation (test) scores for the classifier on this dataset.\n", + "Store the results pandas Series as we did in the previous notebook." ] }, { @@ -99,9 +99,9 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Now, compute the cross-validation scores of a dummy classifier that\n", - "constantly predicts the most frequent class observed the training set. Please\n", - "refer to the online documentation for the [sklearn.dummy.DummyClassifier\n", + "Now, compute the cross-validation scores of a dummy classifier that constantly\n", + "predicts the most frequent class observed the training set. Please refer to\n", + "the online documentation for the [sklearn.dummy.DummyClassifier\n", "](https://scikit-learn.org/stable/modules/generated/sklearn.dummy.DummyClassifier.html)\n", "class.\n", "\n", @@ -122,7 +122,8 @@ " most_frequent_classifier, data, target, cv=cv, n_jobs=2\n", ")\n", "test_score_most_frequent = pd.Series(\n", - " cv_results_most_frequent[\"test_score\"], name=\"Most frequent class predictor\"\n", + " cv_results_most_frequent[\"test_score\"],\n", + " name=\"Most frequent class predictor\",\n", ")\n", "test_score_most_frequent" ] @@ -131,8 +132,8 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Now that we collected the results from the baseline and the model,\n", - "concatenate the test scores as columns a single pandas dataframe." + "Now that we collected the results from the baseline and the model, concatenate\n", + "the test scores as columns a single pandas dataframe." ] }, { @@ -144,7 +145,7 @@ "# solution\n", "all_test_scores = pd.concat(\n", " [test_score_logistic_regression, test_score_most_frequent],\n", - " axis='columns',\n", + " axis=\"columns\",\n", ")\n", "all_test_scores" ] @@ -154,8 +155,8 @@ "metadata": {}, "source": [ "\n", - "Next, plot the histogram of the cross-validation test scores for both\n", - "models with the help of [pandas built-in plotting\n", + "Next, plot the histogram of the cross-validation test scores for both models\n", + "with the help of [pandas built-in plotting\n", "function](https://pandas.pydata.org/pandas-docs/stable/user_guide/visualization.html#histograms).\n", "\n", "What conclusions do you draw from the results?" @@ -187,10 +188,10 @@ }, "source": [ "We observe that the two histograms are well separated. Therefore the dummy\n", - "classifier with the strategy `most_frequent` has a much lower accuracy\n", - "than the logistic regression classifier. We conclude that the logistic\n", - "regression model can successfully find predictive information in the input\n", - "features to improve upon the baseline." + "classifier with the strategy `most_frequent` has a much lower accuracy than\n", + "the logistic regression classifier. We conclude that the logistic regression\n", + "model can successfully find predictive information in the input features to\n", + "improve upon the baseline." ] }, { @@ -262,7 +263,7 @@ " test_score_dummy_stratified,\n", " test_score_dummy_uniform,\n", " ],\n", - " axis='columns',\n", + " axis=\"columns\",\n", ")" ] }, @@ -292,16 +293,16 @@ "source": [ "We see that using `strategy=\"stratified\"`, the results are much worse than\n", "with the `most_frequent` strategy. Since the classes are imbalanced,\n", - "predicting the most frequent involves that we will be right for the\n", - "proportion of this class (~75% of the samples). However, the `\"stratified\"`\n", - "strategy will randomly generate predictions by respecting the training\n", - "set's class distribution, resulting in some wrong predictions even for\n", - "the most frequent class, hence we obtain a lower accuracy.\n", + "predicting the most frequent involves that we will be right for the proportion\n", + "of this class (~75% of the samples). However, the `\"stratified\"` strategy will\n", + "randomly generate predictions by respecting the training set's class\n", + "distribution, resulting in some wrong predictions even for the most frequent\n", + "class, hence we obtain a lower accuracy.\n", "\n", - "This is even more so for the `strategy=\"uniform\"`: this strategy assigns\n", - "class labels uniformly at random. Therefore, on a binary classification\n", - "problem, the cross-validation accuracy is 50% on average, which is the\n", - "weakest of the three dummy baselines." + "This is even more so for the `strategy=\"uniform\"`: this strategy assigns class\n", + "labels uniformly at random. Therefore, on a binary classification problem, the\n", + "cross-validation accuracy is 50% on average, which is the weakest of the three\n", + "dummy baselines." ] }, { @@ -326,16 +327,16 @@ "of interest. When training on such randomly permuted labels, many machine\n", "learning estimators would end up approximately behaving much like the\n", "`DummyClassifier(strategy=\"most_frequent\")` by always predicting the majority\n", - "class, irrespective of the input features. As a result, this\n", - "`\"most_frequent\"` baseline is sometimes called the \"chance level\" for\n", - "imbalanced classification problems, even though its predictions are\n", - "completely deterministic and do not involve much \"chance\" anymore.\n", + "class, irrespective of the input features. As a result, this `\"most_frequent\"`\n", + "baseline is sometimes called the \"chance level\" for imbalanced classification\n", + "problems, even though its predictions are completely deterministic and do not\n", + "involve much \"chance\" anymore.\n", "\n", "Defining the chance level using `permutation_test_score` is quite\n", "computation-intensive because it requires fitting many non-dummy models on\n", - "random permutations of the data. Using dummy classifiers as baselines is\n", - "often enough for practical purposes. For imbalanced classification problems,\n", - "the `\"most_frequent\"` strategy is the strongest of the three baselines and\n", + "random permutations of the data. Using dummy classifiers as baselines is often\n", + "enough for practical purposes. For imbalanced classification problems, the\n", + "`\"most_frequent\"` strategy is the strongest of the three baselines and\n", "therefore the one we should use." ] } diff --git a/notebooks/cross_validation_stratification.ipynb b/notebooks/cross_validation_stratification.ipynb index 90ccad658..7e330c3d2 100644 --- a/notebooks/cross_validation_stratification.ipynb +++ b/notebooks/cross_validation_stratification.ipynb @@ -10,11 +10,11 @@ "In the previous notebooks, we always used either a default `KFold` or a\n", "`ShuffleSplit` cross-validation strategies to iteratively split our dataset.\n", "However, you should not assume that these approaches are always the best\n", - "option: some other cross-validation strategies might be better adapted to\n", - "your problem.\n", + "option: some other cross-validation strategies might be better adapted to your\n", + "problem.\n", "\n", - "Let's start with the concept of stratification by giving an example where\n", - "we can get into trouble if we are not careful. Let's load the iris dataset." + "Let's start with the concept of stratification by giving an example where we\n", + "can get into trouble if we are not careful. Let's load the iris dataset." ] }, { @@ -55,9 +55,9 @@ "metadata": {}, "source": [ "Once we created our model, we will use the cross-validation framework to\n", - "evaluate it. We will use the `KFold` cross-validation strategy.\n", - "We will define a dataset with nine samples and repeat the cross-validation\n", - "three times (i.e. `n_splits`)." + "evaluate it. We will use the `KFold` cross-validation strategy. We will define\n", + "a dataset with nine samples and repeat the cross-validation three times (i.e.\n", + "`n_splits`)." ] }, { @@ -80,11 +80,11 @@ "metadata": {}, "source": [ "By defining three splits, we will use three samples for testing and six for\n", - "training each time. `KFold` does not shuffle by default. It means that it\n", - "will select the three first samples for the testing set at the first split,\n", - "then the three next three samples for the second split, and the three next\n", - "for the last split. In the end, all samples have been used in testing at\n", - "least once among the different splits.\n", + "training each time. `KFold` does not shuffle by default. It means that it will\n", + "select the three first samples for the testing set at the first split, then\n", + "the three next three samples for the second split, and the three next for the\n", + "last split. In the end, all samples have been used in testing at least once\n", + "among the different splits.\n", "\n", "Now, let's apply this strategy to check the generalization performance of our\n", "model." @@ -101,8 +101,9 @@ "cv = KFold(n_splits=3)\n", "results = cross_validate(model, data, target, cv=cv)\n", "test_score = results[\"test_score\"]\n", - "print(f\"The average accuracy is \"\n", - " f\"{test_score.mean():.3f} \u00b1 {test_score.std():.3f}\")" + "print(\n", + " f\"The average accuracy is {test_score.mean():.3f} \u00b1 {test_score.std():.3f}\"\n", + ")" ] }, { @@ -110,8 +111,8 @@ "metadata": {}, "source": [ "It is a real surprise that our model cannot correctly classify any sample in\n", - "any cross-validation split. We will now check our target's value to\n", - "understand the issue." + "any cross-validation split. We will now check our target's value to understand\n", + "the issue." ] }, { @@ -134,8 +135,8 @@ "metadata": {}, "source": [ "We see that the target vector `target` is ordered. It will have some\n", - "unexpected consequences when using the `KFold` cross-validation. To\n", - "illustrate the consequences, we will show the class count in each fold of the\n", + "unexpected consequences when using the `KFold` cross-validation. To illustrate\n", + "the consequences, we will show the class count in each fold of the\n", "cross-validation in the train and test set.\n", "\n", "Let's compute the class counts for both the training and testing sets using\n", @@ -181,8 +182,9 @@ "metadata": {}, "outputs": [], "source": [ - "train_cv_counts = pd.concat(train_cv_counts, axis=1,\n", - " keys=[f\"Fold #{idx}\" for idx in range(n_splits)])\n", + "train_cv_counts = pd.concat(\n", + " train_cv_counts, axis=1, keys=[f\"Fold #{idx}\" for idx in range(n_splits)]\n", + ")\n", "train_cv_counts.index.name = \"Class label\"\n", "train_cv_counts" ] @@ -193,8 +195,9 @@ "metadata": {}, "outputs": [], "source": [ - "test_cv_counts = pd.concat(test_cv_counts, axis=1,\n", - " keys=[f\"Fold #{idx}\" for idx in range(n_splits)])\n", + "test_cv_counts = pd.concat(\n", + " test_cv_counts, axis=1, keys=[f\"Fold #{idx}\" for idx in range(n_splits)]\n", + ")\n", "test_cv_counts.index.name = \"Class label\"\n", "test_cv_counts" ] @@ -234,13 +237,13 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "We can confirm that in each fold, only two of the three classes are present\n", - "in the training set and all samples of the remaining class is used as a test\n", - "set. So our model is unable to predict this class that was unseen during the\n", + "We can confirm that in each fold, only two of the three classes are present in\n", + "the training set and all samples of the remaining class is used as a test set.\n", + "So our model is unable to predict this class that was unseen during the\n", "training stage.\n", "\n", - "One possibility to solve the issue is to shuffle the data before splitting\n", - "the data into three groups." + "One possibility to solve the issue is to shuffle the data before splitting the\n", + "data into three groups." ] }, { @@ -252,8 +255,9 @@ "cv = KFold(n_splits=3, shuffle=True, random_state=0)\n", "results = cross_validate(model, data, target, cv=cv)\n", "test_score = results[\"test_score\"]\n", - "print(f\"The average accuracy is \"\n", - " f\"{test_score.mean():.3f} \u00b1 {test_score.std():.3f}\")" + "print(\n", + " f\"The average accuracy is {test_score.mean():.3f} \u00b1 {test_score.std():.3f}\"\n", + ")" ] }, { @@ -280,10 +284,12 @@ "\n", " train_cv_counts.append(target_train.value_counts())\n", " test_cv_counts.append(target_test.value_counts())\n", - "train_cv_counts = pd.concat(train_cv_counts, axis=1,\n", - " keys=[f\"Fold #{idx}\" for idx in range(n_splits)])\n", - "test_cv_counts = pd.concat(test_cv_counts, axis=1,\n", - " keys=[f\"Fold #{idx}\" for idx in range(n_splits)])\n", + "train_cv_counts = pd.concat(\n", + " train_cv_counts, axis=1, keys=[f\"Fold #{idx}\" for idx in range(n_splits)]\n", + ")\n", + "test_cv_counts = pd.concat(\n", + " test_cv_counts, axis=1, keys=[f\"Fold #{idx}\" for idx in range(n_splits)]\n", + ")\n", "train_cv_counts.index.name = \"Class label\"\n", "test_cv_counts.index.name = \"Class label\"" ] @@ -345,8 +351,9 @@ "source": [ "results = cross_validate(model, data, target, cv=cv)\n", "test_score = results[\"test_score\"]\n", - "print(f\"The average accuracy is \"\n", - " f\"{test_score.mean():.3f} \u00b1 {test_score.std():.3f}\")" + "print(\n", + " f\"The average accuracy is {test_score.mean():.3f} \u00b1 {test_score.std():.3f}\"\n", + ")" ] }, { @@ -362,10 +369,12 @@ "\n", " train_cv_counts.append(target_train.value_counts())\n", " test_cv_counts.append(target_test.value_counts())\n", - "train_cv_counts = pd.concat(train_cv_counts, axis=1,\n", - " keys=[f\"Fold #{idx}\" for idx in range(n_splits)])\n", - "test_cv_counts = pd.concat(test_cv_counts, axis=1,\n", - " keys=[f\"Fold #{idx}\" for idx in range(n_splits)])\n", + "train_cv_counts = pd.concat(\n", + " train_cv_counts, axis=1, keys=[f\"Fold #{idx}\" for idx in range(n_splits)]\n", + ")\n", + "test_cv_counts = pd.concat(\n", + " test_cv_counts, axis=1, keys=[f\"Fold #{idx}\" for idx in range(n_splits)]\n", + ")\n", "train_cv_counts.index.name = \"Class label\"\n", "test_cv_counts.index.name = \"Class label\"" ] diff --git a/notebooks/cross_validation_time.ipynb b/notebooks/cross_validation_time.ipynb index 2758f513e..b4814bd7b 100644 --- a/notebooks/cross_validation_time.ipynb +++ b/notebooks/cross_validation_time.ipynb @@ -32,9 +32,14 @@ "source": [ "import pandas as pd\n", "\n", - "symbols = {\"TOT\": \"Total\", \"XOM\": \"Exxon\", \"CVX\": \"Chevron\",\n", - " \"COP\": \"ConocoPhillips\", \"VLO\": \"Valero Energy\"}\n", - "template_name = (\"../datasets/financial-data/{}.csv\")\n", + "symbols = {\n", + " \"TOT\": \"Total\",\n", + " \"XOM\": \"Exxon\",\n", + " \"CVX\": \"Chevron\",\n", + " \"COP\": \"ConocoPhillips\",\n", + " \"VLO\": \"Valero Energy\",\n", + "}\n", + "template_name = \"../datasets/financial-data/{}.csv\"\n", "\n", "quotes = {}\n", "for symbol in symbols:\n", @@ -85,7 +90,8 @@ "\n", "data, target = quotes.drop(columns=[\"Chevron\"]), quotes[\"Chevron\"]\n", "data_train, data_test, target_train, target_test = train_test_split(\n", - " data, target, shuffle=True, random_state=0)" + " data, target, shuffle=True, random_state=0\n", + ")" ] }, { @@ -143,10 +149,10 @@ "source": [ "from sklearn.model_selection import cross_val_score\n", "\n", - "test_score = cross_val_score(regressor, data_train, target_train, cv=cv,\n", - " n_jobs=2)\n", - "print(f\"The mean R2 is: \"\n", - " f\"{test_score.mean():.2f} \u00b1 {test_score.std():.2f}\")" + "test_score = cross_val_score(\n", + " regressor, data_train, target_train, cv=cv, n_jobs=2\n", + ")\n", + "print(f\"The mean R2 is: {test_score.mean():.2f} \u00b1 {test_score.std():.2f}\")" ] }, { @@ -196,8 +202,8 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Similarly, we obtain good results in terms of $R^2$.\n", - "We will plot the training, testing and prediction samples." + "Similarly, we obtain good results in terms of $R^2$. We will plot the\n", + "training, testing and prediction samples." ] }, { @@ -240,7 +246,10 @@ "outputs": [], "source": [ "data_train, data_test, target_train, target_test = train_test_split(\n", - " data, target, shuffle=False, random_state=0,\n", + " data,\n", + " target,\n", + " shuffle=False,\n", + " random_state=0,\n", ")\n", "regressor.fit(data_train, target_train)\n", "target_predicted = regressor.predict(data_test)\n", @@ -305,10 +314,10 @@ "\n", "groups = quotes.index.to_period(\"Q\")\n", "cv = LeaveOneGroupOut()\n", - "test_score = cross_val_score(regressor, data, target,\n", - " cv=cv, groups=groups, n_jobs=2)\n", - "print(f\"The mean R2 is: \"\n", - " f\"{test_score.mean():.2f} \u00b1 {test_score.std():.2f}\")" + "test_score = cross_val_score(\n", + " regressor, data, target, cv=cv, groups=groups, n_jobs=2\n", + ")\n", + "print(f\"The mean R2 is: {test_score.mean():.2f} \u00b1 {test_score.std():.2f}\")" ] }, { @@ -334,10 +343,10 @@ "from sklearn.model_selection import TimeSeriesSplit\n", "\n", "cv = TimeSeriesSplit(n_splits=groups.nunique())\n", - "test_score = cross_val_score(regressor, data, target,\n", - " cv=cv, groups=groups, n_jobs=2)\n", - "print(f\"The mean R2 is: \"\n", - " f\"{test_score.mean():.2f} \u00b1 {test_score.std():.2f}\")" + "test_score = cross_val_score(\n", + " regressor, data, target, cv=cv, groups=groups, n_jobs=2\n", + ")\n", + "print(f\"The mean R2 is: {test_score.mean():.2f} \u00b1 {test_score.std():.2f}\")" ] }, { diff --git a/notebooks/cross_validation_train_test.ipynb b/notebooks/cross_validation_train_test.ipynb index 6a0e7ecd7..8bcba9aa9 100644 --- a/notebooks/cross_validation_train_test.ipynb +++ b/notebooks/cross_validation_train_test.ipynb @@ -6,9 +6,9 @@ "source": [ "# Cross-validation framework\n", "\n", - "In the previous notebooks, we introduce some concepts regarding the\n", - "evaluation of predictive models. While this section could be slightly\n", - "redundant, we intend to go into details into the cross-validation framework.\n", + "In the previous notebooks, we introduce some concepts regarding the evaluation\n", + "of predictive models. While this section could be slightly redundant, we\n", + "intend to go into details into the cross-validation framework.\n", "\n", "Before we dive in, let's linger on the reasons for always having training and\n", "testing sets. Let's first look at the limitation of using a dataset without\n", @@ -68,8 +68,8 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "To simplify future visualization, let's transform the prices from the\n", - "100 (k\\\\$) range to the thousand dollars (k\\\\$) range." + "To simplify future visualization, let's transform the prices from the 100\n", + "(k\\\\$) range to the thousand dollars (k\\\\$) range." ] }, { @@ -118,9 +118,10 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "After training the regressor, we would like to know its potential generalization\n", - "performance once deployed in production. For this purpose, we use the mean\n", - "absolute error, which gives us an error in the native unit, i.e. k\\\\$." + "After training the regressor, we would like to know its potential\n", + "generalization performance once deployed in production. For this purpose, we\n", + "use the mean absolute error, which gives us an error in the native unit, i.e.\n", + "k\\\\$." ] }, { @@ -158,11 +159,11 @@ "

    In this MOOC, we will consistently use the term \"training error\".

    \n", "
\n", "\n", - "We trained a predictive model to minimize the training error but our aim is\n", - "to minimize the error on data that has not been seen during training.\n", + "We trained a predictive model to minimize the training error but our aim is to\n", + "minimize the error on data that has not been seen during training.\n", "\n", - "This error is also called the **generalization error** or the \"true\"\n", - "**testing error**.\n", + "This error is also called the **generalization error** or the \"true\" **testing\n", + "error**.\n", "\n", "
\n", "

Note

\n", @@ -188,7 +189,8 @@ "from sklearn.model_selection import train_test_split\n", "\n", "data_train, data_test, target_train, target_test = train_test_split(\n", - " data, target, random_state=0)" + " data, target, random_state=0\n", + ")" ] }, { @@ -249,8 +251,8 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "This testing error is actually about what we would expect from our model if\n", - "it was used in a production environment." + "This testing error is actually about what we would expect from our model if it\n", + "was used in a production environment." ] }, { @@ -320,7 +322,8 @@ "\n", "cv = ShuffleSplit(n_splits=40, test_size=0.3, random_state=0)\n", "cv_results = cross_validate(\n", - " regressor, data, target, cv=cv, scoring=\"neg_mean_absolute_error\")" + " regressor, data, target, cv=cv, scoring=\"neg_mean_absolute_error\"\n", + ")" ] }, { @@ -444,8 +447,10 @@ "metadata": {}, "outputs": [], "source": [ - "print(f\"The mean cross-validated testing error is: \"\n", - " f\"{cv_results['test_error'].mean():.2f} k$\")" + "print(\n", + " \"The mean cross-validated testing error is: \"\n", + " f\"{cv_results['test_error'].mean():.2f} k$\"\n", + ")" ] }, { @@ -454,8 +459,10 @@ "metadata": {}, "outputs": [], "source": [ - "print(f\"The standard deviation of the testing error is: \"\n", - " f\"{cv_results['test_error'].std():.2f} k$\")" + "print(\n", + " \"The standard deviation of the testing error is: \"\n", + " f\"{cv_results['test_error'].std():.2f} k$\"\n", + ")" ] }, { @@ -527,8 +534,8 @@ "## More detail regarding `cross_validate`\n", "\n", "During cross-validation, many models are trained and evaluated. Indeed, the\n", - "number of elements in each array of the output of `cross_validate` is a\n", - "result from one of these `fit`/`score` procedures. To make it explicit, it is\n", + "number of elements in each array of the output of `cross_validate` is a result\n", + "from one of these `fit`/`score` procedures. To make it explicit, it is\n", "possible to retrieve these fitted models for each of the splits/folds by\n", "passing the option `return_estimator=True` in `cross_validate`." ] diff --git a/notebooks/cross_validation_validation_curve.ipynb b/notebooks/cross_validation_validation_curve.ipynb index bcba7c53c..c2d64c039 100644 --- a/notebooks/cross_validation_validation_curve.ipynb +++ b/notebooks/cross_validation_validation_curve.ipynb @@ -7,8 +7,8 @@ "# Overfit-generalization-underfit\n", "\n", "In the previous notebook, we presented the general cross-validation framework\n", - "and how it helps us quantify the training and testing errors as well\n", - "as their fluctuations.\n", + "and how it helps us quantify the training and testing errors as well as their\n", + "fluctuations.\n", "\n", "In this notebook, we will put these two errors into perspective and show how\n", "they can help us know if our model generalizes, overfits, or underfits.\n", @@ -58,10 +58,10 @@ "source": [ "## Overfitting vs. underfitting\n", "\n", - "To better understand the generalization performance of our model and maybe find\n", - "insights on how to improve it, we will compare the testing error with the\n", - "training error. Thus, we need to compute the error on the training set,\n", - "which is possible using the `cross_validate` function." + "To better understand the generalization performance of our model and maybe\n", + "find insights on how to improve it, we will compare the testing error with the\n", + "training error. Thus, we need to compute the error on the training set, which\n", + "is possible using the `cross_validate` function." ] }, { @@ -74,9 +74,15 @@ "from sklearn.model_selection import cross_validate, ShuffleSplit\n", "\n", "cv = ShuffleSplit(n_splits=30, test_size=0.2)\n", - "cv_results = cross_validate(regressor, data, target,\n", - " cv=cv, scoring=\"neg_mean_absolute_error\",\n", - " return_train_score=True, n_jobs=2)\n", + "cv_results = cross_validate(\n", + " regressor,\n", + " data,\n", + " target,\n", + " cv=cv,\n", + " scoring=\"neg_mean_absolute_error\",\n", + " return_train_score=True,\n", + " n_jobs=2,\n", + ")\n", "cv_results = pd.DataFrame(cv_results)" ] }, @@ -84,8 +90,8 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "The cross-validation used the negative mean absolute error. We transform\n", - "the negative mean absolute error into a positive mean absolute error." + "The cross-validation used the negative mean absolute error. We transform the\n", + "negative mean absolute error into a positive mean absolute error." ] }, { @@ -96,7 +102,8 @@ "source": [ "scores = pd.DataFrame()\n", "scores[[\"train error\", \"test error\"]] = -cv_results[\n", - " [\"train_score\", \"test_score\"]]" + " [\"train_score\", \"test_score\"]\n", + "]" ] }, { @@ -116,26 +123,26 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "By plotting the distribution of the training and testing errors, we\n", - "get information about whether our model is over-fitting, under-fitting (or\n", - "both at the same time).\n", + "By plotting the distribution of the training and testing errors, we get\n", + "information about whether our model is over-fitting, under-fitting (or both at\n", + "the same time).\n", "\n", - "Here, we observe a **small training error** (actually zero), meaning that\n", - "the model is **not under-fitting**: it is flexible enough to capture any\n", + "Here, we observe a **small training error** (actually zero), meaning that the\n", + "model is **not under-fitting**: it is flexible enough to capture any\n", "variations present in the training set.\n", "\n", - "However the **significantly larger testing error** tells us that the\n", - "model is **over-fitting**: the model has memorized many variations of the\n", - "training set that could be considered \"noisy\" because they do not generalize\n", - "to help us make good prediction on the test set.\n", + "However the **significantly larger testing error** tells us that the model is\n", + "**over-fitting**: the model has memorized many variations of the training set\n", + "that could be considered \"noisy\" because they do not generalize to help us\n", + "make good prediction on the test set.\n", "\n", "## Validation curve\n", "\n", "Some model hyperparameters are usually the key to go from a model that\n", "underfits to a model that overfits, hopefully going through a region were we\n", - "can get a good balance between the two. We can acquire knowledge by plotting\n", - "a curve called the validation curve. This curve can also be applied to the\n", - "above experiment and varies the value of a hyperparameter.\n", + "can get a good balance between the two. We can acquire knowledge by plotting a\n", + "curve called the validation curve. This curve can also be applied to the above\n", + "experiment and varies the value of a hyperparameter.\n", "\n", "For the decision tree, the `max_depth` parameter is used to control the\n", "tradeoff between under-fitting and over-fitting." @@ -152,8 +159,15 @@ "\n", "max_depth = [1, 5, 10, 15, 20, 25]\n", "train_scores, test_scores = validation_curve(\n", - " regressor, data, target, param_name=\"max_depth\", param_range=max_depth,\n", - " cv=cv, scoring=\"neg_mean_absolute_error\", n_jobs=2)\n", + " regressor,\n", + " data,\n", + " target,\n", + " param_name=\"max_depth\",\n", + " param_range=max_depth,\n", + " cv=cv,\n", + " scoring=\"neg_mean_absolute_error\",\n", + " n_jobs=2,\n", + ")\n", "train_errors, test_errors = -train_scores, -test_scores" ] }, @@ -187,32 +201,31 @@ "The validation curve can be divided into three areas:\n", "\n", "- For `max_depth < 10`, the decision tree underfits. The training error and\n", - " therefore the testing error are both high. The model is too\n", - " constrained and cannot capture much of the variability of the target\n", - " variable.\n", + " therefore the testing error are both high. The model is too constrained and\n", + " cannot capture much of the variability of the target variable.\n", "\n", "- The region around `max_depth = 10` corresponds to the parameter for which\n", " the decision tree generalizes the best. It is flexible enough to capture a\n", " fraction of the variability of the target that generalizes, while not\n", " memorizing all of the noise in the target.\n", "\n", - "- For `max_depth > 10`, the decision tree overfits. The training error\n", - " becomes very small, while the testing error increases. In this\n", - " region, the models create decisions specifically for noisy samples harming\n", - " its ability to generalize to test data.\n", + "- For `max_depth > 10`, the decision tree overfits. The training error becomes\n", + " very small, while the testing error increases. In this region, the models\n", + " create decisions specifically for noisy samples harming its ability to\n", + " generalize to test data.\n", "\n", "Note that for `max_depth = 10`, the model overfits a bit as there is a gap\n", - "between the training error and the testing error. It can also\n", - "potentially underfit also a bit at the same time, because the training error\n", - "is still far from zero (more than 30 k\\\\$), meaning that the model might\n", - "still be too constrained to model interesting parts of the data. However, the\n", - "testing error is minimal, and this is what really matters. This is the\n", - "best compromise we could reach by just tuning this parameter.\n", + "between the training error and the testing error. It can also potentially\n", + "underfit also a bit at the same time, because the training error is still far\n", + "from zero (more than 30 k\\\\$), meaning that the model might still be too\n", + "constrained to model interesting parts of the data. However, the testing error\n", + "is minimal, and this is what really matters. This is the best compromise we\n", + "could reach by just tuning this parameter.\n", "\n", "Be aware that looking at the mean errors is quite limiting. We should also\n", - "look at the standard deviation to assess the dispersion of the score. We\n", - "can repeat the same plot as before but this time, we will add some\n", - "information to show the standard deviation of the errors as well." + "look at the standard deviation to assess the dispersion of the score. We can\n", + "repeat the same plot as before but this time, we will add some information to\n", + "show the standard deviation of the errors as well." ] }, { @@ -221,10 +234,18 @@ "metadata": {}, "outputs": [], "source": [ - "plt.errorbar(max_depth, train_errors.mean(axis=1),\n", - " yerr=train_errors.std(axis=1), label='Training error')\n", - "plt.errorbar(max_depth, test_errors.mean(axis=1),\n", - " yerr=test_errors.std(axis=1), label='Testing error')\n", + "plt.errorbar(\n", + " max_depth,\n", + " train_errors.mean(axis=1),\n", + " yerr=train_errors.std(axis=1),\n", + " label=\"Training error\",\n", + ")\n", + "plt.errorbar(\n", + " max_depth,\n", + " test_errors.mean(axis=1),\n", + " yerr=test_errors.std(axis=1),\n", + " label=\"Testing error\",\n", + ")\n", "plt.legend()\n", "\n", "plt.xlabel(\"Maximum depth of decision tree\")\n", @@ -251,8 +272,7 @@ "\n", "* how to identify whether a model is generalizing, overfitting, or\n", " underfitting;\n", - "* how to check influence of a hyperparameter on the tradeoff\n", - " underfit/overfit." + "* how to check influence of a hyperparameter on the tradeoff underfit/overfit." ] } ], diff --git a/notebooks/datasets_ames_housing.ipynb b/notebooks/datasets_ames_housing.ipynb index a8d261112..4fca639f2 100644 --- a/notebooks/datasets_ames_housing.ipynb +++ b/notebooks/datasets_ames_housing.ipynb @@ -7,9 +7,9 @@ "# The Ames housing dataset\n", "\n", "In this notebook, we will quickly present the \"Ames housing\" dataset. We will\n", - "see that this dataset is similar to the \"California housing\" dataset.\n", - "However, it is more complex to handle: it contains missing data and both\n", - "numerical and categorical features.\n", + "see that this dataset is similar to the \"California housing\" dataset. However,\n", + "it is more complex to handle: it contains missing data and both numerical and\n", + "categorical features.\n", "\n", "This dataset is located in the `datasets` directory. It is stored in a comma\n", "separated value (CSV) file. As previously mentioned, we are aware that the\n", @@ -28,7 +28,7 @@ "source": [ "import pandas as pd\n", "\n", - "ames_housing = pd.read_csv(\"../datasets/house_prices.csv\", na_values='?')\n", + "ames_housing = pd.read_csv(\"../datasets/house_prices.csv\", na_values=\"?\")\n", "ames_housing = ames_housing.drop(columns=\"Id\")" ] }, @@ -64,7 +64,10 @@ "outputs": [], "source": [ "target_name = \"SalePrice\"\n", - "data, target = ames_housing.drop(columns=target_name), ames_housing[target_name]" + "data, target = (\n", + " ames_housing.drop(columns=target_name),\n", + " ames_housing[target_name],\n", + ")" ] }, { @@ -98,6 +101,7 @@ "outputs": [], "source": [ "import matplotlib.pyplot as plt\n", + "\n", "target.plot.hist(bins=20, edgecolor=\"black\")\n", "plt.xlabel(\"House price in $\")\n", "_ = plt.title(\"Distribution of the house price \\nin Ames\")" @@ -151,8 +155,8 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "We see that the data are mainly represented with integer number. Let's have\n", - "a look at the histogram for all these features." + "We see that the data are mainly represented with integer number. Let's have a\n", + "look at the histogram for all these features." ] }, { @@ -161,8 +165,9 @@ "metadata": {}, "outputs": [], "source": [ - "numerical_data.hist(bins=20, figsize=(12, 22), edgecolor=\"black\",\n", - " layout=(9, 4))\n", + "numerical_data.hist(\n", + " bins=20, figsize=(12, 22), edgecolor=\"black\", layout=(9, 4)\n", + ")\n", "plt.subplots_adjust(hspace=0.8, wspace=0.8)" ] }, @@ -171,8 +176,8 @@ "metadata": {}, "source": [ "We see that some features have high picks for 0. It could be linked that this\n", - "value was assigned when the criterion did not apply, for instance the\n", - "area of the swimming pool when no swimming pools are available.\n", + "value was assigned when the criterion did not apply, for instance the area of\n", + "the swimming pool when no swimming pools are available.\n", "\n", "We also have some feature encoding some date (for instance year).\n", "\n", @@ -256,7 +261,9 @@ "metadata": {}, "outputs": [], "source": [ - "ames_housing_no_missing = pd.read_csv(\"../datasets/ames_housing_no_missing.csv\")\n", + "ames_housing_no_missing = pd.read_csv(\n", + " \"../datasets/ames_housing_no_missing.csv\"\n", + ")\n", "ames_housing_no_missing.head()" ] }, @@ -267,7 +274,8 @@ "It contains the same information as the original dataset after using a\n", "[`sklearn.impute.SimpleImputer`](https://scikit-learn.org/stable/modules/generated/sklearn.impute.SimpleImputer.html)\n", "to replace missing values using the mean along each numerical column\n", - "(including the target), and the most frequent value along each categorical column." + "(including the target), and the most frequent value along each categorical\n", + "column." ] }, { @@ -322,7 +330,9 @@ " columns=categorical_features.tolist() + numerical_features,\n", ")\n", "ames_housing_preprocessed = ames_housing_preprocessed[ames_housing.columns]\n", - "ames_housing_preprocessed = ames_housing_preprocessed.astype(ames_housing.dtypes)\n", + "ames_housing_preprocessed = ames_housing_preprocessed.astype(\n", + " ames_housing.dtypes\n", + ")\n", "(ames_housing_no_missing == ames_housing_preprocessed).all()" ] } diff --git a/notebooks/datasets_bike_rides.ipynb b/notebooks/datasets_bike_rides.ipynb index 5d12eeb9e..c4cb53450 100644 --- a/notebooks/datasets_bike_rides.ipynb +++ b/notebooks/datasets_bike_rides.ipynb @@ -28,10 +28,10 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "The first column `timestamp` contains a specific information regarding the\n", - "the time and date of a record while other columns contain numerical value\n", - "of some specific measurements. Let's check the data type of the columns more\n", - "in details." + "The first column `timestamp` contains a specific information regarding the the\n", + "time and date of a record while other columns contain numerical value of some\n", + "specific measurements. Let's check the data type of the columns more in\n", + "details." ] }, { @@ -47,16 +47,16 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Indeed, CSV format store data as text. Pandas tries to infer numerical type\n", - "by default. It is the reason why all features but `timestamp` are encoded as\n", + "Indeed, CSV format store data as text. Pandas tries to infer numerical type by\n", + "default. It is the reason why all features but `timestamp` are encoded as\n", "floating point values. However, we see that the `timestamp` is stored as an\n", "`object` column. It means that the data in this column are stored as `str`\n", "rather than a specialized `datetime` data type.\n", "\n", - "In fact, one needs to set an option such that pandas is directed to infer\n", - "such data type when opening the file. In addition, we will want to use\n", - "`timestamp` as an index. Thus, we can reopen the file with some extra\n", - "arguments to help pandas at reading properly our CSV file." + "In fact, one needs to set an option such that pandas is directed to infer such\n", + "data type when opening the file. In addition, we will want to use `timestamp`\n", + "as an index. Thus, we can reopen the file with some extra arguments to help\n", + "pandas at reading properly our CSV file." ] }, { @@ -65,8 +65,9 @@ "metadata": {}, "outputs": [], "source": [ - "cycling = pd.read_csv(\"../datasets/bike_rides.csv\", index_col=0,\n", - " parse_dates=True)\n", + "cycling = pd.read_csv(\n", + " \"../datasets/bike_rides.csv\", index_col=0, parse_dates=True\n", + ")\n", "cycling.index.name = \"\"\n", "cycling.head()" ] @@ -84,40 +85,40 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "By specifying to pandas to parse the date, we obtain a `DatetimeIndex` that\n", - "is really handy when filtering data based on date.\n", + "By specifying to pandas to parse the date, we obtain a `DatetimeIndex` that is\n", + "really handy when filtering data based on date.\n", "\n", - "We can now have a look at the data stored in our dataframe. It will help us\n", - "to frame the data science problem that we try to solve.\n", + "We can now have a look at the data stored in our dataframe. It will help us to\n", + "frame the data science problem that we try to solve.\n", "\n", - "The records correspond at information derived from GPS recordings of a\n", - "cyclist (`speed`, `acceleration`, `slope`) and some extra information\n", - "acquired from other sensors: `heart-rate` that corresponds to the number of\n", - "beats per minute of the cyclist heart, `cadence` that is the rate at which a\n", - "cyclist is turning the pedals, and `power` that corresponds to the work\n", - "required by the cyclist to go forward.\n", + "The records correspond at information derived from GPS recordings of a cyclist\n", + "(`speed`, `acceleration`, `slope`) and some extra information acquired from\n", + "other sensors: `heart-rate` that corresponds to the number of beats per minute\n", + "of the cyclist heart, `cadence` that is the rate at which a cyclist is turning\n", + "the pedals, and `power` that corresponds to the work required by the cyclist\n", + "to go forward.\n", "\n", "The power might be slightly an abstract quantity so let's give a more\n", "intuitive explanation.\n", "\n", - "Let's take the example of a soup blender that one uses to blend vegetable.\n", - "The engine of this blender develop an instantaneous power of ~300 Watts to\n", - "blend the vegetable. Here, our cyclist is just the engine of the blender (at\n", - "the difference that an average cyclist will develop an instantaneous power\n", - "around ~150 Watts) and blending the vegetable corresponds to move the\n", - "cyclist's bike forward.\n", + "Let's take the example of a soup blender that one uses to blend vegetable. The\n", + "engine of this blender develop an instantaneous power of ~300 Watts to blend\n", + "the vegetable. Here, our cyclist is just the engine of the blender (at the\n", + "difference that an average cyclist will develop an instantaneous power around\n", + "~150 Watts) and blending the vegetable corresponds to move the cyclist's bike\n", + "forward.\n", "\n", "Professional cyclists are using power to calibrate their training and track\n", "the energy spent during a ride. For instance, riding at a higher power\n", "requires more energy and thus, you need to provide resources to create this\n", - "energy. With human, this resource is food. For our soup blender, this\n", - "resource can be uranium, petrol, natural gas, coal, etc. Our body serves as a\n", - "power plant to transform the resources into energy.\n", + "energy. With human, this resource is food. For our soup blender, this resource\n", + "can be uranium, petrol, natural gas, coal, etc. Our body serves as a power\n", + "plant to transform the resources into energy.\n", "\n", "The issue with measuring power is linked to the cost of the sensor: a cycling\n", - "power meter. The cost of such sensor vary from $400 to $1000. Thus, our\n", - "data science problem is quite easy: can we predict instantaneous cyclist\n", - "power from other (cheaper) sensors." + "power meter. The cost of such sensor vary from $400 to $1000. Thus, our data\n", + "science problem is quite easy: can we predict instantaneous cyclist power from\n", + "other (cheaper) sensors." ] }, { @@ -209,10 +210,10 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "The starting date is the August 18, 2020 and the ending date is\n", - "September 13, 2020. However, it is obvious that our cyclist did not ride\n", - "every seconds between these dates. Indeed, only a couple of date should be\n", - "present in the dataframe, corresponding to the number of cycling rides." + "The starting date is the August 18, 2020 and the ending date is September 13,\n", + "2020. However, it is obvious that our cyclist did not ride every seconds\n", + "between these dates. Indeed, only a couple of date should be present in the\n", + "dataframe, corresponding to the number of cycling rides." ] }, { @@ -250,7 +251,7 @@ "outputs": [], "source": [ "data_ride.plot()\n", - "plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')\n", + "plt.legend(bbox_to_anchor=(1.05, 1), loc=\"upper left\")\n", "_ = plt.title(\"Sensor values for different cyclist measurements\")" ] }, @@ -260,8 +261,8 @@ "source": [ "Since the unit and range of each measurement (feature) is different, it is\n", "rather difficult to interpret the plot. Also, the high temporal resolution\n", - "make it difficult to make any observation. We could resample the data to get\n", - "a smoother visualization." + "make it difficult to make any observation. We could resample the data to get a\n", + "smoother visualization." ] }, { @@ -271,7 +272,7 @@ "outputs": [], "source": [ "data_ride.resample(\"60S\").mean().plot()\n", - "plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')\n", + "plt.legend(bbox_to_anchor=(1.05, 1), loc=\"upper left\")\n", "_ = plt.title(\"Sensor values for different cyclist measurements\")" ] }, @@ -290,8 +291,13 @@ "source": [ "axs = data_ride.hist(figsize=(10, 12), bins=50, edgecolor=\"black\", grid=False)\n", "# add the units to the plots\n", - "units = [\"beats per minute\", \"rotations per minute\", \"meters per second\",\n", - " \"meters per second squared\", \"%\"]\n", + "units = [\n", + " \"beats per minute\",\n", + " \"rotations per minute\",\n", + " \"meters per second\",\n", + " \"meters per second squared\",\n", + " \"%\",\n", + "]\n", "for unit, ax in zip(units, axs.ravel()):\n", " ax.set_xlabel(unit)\n", "plt.subplots_adjust(hspace=0.6)" @@ -302,8 +308,8 @@ "metadata": {}, "source": [ "From these plots, we can see some interesting information: a cyclist is\n", - "spending some time without pedaling. This samples should be associated with\n", - "a null power. We also see that the slope have large extremum.\n", + "spending some time without pedaling. This samples should be associated with a\n", + "null power. We also see that the slope have large extremum.\n", "\n", "Let's make a pair plot on a subset of data samples to see if we can confirm\n", "some of these intuitions." @@ -348,12 +354,12 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Indeed, we see that low cadence is associated with low power. We can also\n", - "the a link between higher slope / high heart-rate and higher power: a cyclist\n", - "need to develop more energy to go uphill enforcing a stronger physiological\n", - "stimuli on the body. We can confirm this intuition by looking at the\n", - "interaction between the slope and the speed: a lower speed with a higher\n", - "slope is usually associated with higher power." + "Indeed, we see that low cadence is associated with low power. We can also the\n", + "a link between higher slope / high heart-rate and higher power: a cyclist need\n", + "to develop more energy to go uphill enforcing a stronger physiological stimuli\n", + "on the body. We can confirm this intuition by looking at the interaction\n", + "between the slope and the speed: a lower speed with a higher slope is usually\n", + "associated with higher power." ] } ], diff --git a/notebooks/datasets_blood_transfusion.ipynb b/notebooks/datasets_blood_transfusion.ipynb index 1b369d9df..7d6d89006 100644 --- a/notebooks/datasets_blood_transfusion.ipynb +++ b/notebooks/datasets_blood_transfusion.ipynb @@ -7,8 +7,8 @@ "# The blood transfusion dataset\n", "\n", "In this notebook, we will present the \"blood transfusion\" dataset. This\n", - "dataset is locally available in the directory `datasets` and it is stored as\n", - "a comma separated value (CSV) file. We start by loading the entire dataset." + "dataset is locally available in the directory `datasets` and it is stored as a\n", + "comma separated value (CSV) file. We start by loading the entire dataset." ] }, { @@ -42,10 +42,10 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "In this dataframe, we can see that the last column correspond to the target\n", - "to be predicted called `\"Class\"`. We will create two variables, `data` and\n", - "`target` to separate the data from which we could learn a predictive model\n", - "and the `target` that should be predicted." + "In this dataframe, we can see that the last column correspond to the target to\n", + "be predicted called `\"Class\"`. We will create two variables, `data` and\n", + "`target` to separate the data from which we could learn a predictive model and\n", + "the `target` that should be predicted." ] }, { @@ -78,19 +78,18 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "We observe four columns. Each record corresponds to a person that intended\n", - "to give blood. The information stored in each column are:\n", + "We observe four columns. Each record corresponds to a person that intended to\n", + "give blood. The information stored in each column are:\n", "\n", - "* `Recency`: the time in months since the last time a person intended to\n", - " give blood;\n", - "* `Frequency`: the number of time a person intended to give blood in the\n", - " past;\n", + "* `Recency`: the time in months since the last time a person intended to give\n", + " blood;\n", + "* `Frequency`: the number of time a person intended to give blood in the past;\n", "* `Monetary`: the amount of blood given in the past (in c.c.);\n", "* `Time`: the time in months since the first time a person intended to give\n", " blood.\n", "\n", - "Now, let's have a look regarding the type of data that we are dealing in\n", - "these columns and if any missing values are present in our dataset." + "Now, let's have a look regarding the type of data that we are dealing in these\n", + "columns and if any missing values are present in our dataset." ] }, { @@ -181,8 +180,8 @@ "important: a classifier that would predict always this `\"not donated\"` class\n", "would achieve an accuracy of 76% of good classification without using any\n", "information from the data itself. This issue is known as class imbalance. One\n", - "should take care about the generalization performance metric used to evaluate a\n", - "model as well as the predictive model chosen itself.\n", + "should take care about the generalization performance metric used to evaluate\n", + "a model as well as the predictive model chosen itself.\n", "\n", "Now, let's have a naive analysis to see if there is a link between features\n", "and the target using a pair plot representation." @@ -205,14 +204,14 @@ "source": [ "Looking at the diagonal plots, we don't see any feature that individually\n", "could help at separating the two classes. When looking at a pair of feature,\n", - "we don't see any striking combinations as well. However, we can note that\n", - "the `\"Monetary\"` and `\"Frequency\"` features are perfectly correlated: all the\n", - "data points are aligned on a diagonal.\n", + "we don't see any striking combinations as well. However, we can note that the\n", + "`\"Monetary\"` and `\"Frequency\"` features are perfectly correlated: all the data\n", + "points are aligned on a diagonal.\n", "\n", "As a conclusion, this dataset would be a challenging dataset: it suffer from\n", "class imbalance, correlated features and thus very few features will be\n", - "available to learn a model, and none of the feature combinations were found\n", - "to help at predicting." + "available to learn a model, and none of the feature combinations were found to\n", + "help at predicting." ] } ], diff --git a/notebooks/datasets_california_housing.ipynb b/notebooks/datasets_california_housing.ipynb index 8cc957f5f..88dbf5b72 100644 --- a/notebooks/datasets_california_housing.ipynb +++ b/notebooks/datasets_california_housing.ipynb @@ -6,8 +6,8 @@ "source": [ "# The California housing dataset\n", "\n", - "In this notebook, we will quickly present the dataset known as the\n", - "\"California housing dataset\". This dataset can be fetched from internet using\n", + "In this notebook, we will quickly present the dataset known as the \"California\n", + "housing dataset\". This dataset can be fetched from internet using\n", "scikit-learn." ] }, @@ -79,8 +79,8 @@ "In this dataset, we have information regarding the demography (income,\n", "population, house occupancy) in the districts, the location of the districts\n", "(latitude, longitude), and general information regarding the house in the\n", - "districts (number of rooms, number of bedrooms, age of the house). Since\n", - "these statistics are at the granularity of the district, they corresponds to\n", + "districts (number of rooms, number of bedrooms, age of the house). Since these\n", + "statistics are at the granularity of the district, they corresponds to\n", "averages or medians.\n", "\n", "Now, let's have a look to the target to be predicted." @@ -150,9 +150,9 @@ "We can first focus on features for which their distributions would be more or\n", "less expected.\n", "\n", - "The median income is a distribution with a long tail. It means that the\n", - "salary of people is more or less normally distributed but there is some\n", - "people getting a high salary.\n", + "The median income is a distribution with a long tail. It means that the salary\n", + "of people is more or less normally distributed but there is some people\n", + "getting a high salary.\n", "\n", "Regarding the average house age, the distribution is more or less uniform.\n", "\n", @@ -181,16 +181,16 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "For each of these features, comparing the `max` and `75%` values, we can see\n", - "a huge difference. It confirms the intuitions that there are a couple of\n", - "extreme values.\n", + "For each of these features, comparing the `max` and `75%` values, we can see a\n", + "huge difference. It confirms the intuitions that there are a couple of extreme\n", + "values.\n", "\n", "Up to know, we discarded the longitude and latitude that carry geographical\n", - "information. In short, the combination of this feature could help us to\n", - "decide if there are locations associated with high-valued houses. Indeed,\n", - "we could make a scatter plot where the x- and y-axis would be the latitude\n", - "and longitude and the circle size and color would be linked with the house\n", - "value in the district." + "information. In short, the combination of this feature could help us to decide\n", + "if there are locations associated with high-valued houses. Indeed, we could\n", + "make a scatter plot where the x- and y-axis would be the latitude and\n", + "longitude and the circle size and color would be linked with the house value\n", + "in the district." ] }, { @@ -201,11 +201,16 @@ "source": [ "import seaborn as sns\n", "\n", - "sns.scatterplot(data=california_housing.frame, x=\"Longitude\", y=\"Latitude\",\n", - " size=\"MedHouseVal\", hue=\"MedHouseVal\",\n", - " palette=\"viridis\", alpha=0.5)\n", - "plt.legend(title=\"MedHouseVal\", bbox_to_anchor=(1.05, 0.95),\n", - " loc=\"upper left\")\n", + "sns.scatterplot(\n", + " data=california_housing.frame,\n", + " x=\"Longitude\",\n", + " y=\"Latitude\",\n", + " size=\"MedHouseVal\",\n", + " hue=\"MedHouseVal\",\n", + " palette=\"viridis\",\n", + " alpha=0.5,\n", + ")\n", + "plt.legend(title=\"MedHouseVal\", bbox_to_anchor=(1.05, 0.95), loc=\"upper left\")\n", "_ = plt.title(\"Median house value depending of\\n their spatial location\")" ] }, @@ -214,13 +219,13 @@ "metadata": {}, "source": [ "If you are not familiar with the state of California, it is interesting to\n", - "notice that all datapoints show a graphical representation of this state.\n", - "We note that the high-valued houses will be located on the coast, where the\n", - "big cities from California are located: San Diego, Los Angeles, San Jose, or\n", - "San Francisco.\n", + "notice that all datapoints show a graphical representation of this state. We\n", + "note that the high-valued houses will be located on the coast, where the big\n", + "cities from California are located: San Diego, Los Angeles, San Jose, or San\n", + "Francisco.\n", "\n", - "We can do a random subsampling to have less data points to plot but that\n", - "could still allow us to see these specificities." + "We can do a random subsampling to have less data points to plot but that could\n", + "still allow us to see these specificities." ] }, { @@ -232,8 +237,9 @@ "import numpy as np\n", "\n", "rng = np.random.RandomState(0)\n", - "indices = rng.choice(np.arange(california_housing.frame.shape[0]), size=500,\n", - " replace=False)" + "indices = rng.choice(\n", + " np.arange(california_housing.frame.shape[0]), size=500, replace=False\n", + ")" ] }, { @@ -242,12 +248,16 @@ "metadata": {}, "outputs": [], "source": [ - "sns.scatterplot(data=california_housing.frame.iloc[indices],\n", - " x=\"Longitude\", y=\"Latitude\",\n", - " size=\"MedHouseVal\", hue=\"MedHouseVal\",\n", - " palette=\"viridis\", alpha=0.5)\n", - "plt.legend(title=\"MedHouseVal\", bbox_to_anchor=(1.05, 1),\n", - " loc=\"upper left\")\n", + "sns.scatterplot(\n", + " data=california_housing.frame.iloc[indices],\n", + " x=\"Longitude\",\n", + " y=\"Latitude\",\n", + " size=\"MedHouseVal\",\n", + " hue=\"MedHouseVal\",\n", + " palette=\"viridis\",\n", + " alpha=0.5,\n", + ")\n", + "plt.legend(title=\"MedHouseVal\", bbox_to_anchor=(1.05, 1), loc=\"upper left\")\n", "_ = plt.title(\"Median house value depending of\\n their spatial location\")" ] }, @@ -289,8 +299,8 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "While it is always complicated to interpret a pairplot since there is a lot\n", - "of data, here we can get a couple of intuitions. We can confirm that some\n", + "While it is always complicated to interpret a pairplot since there is a lot of\n", + "data, here we can get a couple of intuitions. We can confirm that some\n", "features have extreme values (outliers?). We can as well see that the median\n", "income is helpful to distinguish high-valued from low-valued houses.\n", "\n", @@ -299,7 +309,7 @@ "house values.\n", "\n", "If you are curious, we created a linear predictive model below and show the\n", - "values of the coefficients obtained via cross-validation" + "values of the coefficients obtained via cross-validation." ] }, { @@ -316,8 +326,12 @@ "alphas = np.logspace(-3, 1, num=30)\n", "model = make_pipeline(StandardScaler(), RidgeCV(alphas=alphas))\n", "cv_results = cross_validate(\n", - " model, california_housing.data, california_housing.target,\n", - " return_estimator=True, n_jobs=2)" + " model,\n", + " california_housing.data,\n", + " california_housing.target,\n", + " return_estimator=True,\n", + " n_jobs=2,\n", + ")" ] }, { @@ -340,7 +354,7 @@ "\n", "coefs = pd.DataFrame(\n", " [est[-1].coef_ for est in cv_results[\"estimator\"]],\n", - " columns=california_housing.feature_names\n", + " columns=california_housing.feature_names,\n", ")" ] }, @@ -361,8 +375,8 @@ "metadata": {}, "source": [ "It seems that the three features that we earlier spotted are found important\n", - "by this model. But be careful regarding interpreting these coefficients.\n", - "We let you go into the module \"Interpretation\" to go in depth regarding such\n", + "by this model. But be careful regarding interpreting these coefficients. We\n", + "let you go into the module \"Interpretation\" to go in depth regarding such\n", "experiment." ] } diff --git a/notebooks/dev_features_importance.ipynb b/notebooks/dev_features_importance.ipynb index d036f4a26..5e9b1e41f 100644 --- a/notebooks/dev_features_importance.ipynb +++ b/notebooks/dev_features_importance.ipynb @@ -11,8 +11,8 @@ "\n", "1. interpreting the coefficients in a linear model;\n", "2. the attribute `feature_importances_` in RandomForest;\n", - "3. `permutation feature importance`, which is an inspection technique that\n", - " can be used for any fitted model." + "3. `permutation feature importance`, which is an inspection technique that can\n", + " be used for any fitted model." ] }, { @@ -106,8 +106,8 @@ "\n", "# Adding random features\n", "rng = np.random.RandomState(0)\n", - "bin_var = pd.Series(rng.randint(0, 1, X.shape[0]), name='rnd_bin')\n", - "num_var = pd.Series(np.arange(X.shape[0]), name='rnd_num')\n", + "bin_var = pd.Series(rng.randint(0, 1, X.shape[0]), name=\"rnd_bin\")\n", + "num_var = pd.Series(np.arange(X.shape[0]), name=\"rnd_num\")\n", "X_with_rnd_feat = pd.concat((X, bin_var, num_var), axis=1)" ] }, @@ -126,15 +126,17 @@ "outputs": [], "source": [ "from sklearn.model_selection import train_test_split\n", - "X_train, X_test, y_train, y_test = train_test_split(X_with_rnd_feat, y,\n", - " random_state=29)" + "\n", + "X_train, X_test, y_train, y_test = train_test_split(\n", + " X_with_rnd_feat, y, random_state=29\n", + ")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "Let's quickly inspect some features and the target" + "Let's quickly inspect some features and the target:" ] }, { @@ -148,8 +150,13 @@ "train_dataset = X_train.copy()\n", "train_dataset.insert(0, \"MedHouseVal\", y_train)\n", "_ = sns.pairplot(\n", - " train_dataset[['MedHouseVal', 'Latitude', 'AveRooms', 'AveBedrms', 'MedInc']],\n", - " kind='reg', diag_kind='kde', plot_kws={'scatter_kws': {'alpha': 0.1}})" + " train_dataset[\n", + " [\"MedHouseVal\", \"Latitude\", \"AveRooms\", \"AveBedrms\", \"MedInc\"]\n", + " ],\n", + " kind=\"reg\",\n", + " diag_kind=\"kde\",\n", + " plot_kws={\"scatter_kws\": {\"alpha\": 0.1}},\n", + ")" ] }, { @@ -159,8 +166,8 @@ "We see in the upper right plot that the median income seems to be positively\n", "correlated to the median house price (the target).\n", "\n", - "We can also see that the average number of rooms `AveRooms` is very\n", - "correlated to the average number of bedrooms `AveBedrms`." + "We can also see that the average number of rooms `AveRooms` is very correlated\n", + "to the average number of bedrooms `AveBedrms`." ] }, { @@ -179,10 +186,9 @@ "\n", "Coefficients represent the relationship between the given feature $X_i$ and\n", "the target $y$, assuming that all the other features remain constant\n", - "(conditional dependence). This is different from plotting $X_i$ versus $y$\n", - "and fitting a linear relationship: in that case all possible values of the\n", - "other features are taken into account in the estimation (marginal\n", - "dependence)." + "(conditional dependence). This is different from plotting $X_i$ versus $y$ and\n", + "fitting a linear relationship: in that case all possible values of the other\n", + "features are taken into account in the estimation (marginal dependence)." ] }, { @@ -197,8 +203,8 @@ "\n", "model.fit(X_train, y_train)\n", "\n", - "print(f'model score on training data: {model.score(X_train, y_train)}')\n", - "print(f'model score on testing data: {model.score(X_test, y_test)}')" + "print(f\"model score on training data: {model.score(X_train, y_train)}\")\n", + "print(f\"model score on testing data: {model.score(X_test, y_test)}\")" ] }, { @@ -207,7 +213,7 @@ "source": [ "Our linear model obtains a $R^2$ score of .60, so it explains a significant\n", "part of the target. Its coefficient should be somehow relevant. Let's look at\n", - "the coefficient learnt" + "the coefficient learnt:" ] }, { @@ -221,14 +227,13 @@ "import matplotlib.pyplot as plt\n", "\n", "coefs = pd.DataFrame(\n", - " model.coef_,\n", - " columns=['Coefficients'], index=X_train.columns\n", + " model.coef_, columns=[\"Coefficients\"], index=X_train.columns\n", ")\n", "\n", - "coefs.plot(kind='barh', figsize=(9, 7))\n", - "plt.title('Ridge model')\n", - "plt.axvline(x=0, color='.5')\n", - "plt.subplots_adjust(left=.3)" + "coefs.plot(kind=\"barh\", figsize=(9, 7))\n", + "plt.title(\"Ridge model\")\n", + "plt.axvline(x=0, color=\".5\")\n", + "plt.subplots_adjust(left=0.3)" ] }, { @@ -243,24 +248,24 @@ "price of houses decreases with the number of rooms?\n", "```\n", "\n", - "The coefficients of a linear model are a *conditional* association:\n", - "they quantify the variation of a the output (the price) when the given\n", - "feature is varied, **keeping all other features constant**. We should\n", - "not interpret them as a *marginal* association, characterizing the link\n", - "between the two quantities ignoring all the rest.\n", + "The coefficients of a linear model are a *conditional* association: they\n", + "quantify the variation of a the output (the price) when the given feature is\n", + "varied, **keeping all other features constant**. We should not interpret them\n", + "as a *marginal* association, characterizing the link between the two\n", + "quantities ignoring all the rest.\n", "\n", - "The coefficient associated to `AveRooms` is negative because the number\n", - "of rooms is strongly correlated with the number of bedrooms,\n", - "`AveBedrms`. What we are seeing here is that for districts where the houses\n", - "have the same number of bedrooms on average, when there are more rooms\n", - "(hence non-bedroom rooms), the houses are worth comparatively less.\n", + "The coefficient associated to `AveRooms` is negative because the number of\n", + "rooms is strongly correlated with the number of bedrooms, `AveBedrms`. What we\n", + "are seeing here is that for districts where the houses have the same number of\n", + "bedrooms on average, when there are more rooms (hence non-bedroom rooms), the\n", + "houses are worth comparatively less.\n", "\n", "### Scale of coefficients\n", "\n", "The `AveBedrms` have the higher coefficient. However, we can't compare the\n", "magnitude of these coefficients directly, since they are not scaled. Indeed,\n", - "`Population` is an integer which can be thousands, while `AveBedrms` is\n", - "around 4 and Latitude is in degree.\n", + "`Population` is an integer which can be thousands, while `AveBedrms` is around\n", + "4 and Latitude is in degree.\n", "\n", "So the Population coefficient is expressed in \"$100k\\\\$$ / habitant\" while the\n", "AveBedrms is expressed in \"$100k\\\\$$ / nb of bedrooms\" and the Latitude\n", @@ -268,8 +273,8 @@ "\n", "We see that changing population by one does not change the outcome, while as\n", "we go south (latitude increase) the price becomes cheaper. Also, adding a\n", - "bedroom (keeping all other feature constant) shall rise the price of the\n", - "house by 80k$." + "bedroom (keeping all other feature constant) shall rise the price of the house\n", + "by 80k$." ] }, { @@ -290,9 +295,9 @@ "metadata": {}, "outputs": [], "source": [ - "X_train.std(axis=0).plot(kind='barh', figsize=(9, 7))\n", - "plt.title('Features std. dev.')\n", - "plt.subplots_adjust(left=.3)\n", + "X_train.std(axis=0).plot(kind=\"barh\", figsize=(9, 7))\n", + "plt.title(\"Features std. dev.\")\n", + "plt.subplots_adjust(left=0.3)\n", "plt.xlim((0, 100))" ] }, @@ -317,8 +322,8 @@ "\n", "model.fit(X_train, y_train)\n", "\n", - "print(f'model score on training data: {model.score(X_train, y_train)}')\n", - "print(f'model score on testing data: {model.score(X_test, y_test)}')" + "print(f\"model score on training data: {model.score(X_train, y_train)}\")\n", + "print(f\"model score on testing data: {model.score(X_test, y_test)}\")" ] }, { @@ -328,14 +333,13 @@ "outputs": [], "source": [ "coefs = pd.DataFrame(\n", - " model[1].coef_,\n", - " columns=['Coefficients'], index=X_train.columns\n", + " model[1].coef_, columns=[\"Coefficients\"], index=X_train.columns\n", ")\n", "\n", - "coefs.plot(kind='barh', figsize=(9, 7))\n", - "plt.title('Ridge model')\n", - "plt.axvline(x=0, color='.5')\n", - "plt.subplots_adjust(left=.3)" + "coefs.plot(kind=\"barh\", figsize=(9, 7))\n", + "plt.title(\"Ridge model\")\n", + "plt.axvline(x=0, color=\".5\")\n", + "plt.subplots_adjust(left=0.3)" ] }, { @@ -344,15 +348,15 @@ "source": [ "Now that the coefficients have been scaled, we can safely compare them.\n", "\n", - "The median income feature, with longitude and latitude are the three\n", - "variables that most influence the model.\n", + "The median income feature, with longitude and latitude are the three variables\n", + "that most influence the model.\n", "\n", "The plot above tells us about dependencies between a specific feature and the\n", "target when all other features remain constant, i.e., conditional\n", "dependencies. An increase of the `HouseAge` will induce an increase of the\n", - "price when all other features remain constant. On the contrary, an increase\n", - "of the average rooms will induce an decrease of the price when all other\n", - "features remain constant." + "price when all other features remain constant. On the contrary, an increase of\n", + "the average rooms will induce an decrease of the price when all other features\n", + "remain constant." ] }, { @@ -384,20 +388,23 @@ "from sklearn.model_selection import RepeatedKFold\n", "\n", "cv_model = cross_validate(\n", - " model, X_with_rnd_feat, y, cv=RepeatedKFold(n_splits=5, n_repeats=5),\n", - " return_estimator=True, n_jobs=2\n", + " model,\n", + " X_with_rnd_feat,\n", + " y,\n", + " cv=RepeatedKFold(n_splits=5, n_repeats=5),\n", + " return_estimator=True,\n", + " n_jobs=2,\n", ")\n", "coefs = pd.DataFrame(\n", - " [model[1].coef_\n", - " for model in cv_model['estimator']],\n", - " columns=X_with_rnd_feat.columns\n", + " [model[1].coef_ for model in cv_model[\"estimator\"]],\n", + " columns=X_with_rnd_feat.columns,\n", ")\n", "plt.figure(figsize=(9, 7))\n", - "sns.boxplot(data=coefs, orient='h', color='cyan', saturation=0.5)\n", - "plt.axvline(x=0, color='.5')\n", - "plt.xlabel('Coefficient importance')\n", - "plt.title('Coefficient importance and its variability')\n", - "plt.subplots_adjust(left=.3)" + "sns.boxplot(data=coefs, orient=\"h\", color=\"cyan\", saturation=0.5)\n", + "plt.axvline(x=0, color=\".5\")\n", + "plt.xlabel(\"Coefficient importance\")\n", + "plt.title(\"Coefficient importance and its variability\")\n", + "plt.subplots_adjust(left=0.3)" ] }, { @@ -419,10 +426,10 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "In it important to keep in mind that the associations extracted depend\n", - "on the model. To illustrate this point we consider a Lasso model, that\n", - "performs feature selection with a L1 penalty. Let us fit a Lasso model\n", - "with a strong regularization parameters `alpha`" + "In it important to keep in mind that the associations extracted depend on the\n", + "model. To illustrate this point we consider a Lasso model, that performs\n", + "feature selection with a L1 penalty. Let us fit a Lasso model with a strong\n", + "regularization parameters `alpha`" ] }, { @@ -433,12 +440,12 @@ "source": [ "from sklearn.linear_model import Lasso\n", "\n", - "model = make_pipeline(StandardScaler(), Lasso(alpha=.015))\n", + "model = make_pipeline(StandardScaler(), Lasso(alpha=0.015))\n", "\n", "model.fit(X_train, y_train)\n", "\n", - "print(f'model score on training data: {model.score(X_train, y_train)}')\n", - "print(f'model score on testing data: {model.score(X_test, y_test)}')" + "print(f\"model score on training data: {model.score(X_train, y_train)}\")\n", + "print(f\"model score on testing data: {model.score(X_test, y_test)}\")" ] }, { @@ -448,14 +455,13 @@ "outputs": [], "source": [ "coefs = pd.DataFrame(\n", - " model[1].coef_,\n", - " columns=['Coefficients'], index=X_train.columns\n", + " model[1].coef_, columns=[\"Coefficients\"], index=X_train.columns\n", ")\n", "\n", - "coefs.plot(kind='barh', figsize=(9, 7))\n", - "plt.title('Lasso model, strong regularization')\n", - "plt.axvline(x=0, color='.5')\n", - "plt.subplots_adjust(left=.3)" + "coefs.plot(kind=\"barh\", figsize=(9, 7))\n", + "plt.title(\"Lasso model, strong regularization\")\n", + "plt.axvline(x=0, color=\".5\")\n", + "plt.subplots_adjust(left=0.3)" ] }, { @@ -466,11 +472,10 @@ "However, it has zeroed out 3 coefficients, selecting a small number of\n", "variables to make its prediction.\n", "\n", - "We can see that out of the two correlated features `AveRooms` and\n", - "`AveBedrms`, the model has selected one. Note that this choice is\n", - "partly arbitrary: choosing one does not mean that the other is not\n", - "important for prediction. **Avoid over-interpreting models, as they are\n", - "imperfect**.\n", + "We can see that out of the two correlated features `AveRooms` and `AveBedrms`,\n", + "the model has selected one. Note that this choice is partly arbitrary:\n", + "choosing one does not mean that the other is not important for prediction.\n", + "**Avoid over-interpreting models, as they are imperfect**.\n", "\n", "As above, we can look at the variability of the coefficients:" ] @@ -482,31 +487,33 @@ "outputs": [], "source": [ "cv_model = cross_validate(\n", - " model, X_with_rnd_feat, y, cv=RepeatedKFold(n_splits=5, n_repeats=5),\n", - " return_estimator=True, n_jobs=2\n", + " model,\n", + " X_with_rnd_feat,\n", + " y,\n", + " cv=RepeatedKFold(n_splits=5, n_repeats=5),\n", + " return_estimator=True,\n", + " n_jobs=2,\n", ")\n", "coefs = pd.DataFrame(\n", - " [model[1].coef_\n", - " for model in cv_model['estimator']],\n", - " columns=X_with_rnd_feat.columns\n", + " [model[1].coef_ for model in cv_model[\"estimator\"]],\n", + " columns=X_with_rnd_feat.columns,\n", ")\n", "plt.figure(figsize=(9, 7))\n", - "sns.boxplot(data=coefs, orient='h', color='cyan', saturation=0.5)\n", - "plt.axvline(x=0, color='.5')\n", - "plt.xlabel('Coefficient importance')\n", - "plt.title('Coefficient importance and its variability')\n", - "plt.subplots_adjust(left=.3)" + "sns.boxplot(data=coefs, orient=\"h\", color=\"cyan\", saturation=0.5)\n", + "plt.axvline(x=0, color=\".5\")\n", + "plt.xlabel(\"Coefficient importance\")\n", + "plt.title(\"Coefficient importance and its variability\")\n", + "plt.subplots_adjust(left=0.3)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "We can see that both the coefficients associated to `AveRooms` and\n", - "`AveBedrms` have a strong variability and that they can both be non\n", - "zero. Given that they are strongly correlated, the model can pick one\n", - "or the other to predict well. This choice is a bit arbitrary, and must\n", - "not be over-interpreted." + "We can see that both the coefficients associated to `AveRooms` and `AveBedrms`\n", + "have a strong variability and that they can both be non zero. Given that they\n", + "are strongly correlated, the model can pick one or the other to predict well.\n", + "This choice is a bit arbitrary, and must not be over-interpreted." ] }, { @@ -534,9 +541,9 @@ "source": [ "## 2. RandomForest `feature_importances_`\n", "\n", - "On some algorithms, there are some feature importance methods,\n", - "inherently built within the model. It is the case in RandomForest models.\n", - "Let's investigate the built-in `feature_importances_` attribute." + "On some algorithms, there are some feature importance methods, inherently\n", + "built within the model. It is the case in RandomForest models. Let's\n", + "investigate the built-in `feature_importances_` attribute." ] }, { @@ -551,8 +558,8 @@ "\n", "model.fit(X_train, y_train)\n", "\n", - "print(f'model score on training data: {model.score(X_train, y_train)}')\n", - "print(f'model score on testing data: {model.score(X_test, y_test)}')" + "print(f\"model score on training data: {model.score(X_train, y_train)}\")\n", + "print(f\"model score on testing data: {model.score(X_test, y_test)}\")" ] }, { @@ -614,8 +621,8 @@ "\n", "We introduce here a new technique to evaluate the feature importance of any\n", "given fitted model. It basically shuffles a feature and sees how the model\n", - "changes its prediction. Thus, the change in prediction will correspond to\n", - "the feature importance." + "changes its prediction. Thus, the change in prediction will correspond to the\n", + "feature importance." ] }, { @@ -634,22 +641,24 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "lines_to_next_cell": 2 + }, "outputs": [], "source": [ "model.fit(X_train, y_train)\n", "\n", - "print(f'model score on training data: {model.score(X_train, y_train)}')\n", - "print(f'model score on testing data: {model.score(X_test, y_test)}')" + "print(f\"model score on training data: {model.score(X_train, y_train)}\")\n", + "print(f\"model score on testing data: {model.score(X_test, y_test)}\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "As the model gives a good prediction, it has captured well the link\n", - "between X and y. Hence, it is reasonable to interpret what it has\n", - "captured from the data." + "As the model gives a good prediction, it has captured well the link between X\n", + "and y. Hence, it is reasonable to interpret what it has captured from the\n", + "data." ] }, { @@ -661,7 +670,9 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "lines_to_next_cell": 2 + }, "source": [ "Lets compute the feature importance for a given feature, say the `MedInc`\n", "feature.\n", @@ -688,20 +699,21 @@ "outputs": [], "source": [ "def get_score_after_permutation(model, X, y, curr_feat):\n", - " \"\"\" return the score of model when curr_feat is permuted \"\"\"\n", + " \"\"\"return the score of model when curr_feat is permuted\"\"\"\n", "\n", " X_permuted = X.copy()\n", " col_idx = list(X.columns).index(curr_feat)\n", " # permute one column\n", " X_permuted.iloc[:, col_idx] = np.random.permutation(\n", - " X_permuted[curr_feat].values)\n", + " X_permuted[curr_feat].values\n", + " )\n", "\n", " permuted_score = model.score(X_permuted, y)\n", " return permuted_score\n", "\n", "\n", "def get_feature_importance(model, X, y, curr_feat):\n", - " \"\"\" compare the score when curr_feat is permuted \"\"\"\n", + " \"\"\"compare the score when curr_feat is permuted\"\"\"\n", "\n", " baseline_score_train = model.score(X, y)\n", " permuted_score_train = get_score_after_permutation(model, X, y, curr_feat)\n", @@ -711,11 +723,13 @@ " return feature_importance\n", "\n", "\n", - "curr_feat = 'MedInc'\n", + "curr_feat = \"MedInc\"\n", "\n", "feature_importance = get_feature_importance(model, X_train, y_train, curr_feat)\n", - "print(f'feature importance of \"{curr_feat}\" on train set is '\n", - " f'{feature_importance:.3}')" + "print(\n", + " f'feature importance of \"{curr_feat}\" on train set is '\n", + " f\"{feature_importance:.3}\"\n", + ")" ] }, { @@ -729,7 +743,9 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "lines_to_next_cell": 2 + }, "outputs": [], "source": [ "n_repeats = 10\n", @@ -737,27 +753,33 @@ "list_feature_importance = []\n", "for n_round in range(n_repeats):\n", " list_feature_importance.append(\n", - " get_feature_importance(model, X_train, y_train, curr_feat))\n", + " get_feature_importance(model, X_train, y_train, curr_feat)\n", + " )\n", "\n", "print(\n", " f'feature importance of \"{curr_feat}\" on train set is '\n", - " f'{np.mean(list_feature_importance):.3} '\n", - " f'\u00b1 {np.std(list_feature_importance):.3}')" + " f\"{np.mean(list_feature_importance):.3} \"\n", + " f\"\u00b1 {np.std(list_feature_importance):.3}\"\n", + ")" ] }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "lines_to_next_cell": 2 + }, "source": [ - "0.67 over 0.98 is very relevant (note the $R^2$ score could go below 0). So\n", - "we can imagine our model relies heavily on this feature to predict the class.\n", - "We can now compute the feature permutation importance for all the features." + "0.67 over 0.98 is very relevant (note the $R^2$ score could go below 0). So we\n", + "can imagine our model relies heavily on this feature to predict the class. We\n", + "can now compute the feature permutation importance for all the features." ] }, { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "lines_to_next_cell": 2 + }, "outputs": [], "source": [ "def permutation_importance(model, X, y, n_repeats=10):\n", @@ -768,13 +790,17 @@ " list_feature_importance = []\n", " for n_round in range(n_repeats):\n", " list_feature_importance.append(\n", - " get_feature_importance(model, X, y, curr_feat))\n", + " get_feature_importance(model, X, y, curr_feat)\n", + " )\n", "\n", " importances.append(list_feature_importance)\n", "\n", - " return {'importances_mean': np.mean(importances, axis=1),\n", - " 'importances_std': np.std(importances, axis=1),\n", - " 'importances': importances}\n", + " return {\n", + " \"importances_mean\": np.mean(importances, axis=1),\n", + " \"importances_std\": np.std(importances, axis=1),\n", + " \"importances\": importances,\n", + " }\n", + "\n", "\n", "# This function could directly be access from sklearn\n", "# from sklearn.inspection import permutation_importance" @@ -787,14 +813,16 @@ "outputs": [], "source": [ "def plot_feature_importances(perm_importance_result, feat_name):\n", - " \"\"\" bar plot the feature importance \"\"\"\n", + " \"\"\"bar plot the feature importance\"\"\"\n", "\n", " fig, ax = plt.subplots()\n", "\n", - " indices = perm_importance_result['importances_mean'].argsort()\n", - " plt.barh(range(len(indices)),\n", - " perm_importance_result['importances_mean'][indices],\n", - " xerr=perm_importance_result['importances_std'][indices])\n", + " indices = perm_importance_result[\"importances_mean\"].argsort()\n", + " plt.barh(\n", + " range(len(indices)),\n", + " perm_importance_result[\"importances_mean\"][indices],\n", + " xerr=perm_importance_result[\"importances_std\"][indices],\n", + " )\n", "\n", " ax.set_yticks(range(len(indices)))\n", " _ = ax.set_yticklabels(feat_name[indices])" @@ -814,7 +842,8 @@ "outputs": [], "source": [ "perm_importance_result_train = permutation_importance(\n", - " model, X_train, y_train, n_repeats=10)\n", + " model, X_train, y_train, n_repeats=10\n", + ")\n", "\n", "plot_feature_importances(perm_importance_result_train, X_train.columns)" ] @@ -827,8 +856,8 @@ "important for the model.\n", "\n", "We note that our random variable `rnd_num` is now very less important than\n", - "latitude. Indeed, the feature importance built-in in RandomForest has bias\n", - "for continuous data, such as `AveOccup` and `rnd_num`.\n", + "latitude. Indeed, the feature importance built-in in RandomForest has bias for\n", + "continuous data, such as `AveOccup` and `rnd_num`.\n", "\n", "However, the model still uses these `rnd_num` feature to compute the output.\n", "It is in line with the overfitting we had noticed between the train and test\n", @@ -855,15 +884,9 @@ "metadata": {}, "source": [ "# Take Away\n", - "\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "* One could directly interpret the coefficient in linear model (if the\n", - " feature have been scaled first)\n", + "\n", + "* One could directly interpret the coefficient in linear model (if the feature\n", + " have been scaled first)\n", "* Model like RandomForest have built-in feature importance\n", "* `permutation_importance` gives feature importance by permutation for any\n", " fitted model" diff --git a/notebooks/ensemble_adaboost.ipynb b/notebooks/ensemble_adaboost.ipynb index 4fae9cb45..90617c972 100644 --- a/notebooks/ensemble_adaboost.ipynb +++ b/notebooks/ensemble_adaboost.ipynb @@ -44,8 +44,8 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "We will purposefully train a shallow decision tree. Since it is shallow,\n", - "it is unlikely to overfit and some of the training examples will even be\n", + "We will purposefully train a shallow decision tree. Since it is shallow, it is\n", + "unlikely to overfit and some of the training examples will even be\n", "misclassified." ] }, @@ -98,16 +98,28 @@ ")\n", "\n", "# plot the original dataset\n", - "sns.scatterplot(data=penguins, x=culmen_columns[0], y=culmen_columns[1],\n", - " hue=target_column, palette=palette)\n", + "sns.scatterplot(\n", + " data=penguins,\n", + " x=culmen_columns[0],\n", + " y=culmen_columns[1],\n", + " hue=target_column,\n", + " palette=palette,\n", + ")\n", "# plot the misclassified samples\n", - "sns.scatterplot(data=data_misclassified, x=culmen_columns[0],\n", - " y=culmen_columns[1], label=\"Misclassified samples\",\n", - " marker=\"+\", s=150, color=\"k\")\n", + "sns.scatterplot(\n", + " data=data_misclassified,\n", + " x=culmen_columns[0],\n", + " y=culmen_columns[1],\n", + " label=\"Misclassified samples\",\n", + " marker=\"+\",\n", + " s=150,\n", + " color=\"k\",\n", + ")\n", "\n", "plt.legend(bbox_to_anchor=(1.04, 0.5), loc=\"center left\")\n", - "_ = plt.title(\"Decision tree predictions \\nwith misclassified samples \"\n", - " \"highlighted\")" + "_ = plt.title(\n", + " \"Decision tree predictions \\nwith misclassified samples highlighted\"\n", + ")" ] }, { @@ -117,16 +129,15 @@ "We observe that several samples have been misclassified by the classifier.\n", "\n", "We mentioned that boosting relies on creating a new classifier which tries to\n", - "correct these misclassifications. In scikit-learn, learners have a\n", - "parameter `sample_weight` which forces it to pay more attention to\n", - "samples with higher weights during the training.\n", + "correct these misclassifications. In scikit-learn, learners have a parameter\n", + "`sample_weight` which forces it to pay more attention to samples with higher\n", + "weights during the training.\n", "\n", - "This parameter is set when calling\n", - "`classifier.fit(X, y, sample_weight=weights)`.\n", - "We will use this trick to create a new classifier by 'discarding' all\n", - "correctly classified samples and only considering the misclassified samples.\n", - "Thus, misclassified samples will be assigned a weight of 1 and well\n", - "classified samples will be assigned a weight of 0." + "This parameter is set when calling `classifier.fit(X, y,\n", + "sample_weight=weights)`. We will use this trick to create a new classifier by\n", + "'discarding' all correctly classified samples and only considering the\n", + "misclassified samples. Thus, misclassified samples will be assigned a weight\n", + "of 1 and well classified samples will be assigned a weight of 0." ] }, { @@ -151,12 +162,22 @@ "DecisionBoundaryDisplay.from_estimator(\n", " tree, data, response_method=\"predict\", cmap=\"RdBu\", alpha=0.5\n", ")\n", - "sns.scatterplot(data=penguins, x=culmen_columns[0], y=culmen_columns[1],\n", - " hue=target_column, palette=palette)\n", - "sns.scatterplot(data=data_misclassified, x=culmen_columns[0],\n", - " y=culmen_columns[1],\n", - " label=\"Previously misclassified samples\",\n", - " marker=\"+\", s=150, color=\"k\")\n", + "sns.scatterplot(\n", + " data=penguins,\n", + " x=culmen_columns[0],\n", + " y=culmen_columns[1],\n", + " hue=target_column,\n", + " palette=palette,\n", + ")\n", + "sns.scatterplot(\n", + " data=data_misclassified,\n", + " x=culmen_columns[0],\n", + " y=culmen_columns[1],\n", + " label=\"Previously misclassified samples\",\n", + " marker=\"+\",\n", + " s=150,\n", + " color=\"k\",\n", + ")\n", "\n", "plt.legend(bbox_to_anchor=(1.04, 0.5), loc=\"center left\")\n", "_ = plt.title(\"Decision tree by changing sample weights\")" @@ -182,8 +203,10 @@ " misclassified_samples_idx, newly_misclassified_samples_idx\n", ")\n", "\n", - "print(f\"Number of samples previously misclassified and \"\n", - " f\"still misclassified: {len(remaining_misclassified_samples_idx)}\")" + "print(\n", + " \"Number of samples previously misclassified and \"\n", + " f\"still misclassified: {len(remaining_misclassified_samples_idx)}\"\n", + ")" ] }, { @@ -192,8 +215,8 @@ "source": [ "However, we are making mistakes on previously well classified samples. Thus,\n", "we get the intuition that we should weight the predictions of each classifier\n", - "differently, most probably by using the number of mistakes each classifier\n", - "is making.\n", + "differently, most probably by using the number of mistakes each classifier is\n", + "making.\n", "\n", "So we could use the classification error to combine both trees." ] @@ -220,23 +243,22 @@ "slightly more than the second one. We could use these accuracy values to\n", "weight the predictions of each learner.\n", "\n", - "To summarize, boosting learns several classifiers, each of which will\n", - "focus more or less on specific samples of the dataset. Boosting is thus\n", - "different from bagging: here we never resample our dataset, we just assign\n", - "different weights to the original dataset.\n", + "To summarize, boosting learns several classifiers, each of which will focus\n", + "more or less on specific samples of the dataset. Boosting is thus different\n", + "from bagging: here we never resample our dataset, we just assign different\n", + "weights to the original dataset.\n", "\n", "Boosting requires some strategy to combine the learners together:\n", "\n", - "* one needs to define a way to compute the weights to be assigned\n", - " to samples;\n", + "* one needs to define a way to compute the weights to be assigned to samples;\n", "* one needs to assign a weight to each learner when making predictions.\n", "\n", - "Indeed, we defined a really simple scheme to assign sample weights and\n", - "learner weights. However, there are statistical theories (like in AdaBoost)\n", - "for how these sample and learner weights can be optimally calculated.\n", + "Indeed, we defined a really simple scheme to assign sample weights and learner\n", + "weights. However, there are statistical theories (like in AdaBoost) for how\n", + "these sample and learner weights can be optimally calculated.\n", "\n", - "We will use the AdaBoost classifier implemented in scikit-learn and\n", - "look at the underlying decision tree classifiers trained." + "We will use the AdaBoost classifier implemented in scikit-learn and look at\n", + "the underlying decision tree classifiers trained." ] }, { @@ -248,9 +270,9 @@ "from sklearn.ensemble import AdaBoostClassifier\n", "\n", "estimator = DecisionTreeClassifier(max_depth=3, random_state=0)\n", - "adaboost = AdaBoostClassifier(estimator=estimator,\n", - " n_estimators=3, algorithm=\"SAMME\",\n", - " random_state=0)\n", + "adaboost = AdaBoostClassifier(\n", + " estimator=estimator, n_estimators=3, algorithm=\"SAMME\", random_state=0\n", + ")\n", "adaboost.fit(data, target)" ] }, @@ -264,11 +286,19 @@ " plt.figure()\n", " # we convert `data` into a NumPy array to avoid a warning raised in scikit-learn\n", " DecisionBoundaryDisplay.from_estimator(\n", - " tree, data.to_numpy(), response_method=\"predict\", cmap=\"RdBu\", alpha=0.5\n", + " tree,\n", + " data.to_numpy(),\n", + " response_method=\"predict\",\n", + " cmap=\"RdBu\",\n", + " alpha=0.5,\n", + " )\n", + " sns.scatterplot(\n", + " x=culmen_columns[0],\n", + " y=culmen_columns[1],\n", + " hue=target_column,\n", + " data=penguins,\n", + " palette=palette,\n", " )\n", - " sns.scatterplot(x=culmen_columns[0], y=culmen_columns[1],\n", - " hue=target_column, data=penguins,\n", - " palette=palette)\n", " plt.legend(bbox_to_anchor=(1.04, 0.5), loc=\"center left\")\n", " _ = plt.title(f\"Decision tree trained at round {boosting_round}\")" ] @@ -302,9 +332,9 @@ "classifier also has the highest classification generalization performance.\n", "\n", "While AdaBoost is a nice algorithm to demonstrate the internal machinery of\n", - "boosting algorithms, it is not the most efficient.\n", - "This title is handed to the gradient-boosting decision tree (GBDT) algorithm,\n", - "which we will discuss in the next unit." + "boosting algorithms, it is not the most efficient. This title is handed to the\n", + "gradient-boosting decision tree (GBDT) algorithm, which we will discuss in the\n", + "next unit." ] } ], diff --git a/notebooks/ensemble_bagging.ipynb b/notebooks/ensemble_bagging.ipynb index bd43a80cb..3eebca268 100644 --- a/notebooks/ensemble_bagging.ipynb +++ b/notebooks/ensemble_bagging.ipynb @@ -6,8 +6,8 @@ "source": [ "# Bagging\n", "\n", - "This notebook introduces a very natural strategy to build ensembles of\n", - "machine learning models named \"bagging\".\n", + "This notebook introduces a very natural strategy to build ensembles of machine\n", + "learning models named \"bagging\".\n", "\n", "\"Bagging\" stands for Bootstrap AGGregatING. It uses bootstrap resampling\n", "(random sampling with replacement) to learn several models on random\n", @@ -37,12 +37,13 @@ " x_min, x_max = -3, 3\n", " x = rng.uniform(x_min, x_max, size=n_samples)\n", " noise = 4.0 * rng.randn(n_samples)\n", - " y = x ** 3 - 0.5 * (x + 1) ** 2 + noise\n", + " y = x**3 - 0.5 * (x + 1) ** 2 + noise\n", " y /= y.std()\n", "\n", " data_train = pd.DataFrame(x, columns=[\"Feature\"])\n", " data_test = pd.DataFrame(\n", - " np.linspace(x_max, x_min, num=300), columns=[\"Feature\"])\n", + " np.linspace(x_max, x_min, num=300), columns=[\"Feature\"]\n", + " )\n", " target_train = pd.Series(y, name=\"Target\")\n", "\n", " return data_train, data_test, target_train" @@ -58,8 +59,9 @@ "import seaborn as sns\n", "\n", "data_train, data_test, target_train = generate_data(n_samples=30)\n", - "sns.scatterplot(x=data_train[\"Feature\"], y=target_train, color=\"black\",\n", - " alpha=0.5)\n", + "sns.scatterplot(\n", + " x=data_train[\"Feature\"], y=target_train, color=\"black\", alpha=0.5\n", + ")\n", "_ = plt.title(\"Synthetic regression dataset\")" ] }, @@ -91,8 +93,8 @@ "metadata": {}, "source": [ "Remember that the term \"test\" here refers to data that was not used for\n", - "training and computing an evaluation metric on such a synthetic test set\n", - "would be meaningless." + "training and computing an evaluation metric on such a synthetic test set would\n", + "be meaningless." ] }, { @@ -101,8 +103,9 @@ "metadata": {}, "outputs": [], "source": [ - "sns.scatterplot(x=data_train[\"Feature\"], y=target_train, color=\"black\",\n", - " alpha=0.5)\n", + "sns.scatterplot(\n", + " x=data_train[\"Feature\"], y=target_train, color=\"black\", alpha=0.5\n", + ")\n", "plt.plot(data_test[\"Feature\"], y_pred, label=\"Fitted tree\")\n", "plt.legend()\n", "_ = plt.title(\"Predictions by a single decision tree\")" @@ -121,9 +124,9 @@ "Given a dataset with `n` data points, bootstrapping corresponds to resampling\n", "with replacement `n` out of such `n` data points uniformly at random.\n", "\n", - "As a result, the output of the bootstrap sampling procedure is another\n", - "dataset with also n data points, but likely with duplicates. As a consequence,\n", - "there are also data points from the original dataset that are never selected to\n", + "As a result, the output of the bootstrap sampling procedure is another dataset\n", + "with also n data points, but likely with duplicates. As a consequence, there\n", + "are also data points from the original dataset that are never selected to\n", "appear in a bootstrap sample (by chance). Those data points that are left away\n", "are often referred to as the out-of-bag sample.\n", "\n", @@ -141,7 +144,9 @@ " # Indices corresponding to a sampling with replacement of the same sample\n", " # size than the original data\n", " bootstrap_indices = rng.choice(\n", - " np.arange(target.shape[0]), size=target.shape[0], replace=True,\n", + " np.arange(target.shape[0]),\n", + " size=target.shape[0],\n", + " replace=True,\n", " )\n", " # In pandas, we need to use `.iloc` to extract rows using an integer\n", " # position index:\n", @@ -169,15 +174,28 @@ "for bootstrap_idx in range(n_bootstraps):\n", " # draw a bootstrap from the original data\n", " data_bootstrap, target_bootstrap = bootstrap_sample(\n", - " data_train, target_train,\n", + " data_train,\n", + " target_train,\n", " )\n", " plt.figure()\n", - " plt.scatter(data_bootstrap[\"Feature\"], target_bootstrap,\n", - " color=\"tab:blue\", facecolors=\"none\",\n", - " alpha=0.5, label=\"Resampled data\", s=180, linewidth=5)\n", - " plt.scatter(data_train[\"Feature\"], target_train,\n", - " color=\"black\", s=60,\n", - " alpha=1, label=\"Original data\")\n", + " plt.scatter(\n", + " data_bootstrap[\"Feature\"],\n", + " target_bootstrap,\n", + " color=\"tab:blue\",\n", + " facecolors=\"none\",\n", + " alpha=0.5,\n", + " label=\"Resampled data\",\n", + " s=180,\n", + " linewidth=5,\n", + " )\n", + " plt.scatter(\n", + " data_train[\"Feature\"],\n", + " target_train,\n", + " color=\"black\",\n", + " s=60,\n", + " alpha=1,\n", + " label=\"Original data\",\n", + " )\n", " plt.title(f\"Resampled data #{bootstrap_idx}\")\n", " plt.legend()" ] @@ -188,8 +206,8 @@ "source": [ "\n", "Observe that the 3 variations all share common points with the original\n", - "dataset. Some of the points are randomly resampled several times and appear\n", - "as darker blue circles.\n", + "dataset. Some of the points are randomly resampled several times and appear as\n", + "darker blue circles.\n", "\n", "The 3 generated bootstrap samples are all different from the original dataset\n", "and from each other. To confirm this intuition, we can check the number of\n", @@ -203,14 +221,17 @@ "outputs": [], "source": [ "data_train_huge, data_test_huge, target_train_huge = generate_data(\n", - " n_samples=100_000)\n", + " n_samples=100_000\n", + ")\n", "data_bootstrap_sample, target_bootstrap_sample = bootstrap_sample(\n", - " data_train_huge, target_train_huge)\n", + " data_train_huge, target_train_huge\n", + ")\n", "\n", - "ratio_unique_sample = (np.unique(data_bootstrap_sample).size /\n", - " data_bootstrap_sample.size)\n", + "ratio_unique_sample = (\n", + " np.unique(data_bootstrap_sample).size / data_bootstrap_sample.size\n", + ")\n", "print(\n", - " f\"Percentage of samples present in the original dataset: \"\n", + " \"Percentage of samples present in the original dataset: \"\n", " f\"{ratio_unique_sample * 100:.1f}%\"\n", ")" ] @@ -225,9 +246,9 @@ "the same size as the original dataset, there will be many samples that are in\n", "the bootstrap sample multiple times.\n", "\n", - "Using bootstrap we are able to generate many datasets, all slightly\n", - "different. We can fit a decision tree for each of these datasets and they all\n", - "shall be slightly different as well." + "Using bootstrap we are able to generate many datasets, all slightly different.\n", + "We can fit a decision tree for each of these datasets and they all shall be\n", + "slightly different as well." ] }, { @@ -241,7 +262,8 @@ " tree = DecisionTreeRegressor(max_depth=3, random_state=0)\n", "\n", " data_bootstrap_sample, target_bootstrap_sample = bootstrap_sample(\n", - " data_train, target_train)\n", + " data_train, target_train\n", + " )\n", " tree.fit(data_bootstrap_sample, target_bootstrap_sample)\n", " bag_of_trees.append(tree)" ] @@ -262,12 +284,18 @@ "metadata": {}, "outputs": [], "source": [ - "sns.scatterplot(x=data_train[\"Feature\"], y=target_train, color=\"black\",\n", - " alpha=0.5)\n", + "sns.scatterplot(\n", + " x=data_train[\"Feature\"], y=target_train, color=\"black\", alpha=0.5\n", + ")\n", "for tree_idx, tree in enumerate(bag_of_trees):\n", " tree_predictions = tree.predict(data_test)\n", - " plt.plot(data_test[\"Feature\"], tree_predictions, linestyle=\"--\", alpha=0.8,\n", - " label=f\"Tree #{tree_idx} predictions\")\n", + " plt.plot(\n", + " data_test[\"Feature\"],\n", + " tree_predictions,\n", + " linestyle=\"--\",\n", + " alpha=0.8,\n", + " label=f\"Tree #{tree_idx} predictions\",\n", + " )\n", "\n", "plt.legend()\n", "_ = plt.title(\"Predictions of trees trained on different bootstraps\")" @@ -279,13 +307,12 @@ "source": [ "## Aggregating\n", "\n", - "Once our trees are fitted, we are able to get predictions for each of\n", - "them. In regression, the most straightforward way to combine those\n", - "predictions is just to average them: for a given test data point, we feed the\n", - "input feature values to each of the `n` trained models in the ensemble and as\n", - "a result compute `n` predicted values for the target variable. The final\n", - "prediction of the ensemble for the test data point is the average of those\n", - "`n` values.\n", + "Once our trees are fitted, we are able to get predictions for each of them. In\n", + "regression, the most straightforward way to combine those predictions is just\n", + "to average them: for a given test data point, we feed the input feature values\n", + "to each of the `n` trained models in the ensemble and as a result compute `n`\n", + "predicted values for the target variable. The final prediction of the ensemble\n", + "for the test data point is the average of those `n` values.\n", "\n", "We can plot the averaged predictions from the previous example." ] @@ -296,19 +323,29 @@ "metadata": {}, "outputs": [], "source": [ - "sns.scatterplot(x=data_train[\"Feature\"], y=target_train, color=\"black\",\n", - " alpha=0.5)\n", + "sns.scatterplot(\n", + " x=data_train[\"Feature\"], y=target_train, color=\"black\", alpha=0.5\n", + ")\n", "\n", "bag_predictions = []\n", "for tree_idx, tree in enumerate(bag_of_trees):\n", " tree_predictions = tree.predict(data_test)\n", - " plt.plot(data_test[\"Feature\"], tree_predictions, linestyle=\"--\", alpha=0.8,\n", - " label=f\"Tree #{tree_idx} predictions\")\n", + " plt.plot(\n", + " data_test[\"Feature\"],\n", + " tree_predictions,\n", + " linestyle=\"--\",\n", + " alpha=0.8,\n", + " label=f\"Tree #{tree_idx} predictions\",\n", + " )\n", " bag_predictions.append(tree_predictions)\n", "\n", "bag_predictions = np.mean(bag_predictions, axis=0)\n", - "plt.plot(data_test[\"Feature\"], bag_predictions, label=\"Averaged predictions\",\n", - " linestyle=\"-\")\n", + "plt.plot(\n", + " data_test[\"Feature\"],\n", + " bag_predictions,\n", + " label=\"Averaged predictions\",\n", + " linestyle=\"-\",\n", + ")\n", "plt.legend(bbox_to_anchor=(1.05, 0.8), loc=\"upper left\")\n", "_ = plt.title(\"Predictions of bagged trees\")" ] @@ -318,11 +355,11 @@ "metadata": {}, "source": [ "\n", - "The unbroken red line shows the averaged predictions, which would be the\n", - "final predictions given by our 'bag' of decision tree regressors. Note that\n", - "the predictions of the ensemble is more stable because of the averaging\n", - "operation. As a result, the bag of trees as a whole is less likely to overfit\n", - "than the individual trees.\n", + "The unbroken red line shows the averaged predictions, which would be the final\n", + "predictions given by our 'bag' of decision tree regressors. Note that the\n", + "predictions of the ensemble is more stable because of the averaging operation.\n", + "As a result, the bag of trees as a whole is less likely to overfit than the\n", + "individual trees.\n", "\n", "## Bagging in scikit-learn\n", "\n", @@ -352,11 +389,8 @@ }, { "cell_type": "markdown", - "metadata": { - "lines_to_next_cell": 0 - }, + "metadata": {}, "source": [ - "\n", "Let us visualize the predictions of the ensemble on the same interval of data:" ] }, @@ -366,8 +400,9 @@ "metadata": {}, "outputs": [], "source": [ - "sns.scatterplot(x=data_train[\"Feature\"], y=target_train, color=\"black\",\n", - " alpha=0.5)\n", + "sns.scatterplot(\n", + " x=data_train[\"Feature\"], y=target_train, color=\"black\", alpha=0.5\n", + ")\n", "\n", "bagged_trees_predictions = bagged_trees.predict(data_test)\n", "plt.plot(data_test[\"Feature\"], bagged_trees_predictions)\n", @@ -379,7 +414,6 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "\n", "Because we use 100 trees in the ensemble, the average prediction is indeed\n", "slightly smoother but very similar to our previous average plot.\n", "\n", @@ -399,15 +433,26 @@ " label = \"Predictions of individual trees\" if tree_idx == 0 else None\n", " # we convert `data_test` into a NumPy array to avoid a warning raised in scikit-learn\n", " tree_predictions = tree.predict(data_test.to_numpy())\n", - " plt.plot(data_test[\"Feature\"], tree_predictions, linestyle=\"--\", alpha=0.1,\n", - " color=\"tab:blue\", label=label)\n", + " plt.plot(\n", + " data_test[\"Feature\"],\n", + " tree_predictions,\n", + " linestyle=\"--\",\n", + " alpha=0.1,\n", + " color=\"tab:blue\",\n", + " label=label,\n", + " )\n", "\n", - "sns.scatterplot(x=data_train[\"Feature\"], y=target_train, color=\"black\",\n", - " alpha=0.5)\n", + "sns.scatterplot(\n", + " x=data_train[\"Feature\"], y=target_train, color=\"black\", alpha=0.5\n", + ")\n", "\n", "bagged_trees_predictions = bagged_trees.predict(data_test)\n", - "plt.plot(data_test[\"Feature\"], bagged_trees_predictions,\n", - " color=\"tab:orange\", label=\"Predictions of ensemble\")\n", + "plt.plot(\n", + " data_test[\"Feature\"],\n", + " bagged_trees_predictions,\n", + " color=\"tab:orange\",\n", + " label=\"Predictions of ensemble\",\n", + ")\n", "_ = plt.legend()" ] }, @@ -415,7 +460,6 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "\n", "We used a low value of the opacity parameter `alpha` to better appreciate the\n", "overlap in the prediction functions of the individual trees.\n", "\n", @@ -455,11 +499,10 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "\n", - "This pipeline first scales the data to the 0-1 range with `MinMaxScaler`.\n", - "Then it extracts degree-4 polynomial features. The resulting features will\n", - "all stay in the 0-1 range by construction: if `x` lies in the 0-1 range then\n", - "`x ** n` also lies in the 0-1 range for any value of `n`.\n", + "This pipeline first scales the data to the 0-1 range with `MinMaxScaler`. Then\n", + "it extracts degree-4 polynomial features. The resulting features will all stay\n", + "in the 0-1 range by construction: if `x` lies in the 0-1 range then `x ** n`\n", + "also lies in the 0-1 range for any value of `n`.\n", "\n", "Then the pipeline feeds the resulting non-linear features to a regularized\n", "linear regression model for the final prediction of the target variable.\n", @@ -498,16 +541,24 @@ " # we convert `data_test` into a NumPy array to avoid a warning raised in scikit-learn\n", " regressor_predictions = regressor.predict(data_test.to_numpy())\n", " base_model_line = plt.plot(\n", - " data_test[\"Feature\"], regressor_predictions, linestyle=\"--\", alpha=0.2,\n", + " data_test[\"Feature\"],\n", + " regressor_predictions,\n", + " linestyle=\"--\",\n", + " alpha=0.2,\n", " label=\"Predictions of base models\" if i == 0 else None,\n", - " color=\"tab:blue\"\n", + " color=\"tab:blue\",\n", " )\n", "\n", - "sns.scatterplot(x=data_train[\"Feature\"], y=target_train, color=\"black\",\n", - " alpha=0.5)\n", + "sns.scatterplot(\n", + " x=data_train[\"Feature\"], y=target_train, color=\"black\", alpha=0.5\n", + ")\n", "bagging_predictions = bagging.predict(data_test)\n", - "plt.plot(data_test[\"Feature\"], bagging_predictions,\n", - " color=\"tab:orange\", label=\"Predictions of ensemble\")\n", + "plt.plot(\n", + " data_test[\"Feature\"],\n", + " bagging_predictions,\n", + " color=\"tab:orange\",\n", + " label=\"Predictions of ensemble\",\n", + ")\n", "plt.ylim(target_train.min(), target_train.max())\n", "plt.legend()\n", "_ = plt.title(\"Bagged polynomial regression\")" @@ -517,11 +568,9 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "\n", - "The predictions of this bagged polynomial regression model looks\n", - "qualitatively better than the bagged trees. This is somewhat expected since\n", - "the base model better reflects our knowledge of the true data generating\n", - "process.\n", + "The predictions of this bagged polynomial regression model looks qualitatively\n", + "better than the bagged trees. This is somewhat expected since the base model\n", + "better reflects our knowledge of the true data generating process.\n", "\n", "Again the different shades induced by the overlapping blue lines let us\n", "appreciate the uncertainty in the prediction of the bagged ensemble.\n", diff --git a/notebooks/ensemble_ex_01.ipynb b/notebooks/ensemble_ex_01.ipynb index aac64f1ac..d94557746 100644 --- a/notebooks/ensemble_ex_01.ipynb +++ b/notebooks/ensemble_ex_01.ipynb @@ -9,8 +9,8 @@ "The aim of this notebook is to investigate if we can tune the hyperparameters\n", "of a bagging regressor and evaluate the gain obtained.\n", "\n", - "We will load the California housing dataset and split it into a training and\n", - "a testing set." + "We will load the California housing dataset and split it into a training and a\n", + "testing set." ] }, { @@ -25,7 +25,8 @@ "data, target = fetch_california_housing(as_frame=True, return_X_y=True)\n", "target *= 100 # rescale the target in k$\n", "data_train, data_test, target_train, target_test = train_test_split(\n", - " data, target, random_state=0, test_size=0.5)" + " data, target, random_state=0, test_size=0.5\n", + ")" ] }, { @@ -43,9 +44,9 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Create a `BaggingRegressor` and provide a `DecisionTreeRegressor`\n", - "to its parameter `estimator`. Train the regressor and evaluate its\n", - "generalization performance on the testing set using the mean absolute error." + "Create a `BaggingRegressor` and provide a `DecisionTreeRegressor` to its\n", + "parameter `estimator`. Train the regressor and evaluate its generalization\n", + "performance on the testing set using the mean absolute error." ] }, { @@ -61,16 +62,14 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Now, create a `RandomizedSearchCV` instance using the previous model and\n", - "tune the important parameters of the bagging regressor. Find the best\n", - "parameters and check if you are able to find a set of parameters that\n", - "improve the default regressor still using the mean absolute error as a\n", - "metric.\n", + "Now, create a `RandomizedSearchCV` instance using the previous model and tune\n", + "the important parameters of the bagging regressor. Find the best parameters\n", + "and check if you are able to find a set of parameters that improve the default\n", + "regressor still using the mean absolute error as a metric.\n", "\n", "
\n", "

Tip

\n", - "

You can list the bagging regressor's parameters using the get_params\n", - "method.

\n", + "

You can list the bagging regressor's parameters using the get_params method.

\n", "
" ] }, @@ -86,7 +85,6 @@ ], "metadata": { "jupytext": { - "encoding": "# -*- coding: utf-8 -*-", "main_language": "python" }, "kernelspec": { diff --git a/notebooks/ensemble_ex_02.ipynb b/notebooks/ensemble_ex_02.ipynb index 3da011b8c..1e066245e 100644 --- a/notebooks/ensemble_ex_02.ipynb +++ b/notebooks/ensemble_ex_02.ipynb @@ -26,7 +26,8 @@ "target_name = \"Body Mass (g)\"\n", "data, target = penguins[[feature_name]], penguins[target_name]\n", "data_train, data_test, target_train, target_test = train_test_split(\n", - " data, target, random_state=0)" + " data, target, random_state=0\n", + ")" ] }, { @@ -44,9 +45,8 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Create a random forest containing three trees. Train the forest and\n", - "check the generalization performance on the testing set in terms of mean\n", - "absolute error." + "Create a random forest containing three trees. Train the forest and check the\n", + "generalization performance on the testing set in terms of mean absolute error." ] }, { @@ -118,7 +118,6 @@ ], "metadata": { "jupytext": { - "encoding": "# -*- coding: utf-8 -*-", "main_language": "python" }, "kernelspec": { diff --git a/notebooks/ensemble_ex_03.ipynb b/notebooks/ensemble_ex_03.ipynb index 5d1f61f75..1e2b9f9c3 100644 --- a/notebooks/ensemble_ex_03.ipynb +++ b/notebooks/ensemble_ex_03.ipynb @@ -8,10 +8,10 @@ "\n", "The aim of this exercise is to:\n", "\n", - "* verifying if a random forest or a gradient-boosting decision tree overfit\n", - " if the number of estimators is not properly chosen;\n", - "* use the early-stopping strategy to avoid adding unnecessary trees, to\n", - " get the best generalization performances.\n", + "* verifying if a random forest or a gradient-boosting decision tree overfit if\n", + " the number of estimators is not properly chosen;\n", + "* use the early-stopping strategy to avoid adding unnecessary trees, to get\n", + " the best generalization performances.\n", "\n", "We will use the California housing dataset to conduct our experiments." ] @@ -28,7 +28,8 @@ "data, target = fetch_california_housing(return_X_y=True, as_frame=True)\n", "target *= 100 # rescale the target in k$\n", "data_train, data_test, target_train, target_test = train_test_split(\n", - " data, target, random_state=0, test_size=0.5)" + " data, target, random_state=0, test_size=0.5\n", + ")" ] }, { @@ -111,8 +112,8 @@ "improving for several iterations, it will stop adding trees.\n", "\n", "Now, create a gradient-boosting model with `n_estimators=1_000`. This number\n", - "of trees will be too large. Change the parameter `n_iter_no_change` such\n", - "that the gradient boosting fitting will stop after adding 5 trees that do not\n", + "of trees will be too large. Change the parameter `n_iter_no_change` such that\n", + "the gradient boosting fitting will stop after adding 5 trees that do not\n", "improve the overall generalization performance." ] }, @@ -129,11 +130,10 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Estimate the generalization performance of this model again using\n", - "the `sklearn.metrics.mean_absolute_error` metric but this time using\n", - "the test set that we held out at the beginning of the notebook.\n", - "Compare the resulting value with the values observed in the validation\n", - "curve." + "Estimate the generalization performance of this model again using the\n", + "`sklearn.metrics.mean_absolute_error` metric but this time using the test set\n", + "that we held out at the beginning of the notebook. Compare the resulting value\n", + "with the values observed in the validation curve." ] }, { @@ -148,7 +148,6 @@ ], "metadata": { "jupytext": { - "encoding": "# -*- coding: utf-8 -*-", "main_language": "python" }, "kernelspec": { diff --git a/notebooks/ensemble_ex_04.ipynb b/notebooks/ensemble_ex_04.ipynb index d8bfe0f4e..1f7d258db 100644 --- a/notebooks/ensemble_ex_04.ipynb +++ b/notebooks/ensemble_ex_04.ipynb @@ -7,9 +7,9 @@ "# \ud83d\udcdd Exercise M6.04\n", "\n", "The aim of the exercise is to get familiar with the histogram\n", - "gradient-boosting in scikit-learn. Besides, we will use this model within\n", - "a cross-validation framework in order to inspect internal parameters found\n", - "via grid-search.\n", + "gradient-boosting in scikit-learn. Besides, we will use this model within a\n", + "cross-validation framework in order to inspect internal parameters found via\n", + "grid-search.\n", "\n", "We will use the California housing dataset." ] @@ -30,8 +30,8 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "First, create a histogram gradient boosting regressor. You can set the\n", - "trees number to be large, and configure the model to use early-stopping." + "First, create a histogram gradient boosting regressor. You can set the trees\n", + "number to be large, and configure the model to use early-stopping." ] }, { @@ -47,15 +47,15 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "We will use a grid-search to find some optimal parameter for this model.\n", - "In this grid-search, you should search for the following parameters:\n", + "We will use a grid-search to find some optimal parameter for this model. In\n", + "this grid-search, you should search for the following parameters:\n", "\n", "* `max_depth: [3, 8]`;\n", "* `max_leaf_nodes: [15, 31]`;\n", "* `learning_rate: [0.1, 1]`.\n", "\n", - "Feel free to explore the space with additional values. Create the\n", - "grid-search providing the previous gradient boosting instance as the model." + "Feel free to explore the space with additional values. Create the grid-search\n", + "providing the previous gradient boosting instance as the model." ] }, { @@ -73,9 +73,9 @@ "source": [ "Finally, we will run our experiment through cross-validation. In this regard,\n", "define a 5-fold cross-validation. Besides, be sure to shuffle the data.\n", - "Subsequently, use the function `sklearn.model_selection.cross_validate`\n", - "to run the cross-validation. You should also set `return_estimator=True`,\n", - "so that we can investigate the inner model trained via cross-validation." + "Subsequently, use the function `sklearn.model_selection.cross_validate` to run\n", + "the cross-validation. You should also set `return_estimator=True`, so that we\n", + "can investigate the inner model trained via cross-validation." ] }, { @@ -91,8 +91,8 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Now that we got the cross-validation results, print out the mean and\n", - "standard deviation score." + "Now that we got the cross-validation results, print out the mean and standard\n", + "deviation score." ] }, { @@ -142,7 +142,6 @@ ], "metadata": { "jupytext": { - "encoding": "# -*- coding: utf-8 -*-", "main_language": "python" }, "kernelspec": { diff --git a/notebooks/ensemble_gradient_boosting.ipynb b/notebooks/ensemble_gradient_boosting.ipynb index 4a19ac34e..6bbb6d517 100644 --- a/notebooks/ensemble_gradient_boosting.ipynb +++ b/notebooks/ensemble_gradient_boosting.ipynb @@ -10,10 +10,10 @@ "algorithm and contrast it with AdaBoost.\n", "\n", "Gradient-boosting differs from AdaBoost due to the following reason: instead\n", - "of assigning weights to specific samples, GBDT will fit a decision tree on\n", - "the residuals error (hence the name \"gradient\") of the previous tree.\n", - "Therefore, each new tree in the ensemble predicts the error made by the\n", - "previous learner instead of predicting the target directly.\n", + "of assigning weights to specific samples, GBDT will fit a decision tree on the\n", + "residuals error (hence the name \"gradient\") of the previous tree. Therefore,\n", + "each new tree in the ensemble predicts the error made by the previous learner\n", + "instead of predicting the target directly.\n", "\n", "In this section, we will provide some intuition about the way learners are\n", "combined to give the final prediction. In this regard, let's go back to our\n", @@ -41,11 +41,12 @@ " len_x = x_max - x_min\n", " x = rng.rand(n_samples) * len_x - len_x / 2\n", " noise = rng.randn(n_samples) * 0.3\n", - " y = x ** 3 - 0.5 * x ** 2 + noise\n", + " y = x**3 - 0.5 * x**2 + noise\n", "\n", " data_train = pd.DataFrame(x, columns=[\"Feature\"])\n", - " data_test = pd.DataFrame(np.linspace(x_max, x_min, num=300),\n", - " columns=[\"Feature\"])\n", + " data_test = pd.DataFrame(\n", + " np.linspace(x_max, x_min, num=300), columns=[\"Feature\"]\n", + " )\n", " target_train = pd.Series(y, name=\"Target\")\n", "\n", " return data_train, data_test, target_train\n", @@ -63,8 +64,9 @@ "import matplotlib.pyplot as plt\n", "import seaborn as sns\n", "\n", - "sns.scatterplot(x=data_train[\"Feature\"], y=target_train, color=\"black\",\n", - " alpha=0.5)\n", + "sns.scatterplot(\n", + " x=data_train[\"Feature\"], y=target_train, color=\"black\", alpha=0.5\n", + ")\n", "_ = plt.title(\"Synthetic regression dataset\")" ] }, @@ -72,9 +74,9 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "As we previously discussed, boosting will be based on assembling a sequence\n", - "of learners. We will start by creating a decision tree regressor. We will set\n", - "the depth of the tree so that the resulting learner will underfit the data." + "As we previously discussed, boosting will be based on assembling a sequence of\n", + "learners. We will start by creating a decision tree regressor. We will set the\n", + "depth of the tree so that the resulting learner will underfit the data." ] }, { @@ -96,9 +98,9 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Using the term \"test\" here refers to data that was not used for training.\n", - "It should not be confused with data coming from a train-test split, as it\n", - "was generated in equally-spaced intervals for the visual evaluation of the\n", + "Using the term \"test\" here refers to data that was not used for training. It\n", + "should not be confused with data coming from a train-test split, as it was\n", + "generated in equally-spaced intervals for the visual evaluation of the\n", "predictions." ] }, @@ -109,19 +111,21 @@ "outputs": [], "source": [ "# plot the data\n", - "sns.scatterplot(x=data_train[\"Feature\"], y=target_train, color=\"black\",\n", - " alpha=0.5)\n", + "sns.scatterplot(\n", + " x=data_train[\"Feature\"], y=target_train, color=\"black\", alpha=0.5\n", + ")\n", "# plot the predictions\n", "line_predictions = plt.plot(data_test[\"Feature\"], target_test_predicted, \"--\")\n", "\n", "# plot the residuals\n", - "for value, true, predicted in zip(data_train[\"Feature\"],\n", - " target_train,\n", - " target_train_predicted):\n", + "for value, true, predicted in zip(\n", + " data_train[\"Feature\"], target_train, target_train_predicted\n", + "):\n", " lines_residuals = plt.plot([value, value], [true, predicted], color=\"red\")\n", "\n", - "plt.legend([line_predictions[0], lines_residuals[0]],\n", - " [\"Fitted tree\", \"Residuals\"])\n", + "plt.legend(\n", + " [line_predictions[0], lines_residuals[0]], [\"Fitted tree\", \"Residuals\"]\n", + ")\n", "_ = plt.title(\"Prediction function together \\nwith errors on the training set\")" ] }, @@ -139,11 +143,11 @@ "between the predictions and the ground-truth data. We represent these errors,\n", "called \"Residuals\", by unbroken red lines.\n", "\n", - "Indeed, our initial tree was not expressive enough to handle the complexity\n", - "of the data, as shown by the residuals. In a gradient-boosting algorithm, the\n", - "idea is to create a second tree which, given the same data `data`, will try\n", - "to predict the residuals instead of the vector `target`. We would therefore\n", - "have a tree that is able to predict the errors made by the initial tree.\n", + "Indeed, our initial tree was not expressive enough to handle the complexity of\n", + "the data, as shown by the residuals. In a gradient-boosting algorithm, the\n", + "idea is to create a second tree which, given the same data `data`, will try to\n", + "predict the residuals instead of the vector `target`. We would therefore have\n", + "a tree that is able to predict the errors made by the initial tree.\n", "\n", "Let's train such a tree." ] @@ -171,17 +175,21 @@ "source": [ "sns.scatterplot(x=data_train[\"Feature\"], y=residuals, color=\"black\", alpha=0.5)\n", "line_predictions = plt.plot(\n", - " data_test[\"Feature\"], target_test_predicted_residuals, \"--\")\n", + " data_test[\"Feature\"], target_test_predicted_residuals, \"--\"\n", + ")\n", "\n", "# plot the residuals of the predicted residuals\n", - "for value, true, predicted in zip(data_train[\"Feature\"],\n", - " residuals,\n", - " target_train_predicted_residuals):\n", + "for value, true, predicted in zip(\n", + " data_train[\"Feature\"], residuals, target_train_predicted_residuals\n", + "):\n", " lines_residuals = plt.plot([value, value], [true, predicted], color=\"red\")\n", "\n", - "plt.legend([line_predictions[0], lines_residuals[0]],\n", - " [\"Fitted tree\", \"Residuals\"], bbox_to_anchor=(1.05, 0.8),\n", - " loc=\"upper left\")\n", + "plt.legend(\n", + " [line_predictions[0], lines_residuals[0]],\n", + " [\"Fitted tree\", \"Residuals\"],\n", + " bbox_to_anchor=(1.05, 0.8),\n", + " loc=\"upper left\",\n", + ")\n", "_ = plt.title(\"Prediction of the previous residuals\")" ] }, @@ -190,10 +198,10 @@ "metadata": {}, "source": [ "We see that this new tree only manages to fit some of the residuals. We will\n", - "focus on a specific sample from the training set (i.e. we know that the\n", - "sample will be well predicted using two successive trees). We will use this\n", - "sample to explain how the predictions of both trees are combined. Let's first\n", - "select this sample in `data_train`." + "focus on a specific sample from the training set (i.e. we know that the sample\n", + "will be well predicted using two successive trees). We will use this sample to\n", + "explain how the predictions of both trees are combined. Let's first select\n", + "this sample in `data_train`." ] }, { @@ -203,7 +211,7 @@ "outputs": [], "source": [ "sample = data_train.iloc[[-2]]\n", - "x_sample = sample['Feature'].iloc[0]\n", + "x_sample = sample[\"Feature\"].iloc[0]\n", "target_true = target_train.iloc[-2]\n", "target_true_residual = residuals.iloc[-2]" ] @@ -228,17 +236,19 @@ "# * the predictions\n", "# * the residuals\n", "\n", - "sns.scatterplot(x=data_train[\"Feature\"], y=target_train, color=\"black\",\n", - " alpha=0.5)\n", + "sns.scatterplot(\n", + " x=data_train[\"Feature\"], y=target_train, color=\"black\", alpha=0.5\n", + ")\n", "plt.plot(data_test[\"Feature\"], target_test_predicted, \"--\")\n", - "for value, true, predicted in zip(data_train[\"Feature\"],\n", - " target_train,\n", - " target_train_predicted):\n", + "for value, true, predicted in zip(\n", + " data_train[\"Feature\"], target_train, target_train_predicted\n", + "):\n", " lines_residuals = plt.plot([value, value], [true, predicted], color=\"red\")\n", "\n", "# Highlight the sample of interest\n", - "plt.scatter(sample, target_true, label=\"Sample of interest\",\n", - " color=\"tab:orange\", s=200)\n", + "plt.scatter(\n", + " sample, target_true, label=\"Sample of interest\", color=\"tab:orange\", s=200\n", + ")\n", "plt.xlim([-1, 0])\n", "plt.legend(bbox_to_anchor=(1.05, 0.8), loc=\"upper left\")\n", "_ = plt.title(\"Tree predictions\")" @@ -248,8 +258,8 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Now, let's plot the residuals information. We will plot the residuals\n", - "computed from the first decision tree and show the residual predictions." + "Now, let's plot the residuals information. We will plot the residuals computed\n", + "from the first decision tree and show the residual predictions." ] }, { @@ -263,17 +273,21 @@ "# * the residual predictions\n", "# * the residuals of the residual predictions\n", "\n", - "sns.scatterplot(x=data_train[\"Feature\"], y=residuals,\n", - " color=\"black\", alpha=0.5)\n", + "sns.scatterplot(x=data_train[\"Feature\"], y=residuals, color=\"black\", alpha=0.5)\n", "plt.plot(data_test[\"Feature\"], target_test_predicted_residuals, \"--\")\n", - "for value, true, predicted in zip(data_train[\"Feature\"],\n", - " residuals,\n", - " target_train_predicted_residuals):\n", + "for value, true, predicted in zip(\n", + " data_train[\"Feature\"], residuals, target_train_predicted_residuals\n", + "):\n", " lines_residuals = plt.plot([value, value], [true, predicted], color=\"red\")\n", "\n", "# Highlight the sample of interest\n", - "plt.scatter(sample, target_true_residual, label=\"Sample of interest\",\n", - " color=\"tab:orange\", s=200)\n", + "plt.scatter(\n", + " sample,\n", + " target_true_residual,\n", + " label=\"Sample of interest\",\n", + " color=\"tab:orange\",\n", + " s=200,\n", + ")\n", "plt.xlim([-1, 0])\n", "plt.legend()\n", "_ = plt.title(\"Prediction of the residuals\")" @@ -296,12 +310,13 @@ "metadata": {}, "outputs": [], "source": [ - "print(f\"True value to predict for \"\n", - " f\"f(x={x_sample:.3f}) = {target_true:.3f}\")\n", + "print(f\"True value to predict for f(x={x_sample:.3f}) = {target_true:.3f}\")\n", "\n", "y_pred_first_tree = tree.predict(sample)[0]\n", - "print(f\"Prediction of the first decision tree for x={x_sample:.3f}: \"\n", - " f\"y={y_pred_first_tree:.3f}\")\n", + "print(\n", + " f\"Prediction of the first decision tree for x={x_sample:.3f}: \"\n", + " f\"y={y_pred_first_tree:.3f}\"\n", + ")\n", "print(f\"Error of the tree: {target_true - y_pred_first_tree:.3f}\")" ] }, @@ -319,8 +334,10 @@ "metadata": {}, "outputs": [], "source": [ - "print(f\"Prediction of the residual for x={x_sample:.3f}: \"\n", - " f\"{tree_residuals.predict(sample)[0]:.3f}\")" + "print(\n", + " f\"Prediction of the residual for x={x_sample:.3f}: \"\n", + " f\"{tree_residuals.predict(sample)[0]:.3f}\"\n", + ")" ] }, { @@ -341,8 +358,10 @@ "y_pred_first_and_second_tree = (\n", " y_pred_first_tree + tree_residuals.predict(sample)[0]\n", ")\n", - "print(f\"Prediction of the first and second decision trees combined for \"\n", - " f\"x={x_sample:.3f}: y={y_pred_first_and_second_tree:.3f}\")\n", + "print(\n", + " \"Prediction of the first and second decision trees combined for \"\n", + " f\"x={x_sample:.3f}: y={y_pred_first_and_second_tree:.3f}\"\n", + ")\n", "print(f\"Error of the tree: {target_true - y_pred_first_and_second_tree:.3f}\")" ] }, @@ -352,10 +371,10 @@ "source": [ "We chose a sample for which only two trees were enough to make the perfect\n", "prediction. However, we saw in the previous plot that two trees were not\n", - "enough to correct the residuals of all samples. Therefore, one needs to\n", - "add several trees to the ensemble to successfully correct the error\n", - "(i.e. the second tree corrects the first tree's error, while the third tree\n", - "corrects the second tree's error and so on).\n", + "enough to correct the residuals of all samples. Therefore, one needs to add\n", + "several trees to the ensemble to successfully correct the error (i.e. the\n", + "second tree corrects the first tree's error, while the third tree corrects the\n", + "second tree's error and so on).\n", "\n", "We will compare the generalization performance of random-forest and gradient\n", "boosting on the California housing dataset." @@ -384,7 +403,10 @@ "\n", "gradient_boosting = GradientBoostingRegressor(n_estimators=200)\n", "cv_results_gbdt = cross_validate(\n", - " gradient_boosting, data, target, scoring=\"neg_mean_absolute_error\",\n", + " gradient_boosting,\n", + " data,\n", + " target,\n", + " scoring=\"neg_mean_absolute_error\",\n", " n_jobs=2,\n", ")" ] @@ -396,13 +418,15 @@ "outputs": [], "source": [ "print(\"Gradient Boosting Decision Tree\")\n", - "print(f\"Mean absolute error via cross-validation: \"\n", - " f\"{-cv_results_gbdt['test_score'].mean():.3f} \u00b1 \"\n", - " f\"{cv_results_gbdt['test_score'].std():.3f} k$\")\n", - "print(f\"Average fit time: \"\n", - " f\"{cv_results_gbdt['fit_time'].mean():.3f} seconds\")\n", - "print(f\"Average score time: \"\n", - " f\"{cv_results_gbdt['score_time'].mean():.3f} seconds\")" + "print(\n", + " \"Mean absolute error via cross-validation: \"\n", + " f\"{-cv_results_gbdt['test_score'].mean():.3f} \u00b1 \"\n", + " f\"{cv_results_gbdt['test_score'].std():.3f} k$\"\n", + ")\n", + "print(f\"Average fit time: {cv_results_gbdt['fit_time'].mean():.3f} seconds\")\n", + "print(\n", + " f\"Average score time: {cv_results_gbdt['score_time'].mean():.3f} seconds\"\n", + ")" ] }, { @@ -415,7 +439,10 @@ "\n", "random_forest = RandomForestRegressor(n_estimators=200, n_jobs=2)\n", "cv_results_rf = cross_validate(\n", - " random_forest, data, target, scoring=\"neg_mean_absolute_error\",\n", + " random_forest,\n", + " data,\n", + " target,\n", + " scoring=\"neg_mean_absolute_error\",\n", " n_jobs=2,\n", ")" ] @@ -427,13 +454,13 @@ "outputs": [], "source": [ "print(\"Random Forest\")\n", - "print(f\"Mean absolute error via cross-validation: \"\n", - " f\"{-cv_results_rf['test_score'].mean():.3f} \u00b1 \"\n", - " f\"{cv_results_rf['test_score'].std():.3f} k$\")\n", - "print(f\"Average fit time: \"\n", - " f\"{cv_results_rf['fit_time'].mean():.3f} seconds\")\n", - "print(f\"Average score time: \"\n", - " f\"{cv_results_rf['score_time'].mean():.3f} seconds\")" + "print(\n", + " \"Mean absolute error via cross-validation: \"\n", + " f\"{-cv_results_rf['test_score'].mean():.3f} \u00b1 \"\n", + " f\"{cv_results_rf['test_score'].std():.3f} k$\"\n", + ")\n", + "print(f\"Average fit time: {cv_results_rf['fit_time'].mean():.3f} seconds\")\n", + "print(f\"Average score time: {cv_results_rf['score_time'].mean():.3f} seconds\")" ] }, { @@ -441,13 +468,13 @@ "metadata": {}, "source": [ "In term of computation performance, the forest can be parallelized and will\n", - "benefit from using multiple cores of the CPU. In terms of scoring\n", - "performance, both algorithms lead to very close results.\n", + "benefit from using multiple cores of the CPU. In terms of scoring performance,\n", + "both algorithms lead to very close results.\n", "\n", - "However, we see that the gradient boosting is a very fast algorithm to\n", - "predict compared to random forest. This is due to the fact that gradient\n", - "boosting uses shallow trees. We will go into details in the next notebook\n", - "about the hyperparameters to consider when optimizing ensemble methods." + "However, we see that the gradient boosting is a very fast algorithm to predict\n", + "compared to random forest. This is due to the fact that gradient boosting uses\n", + "shallow trees. We will go into details in the next notebook about the\n", + "hyperparameters to consider when optimizing ensemble methods." ] } ], diff --git a/notebooks/ensemble_hist_gradient_boosting.ipynb b/notebooks/ensemble_hist_gradient_boosting.ipynb index 0fcf52d8a..2814d44a2 100644 --- a/notebooks/ensemble_hist_gradient_boosting.ipynb +++ b/notebooks/ensemble_hist_gradient_boosting.ipynb @@ -5,6 +5,7 @@ "metadata": {}, "source": [ "# Speeding-up gradient-boosting\n", + "\n", "In this notebook, we present a modified version of gradient boosting which\n", "uses a reduced number of splits when building the different trees. This\n", "algorithm is called \"histogram gradient boosting\" in scikit-learn.\n", @@ -14,18 +15,18 @@ "Therefore, the algorithm scales efficiently with both the number of cores and\n", "the number of samples.\n", "\n", - "In gradient-boosting, the algorithm is a sequential algorithm. It requires\n", - "the `N-1` trees to have been fit to be able to fit the tree at stage `N`.\n", + "In gradient-boosting, the algorithm is a sequential algorithm. It requires the\n", + "`N-1` trees to have been fit to be able to fit the tree at stage `N`.\n", "Therefore, the algorithm is quite computationally expensive. The most\n", - "expensive part in this algorithm is the search for the best split in the\n", - "tree which is a brute-force approach: all possible split are evaluated and\n", - "the best one is picked. We explained this process in the notebook \"tree in\n", - "depth\", which you can refer to.\n", + "expensive part in this algorithm is the search for the best split in the tree\n", + "which is a brute-force approach: all possible split are evaluated and the best\n", + "one is picked. We explained this process in the notebook \"tree in depth\",\n", + "which you can refer to.\n", "\n", "To accelerate the gradient-boosting algorithm, one could reduce the number of\n", - "splits to be evaluated. As a consequence, the generalization performance of such\n", - "a tree would be reduced. However, since we are combining several trees in a\n", - "gradient-boosting, we can add more estimators to overcome this issue.\n", + "splits to be evaluated. As a consequence, the generalization performance of\n", + "such a tree would be reduced. However, since we are combining several trees in\n", + "a gradient-boosting, we can add more estimators to overcome this issue.\n", "\n", "We will make a naive implementation of such algorithm using building blocks\n", "from scikit-learn. First, we will load the California housing dataset." @@ -72,8 +73,11 @@ "\n", "gradient_boosting = GradientBoostingRegressor(n_estimators=200)\n", "cv_results_gbdt = cross_validate(\n", - " gradient_boosting, data, target, scoring=\"neg_mean_absolute_error\",\n", - " n_jobs=2\n", + " gradient_boosting,\n", + " data,\n", + " target,\n", + " scoring=\"neg_mean_absolute_error\",\n", + " n_jobs=2,\n", ")" ] }, @@ -84,13 +88,15 @@ "outputs": [], "source": [ "print(\"Gradient Boosting Decision Tree\")\n", - "print(f\"Mean absolute error via cross-validation: \"\n", - " f\"{-cv_results_gbdt['test_score'].mean():.3f} \u00b1 \"\n", - " f\"{cv_results_gbdt['test_score'].std():.3f} k$\")\n", - "print(f\"Average fit time: \"\n", - " f\"{cv_results_gbdt['fit_time'].mean():.3f} seconds\")\n", - "print(f\"Average score time: \"\n", - " f\"{cv_results_gbdt['score_time'].mean():.3f} seconds\")" + "print(\n", + " \"Mean absolute error via cross-validation: \"\n", + " f\"{-cv_results_gbdt['test_score'].mean():.3f} \u00b1 \"\n", + " f\"{cv_results_gbdt['test_score'].std():.3f} k$\"\n", + ")\n", + "print(f\"Average fit time: {cv_results_gbdt['fit_time'].mean():.3f} seconds\")\n", + "print(\n", + " f\"Average score time: {cv_results_gbdt['score_time'].mean():.3f} seconds\"\n", + ")" ] }, { @@ -100,8 +106,8 @@ "We recall that a way of accelerating the gradient boosting is to reduce the\n", "number of split considered within the tree building. One way is to bin the\n", "data before to give them into the gradient boosting. A transformer called\n", - "`KBinsDiscretizer` is doing such transformation. Thus, we can pipeline\n", - "this preprocessing with the gradient boosting.\n", + "`KBinsDiscretizer` is doing such transformation. Thus, we can pipeline this\n", + "preprocessing with the gradient boosting.\n", "\n", "We can first demonstrate the transformation done by the `KBinsDiscretizer`." ] @@ -116,7 +122,8 @@ "from sklearn.preprocessing import KBinsDiscretizer\n", "\n", "discretizer = KBinsDiscretizer(\n", - " n_bins=256, encode=\"ordinal\", strategy=\"quantile\")\n", + " n_bins=256, encode=\"ordinal\", strategy=\"quantile\"\n", + ")\n", "data_trans = discretizer.fit_transform(data)\n", "data_trans" ] @@ -131,10 +138,10 @@ "the features, we requested too much bins in regard of the data dispersion\n", "for those features. The smallest bins will be removed.

\n", "
\n", - "We see that the discretizer transforms the original data into integral\n", - "values (even though they are encoded using a floating-point representation).\n", - "Each value represents the bin index when the distribution by quantile is\n", - "performed. We can check the number of bins per feature." + "We see that the discretizer transforms the original data into integral values\n", + "(even though they are encoded using a floating-point representation). Each\n", + "value represents the bin index when the distribution by quantile is performed.\n", + "We can check the number of bins per feature." ] }, { @@ -151,8 +158,8 @@ "metadata": {}, "source": [ "After this transformation, we see that we have at most 256 unique values per\n", - "features. Now, we will use this transformer to discretize data before\n", - "training the gradient boosting regressor." + "features. Now, we will use this transformer to discretize data before training\n", + "the gradient boosting regressor." ] }, { @@ -164,9 +171,13 @@ "from sklearn.pipeline import make_pipeline\n", "\n", "gradient_boosting = make_pipeline(\n", - " discretizer, GradientBoostingRegressor(n_estimators=200))\n", + " discretizer, GradientBoostingRegressor(n_estimators=200)\n", + ")\n", "cv_results_gbdt = cross_validate(\n", - " gradient_boosting, data, target, scoring=\"neg_mean_absolute_error\",\n", + " gradient_boosting,\n", + " data,\n", + " target,\n", + " scoring=\"neg_mean_absolute_error\",\n", " n_jobs=2,\n", ")" ] @@ -178,27 +189,29 @@ "outputs": [], "source": [ "print(\"Gradient Boosting Decision Tree with KBinsDiscretizer\")\n", - "print(f\"Mean absolute error via cross-validation: \"\n", - " f\"{-cv_results_gbdt['test_score'].mean():.3f} \u00b1 \"\n", - " f\"{cv_results_gbdt['test_score'].std():.3f} k$\")\n", - "print(f\"Average fit time: \"\n", - " f\"{cv_results_gbdt['fit_time'].mean():.3f} seconds\")\n", - "print(f\"Average score time: \"\n", - " f\"{cv_results_gbdt['score_time'].mean():.3f} seconds\")" + "print(\n", + " \"Mean absolute error via cross-validation: \"\n", + " f\"{-cv_results_gbdt['test_score'].mean():.3f} \u00b1 \"\n", + " f\"{cv_results_gbdt['test_score'].std():.3f} k$\"\n", + ")\n", + "print(f\"Average fit time: {cv_results_gbdt['fit_time'].mean():.3f} seconds\")\n", + "print(\n", + " f\"Average score time: {cv_results_gbdt['score_time'].mean():.3f} seconds\"\n", + ")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "Here, we see that the fit time has been reduced but that the\n", - "generalization performance of the model is identical. Scikit-learn provides\n", - "specific classes which are even more optimized for large dataset, called\n", + "Here, we see that the fit time has been reduced but that the generalization\n", + "performance of the model is identical. Scikit-learn provides specific classes\n", + "which are even more optimized for large dataset, called\n", "`HistGradientBoostingClassifier` and `HistGradientBoostingRegressor`. Each\n", "feature in the dataset `data` is first binned by computing histograms, which\n", "are later used to evaluate the potential splits. The number of splits to\n", - "evaluate is then much smaller. This algorithm becomes much more efficient\n", - "than gradient boosting when the dataset has over 10,000 samples.\n", + "evaluate is then much smaller. This algorithm becomes much more efficient than\n", + "gradient boosting when the dataset has over 10,000 samples.\n", "\n", "Below we will give an example for a large dataset and we will compare\n", "computation times with the experiment of the previous section." @@ -213,10 +226,14 @@ "from sklearn.ensemble import HistGradientBoostingRegressor\n", "\n", "histogram_gradient_boosting = HistGradientBoostingRegressor(\n", - " max_iter=200, random_state=0)\n", + " max_iter=200, random_state=0\n", + ")\n", "cv_results_hgbdt = cross_validate(\n", - " histogram_gradient_boosting, data, target,\n", - " scoring=\"neg_mean_absolute_error\", n_jobs=2,\n", + " histogram_gradient_boosting,\n", + " data,\n", + " target,\n", + " scoring=\"neg_mean_absolute_error\",\n", + " n_jobs=2,\n", ")" ] }, @@ -227,21 +244,23 @@ "outputs": [], "source": [ "print(\"Histogram Gradient Boosting Decision Tree\")\n", - "print(f\"Mean absolute error via cross-validation: \"\n", - " f\"{-cv_results_hgbdt['test_score'].mean():.3f} \u00b1 \"\n", - " f\"{cv_results_hgbdt['test_score'].std():.3f} k$\")\n", - "print(f\"Average fit time: \"\n", - " f\"{cv_results_hgbdt['fit_time'].mean():.3f} seconds\")\n", - "print(f\"Average score time: \"\n", - " f\"{cv_results_hgbdt['score_time'].mean():.3f} seconds\")" + "print(\n", + " \"Mean absolute error via cross-validation: \"\n", + " f\"{-cv_results_hgbdt['test_score'].mean():.3f} \u00b1 \"\n", + " f\"{cv_results_hgbdt['test_score'].std():.3f} k$\"\n", + ")\n", + "print(f\"Average fit time: {cv_results_hgbdt['fit_time'].mean():.3f} seconds\")\n", + "print(\n", + " f\"Average score time: {cv_results_hgbdt['score_time'].mean():.3f} seconds\"\n", + ")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "The histogram gradient-boosting is the best algorithm in terms of score.\n", - "It will also scale when the number of samples increases, while the normal\n", + "The histogram gradient-boosting is the best algorithm in terms of score. It\n", + "will also scale when the number of samples increases, while the normal\n", "gradient-boosting will not." ] } diff --git a/notebooks/ensemble_hyperparameters.ipynb b/notebooks/ensemble_hyperparameters.ipynb index 998c0bd02..fb872005b 100644 --- a/notebooks/ensemble_hyperparameters.ipynb +++ b/notebooks/ensemble_hyperparameters.ipynb @@ -37,7 +37,8 @@ "data, target = fetch_california_housing(return_X_y=True, as_frame=True)\n", "target *= 100 # rescale the target in k$\n", "data_train, data_test, target_train, target_test = train_test_split(\n", - " data, target, random_state=0)" + " data, target, random_state=0\n", + ")" ] }, { @@ -121,8 +122,12 @@ " \"min_samples_leaf\": [1, 2, 5, 10, 20, 50, 100],\n", "}\n", "search_cv = RandomizedSearchCV(\n", - " RandomForestRegressor(n_jobs=2), param_distributions=param_distributions,\n", - " scoring=\"neg_mean_absolute_error\", n_iter=10, random_state=0, n_jobs=2,\n", + " RandomForestRegressor(n_jobs=2),\n", + " param_distributions=param_distributions,\n", + " scoring=\"neg_mean_absolute_error\",\n", + " n_iter=10,\n", + " random_state=0,\n", + " n_jobs=2,\n", ")\n", "search_cv.fit(data_train, target_train)\n", "\n", @@ -162,7 +167,9 @@ "outputs": [], "source": [ "error = -search_cv.score(data_test, target_test)\n", - "print(f\"On average, our random forest regressor makes an error of {error:.2f} k$\")" + "print(\n", + " f\"On average, our random forest regressor makes an error of {error:.2f} k$\"\n", + ")" ] }, { @@ -176,8 +183,8 @@ "`learning_rate`, and `max_depth` or `max_leaf_nodes` (as previously discussed\n", "random forest).\n", "\n", - "Let's first discuss the `max_depth` (or `max_leaf_nodes`) parameter. We saw\n", - "in the section on gradient-boosting that the algorithm fits the error of the\n", + "Let's first discuss the `max_depth` (or `max_leaf_nodes`) parameter. We saw in\n", + "the section on gradient-boosting that the algorithm fits the error of the\n", "previous tree in the ensemble. Thus, fitting fully grown trees would be\n", "detrimental. Indeed, the first tree of the ensemble would perfectly fit\n", "(overfit) the data and thus no subsequent tree would be required, since there\n", @@ -216,8 +223,12 @@ " \"learning_rate\": loguniform(0.01, 1),\n", "}\n", "search_cv = RandomizedSearchCV(\n", - " GradientBoostingRegressor(), param_distributions=param_distributions,\n", - " scoring=\"neg_mean_absolute_error\", n_iter=20, random_state=0, n_jobs=2\n", + " GradientBoostingRegressor(),\n", + " param_distributions=param_distributions,\n", + " scoring=\"neg_mean_absolute_error\",\n", + " n_iter=20,\n", + " random_state=0,\n", + " n_jobs=2,\n", ")\n", "search_cv.fit(data_train, target_train)\n", "\n", @@ -242,10 +253,10 @@ "\n", "In this search, we see that the `learning_rate` is required to be large\n", "enough, i.e. > 0.1. We also observe that for the best ranked models, having a\n", - "smaller `learning_rate`, will require more trees or a larger number of\n", - "leaves for each tree. However, it is particularly difficult to draw\n", - "more detailed conclusions since the best value of an hyperparameter depends\n", - "on the other hyperparameter values." + "smaller `learning_rate`, will require more trees or a larger number of leaves\n", + "for each tree. However, it is particularly difficult to draw more detailed\n", + "conclusions since the best value of an hyperparameter depends on the other\n", + "hyperparameter values." ] }, { diff --git a/notebooks/ensemble_introduction.ipynb b/notebooks/ensemble_introduction.ipynb index 2abcee2e3..56d604643 100644 --- a/notebooks/ensemble_introduction.ipynb +++ b/notebooks/ensemble_introduction.ipynb @@ -11,8 +11,8 @@ "models result in more powerful and robust models with less hassle.\n", "\n", "We will start by loading the california housing dataset. We recall that the\n", - "goal in this dataset is to predict the median house value in some district\n", - "in California based on demographic and geographic data." + "goal in this dataset is to predict the median house value in some district in\n", + "California based on demographic and geographic data." ] }, { @@ -59,8 +59,10 @@ "cv_results = cross_validate(tree, data, target, n_jobs=2)\n", "scores = cv_results[\"test_score\"]\n", "\n", - "print(f\"R2 score obtained by cross-validation: \"\n", - " f\"{scores.mean():.3f} \u00b1 {scores.std():.3f}\")" + "print(\n", + " \"R2 score obtained by cross-validation: \"\n", + " f\"{scores.mean():.3f} \u00b1 {scores.std():.3f}\"\n", + ")" ] }, { @@ -96,25 +98,33 @@ "param_grid = {\n", " \"max_depth\": [5, 8, None],\n", " \"min_samples_split\": [2, 10, 30, 50],\n", - " \"min_samples_leaf\": [0.01, 0.05, 0.1, 1]}\n", + " \"min_samples_leaf\": [0.01, 0.05, 0.1, 1],\n", + "}\n", "cv = 3\n", "\n", - "tree = GridSearchCV(DecisionTreeRegressor(random_state=0),\n", - " param_grid=param_grid, cv=cv, n_jobs=2)\n", - "cv_results = cross_validate(tree, data, target, n_jobs=2,\n", - " return_estimator=True)\n", + "tree = GridSearchCV(\n", + " DecisionTreeRegressor(random_state=0),\n", + " param_grid=param_grid,\n", + " cv=cv,\n", + " n_jobs=2,\n", + ")\n", + "cv_results = cross_validate(\n", + " tree, data, target, n_jobs=2, return_estimator=True\n", + ")\n", "scores = cv_results[\"test_score\"]\n", "\n", - "print(f\"R2 score obtained by cross-validation: \"\n", - " f\"{scores.mean():.3f} \u00b1 {scores.std():.3f}\")" + "print(\n", + " \"R2 score obtained by cross-validation: \"\n", + " f\"{scores.mean():.3f} \u00b1 {scores.std():.3f}\"\n", + ")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "We see that optimizing the hyperparameters will have a positive effect\n", - "on the generalization performance. However, it comes with a higher computational\n", + "We see that optimizing the hyperparameters will have a positive effect on the\n", + "generalization performance. However, it comes with a higher computational\n", "cost." ] }, @@ -126,14 +136,14 @@ "the tuning of the parameters and investigate the results.\n", "\n", "Now we will use an ensemble method called bagging. More details about this\n", - "method will be discussed in the next section. In short, this method will use\n", - "a base regressor (i.e. decision tree regressors) and will train several of\n", - "them on a slightly modified version of the training set. Then, the\n", - "predictions of all these base regressors will be combined by averaging.\n", + "method will be discussed in the next section. In short, this method will use a\n", + "base regressor (i.e. decision tree regressors) and will train several of them\n", + "on a slightly modified version of the training set. Then, the predictions of\n", + "all these base regressors will be combined by averaging.\n", "\n", "Here, we will use 20 decision trees and check the fitting time as well as the\n", - "generalization performance on the left-out testing data. It is important to note\n", - "that we are not going to tune any parameter of the decision tree." + "generalization performance on the left-out testing data. It is important to\n", + "note that we are not going to tune any parameter of the decision tree." ] }, { @@ -147,13 +157,16 @@ "\n", "estimator = DecisionTreeRegressor(random_state=0)\n", "bagging_regressor = BaggingRegressor(\n", - " estimator=estimator, n_estimators=20, random_state=0)\n", + " estimator=estimator, n_estimators=20, random_state=0\n", + ")\n", "\n", "cv_results = cross_validate(bagging_regressor, data, target, n_jobs=2)\n", "scores = cv_results[\"test_score\"]\n", "\n", - "print(f\"R2 score obtained by cross-validation: \"\n", - " f\"{scores.mean():.3f} \u00b1 {scores.std():.3f}\")" + "print(\n", + " \"R2 score obtained by cross-validation: \"\n", + " f\"{scores.mean():.3f} \u00b1 {scores.std():.3f}\"\n", + ")" ] }, { @@ -161,16 +174,15 @@ "metadata": {}, "source": [ "Without searching for optimal hyperparameters, the overall generalization\n", - "performance of the bagging regressor is better than a single decision tree.\n", - "In addition, the computational cost is reduced in comparison of seeking\n", - "for the optimal hyperparameters.\n", + "performance of the bagging regressor is better than a single decision tree. In\n", + "addition, the computational cost is reduced in comparison of seeking for the\n", + "optimal hyperparameters.\n", "\n", "This shows the motivation behind the use of an ensemble learner: it gives a\n", "relatively good baseline with decent generalization performance without any\n", "parameter tuning.\n", "\n", - "Now, we will discuss in detail two ensemble families: bagging and\n", - "boosting:\n", + "Now, we will discuss in detail two ensemble families: bagging and boosting:\n", "\n", "* ensemble using bootstrap (e.g. bagging and random-forest);\n", "* ensemble using boosting (e.g. adaptive boosting and gradient-boosting\n", diff --git a/notebooks/ensemble_random_forest.ipynb b/notebooks/ensemble_random_forest.ipynb index 4da0667c2..4e6578193 100644 --- a/notebooks/ensemble_random_forest.ipynb +++ b/notebooks/ensemble_random_forest.ipynb @@ -10,9 +10,9 @@ "differences with the bagging ensembles.\n", "\n", "Random forests are a popular model in machine learning. They are a\n", - "modification of the bagging algorithm. In bagging, any classifier or\n", - "regressor can be used. In random forests, the base classifier or regressor\n", - "is always a decision tree.\n", + "modification of the bagging algorithm. In bagging, any classifier or regressor\n", + "can be used. In random forests, the base classifier or regressor is always a\n", + "decision tree.\n", "\n", "Random forests have another particularity: when training a tree, the search\n", "for the best split is done only on a subset of the original features taken at\n", @@ -28,8 +28,8 @@ "\n", "## A look at random forests\n", "\n", - "We will illustrate the usage of a random forest classifier on the adult\n", - "census dataset." + "We will illustrate the usage of a random forest classifier on the adult census\n", + "dataset." ] }, { @@ -66,12 +66,12 @@ "features using an `OrdinalEncoder` since tree-based models can work very\n", "efficiently with such a naive representation of categorical variables.\n", "\n", - "Since there are rare categories in this dataset we need to specifically\n", - "encode unknown categories at prediction time in order to be able to use\n", + "Since there are rare categories in this dataset we need to specifically encode\n", + "unknown categories at prediction time in order to be able to use\n", "cross-validation. Otherwise some rare categories could only be present on the\n", "validation side of the cross-validation split and the `OrdinalEncoder` would\n", - "raise an error when calling its `transform` method with the data points\n", - "of the validation set." + "raise an error when calling its `transform` method with the data points of the\n", + "validation set." ] }, { @@ -88,7 +88,7 @@ ")\n", "preprocessor = make_column_transformer(\n", " (categorical_encoder, make_column_selector(dtype_include=object)),\n", - " remainder=\"passthrough\"\n", + " remainder=\"passthrough\",\n", ")" ] }, @@ -97,8 +97,8 @@ "metadata": {}, "source": [ "\n", - "We will first give a simple example where we will train a single decision\n", - "tree classifier and check its generalization performance via cross-validation." + "We will first give a simple example where we will train a single decision tree\n", + "classifier and check its generalization performance via cross-validation." ] }, { @@ -123,8 +123,10 @@ "\n", "scores_tree = cross_val_score(tree, data, target)\n", "\n", - "print(f\"Decision tree classifier: \"\n", - " f\"{scores_tree.mean():.3f} \u00b1 {scores_tree.std():.3f}\")" + "print(\n", + " \"Decision tree classifier: \"\n", + " f\"{scores_tree.mean():.3f} \u00b1 {scores_tree.std():.3f}\"\n", + ")" ] }, { @@ -150,8 +152,10 @@ " preprocessor,\n", " BaggingClassifier(\n", " estimator=DecisionTreeClassifier(random_state=0),\n", - " n_estimators=50, n_jobs=2, random_state=0,\n", - " )\n", + " n_estimators=50,\n", + " n_jobs=2,\n", + " random_state=0,\n", + " ),\n", ")" ] }, @@ -163,21 +167,22 @@ "source": [ "scores_bagged_trees = cross_val_score(bagged_trees, data, target)\n", "\n", - "print(f\"Bagged decision tree classifier: \"\n", - " f\"{scores_bagged_trees.mean():.3f} \u00b1 {scores_bagged_trees.std():.3f}\")" + "print(\n", + " \"Bagged decision tree classifier: \"\n", + " f\"{scores_bagged_trees.mean():.3f} \u00b1 {scores_bagged_trees.std():.3f}\"\n", + ")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "\n", "Note that the generalization performance of the bagged trees is already much\n", "better than the performance of a single tree.\n", "\n", "Now, we will use a random forest. You will observe that we do not need to\n", - "specify any `estimator` because the estimator is forced to be a decision\n", - "tree. Thus, we just specify the desired number of trees in the forest." + "specify any `estimator` because the estimator is forced to be a decision tree.\n", + "Thus, we just specify the desired number of trees in the forest." ] }, { @@ -190,7 +195,7 @@ "\n", "random_forest = make_pipeline(\n", " preprocessor,\n", - " RandomForestClassifier(n_estimators=50, n_jobs=2, random_state=0)\n", + " RandomForestClassifier(n_estimators=50, n_jobs=2, random_state=0),\n", ")" ] }, @@ -202,16 +207,17 @@ "source": [ "scores_random_forest = cross_val_score(random_forest, data, target)\n", "\n", - "print(f\"Random forest classifier: \"\n", - " f\"{scores_random_forest.mean():.3f} \u00b1 \"\n", - " f\"{scores_random_forest.std():.3f}\")" + "print(\n", + " \"Random forest classifier: \"\n", + " f\"{scores_random_forest.mean():.3f} \u00b1 \"\n", + " f\"{scores_random_forest.std():.3f}\"\n", + ")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "\n", "It seems that the random forest is performing slightly better than the bagged\n", "trees possibly due to the randomized selection of the features which\n", "decorrelates the prediction errors of individual trees and as a consequence\n", @@ -233,17 +239,17 @@ "\n", "However, `max_features` is one of the hyperparameters to consider when tuning\n", "a random forest:\n", - "- too much randomness in the trees can lead to underfitted base models and\n", - " can be detrimental for the ensemble as a whole,\n", + "- too much randomness in the trees can lead to underfitted base models and can\n", + " be detrimental for the ensemble as a whole,\n", "- too few randomness in the trees leads to more correlation of the prediction\n", - " errors and as a result reduce the benefits of the averaging step in terms\n", - " of overfitting control.\n", + " errors and as a result reduce the benefits of the averaging step in terms of\n", + " overfitting control.\n", "\n", "In scikit-learn, the bagging classes also expose a `max_features` parameter.\n", "However, `BaggingClassifier` and `BaggingRegressor` are agnostic with respect\n", "to their base model and therefore random feature subsampling can only happen\n", - "once before fitting each base model instead of several times per base model\n", - "as is the case when adding splits to a given tree.\n", + "once before fitting each base model instead of several times per base model as\n", + "is the case when adding splits to a given tree.\n", "\n", "We summarize these details in the following table:\n", "\n", diff --git a/notebooks/ensemble_sol_01.ipynb b/notebooks/ensemble_sol_01.ipynb index 91f547842..c1dbb6ed7 100644 --- a/notebooks/ensemble_sol_01.ipynb +++ b/notebooks/ensemble_sol_01.ipynb @@ -9,8 +9,8 @@ "The aim of this notebook is to investigate if we can tune the hyperparameters\n", "of a bagging regressor and evaluate the gain obtained.\n", "\n", - "We will load the California housing dataset and split it into a training and\n", - "a testing set." + "We will load the California housing dataset and split it into a training and a\n", + "testing set." ] }, { @@ -25,7 +25,8 @@ "data, target = fetch_california_housing(as_frame=True, return_X_y=True)\n", "target *= 100 # rescale the target in k$\n", "data_train, data_test, target_train, target_test = train_test_split(\n", - " data, target, random_state=0, test_size=0.5)" + " data, target, random_state=0, test_size=0.5\n", + ")" ] }, { @@ -43,9 +44,9 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Create a `BaggingRegressor` and provide a `DecisionTreeRegressor`\n", - "to its parameter `estimator`. Train the regressor and evaluate its\n", - "generalization performance on the testing set using the mean absolute error." + "Create a `BaggingRegressor` and provide a `DecisionTreeRegressor` to its\n", + "parameter `estimator`. Train the regressor and evaluate its generalization\n", + "performance on the testing set using the mean absolute error." ] }, { @@ -63,24 +64,24 @@ "bagging = BaggingRegressor(estimator=tree, n_jobs=2)\n", "bagging.fit(data_train, target_train)\n", "target_predicted = bagging.predict(data_test)\n", - "print(f\"Basic mean absolute error of the bagging regressor:\\n\"\n", - " f\"{mean_absolute_error(target_test, target_predicted):.2f} k$\")" + "print(\n", + " \"Basic mean absolute error of the bagging regressor:\\n\"\n", + " f\"{mean_absolute_error(target_test, target_predicted):.2f} k$\"\n", + ")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "Now, create a `RandomizedSearchCV` instance using the previous model and\n", - "tune the important parameters of the bagging regressor. Find the best\n", - "parameters and check if you are able to find a set of parameters that\n", - "improve the default regressor still using the mean absolute error as a\n", - "metric.\n", + "Now, create a `RandomizedSearchCV` instance using the previous model and tune\n", + "the important parameters of the bagging regressor. Find the best parameters\n", + "and check if you are able to find a set of parameters that improve the default\n", + "regressor still using the mean absolute error as a metric.\n", "\n", "
\n", "

Tip

\n", - "

You can list the bagging regressor's parameters using the get_params\n", - "method.

\n", + "

You can list the bagging regressor's parameters using the get_params method.

\n", "
" ] }, @@ -151,8 +152,10 @@ "outputs": [], "source": [ "target_predicted = search.predict(data_test)\n", - "print(f\"Mean absolute error after tuning of the bagging regressor:\\n\"\n", - " f\"{mean_absolute_error(target_test, target_predicted):.2f} k$\")" + "print(\n", + " \"Mean absolute error after tuning of the bagging regressor:\\n\"\n", + " f\"{mean_absolute_error(target_test, target_predicted):.2f} k$\"\n", + ")" ] }, { @@ -163,8 +166,8 @@ ] }, "source": [ - "We see that the predictor provided by the bagging regressor does not need\n", - "much hyperparameter tuning compared to a single decision tree." + "We see that the predictor provided by the bagging regressor does not need much\n", + "hyperparameter tuning compared to a single decision tree." ] } ], diff --git a/notebooks/ensemble_sol_02.ipynb b/notebooks/ensemble_sol_02.ipynb index e34215010..81e1feeec 100644 --- a/notebooks/ensemble_sol_02.ipynb +++ b/notebooks/ensemble_sol_02.ipynb @@ -26,7 +26,8 @@ "target_name = \"Body Mass (g)\"\n", "data, target = penguins[[feature_name]], penguins[target_name]\n", "data_train, data_test, target_train, target_test = train_test_split(\n", - " data, target, random_state=0)" + " data, target, random_state=0\n", + ")" ] }, { @@ -44,9 +45,8 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Create a random forest containing three trees. Train the forest and\n", - "check the generalization performance on the testing set in terms of mean\n", - "absolute error." + "Create a random forest containing three trees. Train the forest and check the\n", + "generalization performance on the testing set in terms of mean absolute error." ] }, { @@ -62,8 +62,10 @@ "forest = RandomForestRegressor(n_estimators=3)\n", "forest.fit(data_train, target_train)\n", "target_predicted = forest.predict(data_test)\n", - "print(f\"Mean absolute error: \"\n", - " f\"{mean_absolute_error(target_test, target_predicted):.3f} grams\")" + "print(\n", + " \"Mean absolute error: \"\n", + " f\"{mean_absolute_error(target_test, target_predicted):.3f} grams\"\n", + ")" ] }, { @@ -85,8 +87,7 @@ "# solution\n", "import numpy as np\n", "\n", - "data_range = pd.DataFrame(np.linspace(170, 235, num=300),\n", - " columns=data.columns)" + "data_range = pd.DataFrame(np.linspace(170, 235, num=300), columns=data.columns)" ] }, { @@ -135,15 +136,21 @@ "import matplotlib.pyplot as plt\n", "import seaborn as sns\n", "\n", - "sns.scatterplot(data=penguins, x=feature_name, y=target_name,\n", - " color=\"black\", alpha=0.5)\n", + "sns.scatterplot(\n", + " data=penguins, x=feature_name, y=target_name, color=\"black\", alpha=0.5\n", + ")\n", "\n", "# plot tree predictions\n", "for tree_idx, predictions in enumerate(tree_predictions):\n", - " plt.plot(data_range[feature_name], predictions, label=f\"Tree #{tree_idx}\",\n", - " linestyle=\"--\", alpha=0.8)\n", + " plt.plot(\n", + " data_range[feature_name],\n", + " predictions,\n", + " label=f\"Tree #{tree_idx}\",\n", + " linestyle=\"--\",\n", + " alpha=0.8,\n", + " )\n", "\n", - "plt.plot(data_range[feature_name], forest_predictions, label=f\"Random forest\")\n", + "plt.plot(data_range[feature_name], forest_predictions, label=\"Random forest\")\n", "_ = plt.legend(bbox_to_anchor=(1.05, 0.8), loc=\"upper left\")" ] } diff --git a/notebooks/ensemble_sol_03.ipynb b/notebooks/ensemble_sol_03.ipynb index 4ce1ad8d7..c4213984b 100644 --- a/notebooks/ensemble_sol_03.ipynb +++ b/notebooks/ensemble_sol_03.ipynb @@ -8,10 +8,10 @@ "\n", "The aim of this exercise is to:\n", "\n", - "* verifying if a random forest or a gradient-boosting decision tree overfit\n", - " if the number of estimators is not properly chosen;\n", - "* use the early-stopping strategy to avoid adding unnecessary trees, to\n", - " get the best generalization performances.\n", + "* verifying if a random forest or a gradient-boosting decision tree overfit if\n", + " the number of estimators is not properly chosen;\n", + "* use the early-stopping strategy to avoid adding unnecessary trees, to get\n", + " the best generalization performances.\n", "\n", "We will use the California housing dataset to conduct our experiments." ] @@ -28,7 +28,8 @@ "data, target = fetch_california_housing(return_X_y=True, as_frame=True)\n", "target *= 100 # rescale the target in k$\n", "data_train, data_test, target_train, target_test = train_test_split(\n", - " data, target, random_state=0, test_size=0.5)" + " data, target, random_state=0, test_size=0.5\n", + ")" ] }, { @@ -112,7 +113,10 @@ " scoring=\"neg_mean_absolute_error\",\n", " n_jobs=2,\n", ")\n", - "gbdt_train_errors, gbdt_validation_errors = -gbdt_train_scores, -gbdt_validation_scores\n", + "gbdt_train_errors, gbdt_validation_errors = (\n", + " -gbdt_train_scores,\n", + " -gbdt_validation_scores,\n", + ")\n", "\n", "forest_train_scores, forest_validation_scores = validation_curve(\n", " forest,\n", @@ -191,8 +195,8 @@ "improving for several iterations, it will stop adding trees.\n", "\n", "Now, create a gradient-boosting model with `n_estimators=1_000`. This number\n", - "of trees will be too large. Change the parameter `n_iter_no_change` such\n", - "that the gradient boosting fitting will stop after adding 5 trees that do not\n", + "of trees will be too large. Change the parameter `n_iter_no_change` such that\n", + "the gradient boosting fitting will stop after adding 5 trees that do not\n", "improve the overall generalization performance." ] }, @@ -217,19 +221,18 @@ }, "source": [ "We see that the number of trees used is far below 1000 with the current\n", - "dataset. Training the gradient boosting model with the entire 1000 trees\n", - "would have been useless." + "dataset. Training the gradient boosting model with the entire 1000 trees would\n", + "have been useless." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "Estimate the generalization performance of this model again using\n", - "the `sklearn.metrics.mean_absolute_error` metric but this time using\n", - "the test set that we held out at the beginning of the notebook.\n", - "Compare the resulting value with the values observed in the validation\n", - "curve." + "Estimate the generalization performance of this model again using the\n", + "`sklearn.metrics.mean_absolute_error` metric but this time using the test set\n", + "that we held out at the beginning of the notebook. Compare the resulting value\n", + "with the values observed in the validation curve." ] }, { @@ -240,6 +243,7 @@ "source": [ "# solution\n", "from sklearn.metrics import mean_absolute_error\n", + "\n", "error = mean_absolute_error(target_test, gbdt.predict(data_test))\n", "print(f\"On average, our GBDT regressor makes an error of {error:.2f} k$\")" ] diff --git a/notebooks/ensemble_sol_04.ipynb b/notebooks/ensemble_sol_04.ipynb index f91a775ef..01c8aac7a 100644 --- a/notebooks/ensemble_sol_04.ipynb +++ b/notebooks/ensemble_sol_04.ipynb @@ -7,9 +7,9 @@ "# \ud83d\udcc3 Solution for Exercise M6.04\n", "\n", "The aim of the exercise is to get familiar with the histogram\n", - "gradient-boosting in scikit-learn. Besides, we will use this model within\n", - "a cross-validation framework in order to inspect internal parameters found\n", - "via grid-search.\n", + "gradient-boosting in scikit-learn. Besides, we will use this model within a\n", + "cross-validation framework in order to inspect internal parameters found via\n", + "grid-search.\n", "\n", "We will use the California housing dataset." ] @@ -30,8 +30,8 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "First, create a histogram gradient boosting regressor. You can set the\n", - "trees number to be large, and configure the model to use early-stopping." + "First, create a histogram gradient boosting regressor. You can set the trees\n", + "number to be large, and configure the model to use early-stopping." ] }, { @@ -44,22 +44,23 @@ "from sklearn.ensemble import HistGradientBoostingRegressor\n", "\n", "hist_gbdt = HistGradientBoostingRegressor(\n", - " max_iter=1000, early_stopping=True, random_state=0)" + " max_iter=1000, early_stopping=True, random_state=0\n", + ")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "We will use a grid-search to find some optimal parameter for this model.\n", - "In this grid-search, you should search for the following parameters:\n", + "We will use a grid-search to find some optimal parameter for this model. In\n", + "this grid-search, you should search for the following parameters:\n", "\n", "* `max_depth: [3, 8]`;\n", "* `max_leaf_nodes: [15, 31]`;\n", "* `learning_rate: [0.1, 1]`.\n", "\n", - "Feel free to explore the space with additional values. Create the\n", - "grid-search providing the previous gradient boosting instance as the model." + "Feel free to explore the space with additional values. Create the grid-search\n", + "providing the previous gradient boosting instance as the model." ] }, { @@ -86,9 +87,9 @@ "source": [ "Finally, we will run our experiment through cross-validation. In this regard,\n", "define a 5-fold cross-validation. Besides, be sure to shuffle the data.\n", - "Subsequently, use the function `sklearn.model_selection.cross_validate`\n", - "to run the cross-validation. You should also set `return_estimator=True`,\n", - "so that we can investigate the inner model trained via cross-validation." + "Subsequently, use the function `sklearn.model_selection.cross_validate` to run\n", + "the cross-validation. You should also set `return_estimator=True`, so that we\n", + "can investigate the inner model trained via cross-validation." ] }, { @@ -103,15 +104,16 @@ "\n", "cv = KFold(n_splits=5, shuffle=True, random_state=0)\n", "results = cross_validate(\n", - " search, data, target, cv=cv, return_estimator=True, n_jobs=2)" + " search, data, target, cv=cv, return_estimator=True, n_jobs=2\n", + ")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "Now that we got the cross-validation results, print out the mean and\n", - "standard deviation score." + "Now that we got the cross-validation results, print out the mean and standard\n", + "deviation score." ] }, { @@ -121,9 +123,11 @@ "outputs": [], "source": [ "# solution\n", - "print(f\"R2 score with cross-validation:\\n\"\n", - " f\"{results['test_score'].mean():.3f} \u00b1 \"\n", - " f\"{results['test_score'].std():.3f}\")" + "print(\n", + " \"R2 score with cross-validation:\\n\"\n", + " f\"{results['test_score'].mean():.3f} \u00b1 \"\n", + " f\"{results['test_score'].std():.3f}\"\n", + ")" ] }, { @@ -184,7 +188,8 @@ " search_cv_results = pd.DataFrame(estimator.cv_results_)\n", " search_cv_results = search_cv_results[columns].set_index(index_columns)\n", " search_cv_results = search_cv_results.rename(\n", - " columns={\"mean_test_score\": f\"CV {cv_idx}\"})\n", + " columns={\"mean_test_score\": f\"CV {cv_idx}\"}\n", + " )\n", " inner_cv_results.append(search_cv_results)\n", "inner_cv_results = pd.concat(inner_cv_results, axis=1).T" ] @@ -205,8 +210,10 @@ "inner_cv_results.plot.box(vert=False, color=color)\n", "plt.xlabel(\"R2 score\")\n", "plt.ylabel(\"Parameters\")\n", - "_ = plt.title(\"Inner CV results with parameters\\n\"\n", - " \"(max_depth, max_leaf_nodes, learning_rate)\")" + "_ = plt.title(\n", + " \"Inner CV results with parameters\\n\"\n", + " \"(max_depth, max_leaf_nodes, learning_rate)\"\n", + ")" ] }, { @@ -217,10 +224,9 @@ ] }, "source": [ - "We see that the first 4 ranked set of parameters are very close.\n", - "We could select any of these 4 combinations.\n", - "It coincides with the results we observe when inspecting the\n", - "best parameters of the outer CV." + "We see that the first 4 ranked set of parameters are very close. We could\n", + "select any of these 4 combinations. It coincides with the results we observe\n", + "when inspecting the best parameters of the outer CV." ] } ], diff --git a/notebooks/feature_selection_ex_01.ipynb b/notebooks/feature_selection_ex_01.ipynb index 205217a7b..992aef924 100644 --- a/notebooks/feature_selection_ex_01.ipynb +++ b/notebooks/feature_selection_ex_01.ipynb @@ -7,17 +7,17 @@ "# \ud83d\udcdd Exercise 01\n", "\n", "The aim of this exercise is to highlight caveats to have in mind when using\n", - "feature selection. You have to be extremely careful regarding the set of\n", - "data on which you will compute the statistic that helps your feature\n", - "selection algorithm to decide which feature to select.\n", + "feature selection. You have to be extremely careful regarding the set of data\n", + "on which you will compute the statistic that helps your feature selection\n", + "algorithm to decide which feature to select.\n", "\n", "On purpose, we will make you program the wrong way of doing feature selection\n", "to gain insights.\n", "\n", "First, you will create a completely random dataset using NumPy. Using the\n", "function `np.random.randn`, generate a matrix `data` containing 100 samples\n", - "and 100,000 features. Then, using the function `np.random.randint`, generate\n", - "a vector `target` with 100 samples containing either 0 or 1.\n", + "and 100,000 features. Then, using the function `np.random.randint`, generate a\n", + "vector `target` with 100 samples containing either 0 or 1.\n", "\n", "This type of dimensionality is typical in bioinformatics when dealing with\n", "RNA-seq. However, we will use completely randomized features such that we\n", @@ -41,8 +41,8 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Now, create a logistic regression model and use cross-validation to check\n", - "the score of such a model. It will allow use to confirm that our model cannot\n", + "Now, create a logistic regression model and use cross-validation to check the\n", + "score of such a model. It will allow use to confirm that our model cannot\n", "predict anything meaningful from random data." ] }, @@ -62,8 +62,8 @@ "Now, we will ask you to program the **wrong** pattern to select feature.\n", "Select the feature by using the entire dataset. We will choose ten features\n", "with the highest ANOVA F-score computed on the full dataset. Subsequently,\n", - "subsample the dataset `data` by selecting the features' subset. Finally,\n", - "train and test a logistic regression model.\n", + "subsample the dataset `data` by selecting the features' subset. Finally, train\n", + "and test a logistic regression model.\n", "\n", "You should get some surprising results." ] @@ -74,6 +74,8 @@ "metadata": {}, "outputs": [], "source": [ + "from sklearn.feature_selection import SelectKBest, f_classif\n", + "\n", "# Write your code here." ] }, @@ -93,6 +95,8 @@ "metadata": {}, "outputs": [], "source": [ + "from sklearn.model_selection import train_test_split\n", + "\n", "# Write your code here." ] }, @@ -100,11 +104,11 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "However, the previous case is not perfect. For instance, if we were asking\n", - "to perform cross-validation, the manual `fit`/`transform` of the datasets\n", - "will make our life hard. Indeed, the solution here is to use a scikit-learn\n", - "pipeline in which the feature selection will be a pre processing stage\n", - "before to train the model.\n", + "However, the previous case is not perfect. For instance, if we were asking to\n", + "perform cross-validation, the manual `fit`/`transform` of the datasets will\n", + "make our life hard. Indeed, the solution here is to use a scikit-learn\n", + "pipeline in which the feature selection will be a pre processing stage before\n", + "to train the model.\n", "\n", "Thus, start by creating a pipeline with the feature selector and the logistic\n", "regression. Then, use cross-validation to get an estimate of the uncertainty\n", @@ -117,13 +121,14 @@ "metadata": {}, "outputs": [], "source": [ + "from sklearn.pipeline import make_pipeline\n", + "\n", "# Write your code here." ] } ], "metadata": { "jupytext": { - "encoding": "# -*- coding: utf-8 -*-", "main_language": "python" }, "kernelspec": { diff --git a/notebooks/feature_selection_introduction.ipynb b/notebooks/feature_selection_introduction.ipynb index d99591575..63c036914 100644 --- a/notebooks/feature_selection_introduction.ipynb +++ b/notebooks/feature_selection_introduction.ipynb @@ -6,12 +6,12 @@ "source": [ "# Benefits of using feature selection\n", "\n", - "In this notebook, we aim at introducing the main benefits that can be\n", - "gained when using feature selection.\n", + "In this notebook, we aim at introducing the main benefits that can be gained\n", + "when using feature selection.\n", "\n", "Indeed, the principal advantage of selecting features within a machine\n", - "learning pipeline is to reduce the time to train this pipeline and its time\n", - "to predict. We will give an example to highlights these advantages. First, we\n", + "learning pipeline is to reduce the time to train this pipeline and its time to\n", + "predict. We will give an example to highlights these advantages. First, we\n", "generate a synthetic dataset to control the number of features that will be\n", "informative, redundant, repeated, and random." ] @@ -44,12 +44,12 @@ "\n", "We will create two machine learning pipelines. The former will be a random\n", "forest that will use all available features. The latter will also be a random\n", - "forest, but we will add a feature selection step to train this classifier.\n", - "The feature selection is based on a univariate test (ANOVA F-value) between\n", - "each feature and the target that we want to predict. The features with the\n", - "two most significant scores are selected.\n", + "forest, but we will add a feature selection step to train this classifier. The\n", + "feature selection is based on a univariate test (ANOVA F-value) between each\n", + "feature and the target that we want to predict. The features with the two most\n", + "significant scores are selected.\n", "\n", - "Let's create the model without any feature selection" + "Let's create the model without any feature selection:" ] }, { @@ -93,8 +93,8 @@ "metadata": {}, "source": [ "We will measure the average time spent to train each pipeline and make it\n", - "predict. Besides, we will compute the testing score of the model. We\n", - "will collect these results via cross-validation.\n", + "predict. Besides, we will compute the testing score of the model. We will\n", + "collect these results via cross-validation.\n", "\n", "Let's start with the random forest without feature selection. We will store\n", "the results into a dataframe." @@ -109,8 +109,9 @@ "import pandas as pd\n", "from sklearn.model_selection import cross_validate\n", "\n", - "cv_results_without_selection = cross_validate(model_without_selection, data,\n", - " target)\n", + "cv_results_without_selection = cross_validate(\n", + " model_without_selection, data, target\n", + ")\n", "cv_results_without_selection = pd.DataFrame(cv_results_without_selection)" ] }, @@ -129,7 +130,8 @@ "outputs": [], "source": [ "cv_results_with_selection = cross_validate(\n", - " model_with_selection, data, target, return_estimator=True)\n", + " model_with_selection, data, target, return_estimator=True\n", + ")\n", "cv_results_with_selection = pd.DataFrame(cv_results_with_selection)" ] }, @@ -137,8 +139,8 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "To analyze the results, we will merge the results from the two pipeline in\n", - "a single pandas dataframe." + "To analyze the results, we will merge the results from the two pipeline in a\n", + "single pandas dataframe." ] }, { @@ -195,8 +197,8 @@ "We can draw the same conclusions for both training and scoring elapsed time:\n", "selecting the most informative features speed-up our pipeline.\n", "\n", - "Of course, such speed-up is beneficial only if the generalization performance in\n", - "terms of metrics remain the same. Let's check the testing score." + "Of course, such speed-up is beneficial only if the generalization performance\n", + "in terms of metrics remain the same. Let's check the testing score." ] }, { @@ -214,8 +216,8 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "We can observe that the model's generalization performance selecting a subset of\n", - "features decreases compared with the model using all available features.\n", + "We can observe that the model's generalization performance selecting a subset\n", + "of features decreases compared with the model using all available features.\n", "Since we generated the dataset, we can infer that the decrease is because of\n", "the selection. The feature selection algorithm did not choose the two\n", "informative features.\n", @@ -243,8 +245,8 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "We see that the feature `53` is always selected while the other feature\n", - "varies depending on the cross-validation fold.\n", + "We see that the feature `53` is always selected while the other feature varies\n", + "depending on the cross-validation fold.\n", "\n", "If we would like to keep our score with similar generalization performance, we\n", "could choose another metric to perform the test or select more features. For\n", @@ -255,12 +257,12 @@ "harder.\n", "\n", "Therefore, we could come with a much more complicated procedure that could\n", - "tune (via cross-validation) the number of selected features and change\n", - "the way feature is selected (e.g. using a machine-learning model). However,\n", - "going towards these solutions alienates the feature selection's primary\n", - "purpose to get a significant train/test speed-up. Also, if the primary goal\n", - "was to get a more performant model, performant models exclude non-informative\n", - "features natively." + "tune (via cross-validation) the number of selected features and change the way\n", + "feature is selected (e.g. using a machine-learning model). However, going\n", + "towards these solutions alienates the feature selection's primary purpose to\n", + "get a significant train/test speed-up. Also, if the primary goal was to get a\n", + "more performant model, performant models exclude non-informative features\n", + "natively." ] } ], diff --git a/notebooks/feature_selection_limitation_model.ipynb b/notebooks/feature_selection_limitation_model.ipynb index 8604c05b5..a8e25c700 100644 --- a/notebooks/feature_selection_limitation_model.ipynb +++ b/notebooks/feature_selection_limitation_model.ipynb @@ -60,8 +60,8 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "We will evaluate this model by a k-fold cross validation and store the\n", - "results in a pandas dataframe." + "We will evaluate this model by a k-fold cross validation and store the results\n", + "in a pandas dataframe." ] }, { @@ -74,7 +74,8 @@ "from sklearn.model_selection import cross_validate\n", "\n", "cv_results_without_selection = cross_validate(\n", - " model_without_selection, data, target, cv=5)\n", + " model_without_selection, data, target, cv=5\n", + ")\n", "cv_results_without_selection = pd.DataFrame(cv_results_without_selection)" ] }, @@ -82,8 +83,8 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Then, we will build another model which will include a feature selection\n", - "step based on a random forest and evaluate it as well with cross-validation." + "Then, we will build another model which will include a feature selection step\n", + "based on a random forest and evaluate it as well with cross-validation." ] }, { @@ -97,7 +98,8 @@ "\n", "feature_selector = SelectFromModel(RandomForestClassifier())\n", "model_with_selection = make_pipeline(\n", - " feature_selector, RandomForestClassifier())" + " feature_selector, RandomForestClassifier()\n", + ")" ] }, { @@ -106,8 +108,9 @@ "metadata": {}, "outputs": [], "source": [ - "cv_results_with_selection = cross_validate(model_with_selection, data, target,\n", - " cv=5)\n", + "cv_results_with_selection = cross_validate(\n", + " model_with_selection, data, target, cv=5\n", + ")\n", "cv_results_with_selection = pd.DataFrame(cv_results_with_selection)" ] }, @@ -115,8 +118,8 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "We can compare the testing score of the two models. For this matter,\n", - "we are combining results in a single dataframe." + "We can compare the testing score of the two models. For this matter, we are\n", + "combining results in a single dataframe." ] }, { @@ -157,8 +160,8 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "The model that selected a subset of feature is less performant than a\n", - "random forest fitted on the full dataset.\n", + "The model that selected a subset of feature is less performant than a random\n", + "forest fitted on the full dataset.\n", "\n", "We can rely on some aspects tackled in the notebook presenting the model\n", "inspection to explain this behaviour. The decision tree's relative feature\n", diff --git a/notebooks/feature_selection_sol_01.ipynb b/notebooks/feature_selection_sol_01.ipynb index 9c8c28a38..28dd237d4 100644 --- a/notebooks/feature_selection_sol_01.ipynb +++ b/notebooks/feature_selection_sol_01.ipynb @@ -7,17 +7,17 @@ "# \ud83d\udcc3 Solution for Exercise 01\n", "\n", "The aim of this exercise is to highlight caveats to have in mind when using\n", - "feature selection. You have to be extremely careful regarding the set of\n", - "data on which you will compute the statistic that helps your feature\n", - "selection algorithm to decide which feature to select.\n", + "feature selection. You have to be extremely careful regarding the set of data\n", + "on which you will compute the statistic that helps your feature selection\n", + "algorithm to decide which feature to select.\n", "\n", "On purpose, we will make you program the wrong way of doing feature selection\n", "to gain insights.\n", "\n", "First, you will create a completely random dataset using NumPy. Using the\n", "function `np.random.randn`, generate a matrix `data` containing 100 samples\n", - "and 100,000 features. Then, using the function `np.random.randint`, generate\n", - "a vector `target` with 100 samples containing either 0 or 1.\n", + "and 100,000 features. Then, using the function `np.random.randint`, generate a\n", + "vector `target` with 100 samples containing either 0 or 1.\n", "\n", "This type of dimensionality is typical in bioinformatics when dealing with\n", "RNA-seq. However, we will use completely randomized features such that we\n", @@ -43,8 +43,8 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Now, create a logistic regression model and use cross-validation to check\n", - "the score of such a model. It will allow use to confirm that our model cannot\n", + "Now, create a logistic regression model and use cross-validation to check the\n", + "score of such a model. It will allow use to confirm that our model cannot\n", "predict anything meaningful from random data." ] }, @@ -83,8 +83,8 @@ "Now, we will ask you to program the **wrong** pattern to select feature.\n", "Select the feature by using the entire dataset. We will choose ten features\n", "with the highest ANOVA F-score computed on the full dataset. Subsequently,\n", - "subsample the dataset `data` by selecting the features' subset. Finally,\n", - "train and test a logistic regression model.\n", + "subsample the dataset `data` by selecting the features' subset. Finally, train\n", + "and test a logistic regression model.\n", "\n", "You should get some surprising results." ] @@ -95,7 +95,6 @@ "metadata": {}, "outputs": [], "source": [ - "# solution\n", "from sklearn.feature_selection import SelectKBest, f_classif\n", "\n", "# solution\n", @@ -113,9 +112,9 @@ ] }, "source": [ - "Surprisingly, the logistic regression succeeded in having a fantastic\n", - "accuracy using data that did not have any link with the target in the first\n", - "place. We therefore know that these results are not legit.\n", + "Surprisingly, the logistic regression succeeded in having a fantastic accuracy\n", + "using data that did not have any link with the target in the first place. We\n", + "therefore know that these results are not legit.\n", "\n", "The reasons for obtaining these results are two folds: the pool of available\n", "features is large compared to the number of samples. It is possible to find a\n", @@ -140,12 +139,12 @@ "metadata": {}, "outputs": [], "source": [ - "# solution\n", "from sklearn.model_selection import train_test_split\n", "\n", "# solution\n", "data_train, data_test, target_train, target_test = train_test_split(\n", - " data, target, random_state=0)\n", + " data, target, random_state=0\n", + ")\n", "feature_selector.fit(data_train, target_train)\n", "data_train_subset = feature_selector.transform(data_train)\n", "data_test_subset = feature_selector.transform(data_test)\n", @@ -166,20 +165,20 @@ "features only on the training set will not help when testing our model. In\n", "this case, we obtained the expected results.\n", "\n", - "Therefore, as with hyperparameters optimization or model selection, tuning\n", - "the feature space should be done solely on the training set, keeping a part\n", - "of the data left-out.\n" + "Therefore, as with hyperparameters optimization or model selection, tuning the\n", + "feature space should be done solely on the training set, keeping a part of the\n", + "data left-out.\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "However, the previous case is not perfect. For instance, if we were asking\n", - "to perform cross-validation, the manual `fit`/`transform` of the datasets\n", - "will make our life hard. Indeed, the solution here is to use a scikit-learn\n", - "pipeline in which the feature selection will be a pre processing stage\n", - "before to train the model.\n", + "However, the previous case is not perfect. For instance, if we were asking to\n", + "perform cross-validation, the manual `fit`/`transform` of the datasets will\n", + "make our life hard. Indeed, the solution here is to use a scikit-learn\n", + "pipeline in which the feature selection will be a pre processing stage before\n", + "to train the model.\n", "\n", "Thus, start by creating a pipeline with the feature selector and the logistic\n", "regression. Then, use cross-validation to get an estimate of the uncertainty\n", @@ -192,7 +191,6 @@ "metadata": {}, "outputs": [], "source": [ - "# solution\n", "from sklearn.pipeline import make_pipeline\n", "\n", "# solution\n", @@ -209,9 +207,9 @@ ] }, "source": [ - "We see that using a scikit-learn pipeline removes a lot of boilerplate\n", - "code and helps avoid mistakes when calling `fit` and `transform` on the\n", - "different set of data." + "We see that using a scikit-learn pipeline removes a lot of boilerplate code\n", + "and helps avoid mistakes when calling `fit` and `transform` on the different\n", + "set of data." ] } ], diff --git a/notebooks/linear_models_ex_01.ipynb b/notebooks/linear_models_ex_01.ipynb index 0b06d7f95..3d85324d3 100644 --- a/notebooks/linear_models_ex_01.ipynb +++ b/notebooks/linear_models_ex_01.ipynb @@ -75,12 +75,12 @@ "source": [ "## Main exercise\n", "\n", - "Define a vector `weights = [...]` and a vector `intercepts = [...]` of\n", - "the same length. Each pair of entries `(weights[i], intercepts[i])` tags a\n", + "Define a vector `weights = [...]` and a vector `intercepts = [...]` of the\n", + "same length. Each pair of entries `(weights[i], intercepts[i])` tags a\n", "different model. Use these vectors along with the vector\n", - "`flipper_length_range` to plot several linear models that could possibly\n", - "fit our data. Use the above helper function to visualize both the models and\n", - "the real samples." + "`flipper_length_range` to plot several linear models that could possibly fit\n", + "our data. Use the above helper function to visualize both the models and the\n", + "real samples." ] }, { @@ -109,9 +109,9 @@ "lines_to_next_cell": 2 }, "source": [ - "In the previous question, you were asked to create several linear models.\n", - "The visualization allowed you to qualitatively assess if a model was better\n", - "than another.\n", + "In the previous question, you were asked to create several linear models. The\n", + "visualization allowed you to qualitatively assess if a model was better than\n", + "another.\n", "\n", "Now, you should come up with a quantitative measure which indicates the\n", "goodness of fit of each linear model and allows you to select the best model.\n", @@ -133,8 +133,8 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "You can now copy and paste the code below to show the goodness of fit for\n", - "each model.\n", + "You can now copy and paste the code below to show the goodness of fit for each\n", + "model.\n", "\n", "```python\n", "for model_idx, (weight, intercept) in enumerate(zip(weights, intercepts)):\n", @@ -157,7 +157,6 @@ ], "metadata": { "jupytext": { - "encoding": "# -*- coding: utf-8 -*-", "main_language": "python" }, "kernelspec": { diff --git a/notebooks/linear_models_ex_02.ipynb b/notebooks/linear_models_ex_02.ipynb index 8ce970908..c9c0aad96 100644 --- a/notebooks/linear_models_ex_02.ipynb +++ b/notebooks/linear_models_ex_02.ipynb @@ -26,16 +26,17 @@ "outputs": [], "source": [ "import numpy as np\n", + "\n", "# Set the seed for reproduction\n", "rng = np.random.RandomState(0)\n", "\n", "# Generate data\n", "n_sample = 100\n", "data_max, data_min = 1.4, -1.4\n", - "len_data = (data_max - data_min)\n", + "len_data = data_max - data_min\n", "data = rng.rand(n_sample) * len_data - len_data / 2\n", - "noise = rng.randn(n_sample) * .3\n", - "target = data ** 3 - 0.5 * data ** 2 + noise" + "noise = rng.randn(n_sample) * 0.3\n", + "target = data**3 - 0.5 * data**2 + noise" ] }, { @@ -56,31 +57,33 @@ "outputs": [], "source": [ "import pandas as pd\n", + "\n", "full_data = pd.DataFrame({\"data\": data, \"target\": target})" ] }, { "cell_type": "code", "execution_count": null, - "metadata": { - "lines_to_next_cell": 1 - }, + "metadata": {}, "outputs": [], "source": [ "import seaborn as sns\n", "\n", - "_ = sns.scatterplot(data=full_data, x=\"data\", y=\"target\", color=\"black\",\n", - " alpha=0.5)" + "_ = sns.scatterplot(\n", + " data=full_data, x=\"data\", y=\"target\", color=\"black\", alpha=0.5\n", + ")" ] }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "lines_to_next_cell": 2 + }, "source": [ "We observe that the link between the data `data` and vector `target` is\n", - "non-linear. For instance, `data` could represent the years of\n", - "experience (normalized) and `target` the salary (normalized). Therefore, the\n", - "problem here would be to infer the salary given the years of experience.\n", + "non-linear. For instance, `data` could represent the years of experience\n", + "(normalized) and `target` the salary (normalized). Therefore, the problem here\n", + "would be to infer the salary given the years of experience.\n", "\n", "Using the function `f` defined below, find both the `weight` and the\n", "`intercept` that you think will lead to a good linear model. Plot both the\n", @@ -90,9 +93,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "lines_to_next_cell": 1 - }, + "metadata": {}, "outputs": [], "source": [ "def f(data, weight=0, intercept=0):\n", @@ -188,7 +189,6 @@ ], "metadata": { "jupytext": { - "encoding": "# -*- coding: utf-8 -*-", "main_language": "python" }, "kernelspec": { diff --git a/notebooks/linear_models_ex_03.ipynb b/notebooks/linear_models_ex_03.ipynb index 80c5d87d8..4cf750e81 100644 --- a/notebooks/linear_models_ex_03.ipynb +++ b/notebooks/linear_models_ex_03.ipynb @@ -13,11 +13,10 @@ "The aim of this notebook is to train a linear regression algorithm on a\n", "dataset with more than a single feature.\n", "\n", - "We will load a dataset about house prices in California.\n", - "The dataset consists of 8 features regarding the demography and geography of\n", - "districts in California and the aim is to predict the median house price of\n", - "each district. We will use all 8 features to predict the target, the median\n", - "house price." + "We will load a dataset about house prices in California. The dataset consists\n", + "of 8 features regarding the demography and geography of districts in\n", + "California and the aim is to predict the median house price of each district.\n", + "We will use all 8 features to predict the target, the median house price." ] }, { @@ -48,8 +47,8 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Now it is your turn to train a linear regression model on this dataset.\n", - "First, create a linear regression model." + "Now it is your turn to train a linear regression model on this dataset. First,\n", + "create a linear regression model." ] }, { @@ -65,8 +64,8 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Execute a cross-validation with 10 folds and use the mean absolute error\n", - "(MAE) as metric. Be sure to *return* the fitted *estimators*." + "Execute a cross-validation with 10 folds and use the mean absolute error (MAE)\n", + "as metric. Be sure to *return* the fitted *estimators*." ] }, { @@ -101,8 +100,8 @@ }, "source": [ "Inspect the fitted model using a box plot to show the distribution of values\n", - "for the coefficients returned from the cross-validation. Hint:\n", - "use the function\n", + "for the coefficients returned from the cross-validation. Hint: use the\n", + "function\n", "[`df.plot.box()`](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.plot.box.html)\n", "to create a box plot." ] @@ -119,7 +118,6 @@ ], "metadata": { "jupytext": { - "encoding": "# -*- coding: utf-8 -*-", "main_language": "python" }, "kernelspec": { diff --git a/notebooks/linear_models_ex_04.ipynb b/notebooks/linear_models_ex_04.ipynb index 9d7107377..77086778b 100644 --- a/notebooks/linear_models_ex_04.ipynb +++ b/notebooks/linear_models_ex_04.ipynb @@ -6,8 +6,8 @@ "source": [ "# \ud83d\udcdd Exercise M4.04\n", "\n", - "In the previous notebook, we saw the effect of applying some regularization\n", - "on the coefficient of a linear model.\n", + "In the previous notebook, we saw the effect of applying some regularization on\n", + "the coefficient of a linear model.\n", "\n", "In this exercise, we will study the advantage of using some regularization\n", "when dealing with correlated features.\n", @@ -39,8 +39,8 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "When creating the dataset, `make_regression` returns the true coefficient\n", - "used to generate the dataset. Let's plot this information." + "When creating the dataset, `make_regression` returns the true coefficient used\n", + "to generate the dataset. Let's plot this information." ] }, { @@ -67,9 +67,9 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Create a `LinearRegression` regressor and fit on the entire dataset and\n", - "check the value of the coefficients. Are the coefficients of the linear\n", - "regressor close to the coefficients used to generate the dataset?" + "Create a `LinearRegression` regressor and fit on the entire dataset and check\n", + "the value of the coefficients. Are the coefficients of the linear regressor\n", + "close to the coefficients used to generate the dataset?" ] }, { @@ -103,8 +103,8 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Fit again the linear regressor on this new dataset and check the\n", - "coefficients. What do you observe?" + "Fit again the linear regressor on this new dataset and check the coefficients.\n", + "What do you observe?" ] }, { @@ -153,7 +153,6 @@ ], "metadata": { "jupytext": { - "encoding": "# -*- coding: utf-8 -*-", "main_language": "python" }, "kernelspec": { diff --git a/notebooks/linear_models_ex_05.ipynb b/notebooks/linear_models_ex_05.ipynb index 71c8c8567..866d52086 100644 --- a/notebooks/linear_models_ex_05.ipynb +++ b/notebooks/linear_models_ex_05.ipynb @@ -5,13 +5,14 @@ "metadata": {}, "source": [ "# \ud83d\udcdd Exercise M4.05\n", + "\n", "In the previous notebook we set `penalty=\"none\"` to disable regularization\n", - "entirely. This parameter can also control the **type** of regularization to use,\n", - "whereas the regularization **strength** is set using the parameter `C`.\n", - "Setting`penalty=\"none\"` is equivalent to an infinitely large value of `C`.\n", - "In this exercise, we ask you to train a logistic regression classifier using the\n", - "`penalty=\"l2\"` regularization (which happens to be the default in scikit-learn)\n", - "to find by yourself the effect of the parameter `C`.\n", + "entirely. This parameter can also control the **type** of regularization to\n", + "use, whereas the regularization **strength** is set using the parameter `C`.\n", + "Setting`penalty=\"none\"` is equivalent to an infinitely large value of `C`. In\n", + "this exercise, we ask you to train a logistic regression classifier using the\n", + "`penalty=\"l2\"` regularization (which happens to be the default in\n", + "scikit-learn) to find by yourself the effect of the parameter `C`.\n", "\n", "We will start by loading the dataset." ] @@ -37,8 +38,9 @@ "\n", "penguins = pd.read_csv(\"../datasets/penguins_classification.csv\")\n", "# only keep the Adelie and Chinstrap classes\n", - "penguins = penguins.set_index(\"Species\").loc[\n", - " [\"Adelie\", \"Chinstrap\"]].reset_index()\n", + "penguins = (\n", + " penguins.set_index(\"Species\").loc[[\"Adelie\", \"Chinstrap\"]].reset_index()\n", + ")\n", "\n", "culmen_columns = [\"Culmen Length (mm)\", \"Culmen Depth (mm)\"]\n", "target_column = \"Species\"" @@ -79,7 +81,8 @@ "from sklearn.linear_model import LogisticRegression\n", "\n", "logistic_regression = make_pipeline(\n", - " StandardScaler(), LogisticRegression(penalty=\"l2\"))" + " StandardScaler(), LogisticRegression(penalty=\"l2\")\n", + ")" ] }, { @@ -122,7 +125,6 @@ ], "metadata": { "jupytext": { - "encoding": "# -*- coding: utf-8 -*-", "main_language": "python" }, "kernelspec": { diff --git a/notebooks/linear_models_regularization.ipynb b/notebooks/linear_models_regularization.ipynb index 15828c86b..ff52b4849 100644 --- a/notebooks/linear_models_regularization.ipynb +++ b/notebooks/linear_models_regularization.ipynb @@ -9,12 +9,12 @@ "In this notebook, we will see the limitations of linear regression models and\n", "the advantage of using regularized models instead.\n", "\n", - "Besides, we will also present the preprocessing required when dealing\n", - "with regularized models, furthermore when the regularization parameter\n", - "needs to be tuned.\n", + "Besides, we will also present the preprocessing required when dealing with\n", + "regularized models, furthermore when the regularization parameter needs to be\n", + "tuned.\n", "\n", - "We will start by highlighting the over-fitting issue that can arise with\n", - "a simple linear regression model.\n", + "We will start by highlighting the over-fitting issue that can arise with a\n", + "simple linear regression model.\n", "\n", "## Effect of regularization\n", "\n", @@ -55,10 +55,9 @@ "We showed that one can use the `PolynomialFeatures` transformer to create\n", "additional features encoding non-linear interactions between features.\n", "\n", - "Here, we will use this transformer to augment the feature space.\n", - "Subsequently, we will train a linear regression model. We will use the\n", - "out-of-sample test set to evaluate the generalization capabilities of our\n", - "model." + "Here, we will use this transformer to augment the feature space. Subsequently,\n", + "we will train a linear regression model. We will use the out-of-sample test\n", + "set to evaluate the generalization capabilities of our model." ] }, { @@ -72,12 +71,18 @@ "from sklearn.preprocessing import PolynomialFeatures\n", "from sklearn.linear_model import LinearRegression\n", "\n", - "linear_regression = make_pipeline(PolynomialFeatures(degree=2),\n", - " LinearRegression())\n", - "cv_results = cross_validate(linear_regression, data, target,\n", - " cv=10, scoring=\"neg_mean_squared_error\",\n", - " return_train_score=True,\n", - " return_estimator=True)" + "linear_regression = make_pipeline(\n", + " PolynomialFeatures(degree=2), LinearRegression()\n", + ")\n", + "cv_results = cross_validate(\n", + " linear_regression,\n", + " data,\n", + " target,\n", + " cv=10,\n", + " scoring=\"neg_mean_squared_error\",\n", + " return_train_score=True,\n", + " return_estimator=True,\n", + ")" ] }, { @@ -95,8 +100,10 @@ "outputs": [], "source": [ "train_error = -cv_results[\"train_score\"]\n", - "print(f\"Mean squared error of linear regression model on the train set:\\n\"\n", - " f\"{train_error.mean():.3f} \u00b1 {train_error.std():.3f}\")" + "print(\n", + " \"Mean squared error of linear regression model on the train set:\\n\"\n", + " f\"{train_error.mean():.3f} \u00b1 {train_error.std():.3f}\"\n", + ")" ] }, { @@ -106,8 +113,10 @@ "outputs": [], "source": [ "test_error = -cv_results[\"test_score\"]\n", - "print(f\"Mean squared error of linear regression model on the test set:\\n\"\n", - " f\"{test_error.mean():.3f} \u00b1 {test_error.std():.3f}\")" + "print(\n", + " \"Mean squared error of linear regression model on the test set:\\n\"\n", + " f\"{test_error.mean():.3f} \u00b1 {test_error.std():.3f}\"\n", + ")" ] }, { @@ -118,17 +127,16 @@ "gap between the training and testing score is an indication that our model\n", "overfitted our training set.\n", "\n", - "Indeed, this is one of the danger when augmenting the number of features\n", - "with a `PolynomialFeatures` transformer. Our model will focus on some\n", - "specific features. We can check the weights of the model to have a\n", - "confirmation. Let's create a dataframe: the columns will contain the name\n", - "of the feature while the line the coefficients values stored by each model\n", - "during the cross-validation.\n", + "Indeed, this is one of the danger when augmenting the number of features with\n", + "a `PolynomialFeatures` transformer. Our model will focus on some specific\n", + "features. We can check the weights of the model to have a confirmation. Let's\n", + "create a dataframe: the columns will contain the name of the feature while the\n", + "line the coefficients values stored by each model during the cross-validation.\n", "\n", "Since we used a `PolynomialFeatures` to augment the data, we will create\n", - "feature names representative of the feature combination. Scikit-learn\n", - "provides a `get_feature_names_out` method for this purpose. First, let's get\n", - "the first fitted model from the cross-validation." + "feature names representative of the feature combination. Scikit-learn provides\n", + "a `get_feature_names_out` method for this purpose. First, let's get the first\n", + "fitted model from the cross-validation." ] }, { @@ -145,7 +153,7 @@ "metadata": {}, "source": [ "Now, we can access to the fitted `PolynomialFeatures` to generate the feature\n", - "names" + "names:" ] }, { @@ -155,7 +163,8 @@ "outputs": [], "source": [ "feature_names = model_first_fold[0].get_feature_names_out(\n", - " input_features=data.columns)\n", + " input_features=data.columns\n", + ")\n", "feature_names" ] }, @@ -216,12 +225,16 @@ "source": [ "from sklearn.linear_model import Ridge\n", "\n", - "ridge = make_pipeline(PolynomialFeatures(degree=2),\n", - " Ridge(alpha=100))\n", - "cv_results = cross_validate(ridge, data, target,\n", - " cv=10, scoring=\"neg_mean_squared_error\",\n", - " return_train_score=True,\n", - " return_estimator=True)" + "ridge = make_pipeline(PolynomialFeatures(degree=2), Ridge(alpha=100))\n", + "cv_results = cross_validate(\n", + " ridge,\n", + " data,\n", + " target,\n", + " cv=10,\n", + " scoring=\"neg_mean_squared_error\",\n", + " return_train_score=True,\n", + " return_estimator=True,\n", + ")" ] }, { @@ -242,8 +255,10 @@ "outputs": [], "source": [ "train_error = -cv_results[\"train_score\"]\n", - "print(f\"Mean squared error of linear regression model on the train set:\\n\"\n", - " f\"{train_error.mean():.3f} \u00b1 {train_error.std():.3f}\")" + "print(\n", + " \"Mean squared error of linear regression model on the train set:\\n\"\n", + " f\"{train_error.mean():.3f} \u00b1 {train_error.std():.3f}\"\n", + ")" ] }, { @@ -253,8 +268,10 @@ "outputs": [], "source": [ "test_error = -cv_results[\"test_score\"]\n", - "print(f\"Mean squared error of linear regression model on the test set:\\n\"\n", - " f\"{test_error.mean():.3f} \u00b1 {test_error.std():.3f}\")" + "print(\n", + " \"Mean squared error of linear regression model on the test set:\\n\"\n", + " f\"{test_error.mean():.3f} \u00b1 {test_error.std():.3f}\"\n", + ")" ] }, { @@ -302,30 +319,28 @@ "## Feature scaling and regularization\n", "\n", "On the one hand, weights define the link between feature values and the\n", - "predicted target.\n", - "On the other hand, regularization adds constraints on the weights of the\n", - "model through the `alpha` parameter. Therefore, the effect that feature\n", - "rescaling has on the final weights also interacts with regularization.\n", + "predicted target. On the other hand, regularization adds constraints on the\n", + "weights of the model through the `alpha` parameter. Therefore, the effect that\n", + "feature rescaling has on the final weights also interacts with regularization.\n", "\n", - "Let's consider the case where features live on the same scale/units: if\n", - "two features are found to be equally important by the model, they will be\n", - "affected similarly by regularization strength.\n", + "Let's consider the case where features live on the same scale/units: if two\n", + "features are found to be equally important by the model, they will be affected\n", + "similarly by regularization strength.\n", "\n", - "Now, let's consider the scenario where features have completely different\n", - "data scale (for instance age in years and annual revenue in dollars).\n", - "If two features are as important, our model will boost the weights of\n", - "features with small scale and reduce the weights of features with\n", - "high scale.\n", + "Now, let's consider the scenario where features have completely different data\n", + "scale (for instance age in years and annual revenue in dollars). If two\n", + "features are as important, our model will boost the weights of features with\n", + "small scale and reduce the weights of features with high scale.\n", "\n", "We recall that regularization forces weights to be closer. Therefore, we get\n", - "an intuition that if we want to use regularization, dealing with rescaled\n", - "data would make it easier to find an optimal regularization parameter and\n", - "thus an adequate model.\n", + "an intuition that if we want to use regularization, dealing with rescaled data\n", + "would make it easier to find an optimal regularization parameter and thus an\n", + "adequate model.\n", "\n", "As a side note, some solvers based on gradient computation are expecting such\n", "rescaled data. Unscaled data will be detrimental when computing the optimal\n", - "weights. Therefore, when working with a linear model and numerical data, it\n", - "is generally good practice to scale the data.\n", + "weights. Therefore, when working with a linear model and numerical data, it is\n", + "generally good practice to scale the data.\n", "\n", "Thus, we will add a `StandardScaler` in the machine learning pipeline. This\n", "scaler will be placed just before the regressor." @@ -339,12 +354,18 @@ "source": [ "from sklearn.preprocessing import StandardScaler\n", "\n", - "ridge = make_pipeline(PolynomialFeatures(degree=2), StandardScaler(),\n", - " Ridge(alpha=0.5))\n", - "cv_results = cross_validate(ridge, data, target,\n", - " cv=10, scoring=\"neg_mean_squared_error\",\n", - " return_train_score=True,\n", - " return_estimator=True)" + "ridge = make_pipeline(\n", + " PolynomialFeatures(degree=2), StandardScaler(), Ridge(alpha=0.5)\n", + ")\n", + "cv_results = cross_validate(\n", + " ridge,\n", + " data,\n", + " target,\n", + " cv=10,\n", + " scoring=\"neg_mean_squared_error\",\n", + " return_train_score=True,\n", + " return_estimator=True,\n", + ")" ] }, { @@ -354,8 +375,10 @@ "outputs": [], "source": [ "train_error = -cv_results[\"train_score\"]\n", - "print(f\"Mean squared error of linear regression model on the train set:\\n\"\n", - " f\"{train_error.mean():.3f} \u00b1 {train_error.std():.3f}\")" + "print(\n", + " \"Mean squared error of linear regression model on the train set:\\n\"\n", + " f\"{train_error.mean():.3f} \u00b1 {train_error.std():.3f}\"\n", + ")" ] }, { @@ -365,8 +388,10 @@ "outputs": [], "source": [ "test_error = -cv_results[\"test_score\"]\n", - "print(f\"Mean squared error of linear regression model on the test set:\\n\"\n", - " f\"{test_error.mean():.3f} \u00b1 {test_error.std():.3f}\")" + "print(\n", + " \"Mean squared error of linear regression model on the test set:\\n\"\n", + " f\"{test_error.mean():.3f} \u00b1 {test_error.std():.3f}\"\n", + ")" ] }, { @@ -408,8 +433,8 @@ "Compare to the previous plots, we see that now all weight magnitudes are\n", "closer and that all features are more equally contributing.\n", "\n", - "In the previous example, we fixed `alpha=0.5`. We will now check the impact\n", - "of the value of `alpha` by increasing its value." + "In the previous example, we fixed `alpha=0.5`. We will now check the impact of\n", + "the value of `alpha` by increasing its value." ] }, { @@ -418,12 +443,18 @@ "metadata": {}, "outputs": [], "source": [ - "ridge = make_pipeline(PolynomialFeatures(degree=2), StandardScaler(),\n", - " Ridge(alpha=1_000_000))\n", - "cv_results = cross_validate(ridge, data, target,\n", - " cv=10, scoring=\"neg_mean_squared_error\",\n", - " return_train_score=True,\n", - " return_estimator=True)" + "ridge = make_pipeline(\n", + " PolynomialFeatures(degree=2), StandardScaler(), Ridge(alpha=1_000_000)\n", + ")\n", + "cv_results = cross_validate(\n", + " ridge,\n", + " data,\n", + " target,\n", + " cv=10,\n", + " scoring=\"neg_mean_squared_error\",\n", + " return_train_score=True,\n", + " return_estimator=True,\n", + ")" ] }, { @@ -461,9 +492,9 @@ "OneHotEncoder since the feature values are already on a similar scale.

\n", "

However, this choice can be questioned since scaling interacts with\n", "regularization as well. For instance, scaling categorical features that are\n", - "imbalanced (e.g. more occurrences of a specific category) would even out\n", - "the impact of regularization to each category. However, scaling such features\n", - "in the presence of rare categories could be problematic (i.e. division by a very\n", + "imbalanced (e.g. more occurrences of a specific category) would even out the\n", + "impact of regularization to each category. However, scaling such features in\n", + "the presence of rare categories could be problematic (i.e. division by a very\n", "small standard deviation) and it can therefore introduce numerical issues.

\n", "
\n", "\n", @@ -471,8 +502,8 @@ "an effect on the performance. We chose the parameter beforehand and fixed it\n", "for the analysis.\n", "\n", - "In the next section, we will check the impact of the regularization\n", - "parameter `alpha` and how it should be tuned.\n", + "In the next section, we will check the impact of the regularization parameter\n", + "`alpha` and how it should be tuned.\n", "\n", "## Fine tuning the regularization parameter\n", "\n", @@ -480,9 +511,9 @@ "The default parameter will not lead to the optimal model. Therefore, we need\n", "to tune the `alpha` parameter.\n", "\n", - "Model hyperparameter tuning should be done with care. Indeed, we want to\n", - "find an optimal parameter that maximizes some metrics. Thus, it requires both\n", - "a training set and testing set.\n", + "Model hyperparameter tuning should be done with care. Indeed, we want to find\n", + "an optimal parameter that maximizes some metrics. Thus, it requires both a\n", + "training set and testing set.\n", "\n", "However, this testing set should be different from the out-of-sample testing\n", "set that we used to evaluate our model: if we use the same one, we are using\n", @@ -490,17 +521,16 @@ "out-of-sample rule.\n", "\n", "Therefore, we should include search of the hyperparameter `alpha` within the\n", - "cross-validation. As we saw in previous notebooks, we could use a\n", - "grid-search. However, some predictor in scikit-learn are available with\n", - "an integrated hyperparameter search, more efficient than using a grid-search.\n", - "The name of these predictors finishes by `CV`. In the case of `Ridge`,\n", - "scikit-learn provides a `RidgeCV` regressor.\n", + "cross-validation. As we saw in previous notebooks, we could use a grid-search.\n", + "However, some predictor in scikit-learn are available with an integrated\n", + "hyperparameter search, more efficient than using a grid-search. The name of\n", + "these predictors finishes by `CV`. In the case of `Ridge`, scikit-learn\n", + "provides a `RidgeCV` regressor.\n", "\n", "Therefore, we can use this predictor as the last step of the pipeline.\n", "Including the pipeline a cross-validation allows to make a nested\n", - "cross-validation: the inner cross-validation will search for the best\n", - "alpha, while the outer cross-validation will give an estimate of the\n", - "testing score." + "cross-validation: the inner cross-validation will search for the best alpha,\n", + "while the outer cross-validation will give an estimate of the testing score." ] }, { @@ -513,8 +543,11 @@ "from sklearn.linear_model import RidgeCV\n", "\n", "alphas = np.logspace(-2, 0, num=21)\n", - "ridge = make_pipeline(PolynomialFeatures(degree=2), StandardScaler(),\n", - " RidgeCV(alphas=alphas, store_cv_values=True))" + "ridge = make_pipeline(\n", + " PolynomialFeatures(degree=2),\n", + " StandardScaler(),\n", + " RidgeCV(alphas=alphas, store_cv_values=True),\n", + ")" ] }, { @@ -526,10 +559,16 @@ "from sklearn.model_selection import ShuffleSplit\n", "\n", "cv = ShuffleSplit(n_splits=5, random_state=1)\n", - "cv_results = cross_validate(ridge, data, target,\n", - " cv=cv, scoring=\"neg_mean_squared_error\",\n", - " return_train_score=True,\n", - " return_estimator=True, n_jobs=2)" + "cv_results = cross_validate(\n", + " ridge,\n", + " data,\n", + " target,\n", + " cv=cv,\n", + " scoring=\"neg_mean_squared_error\",\n", + " return_train_score=True,\n", + " return_estimator=True,\n", + " n_jobs=2,\n", + ")" ] }, { @@ -539,8 +578,10 @@ "outputs": [], "source": [ "train_error = -cv_results[\"train_score\"]\n", - "print(f\"Mean squared error of linear regression model on the train set:\\n\"\n", - " f\"{train_error.mean():.3f} \u00b1 {train_error.std():.3f}\")" + "print(\n", + " \"Mean squared error of linear regression model on the train set:\\n\"\n", + " f\"{train_error.mean():.3f} \u00b1 {train_error.std():.3f}\"\n", + ")" ] }, { @@ -550,8 +591,10 @@ "outputs": [], "source": [ "test_error = -cv_results[\"test_score\"]\n", - "print(f\"Mean squared error of linear regression model on the test set:\\n\"\n", - " f\"{test_error.mean():.3f} \u00b1 {test_error.std():.3f}\")" + "print(\n", + " \"Mean squared error of linear regression model on the test set:\\n\"\n", + " f\"{test_error.mean():.3f} \u00b1 {test_error.std():.3f}\"\n", + ")" ] }, { @@ -564,8 +607,8 @@ "When fitting the ridge regressor, we also requested to store the error found\n", "during cross-validation (by setting the parameter `store_cv_values=True`). We\n", "will plot the mean squared error for the different `alphas` regularization\n", - "strength that we tried. The error bars represent one standard deviation of\n", - "the average mean square error across folds for a given value of `alpha`." + "strength that we tried. The error bars represent one standard deviation of the\n", + "average mean square error across folds for a given value of `alpha`." ] }, { @@ -574,8 +617,9 @@ "metadata": {}, "outputs": [], "source": [ - "mse_alphas = [est[-1].cv_values_.mean(axis=0)\n", - " for est in cv_results[\"estimator\"]]\n", + "mse_alphas = [\n", + " est[-1].cv_values_.mean(axis=0) for est in cv_results[\"estimator\"]\n", + "]\n", "cv_alphas = pd.DataFrame(mse_alphas, columns=alphas)\n", "cv_alphas = cv_alphas.aggregate([\"mean\", \"std\"]).T\n", "cv_alphas" @@ -587,8 +631,7 @@ "metadata": {}, "outputs": [], "source": [ - "plt.errorbar(cv_alphas.index, cv_alphas[\"mean\"],\n", - " yerr=cv_alphas[\"std\"])\n", + "plt.errorbar(cv_alphas.index, cv_alphas[\"mean\"], yerr=cv_alphas[\"std\"])\n", "plt.xlim((0.0, 1.0))\n", "plt.ylim((4_500, 11_000))\n", "plt.ylabel(\"Mean squared error\\n (lower is better)\")\n", @@ -601,8 +644,8 @@ "metadata": {}, "source": [ "As we can see, regularization is just like salt in cooking: one must balance\n", - "its amount to get the best generalization performance. We can check if the best\n", - "`alpha` found is stable across the cross-validation fold." + "its amount to get the best generalization performance. We can check if the\n", + "best `alpha` found is stable across the cross-validation fold." ] }, { @@ -621,9 +664,9 @@ "source": [ "The optimal regularization strength is not necessarily the same on all\n", "cross-validation iterations. But since we expect each cross-validation\n", - "resampling to stem from the same data distribution, it is common practice\n", - "to choose the best `alpha` to put into production as lying in the range\n", - "defined by:" + "resampling to stem from the same data distribution, it is common practice to\n", + "choose the best `alpha` to put into production as lying in the range defined\n", + "by:" ] }, { @@ -632,8 +675,10 @@ "metadata": {}, "outputs": [], "source": [ - "print(f\"Min optimal alpha: {np.min(best_alphas):.2f} and \"\n", - " f\"Max optimal alpha: {np.max(best_alphas):.2f}\")" + "print(\n", + " f\"Min optimal alpha: {np.min(best_alphas):.2f} and \"\n", + " f\"Max optimal alpha: {np.max(best_alphas):.2f}\"\n", + ")" ] }, { @@ -643,8 +688,8 @@ "This range can be reduced by decreasing the spacing between the grid of\n", "`alphas`.\n", "\n", - "In this notebook, you learned about the concept of regularization and\n", - "the importance of preprocessing and parameter tuning." + "In this notebook, you learned about the concept of regularization and the\n", + "importance of preprocessing and parameter tuning." ] } ], diff --git a/notebooks/linear_models_sol_01.ipynb b/notebooks/linear_models_sol_01.ipynb index d4282fc71..f6f710ade 100644 --- a/notebooks/linear_models_sol_01.ipynb +++ b/notebooks/linear_models_sol_01.ipynb @@ -75,12 +75,12 @@ "source": [ "## Main exercise\n", "\n", - "Define a vector `weights = [...]` and a vector `intercepts = [...]` of\n", - "the same length. Each pair of entries `(weights[i], intercepts[i])` tags a\n", + "Define a vector `weights = [...]` and a vector `intercepts = [...]` of the\n", + "same length. Each pair of entries `(weights[i], intercepts[i])` tags a\n", "different model. Use these vectors along with the vector\n", - "`flipper_length_range` to plot several linear models that could possibly\n", - "fit our data. Use the above helper function to visualize both the models and\n", - "the real samples." + "`flipper_length_range` to plot several linear models that could possibly fit\n", + "our data. Use the above helper function to visualize both the models and the\n", + "real samples." ] }, { @@ -107,17 +107,22 @@ "weights = [-40, 45, 90]\n", "intercepts = [15000, -5000, -14000]\n", "\n", - "ax = sns.scatterplot(data=penguins, x=feature_name, y=target_name,\n", - " color=\"black\", alpha=0.5)\n", + "ax = sns.scatterplot(\n", + " data=penguins, x=feature_name, y=target_name, color=\"black\", alpha=0.5\n", + ")\n", "\n", "label = \"{0:.2f} (g / mm) * flipper length + {1:.2f} (g)\"\n", "for weight, intercept in zip(weights, intercepts):\n", " predicted_body_mass = linear_model_flipper_mass(\n", - " flipper_length_range, weight, intercept)\n", + " flipper_length_range, weight, intercept\n", + " )\n", "\n", - " ax.plot(flipper_length_range, predicted_body_mass,\n", - " label=label.format(weight, intercept))\n", - "_ = ax.legend(loc='center left', bbox_to_anchor=(-0.25, 1.25), ncol=1)" + " ax.plot(\n", + " flipper_length_range,\n", + " predicted_body_mass,\n", + " label=label.format(weight, intercept),\n", + " )\n", + "_ = ax.legend(loc=\"center left\", bbox_to_anchor=(-0.25, 1.25), ncol=1)" ] }, { @@ -126,9 +131,9 @@ "lines_to_next_cell": 2 }, "source": [ - "In the previous question, you were asked to create several linear models.\n", - "The visualization allowed you to qualitatively assess if a model was better\n", - "than another.\n", + "In the previous question, you were asked to create several linear models. The\n", + "visualization allowed you to qualitatively assess if a model was better than\n", + "another.\n", "\n", "Now, you should come up with a quantitative measure which indicates the\n", "goodness of fit of each linear model and allows you to select the best model.\n", @@ -140,9 +145,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "lines_to_next_cell": 1 - }, + "metadata": {}, "outputs": [], "source": [ "# solution\n", @@ -164,8 +167,8 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "You can now copy and paste the code below to show the goodness of fit for\n", - "each model.\n", + "You can now copy and paste the code below to show the goodness of fit for each\n", + "model.\n", "\n", "```python\n", "for model_idx, (weight, intercept) in enumerate(zip(weights, intercepts)):\n", diff --git a/notebooks/linear_models_sol_02.ipynb b/notebooks/linear_models_sol_02.ipynb index 28f4a2951..d56864c4e 100644 --- a/notebooks/linear_models_sol_02.ipynb +++ b/notebooks/linear_models_sol_02.ipynb @@ -26,16 +26,17 @@ "outputs": [], "source": [ "import numpy as np\n", + "\n", "# Set the seed for reproduction\n", "rng = np.random.RandomState(0)\n", "\n", "# Generate data\n", "n_sample = 100\n", "data_max, data_min = 1.4, -1.4\n", - "len_data = (data_max - data_min)\n", + "len_data = data_max - data_min\n", "data = rng.rand(n_sample) * len_data - len_data / 2\n", - "noise = rng.randn(n_sample) * .3\n", - "target = data ** 3 - 0.5 * data ** 2 + noise" + "noise = rng.randn(n_sample) * 0.3\n", + "target = data**3 - 0.5 * data**2 + noise" ] }, { @@ -56,31 +57,33 @@ "outputs": [], "source": [ "import pandas as pd\n", + "\n", "full_data = pd.DataFrame({\"data\": data, \"target\": target})" ] }, { "cell_type": "code", "execution_count": null, - "metadata": { - "lines_to_next_cell": 1 - }, + "metadata": {}, "outputs": [], "source": [ "import seaborn as sns\n", "\n", - "_ = sns.scatterplot(data=full_data, x=\"data\", y=\"target\", color=\"black\",\n", - " alpha=0.5)" + "_ = sns.scatterplot(\n", + " data=full_data, x=\"data\", y=\"target\", color=\"black\", alpha=0.5\n", + ")" ] }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "lines_to_next_cell": 2 + }, "source": [ "We observe that the link between the data `data` and vector `target` is\n", - "non-linear. For instance, `data` could represent the years of\n", - "experience (normalized) and `target` the salary (normalized). Therefore, the\n", - "problem here would be to infer the salary given the years of experience.\n", + "non-linear. For instance, `data` could represent the years of experience\n", + "(normalized) and `target` the salary (normalized). Therefore, the problem here\n", + "would be to infer the salary given the years of experience.\n", "\n", "Using the function `f` defined below, find both the `weight` and the\n", "`intercept` that you think will lead to a good linear model. Plot both the\n", @@ -90,9 +93,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "lines_to_next_cell": 1 - }, + "metadata": {}, "outputs": [], "source": [ "def f(data, weight=0, intercept=0):\n", @@ -120,8 +121,9 @@ }, "outputs": [], "source": [ - "ax = sns.scatterplot(data=full_data, x=\"data\", y=\"target\", color=\"black\",\n", - " alpha=0.5)\n", + "ax = sns.scatterplot(\n", + " data=full_data, x=\"data\", y=\"target\", color=\"black\", alpha=0.5\n", + ")\n", "_ = ax.plot(data, predictions)" ] }, @@ -203,8 +205,9 @@ }, "outputs": [], "source": [ - "ax = sns.scatterplot(data=full_data, x=\"data\", y=\"target\", color=\"black\",\n", - " alpha=0.5)\n", + "ax = sns.scatterplot(\n", + " data=full_data, x=\"data\", y=\"target\", color=\"black\", alpha=0.5\n", + ")\n", "_ = ax.plot(data, predictions)" ] }, diff --git a/notebooks/linear_models_sol_03.ipynb b/notebooks/linear_models_sol_03.ipynb index 1a3861b21..634c43171 100644 --- a/notebooks/linear_models_sol_03.ipynb +++ b/notebooks/linear_models_sol_03.ipynb @@ -13,11 +13,10 @@ "The aim of this notebook is to train a linear regression algorithm on a\n", "dataset with more than a single feature.\n", "\n", - "We will load a dataset about house prices in California.\n", - "The dataset consists of 8 features regarding the demography and geography of\n", - "districts in California and the aim is to predict the median house price of\n", - "each district. We will use all 8 features to predict the target, the median\n", - "house price." + "We will load a dataset about house prices in California. The dataset consists\n", + "of 8 features regarding the demography and geography of districts in\n", + "California and the aim is to predict the median house price of each district.\n", + "We will use all 8 features to predict the target, the median house price." ] }, { @@ -48,8 +47,8 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Now it is your turn to train a linear regression model on this dataset.\n", - "First, create a linear regression model." + "Now it is your turn to train a linear regression model on this dataset. First,\n", + "create a linear regression model." ] }, { @@ -68,8 +67,8 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Execute a cross-validation with 10 folds and use the mean absolute error\n", - "(MAE) as metric. Be sure to *return* the fitted *estimators*." + "Execute a cross-validation with 10 folds and use the mean absolute error (MAE)\n", + "as metric. Be sure to *return* the fitted *estimators*." ] }, { @@ -81,9 +80,15 @@ "# solution\n", "from sklearn.model_selection import cross_validate\n", "\n", - "cv_results = cross_validate(linear_regression, data, target,\n", - " scoring=\"neg_mean_absolute_error\",\n", - " return_estimator=True, cv=10, n_jobs=2)" + "cv_results = cross_validate(\n", + " linear_regression,\n", + " data,\n", + " target,\n", + " scoring=\"neg_mean_absolute_error\",\n", + " return_estimator=True,\n", + " cv=10,\n", + " n_jobs=2,\n", + ")" ] }, { @@ -100,9 +105,11 @@ "outputs": [], "source": [ "# solution\n", - "print(f\"Mean absolute error on testing set: \"\n", - " f\"{-cv_results['test_score'].mean():.3f} k$ \u00b1 \"\n", - " f\"{cv_results['test_score'].std():.3f}\")" + "print(\n", + " \"Mean absolute error on testing set: \"\n", + " f\"{-cv_results['test_score'].mean():.3f} k$ \u00b1 \"\n", + " f\"{cv_results['test_score'].std():.3f}\"\n", + ")" ] }, { @@ -112,8 +119,8 @@ }, "source": [ "Inspect the fitted model using a box plot to show the distribution of values\n", - "for the coefficients returned from the cross-validation. Hint:\n", - "use the function\n", + "for the coefficients returned from the cross-validation. Hint: use the\n", + "function\n", "[`df.plot.box()`](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.plot.box.html)\n", "to create a box plot." ] @@ -128,7 +135,8 @@ "import pandas as pd\n", "\n", "weights = pd.DataFrame(\n", - " [est.coef_ for est in cv_results[\"estimator\"]], columns=data.columns)" + " [est.coef_ for est in cv_results[\"estimator\"]], columns=data.columns\n", + ")" ] }, { diff --git a/notebooks/linear_models_sol_04.ipynb b/notebooks/linear_models_sol_04.ipynb index 067b287dc..f49b0c465 100644 --- a/notebooks/linear_models_sol_04.ipynb +++ b/notebooks/linear_models_sol_04.ipynb @@ -6,8 +6,8 @@ "source": [ "# \ud83d\udcc3 Solution for Exercise M4.04\n", "\n", - "In the previous notebook, we saw the effect of applying some regularization\n", - "on the coefficient of a linear model.\n", + "In the previous notebook, we saw the effect of applying some regularization on\n", + "the coefficient of a linear model.\n", "\n", "In this exercise, we will study the advantage of using some regularization\n", "when dealing with correlated features.\n", @@ -39,8 +39,8 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "When creating the dataset, `make_regression` returns the true coefficient\n", - "used to generate the dataset. Let's plot this information." + "When creating the dataset, `make_regression` returns the true coefficient used\n", + "to generate the dataset. Let's plot this information." ] }, { @@ -67,9 +67,9 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Create a `LinearRegression` regressor and fit on the entire dataset and\n", - "check the value of the coefficients. Are the coefficients of the linear\n", - "regressor close to the coefficients used to generate the dataset?" + "Create a `LinearRegression` regressor and fit on the entire dataset and check\n", + "the value of the coefficients. Are the coefficients of the linear regressor\n", + "close to the coefficients used to generate the dataset?" ] }, { @@ -145,8 +145,8 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Fit again the linear regressor on this new dataset and check the\n", - "coefficients. What do you observe?" + "Fit again the linear regressor on this new dataset and check the coefficients.\n", + "What do you observe?" ] }, { @@ -194,12 +194,12 @@ ] }, "source": [ - "We see that the coefficient values are far from what one could expect.\n", - "By repeating the informative features, one would have expected these\n", - "coefficients to be similarly informative.\n", + "We see that the coefficient values are far from what one could expect. By\n", + "repeating the informative features, one would have expected these coefficients\n", + "to be similarly informative.\n", "\n", - "Instead, we see that some coefficients have a huge norm ~1e14. It indeed\n", - "means that we try to solve an mathematical ill-posed problem. Indeed, finding\n", + "Instead, we see that some coefficients have a huge norm ~1e14. It indeed means\n", + "that we try to solve an mathematical ill-posed problem. Indeed, finding\n", "coefficients in a linear regression involves inverting the matrix\n", "`np.dot(data.T, data)` which is not possible (or lead to high numerical\n", "errors)." @@ -251,8 +251,8 @@ "source": [ "We see that the penalty applied on the weights give a better results: the\n", "values of the coefficients do not suffer from numerical issues. Indeed, the\n", - "matrix to be inverted internally is `np.dot(data.T, data) + alpha * I`.\n", - "Adding this penalty `alpha` allow the inversion without numerical issue." + "matrix to be inverted internally is `np.dot(data.T, data) + alpha * I`. Adding\n", + "this penalty `alpha` allow the inversion without numerical issue." ] }, { @@ -281,8 +281,8 @@ ] }, "source": [ - "Repeating three times each informative features induced to divide the\n", - "ridge coefficients by three." + "Repeating three times each informative features induced to divide the ridge\n", + "coefficients by three." ] }, { @@ -324,7 +324,7 @@ "import pandas as pd\n", "from sklearn.model_selection import train_test_split\n", "\n", - "ames_housing = pd.read_csv(\"../datasets/house_prices.csv\", na_values='?')\n", + "ames_housing = pd.read_csv(\"../datasets/house_prices.csv\", na_values=\"?\")\n", "ames_housing = ames_housing.drop(columns=\"Id\")\n", "\n", "categorical_columns = [\"Street\", \"Foundation\", \"CentralAir\", \"PavedDrive\"]\n", @@ -347,8 +347,8 @@ "\n", "We previously presented that a `OneHotEncoder` creates as many columns as\n", "categories. Therefore, there is always one column (i.e. one encoded category)\n", - "that can be inferred from the others. Thus, `OneHotEncoder` creates\n", - "collinear features.\n", + "that can be inferred from the others. Thus, `OneHotEncoder` creates collinear\n", + "features.\n", "\n", "We illustrate this behaviour by considering the \"CentralAir\" feature that\n", "contains only two categories:" @@ -401,20 +401,19 @@ "Here, we see that the encoded category \"CentralAir_N\" is the opposite of the\n", "encoded category \"CentralAir_Y\". Therefore, we observe that using a\n", "`OneHotEncoder` creates two features having the problematic pattern observed\n", - "earlier in this exercise. Training a linear regression model on such a\n", - "of one-hot encoded binary feature can therefore lead to numerical\n", - "problems, especially without regularization. Furthermore, the two one-hot\n", - "features are redundant as they encode exactly the same information in\n", - "opposite ways.\n", + "earlier in this exercise. Training a linear regression model on such a of\n", + "one-hot encoded binary feature can therefore lead to numerical problems,\n", + "especially without regularization. Furthermore, the two one-hot features are\n", + "redundant as they encode exactly the same information in opposite ways.\n", "\n", - "Using regularization helps to overcome the numerical issues that we highlighted\n", - "earlier in this exercise.\n", + "Using regularization helps to overcome the numerical issues that we\n", + "highlighted earlier in this exercise.\n", "\n", "Another strategy is to arbitrarily drop one of the encoded categories.\n", "Scikit-learn provides such an option by setting the parameter `drop` in the\n", - "`OneHotEncoder`. This parameter can be set to `first` to always drop the\n", - "first encoded category or `binary_only` to only drop a column in the case of\n", - "binary categories." + "`OneHotEncoder`. This parameter can be set to `first` to always drop the first\n", + "encoded category or `binary_only` to only drop a column in the case of binary\n", + "categories." ] }, { @@ -446,13 +445,14 @@ "source": [ "\n", "We see that only the second column of the previous encoded data is kept.\n", - "Dropping one of the one-hot encoded column is a common practice,\n", - "especially for binary categorical features. Note however that this breaks\n", - "symmetry between categories and impacts the number of coefficients of the\n", - "model, their values, and thus their meaning, especially when applying\n", - "strong regularization.\n", + "Dropping one of the one-hot encoded column is a common practice, especially\n", + "for binary categorical features. Note however that this breaks symmetry\n", + "between categories and impacts the number of coefficients of the model, their\n", + "values, and thus their meaning, especially when applying strong\n", + "regularization.\n", "\n", - "Let's finally illustrate how to use this option is a machine-learning pipeline:" + "Let's finally illustrate how to use this option is a machine-learning\n", + "pipeline:" ] }, { @@ -470,9 +470,7 @@ "model = make_pipeline(OneHotEncoder(drop=\"first\", dtype=np.int32), Ridge())\n", "model.fit(X_train, y_train)\n", "n_categories = [X_train[col].nunique() for col in X_train.columns]\n", - "print(\n", - " f\"R2 score on the testing set: {model.score(X_test, y_test):.2f}\"\n", - ")\n", + "print(f\"R2 score on the testing set: {model.score(X_test, y_test):.2f}\")\n", "print(\n", " f\"Our model contains {model[-1].coef_.size} features while \"\n", " f\"{sum(n_categories)} categories are originally available.\"\n", diff --git a/notebooks/linear_models_sol_05.ipynb b/notebooks/linear_models_sol_05.ipynb index d0c98a1c7..08bae2e77 100644 --- a/notebooks/linear_models_sol_05.ipynb +++ b/notebooks/linear_models_sol_05.ipynb @@ -5,13 +5,14 @@ "metadata": {}, "source": [ "# \ud83d\udcc3 Solution for Exercise M4.05\n", + "\n", "In the previous notebook we set `penalty=\"none\"` to disable regularization\n", - "entirely. This parameter can also control the **type** of regularization to use,\n", - "whereas the regularization **strength** is set using the parameter `C`.\n", - "Setting`penalty=\"none\"` is equivalent to an infinitely large value of `C`.\n", - "In this exercise, we ask you to train a logistic regression classifier using the\n", - "`penalty=\"l2\"` regularization (which happens to be the default in scikit-learn)\n", - "to find by yourself the effect of the parameter `C`.\n", + "entirely. This parameter can also control the **type** of regularization to\n", + "use, whereas the regularization **strength** is set using the parameter `C`.\n", + "Setting`penalty=\"none\"` is equivalent to an infinitely large value of `C`. In\n", + "this exercise, we ask you to train a logistic regression classifier using the\n", + "`penalty=\"l2\"` regularization (which happens to be the default in\n", + "scikit-learn) to find by yourself the effect of the parameter `C`.\n", "\n", "We will start by loading the dataset." ] @@ -37,8 +38,9 @@ "\n", "penguins = pd.read_csv(\"../datasets/penguins_classification.csv\")\n", "# only keep the Adelie and Chinstrap classes\n", - "penguins = penguins.set_index(\"Species\").loc[\n", - " [\"Adelie\", \"Chinstrap\"]].reset_index()\n", + "penguins = (\n", + " penguins.set_index(\"Species\").loc[[\"Adelie\", \"Chinstrap\"]].reset_index()\n", + ")\n", "\n", "culmen_columns = [\"Culmen Length (mm)\", \"Culmen Depth (mm)\"]\n", "target_column = \"Species\"" @@ -79,7 +81,8 @@ "from sklearn.linear_model import LogisticRegression\n", "\n", "logistic_regression = make_pipeline(\n", - " StandardScaler(), LogisticRegression(penalty=\"l2\"))" + " StandardScaler(), LogisticRegression(penalty=\"l2\")\n", + ")" ] }, { @@ -118,8 +121,12 @@ " alpha=0.5,\n", " )\n", " sns.scatterplot(\n", - " data=penguins_test, x=culmen_columns[0], y=culmen_columns[1],\n", - " hue=target_column, palette=[\"tab:red\", \"tab:blue\"])\n", + " data=penguins_test,\n", + " x=culmen_columns[0],\n", + " y=culmen_columns[1],\n", + " hue=target_column,\n", + " palette=[\"tab:red\", \"tab:blue\"],\n", + " )\n", " plt.legend(bbox_to_anchor=(1.05, 0.8), loc=\"upper left\")\n", " plt.title(f\"C: {C} \\n Accuracy on the test set: {accuracy:.2f}\")" ] @@ -156,8 +163,7 @@ }, "outputs": [], "source": [ - "weights_ridge = pd.concat(\n", - " weights_ridge, axis=1, keys=[f\"C: {C}\" for C in Cs])\n", + "weights_ridge = pd.concat(weights_ridge, axis=1, keys=[f\"C: {C}\" for C in Cs])\n", "weights_ridge.plot.barh()\n", "_ = plt.title(\"LogisticRegression weights depending of C\")" ] diff --git a/notebooks/linear_regression_in_sklearn.ipynb b/notebooks/linear_regression_in_sklearn.ipynb index e84cc3f2b..75ea3b566 100644 --- a/notebooks/linear_regression_in_sklearn.ipynb +++ b/notebooks/linear_regression_in_sklearn.ipynb @@ -7,14 +7,14 @@ "# Linear regression using scikit-learn\n", "\n", "In the previous notebook, we presented the parametrization of a linear model.\n", - "During the exercise, you saw that varying parameters will give different models\n", - "that will fit better or worse the data. To evaluate quantitatively this\n", + "During the exercise, you saw that varying parameters will give different\n", + "models that will fit better or worse the data. To evaluate quantitatively this\n", "goodness of fit, you implemented a so-called metric.\n", "\n", "When doing machine learning, you are interested in selecting the model which\n", - "will minimize the error on the data available the most.\n", - "From the previous exercise, we could implement a brute-force approach,\n", - "varying the weights and intercept and select the model with the lowest error.\n", + "will minimize the error on the data available the most. From the previous\n", + "exercise, we could implement a brute-force approach, varying the weights and\n", + "intercept and select the model with the lowest error.\n", "\n", "Hopefully, this problem of finding the best parameters values (i.e. that\n", "result in the lowest error) can be solved without the need to check every\n", @@ -66,8 +66,8 @@ "metadata": {}, "source": [ "The instance `linear_regression` will store the parameter values in the\n", - "attributes `coef_` and `intercept_`. We can check what the optimal model\n", - "found is:" + "attributes `coef_` and `intercept_`. We can check what the optimal model found\n", + "is:" ] }, { @@ -108,7 +108,8 @@ "\n", "flipper_length_range = np.linspace(data.min(), data.max(), num=300)\n", "predicted_body_mass = (\n", - " weight_flipper_length * flipper_length_range + intercept_body_mass)" + " weight_flipper_length * flipper_length_range + intercept_body_mass\n", + ")" ] }, { @@ -129,10 +130,10 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "In the solution of the previous exercise, we implemented a function to\n", - "compute the goodness of fit of a model. Indeed, we mentioned two metrics: (i)\n", - "the mean squared error and (ii) the mean absolute error. These metrics are\n", - "implemented in scikit-learn and we do not need to use our own implementation.\n", + "In the solution of the previous exercise, we implemented a function to compute\n", + "the goodness of fit of a model. Indeed, we mentioned two metrics: (i) the mean\n", + "squared error and (ii) the mean absolute error. These metrics are implemented\n", + "in scikit-learn and we do not need to use our own implementation.\n", "\n", "We can first compute the mean squared error." ] @@ -161,8 +162,8 @@ "a higher mean squared error on the training set.\n", "\n", "However, the mean squared error is difficult to interpret. The mean absolute\n", - "error is more intuitive since it provides an error in the same unit as the\n", - "one of the target." + "error is more intuitive since it provides an error in the same unit as the one\n", + "of the target." ] }, { @@ -183,8 +184,8 @@ "lines_to_next_cell": 2 }, "source": [ - "A mean absolute error of 313 means that in average, our model make an error\n", - "of \u00b1 313 grams when predicting the body mass of a penguin given its flipper\n", + "A mean absolute error of 313 means that in average, our model make an error of\n", + "\u00b1 313 grams when predicting the body mass of a penguin given its flipper\n", "length." ] }, diff --git a/notebooks/linear_regression_non_linear_link.ipynb b/notebooks/linear_regression_non_linear_link.ipynb index 6ed1337e2..e2783cfb5 100644 --- a/notebooks/linear_regression_non_linear_link.ipynb +++ b/notebooks/linear_regression_non_linear_link.ipynb @@ -6,12 +6,12 @@ "source": [ "# Linear regression for a non-linear features-target relationship\n", "\n", - "In the previous exercise, you were asked to train a linear regression model\n", - "on a dataset where the matrix `data` and the vector `target` do not have a\n", - "linear link.\n", + "In the previous exercise, you were asked to train a linear regression model on\n", + "a dataset where the matrix `data` and the vector `target` do not have a linear\n", + "link.\n", "\n", - "In this notebook, we show that even if the parametrization of linear models\n", - "is not natively adapted to the problem at hand, it is still possible to make\n", + "In this notebook, we show that even if the parametrization of linear models is\n", + "not natively adapted to the problem at hand, it is still possible to make\n", "linear models more expressive by engineering additional features.\n", "\n", "A machine learning pipeline that combines a non-linear feature engineering\n", @@ -34,11 +34,11 @@ "\n", "n_sample = 100\n", "data_max, data_min = 1.4, -1.4\n", - "len_data = (data_max - data_min)\n", + "len_data = data_max - data_min\n", "# sort the data to make plotting easier later\n", "data = np.sort(rng.rand(n_sample) * len_data - len_data / 2)\n", - "noise = rng.randn(n_sample) * .3\n", - "target = data ** 3 - 0.5 * data ** 2 + noise" + "noise = rng.randn(n_sample) * 0.3\n", + "target = data**3 - 0.5 * data**2 + noise" ] }, { @@ -71,16 +71,17 @@ "source": [ "import seaborn as sns\n", "\n", - "_ = sns.scatterplot(data=full_data, x=\"input_feature\", y=\"target\",\n", - " color=\"black\", alpha=0.5)" + "_ = sns.scatterplot(\n", + " data=full_data, x=\"input_feature\", y=\"target\", color=\"black\", alpha=0.5\n", + ")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "We will highlight the limitations of fitting a linear regression model as\n", - "done in the previous exercise.\n", + "We will highlight the limitations of fitting a linear regression model as done\n", + "in the previous exercise.\n", "\n", "
\n", "

Warning

\n", @@ -132,8 +133,9 @@ "metadata": {}, "outputs": [], "source": [ - "ax = sns.scatterplot(data=full_data, x=\"input_feature\", y=\"target\",\n", - " color=\"black\", alpha=0.5)\n", + "ax = sns.scatterplot(\n", + " data=full_data, x=\"input_feature\", y=\"target\", color=\"black\", alpha=0.5\n", + ")\n", "ax.plot(data, target_predicted)\n", "_ = ax.set_title(f\"Mean squared error = {mse:.2f}\")" ] @@ -144,8 +146,8 @@ "source": [ "\n", "Here the coefficient and intercept learnt by `LinearRegression` define the\n", - "best \"straight line\" that fits the data. We can inspect the coefficients\n", - "using the attributes of the model learnt as follows:" + "best \"straight line\" that fits the data. We can inspect the coefficients using\n", + "the attributes of the model learnt as follows:" ] }, { @@ -154,18 +156,19 @@ "metadata": {}, "outputs": [], "source": [ - "print(f\"weight: {linear_regression.coef_[0]:.2f}, \"\n", - " f\"intercept: {linear_regression.intercept_:.2f}\")" + "print(\n", + " f\"weight: {linear_regression.coef_[0]:.2f}, \"\n", + " f\"intercept: {linear_regression.intercept_:.2f}\"\n", + ")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "\n", "It is important to note that the learnt model will not be able to handle the\n", - "non-linear relationship between `data` and `target` since linear models\n", - "assume the relationship between `data` and `target` to be linear.\n", + "non-linear relationship between `data` and `target` since linear models assume\n", + "the relationship between `data` and `target` to be linear.\n", "\n", "Indeed, there are 3 possibilities to solve this issue:\n", "\n", @@ -198,8 +201,9 @@ "metadata": {}, "outputs": [], "source": [ - "ax = sns.scatterplot(data=full_data, x=\"input_feature\", y=\"target\",\n", - " color=\"black\", alpha=0.5)\n", + "ax = sns.scatterplot(\n", + " data=full_data, x=\"input_feature\", y=\"target\", color=\"black\", alpha=0.5\n", + ")\n", "ax.plot(data, target_predicted)\n", "_ = ax.set_title(f\"Mean squared error = {mse:.2f}\")" ] @@ -208,12 +212,11 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "\n", - "Instead of having a model which can natively deal with non-linearity, we\n", - "could also modify our data: we could create new features, derived from the\n", - "original features, using some expert knowledge. In this example, we know that\n", - "we have a cubic and squared relationship between `data` and `target` (because\n", - "we generated the data).\n", + "Instead of having a model which can natively deal with non-linearity, we could\n", + "also modify our data: we could create new features, derived from the original\n", + "features, using some expert knowledge. In this example, we know that we have a\n", + "cubic and squared relationship between `data` and `target` (because we\n", + "generated the data).\n", "\n", "Indeed, we could create two new features (`data ** 2` and `data ** 3`) using\n", "this information as follows. This kind of transformation is called a\n", @@ -237,7 +240,7 @@ }, "outputs": [], "source": [ - "data_expanded = np.concatenate([data, data ** 2, data ** 3], axis=1)\n", + "data_expanded = np.concatenate([data, data**2, data**3], axis=1)\n", "data_expanded.shape" ] }, @@ -258,8 +261,9 @@ "metadata": {}, "outputs": [], "source": [ - "ax = sns.scatterplot(data=full_data, x=\"input_feature\", y=\"target\",\n", - " color=\"black\", alpha=0.5)\n", + "ax = sns.scatterplot(\n", + " data=full_data, x=\"input_feature\", y=\"target\", color=\"black\", alpha=0.5\n", + ")\n", "ax.plot(data, target_predicted)\n", "_ = ax.set_title(f\"Mean squared error = {mse:.2f}\")" ] @@ -268,7 +272,6 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "\n", "We can see that even with a linear model, we can overcome the linearity\n", "limitation of the model by adding the non-linear components in the design of\n", "additional features. Here, we created new features by knowing the way the\n", @@ -304,11 +307,10 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "\n", "In the previous cell we had to set `include_bias=False` as otherwise we would\n", "create a column perfectly correlated to the `intercept_` introduced by the\n", - "`LinearRegression`. We can verify that this procedure is equivalent to creating\n", - "the features by hand up to numerical error by computing the maximum\n", + "`LinearRegression`. We can verify that this procedure is equivalent to\n", + "creating the features by hand up to numerical error by computing the maximum\n", "of the absolute values of the differences between the features generated by\n", "both methods and checking that it is close to zero:" ] @@ -326,7 +328,6 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "\n", "Then it should not be surprising that the predictions of the\n", "`PolynomialFeatures` pipeline match the predictions of the linear model fit on\n", "manually engineered features." @@ -338,8 +339,9 @@ "metadata": {}, "outputs": [], "source": [ - "ax = sns.scatterplot(data=full_data, x=\"input_feature\", y=\"target\",\n", - " color=\"black\", alpha=0.5)\n", + "ax = sns.scatterplot(\n", + " data=full_data, x=\"input_feature\", y=\"target\", color=\"black\", alpha=0.5\n", + ")\n", "ax.plot(data, target_predicted)\n", "_ = ax.set_title(f\"Mean squared error = {mse:.2f}\")" ] @@ -348,12 +350,10 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "\n", "The last possibility is to make a linear model more expressive is to use a\n", "\"kernel\". Instead of learning a weight per feature as we previously\n", - "emphasized, a weight will be assigned to each sample. However, not all\n", - "samples will be used. This is the base of the support vector machine\n", - "algorithm.\n", + "emphasized, a weight will be assigned to each sample. However, not all samples\n", + "will be used. This is the base of the support vector machine algorithm.\n", "\n", "The mathematical definition of \"kernels\" and \"support vector machines\" is\n", "beyond the scope of this course. We encourage interested readers with a\n", @@ -361,8 +361,8 @@ "SVMs](https://scikit-learn.org/stable/modules/svm.html) for more details.\n", "\n", "For the rest of us, let us just develop some intuitions on the relative\n", - "expressive power of support vector machines with linear and non-linear\n", - "kernels by fitting them on the same dataset.\n", + "expressive power of support vector machines with linear and non-linear kernels\n", + "by fitting them on the same dataset.\n", "\n", "First, consider a support vector machine with a linear kernel:" ] @@ -387,8 +387,9 @@ "metadata": {}, "outputs": [], "source": [ - "ax = sns.scatterplot(data=full_data, x=\"input_feature\", y=\"target\",\n", - " color=\"black\", alpha=0.5)\n", + "ax = sns.scatterplot(\n", + " data=full_data, x=\"input_feature\", y=\"target\", color=\"black\", alpha=0.5\n", + ")\n", "ax.plot(data, target_predicted)\n", "_ = ax.set_title(f\"Mean squared error = {mse:.2f}\")" ] @@ -397,7 +398,6 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "\n", "The predictions of our SVR with a linear kernel are all aligned on a straight\n", "line. `SVR(kernel=\"linear\")` is indeed yet another example of a linear model.\n", "\n", @@ -428,8 +428,9 @@ "metadata": {}, "outputs": [], "source": [ - "ax = sns.scatterplot(data=full_data, x=\"input_feature\", y=\"target\",\n", - " color=\"black\", alpha=0.5)\n", + "ax = sns.scatterplot(\n", + " data=full_data, x=\"input_feature\", y=\"target\", color=\"black\", alpha=0.5\n", + ")\n", "ax.plot(data, target_predicted)\n", "_ = ax.set_title(f\"Mean squared error = {mse:.2f}\")" ] @@ -438,7 +439,6 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "\n", "Kernel methods such as SVR are very efficient for small to medium datasets.\n", "\n", "For larger datasets with `n_samples >> 10_000`, it is often computationally\n", @@ -449,9 +449,9 @@ "or\n", "[Nystroem](https://scikit-learn.org/stable/modules/generated/sklearn.kernel_approximation.Nystroem.html).\n", "\n", - "Here again we refer the interested reader to the documentation to get a\n", - "proper definition of those methods. The following just gives an intuitive\n", - "overview of the predictions we would get using those on our toy dataset:" + "Here again we refer the interested reader to the documentation to get a proper\n", + "definition of those methods. The following just gives an intuitive overview of\n", + "the predictions we would get using those on our toy dataset:" ] }, { @@ -463,14 +463,16 @@ "from sklearn.preprocessing import KBinsDiscretizer\n", "\n", "binned_regression = make_pipeline(\n", - " KBinsDiscretizer(n_bins=8), LinearRegression(),\n", + " KBinsDiscretizer(n_bins=8),\n", + " LinearRegression(),\n", ")\n", "binned_regression.fit(data, target)\n", "target_predicted = binned_regression.predict(data)\n", "mse = mean_squared_error(target, target_predicted)\n", "\n", - "ax = sns.scatterplot(data=full_data, x=\"input_feature\", y=\"target\",\n", - " color=\"black\", alpha=0.5)\n", + "ax = sns.scatterplot(\n", + " data=full_data, x=\"input_feature\", y=\"target\", color=\"black\", alpha=0.5\n", + ")\n", "ax.plot(data, target_predicted)\n", "_ = ax.set_title(f\"Mean squared error = {mse:.2f}\")" ] @@ -484,14 +486,16 @@ "from sklearn.kernel_approximation import Nystroem\n", "\n", "nystroem_regression = make_pipeline(\n", - " Nystroem(n_components=5), LinearRegression(),\n", + " Nystroem(n_components=5),\n", + " LinearRegression(),\n", ")\n", "nystroem_regression.fit(data, target)\n", "target_predicted = nystroem_regression.predict(data)\n", "mse = mean_squared_error(target, target_predicted)\n", "\n", - "ax = sns.scatterplot(data=full_data, x=\"input_feature\", y=\"target\",\n", - " color=\"black\", alpha=0.5)\n", + "ax = sns.scatterplot(\n", + " data=full_data, x=\"input_feature\", y=\"target\", color=\"black\", alpha=0.5\n", + ")\n", "ax.plot(data, target_predicted)\n", "_ = ax.set_title(f\"Mean squared error = {mse:.2f}\")" ] diff --git a/notebooks/linear_regression_without_sklearn.ipynb b/notebooks/linear_regression_without_sklearn.ipynb index 35cb40e91..22707379c 100644 --- a/notebooks/linear_regression_without_sklearn.ipynb +++ b/notebooks/linear_regression_without_sklearn.ipynb @@ -38,8 +38,8 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "We will formulate the following problem: using the flipper length of a penguin, \n", - "we would like to infer its mass." + "We will formulate the following problem: using the flipper length of a\n", + "penguin, we would like to infer its mass." ] }, { @@ -54,8 +54,9 @@ "target_name = \"Body Mass (g)\"\n", "data, target = penguins[[feature_name]], penguins[target_name]\n", "\n", - "ax = sns.scatterplot(data=penguins, x=feature_name, y=target_name,\n", - " color=\"black\", alpha=0.5)\n", + "ax = sns.scatterplot(\n", + " data=penguins, x=feature_name, y=target_name, color=\"black\", alpha=0.5\n", + ")\n", "ax.set_title(\"Body Mass as a function of the Flipper Length\")" ] }, @@ -65,11 +66,11 @@ "source": [ "
\n", "

Tip

\n", - "

The function scatterplot from seaborn take as input the full dataframe\n", - "and the parameter x and y allows to specify the name of the columns to\n", - "be plotted. Note that this function returns a matplotlib axis\n", - "(named ax in the example above) that can be further used to add elements on\n", - "the same matplotlib axis (such as a title).

\n", + "

The function scatterplot from seaborn take as input the full dataframe and\n", + "the parameter x and y allows to specify the name of the columns to be\n", + "plotted. Note that this function returns a matplotlib axis (named ax in the\n", + "example above) that can be further used to add elements on the same matplotlib\n", + "axis (such as a title).

\n", "
" ] }, @@ -79,16 +80,15 @@ "lines_to_next_cell": 2 }, "source": [ - "In this problem, penguin mass is our target. It is a continuous\n", - "variable that roughly varies between 2700 g and 6300 g. Thus, this is a\n", - "regression problem (in contrast to classification). We also see that there is\n", - "almost a linear relationship between the body mass of the penguin and its\n", - "flipper length. The longer the flipper, the heavier the penguin.\n", + "In this problem, penguin mass is our target. It is a continuous variable that\n", + "roughly varies between 2700 g and 6300 g. Thus, this is a regression problem\n", + "(in contrast to classification). We also see that there is almost a linear\n", + "relationship between the body mass of the penguin and its flipper length. The\n", + "longer the flipper, the heavier the penguin.\n", "\n", - "Thus, we could come up with a simple formula, where given a flipper length\n", - "we could compute the body mass of a penguin using a linear relationship\n", - "of the form `y = a * x + b` where `a` and `b` are the 2 parameters of our\n", - "model." + "Thus, we could come up with a simple formula, where given a flipper length we\n", + "could compute the body mass of a penguin using a linear relationship of the\n", + "form `y = a * x + b` where `a` and `b` are the 2 parameters of our model." ] }, { @@ -97,8 +97,9 @@ "metadata": {}, "outputs": [], "source": [ - "def linear_model_flipper_mass(flipper_length, weight_flipper_length,\n", - " intercept_body_mass):\n", + "def linear_model_flipper_mass(\n", + " flipper_length, weight_flipper_length, intercept_body_mass\n", + "):\n", " \"\"\"Linear model of the form y = a * x + b\"\"\"\n", " body_mass = weight_flipper_length * flipper_length + intercept_body_mass\n", " return body_mass" @@ -108,9 +109,9 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Using the model we defined above, we can check the body mass values\n", - "predicted for a range of flipper lengths. We will set `weight_flipper_length`\n", - "to be 45 and `intercept_body_mass` to be -5000." + "Using the model we defined above, we can check the body mass values predicted\n", + "for a range of flipper lengths. We will set `weight_flipper_length` to be 45\n", + "and `intercept_body_mass` to be -5000." ] }, { @@ -126,7 +127,8 @@ "\n", "flipper_length_range = np.linspace(data.min(), data.max(), num=300)\n", "predicted_body_mass = linear_model_flipper_mass(\n", - " flipper_length_range, weight_flipper_length, intercept_body_mass)" + " flipper_length_range, weight_flipper_length, intercept_body_mass\n", + ")" ] }, { @@ -144,8 +146,9 @@ "source": [ "label = \"{0:.2f} (g / mm) * flipper length + {1:.2f} (g)\"\n", "\n", - "ax = sns.scatterplot(data=penguins, x=feature_name, y=target_name,\n", - " color=\"black\", alpha=0.5)\n", + "ax = sns.scatterplot(\n", + " data=penguins, x=feature_name, y=target_name, color=\"black\", alpha=0.5\n", + ")\n", "ax.plot(flipper_length_range, predicted_body_mass)\n", "_ = ax.set_title(label.format(weight_flipper_length, intercept_body_mass))" ] @@ -158,10 +161,10 @@ "`flipper_length` in order to make the inference. When this coefficient is\n", "positive, it means that penguins with longer flipper lengths will have larger\n", "body masses. If the coefficient is negative, it means that penguins with\n", - "shorter flipper lengths have larger body masses. Graphically, this\n", - "coefficient is represented by the slope of the curve in the plot. Below we\n", - "show what the curve would look like when the `weight_flipper_length`\n", - "coefficient is negative." + "shorter flipper lengths have larger body masses. Graphically, this coefficient\n", + "is represented by the slope of the curve in the plot. Below we show what the\n", + "curve would look like when the `weight_flipper_length` coefficient is\n", + "negative." ] }, { @@ -174,7 +177,8 @@ "intercept_body_mass = 13000\n", "\n", "predicted_body_mass = linear_model_flipper_mass(\n", - " flipper_length_range, weight_flipper_length, intercept_body_mass)" + " flipper_length_range, weight_flipper_length, intercept_body_mass\n", + ")" ] }, { @@ -190,8 +194,9 @@ "metadata": {}, "outputs": [], "source": [ - "ax = sns.scatterplot(data=penguins, x=feature_name, y=target_name,\n", - " color=\"black\", alpha=0.5)\n", + "ax = sns.scatterplot(\n", + " data=penguins, x=feature_name, y=target_name, color=\"black\", alpha=0.5\n", + ")\n", "ax.plot(flipper_length_range, predicted_body_mass)\n", "_ = ax.set_title(label.format(weight_flipper_length, intercept_body_mass))" ] @@ -200,10 +205,9 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "In our case, this coefficient has a meaningful unit: g/mm.\n", - "For instance, a coefficient of 40 g/mm, means that for each\n", - "additional millimeter in flipper length, the body weight predicted will\n", - "increase by 40 g." + "In our case, this coefficient has a meaningful unit: g/mm. For instance, a\n", + "coefficient of 40 g/mm, means that for each additional millimeter in flipper\n", + "length, the body weight predicted will increase by 40 g." ] }, { @@ -213,13 +217,17 @@ "outputs": [], "source": [ "body_mass_180 = linear_model_flipper_mass(\n", - " flipper_length=180, weight_flipper_length=40, intercept_body_mass=0)\n", + " flipper_length=180, weight_flipper_length=40, intercept_body_mass=0\n", + ")\n", "body_mass_181 = linear_model_flipper_mass(\n", - " flipper_length=181, weight_flipper_length=40, intercept_body_mass=0)\n", + " flipper_length=181, weight_flipper_length=40, intercept_body_mass=0\n", + ")\n", "\n", - "print(f\"The body mass for a flipper length of 180 mm \"\n", - " f\"is {body_mass_180} g and {body_mass_181} g \"\n", - " f\"for a flipper length of 181 mm\")" + "print(\n", + " \"The body mass for a flipper length of 180 mm \"\n", + " f\"is {body_mass_180} g and {body_mass_181} g \"\n", + " \"for a flipper length of 181 mm\"\n", + ")" ] }, { @@ -228,10 +236,10 @@ "source": [ "We can also see that we have a parameter `intercept_body_mass` in our model.\n", "This parameter corresponds to the value on the y-axis if `flipper_length=0`\n", - "(which in our case is only a mathematical consideration, as in our data,\n", - " the value of `flipper_length` only goes from 170mm to 230mm). This y-value\n", - "when x=0 is called the y-intercept. If `intercept_body_mass` is 0, the curve\n", - "will pass through the origin:" + "(which in our case is only a mathematical consideration, as in our data, the\n", + " value of `flipper_length` only goes from 170mm to 230mm). This y-value when\n", + "x=0 is called the y-intercept. If `intercept_body_mass` is 0, the curve will\n", + "pass through the origin:" ] }, { @@ -246,7 +254,8 @@ "# redefined the flipper length to start at 0 to plot the intercept value\n", "flipper_length_range = np.linspace(0, data.max(), num=300)\n", "predicted_body_mass = linear_model_flipper_mass(\n", - " flipper_length_range, weight_flipper_length, intercept_body_mass)" + " flipper_length_range, weight_flipper_length, intercept_body_mass\n", + ")" ] }, { @@ -255,8 +264,9 @@ "metadata": {}, "outputs": [], "source": [ - "ax = sns.scatterplot(data=penguins, x=feature_name, y=target_name,\n", - " color=\"black\", alpha=0.5)\n", + "ax = sns.scatterplot(\n", + " data=penguins, x=feature_name, y=target_name, color=\"black\", alpha=0.5\n", + ")\n", "ax.plot(flipper_length_range, predicted_body_mass)\n", "_ = ax.set_title(label.format(weight_flipper_length, intercept_body_mass))" ] @@ -278,7 +288,8 @@ "intercept_body_mass = -5000\n", "\n", "predicted_body_mass = linear_model_flipper_mass(\n", - " flipper_length_range, weight_flipper_length, intercept_body_mass)" + " flipper_length_range, weight_flipper_length, intercept_body_mass\n", + ")" ] }, { @@ -287,8 +298,9 @@ "metadata": {}, "outputs": [], "source": [ - "ax = sns.scatterplot(data=penguins, x=feature_name, y=target_name,\n", - " color=\"black\", alpha=0.5)\n", + "ax = sns.scatterplot(\n", + " data=penguins, x=feature_name, y=target_name, color=\"black\", alpha=0.5\n", + ")\n", "ax.plot(flipper_length_range, predicted_body_mass)\n", "_ = ax.set_title(label.format(weight_flipper_length, intercept_body_mass))" ] diff --git a/notebooks/logistic_regression.ipynb b/notebooks/logistic_regression.ipynb index fa30beced..fc41a3402 100644 --- a/notebooks/logistic_regression.ipynb +++ b/notebooks/logistic_regression.ipynb @@ -5,13 +5,14 @@ "metadata": {}, "source": [ "# Linear model for classification\n", + "\n", "In regression, we saw that the target to be predicted was a continuous\n", "variable. In classification, this target will be discrete (e.g. categorical).\n", "\n", "We will go back to our penguin dataset. However, this time we will try to\n", "predict the penguin species using the culmen information. We will also\n", - "simplify our classification problem by selecting only 2 of the penguin\n", - "species to solve a binary classification problem." + "simplify our classification problem by selecting only 2 of the penguin species\n", + "to solve a binary classification problem." ] }, { @@ -36,8 +37,9 @@ "penguins = pd.read_csv(\"../datasets/penguins_classification.csv\")\n", "\n", "# only keep the Adelie and Chinstrap classes\n", - "penguins = penguins.set_index(\"Species\").loc[\n", - " [\"Adelie\", \"Chinstrap\"]].reset_index()\n", + "penguins = (\n", + " penguins.set_index(\"Species\").loc[[\"Adelie\", \"Chinstrap\"]].reset_index()\n", + ")\n", "culmen_columns = [\"Culmen Length (mm)\", \"Culmen Depth (mm)\"]\n", "target_column = \"Species\"" ] @@ -68,13 +70,12 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "We can observe that we have quite a simple problem. When the culmen\n", - "length increases, the probability that the penguin is a Chinstrap is closer\n", - "to 1. However, the culmen depth is not helpful for predicting the penguin\n", - "species.\n", + "We can observe that we have quite a simple problem. When the culmen length\n", + "increases, the probability that the penguin is a Chinstrap is closer to 1.\n", + "However, the culmen depth is not helpful for predicting the penguin species.\n", "\n", - "For model fitting, we will separate the target from the data and\n", - "we will create a training and a testing set." + "For model fitting, we will separate the target from the data and we will\n", + "create a training and a testing set." ] }, { @@ -98,11 +99,9 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "\n", - "The linear regression that we previously saw will predict a continuous\n", - "output. When the target is a binary outcome, one can use the logistic\n", - "function to model the probability. This model is known as logistic\n", - "regression.\n", + "The linear regression that we previously saw will predict a continuous output.\n", + "When the target is a binary outcome, one can use the logistic function to\n", + "model the probability. This model is known as logistic regression.\n", "\n", "Scikit-learn provides the class `LogisticRegression` which implements this\n", "algorithm." @@ -130,11 +129,10 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "\n", - "Since we are dealing with a classification problem containing only 2\n", - "features, it is then possible to observe the decision function boundary.\n", - "The boundary is the rule used by our predictive model to affect a class label\n", - "given the feature values of the sample.\n", + "Since we are dealing with a classification problem containing only 2 features,\n", + "it is then possible to observe the decision function boundary. The boundary is\n", + "the rule used by our predictive model to affect a class label given the\n", + "feature values of the sample.\n", "\n", "
\n", "

Note

\n", @@ -158,11 +156,19 @@ "from sklearn.inspection import DecisionBoundaryDisplay\n", "\n", "DecisionBoundaryDisplay.from_estimator(\n", - " logistic_regression, data_test, response_method=\"predict\", cmap=\"RdBu_r\", alpha=0.5\n", + " logistic_regression,\n", + " data_test,\n", + " response_method=\"predict\",\n", + " cmap=\"RdBu_r\",\n", + " alpha=0.5,\n", ")\n", "sns.scatterplot(\n", - " data=penguins_test, x=culmen_columns[0], y=culmen_columns[1],\n", - " hue=target_column, palette=[\"tab:red\", \"tab:blue\"])\n", + " data=penguins_test,\n", + " x=culmen_columns[0],\n", + " y=culmen_columns[1],\n", + " hue=target_column,\n", + " palette=[\"tab:red\", \"tab:blue\"],\n", + ")\n", "_ = plt.title(\"Decision boundary of the trained\\n LogisticRegression\")" ] }, diff --git a/notebooks/logistic_regression_non_linear.ipynb b/notebooks/logistic_regression_non_linear.ipynb index dae0e199b..ccc05be33 100644 --- a/notebooks/logistic_regression_non_linear.ipynb +++ b/notebooks/logistic_regression_non_linear.ipynb @@ -6,16 +6,15 @@ "source": [ "# Beyond linear separation in classification\n", "\n", - "As we saw in the regression section, the linear classification model\n", - "expects the data to be linearly separable. When this assumption does not\n", - "hold, the model is not expressive enough to properly fit the data.\n", - "Therefore, we need to apply the same tricks as in regression: feature\n", - "augmentation (potentially using expert-knowledge) or using a\n", - "kernel-based method.\n", + "As we saw in the regression section, the linear classification model expects\n", + "the data to be linearly separable. When this assumption does not hold, the\n", + "model is not expressive enough to properly fit the data. Therefore, we need to\n", + "apply the same tricks as in regression: feature augmentation (potentially\n", + "using expert-knowledge) or using a kernel-based method.\n", "\n", - "We will provide examples where we will use a kernel support vector machine\n", - "to perform classification on some toy-datasets where it is impossible to\n", - "find a perfect linear separation.\n", + "We will provide examples where we will use a kernel support vector machine to\n", + "perform classification on some toy-datasets where it is impossible to find a\n", + "perfect linear separation.\n", "\n", "We will generate a first dataset where the data are represented as two\n", "interlaced half circles. This dataset is generated using the function\n", @@ -38,8 +37,10 @@ "X, y = make_moons(n_samples=100, noise=0.13, random_state=42)\n", "\n", "# We store both the data and target in a dataframe to ease plotting\n", - "moons = pd.DataFrame(np.concatenate([X, y[:, np.newaxis]], axis=1),\n", - " columns=feature_names + [target_name])\n", + "moons = pd.DataFrame(\n", + " np.concatenate([X, y[:, np.newaxis]], axis=1),\n", + " columns=feature_names + [target_name],\n", + ")\n", "data_moons, target_moons = moons[feature_names], moons[target_name]" ] }, @@ -60,8 +61,13 @@ "import matplotlib.pyplot as plt\n", "import seaborn as sns\n", "\n", - "sns.scatterplot(data=moons, x=feature_names[0], y=feature_names[1],\n", - " hue=target_moons, palette=[\"tab:red\", \"tab:blue\"])\n", + "sns.scatterplot(\n", + " data=moons,\n", + " x=feature_names[0],\n", + " y=feature_names[1],\n", + " hue=target_moons,\n", + " palette=[\"tab:red\", \"tab:blue\"],\n", + ")\n", "_ = plt.title(\"Illustration of the moons dataset\")" ] }, @@ -69,13 +75,13 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "From the intuitions that we got by studying linear model, it should be\n", - "obvious that a linear classifier will not be able to find a perfect decision\n", - "function to separate the two classes.\n", + "From the intuitions that we got by studying linear model, it should be obvious\n", + "that a linear classifier will not be able to find a perfect decision function\n", + "to separate the two classes.\n", "\n", - "Let's try to see what is the decision boundary of such a linear classifier.\n", - "We will create a predictive model by standardizing the dataset followed by\n", - "a linear support vector machine classifier." + "Let's try to see what is the decision boundary of such a linear classifier. We\n", + "will create a predictive model by standardizing the dataset followed by a\n", + "linear support vector machine classifier." ] }, { @@ -98,11 +104,11 @@ "source": [ "
\n", "

Warning

\n", - "

Be aware that we fit and will check the boundary decision of the classifier\n", - "on the same dataset without splitting the dataset into a training set and a\n", + "

Be aware that we fit and will check the boundary decision of the classifier on\n", + "the same dataset without splitting the dataset into a training set and a\n", "testing set. While this is a bad practice, we use it for the sake of\n", - "simplicity to depict the model behavior. Always use cross-validation when\n", - "you want to assess the generalization performance of a machine-learning model.

\n", + "simplicity to depict the model behavior. Always use cross-validation when you\n", + "want to assess the generalization performance of a machine-learning model.

\n", "
" ] }, @@ -124,8 +130,13 @@ "DecisionBoundaryDisplay.from_estimator(\n", " linear_model, data_moons, response_method=\"predict\", cmap=\"RdBu\", alpha=0.5\n", ")\n", - "sns.scatterplot(data=moons, x=feature_names[0], y=feature_names[1],\n", - " hue=target_moons, palette=[\"tab:red\", \"tab:blue\"])\n", + "sns.scatterplot(\n", + " data=moons,\n", + " x=feature_names[0],\n", + " y=feature_names[1],\n", + " hue=target_moons,\n", + " palette=[\"tab:red\", \"tab:blue\"],\n", + ")\n", "_ = plt.title(\"Decision boundary of a linear model\")" ] }, @@ -152,9 +163,12 @@ "target_name = \"class\"\n", "\n", "X, y = make_gaussian_quantiles(\n", - " n_samples=100, n_features=2, n_classes=2, random_state=42)\n", - "gauss = pd.DataFrame(np.concatenate([X, y[:, np.newaxis]], axis=1),\n", - " columns=feature_names + [target_name])\n", + " n_samples=100, n_features=2, n_classes=2, random_state=42\n", + ")\n", + "gauss = pd.DataFrame(\n", + " np.concatenate([X, y[:, np.newaxis]], axis=1),\n", + " columns=feature_names + [target_name],\n", + ")\n", "data_gauss, target_gauss = gauss[feature_names], gauss[target_name]" ] }, @@ -164,8 +178,13 @@ "metadata": {}, "outputs": [], "source": [ - "ax = sns.scatterplot(data=gauss, x=feature_names[0], y=feature_names[1],\n", - " hue=target_gauss, palette=[\"tab:red\", \"tab:blue\"])\n", + "ax = sns.scatterplot(\n", + " data=gauss,\n", + " x=feature_names[0],\n", + " y=feature_names[1],\n", + " hue=target_gauss,\n", + " palette=[\"tab:red\", \"tab:blue\"],\n", + ")\n", "_ = plt.title(\"Illustration of the Gaussian quantiles dataset\")" ] }, @@ -188,8 +207,13 @@ "DecisionBoundaryDisplay.from_estimator(\n", " linear_model, data_gauss, response_method=\"predict\", cmap=\"RdBu\", alpha=0.5\n", ")\n", - "sns.scatterplot(data=gauss, x=feature_names[0], y=feature_names[1],\n", - " hue=target_gauss, palette=[\"tab:red\", \"tab:blue\"])\n", + "sns.scatterplot(\n", + " data=gauss,\n", + " x=feature_names[0],\n", + " y=feature_names[1],\n", + " hue=target_gauss,\n", + " palette=[\"tab:red\", \"tab:blue\"],\n", + ")\n", "_ = plt.title(\"Decision boundary of a linear model\")" ] }, @@ -198,13 +222,13 @@ "metadata": {}, "source": [ "As expected, a linear separation cannot be used to separate the classes\n", - "properly: the model will under-fit as it will make errors even on\n", - "the training set.\n", + "properly: the model will under-fit as it will make errors even on the training\n", + "set.\n", "\n", "In the section about linear regression, we saw that we could use several\n", - "tricks to make a linear model more flexible by augmenting features or\n", - "using a kernel. Here, we will use the later solution by using a radial basis\n", - "function (RBF) kernel together with a support vector machine classifier.\n", + "tricks to make a linear model more flexible by augmenting features or using a\n", + "kernel. Here, we will use the later solution by using a radial basis function\n", + "(RBF) kernel together with a support vector machine classifier.\n", "\n", "We will repeat the two previous experiments and check the obtained decision\n", "function." @@ -229,8 +253,13 @@ "DecisionBoundaryDisplay.from_estimator(\n", " kernel_model, data_moons, response_method=\"predict\", cmap=\"RdBu\", alpha=0.5\n", ")\n", - "sns.scatterplot(data=moons, x=feature_names[0], y=feature_names[1],\n", - " hue=target_moons, palette=[\"tab:red\", \"tab:blue\"])\n", + "sns.scatterplot(\n", + " data=moons,\n", + " x=feature_names[0],\n", + " y=feature_names[1],\n", + " hue=target_moons,\n", + " palette=[\"tab:red\", \"tab:blue\"],\n", + ")\n", "_ = plt.title(\"Decision boundary with a model using an RBF kernel\")" ] }, @@ -238,8 +267,8 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "We see that the decision boundary is not anymore a straight line. Indeed,\n", - "an area is defined around the red samples and we could imagine that this\n", + "We see that the decision boundary is not anymore a straight line. Indeed, an\n", + "area is defined around the red samples and we could imagine that this\n", "classifier should be able to generalize on unseen data.\n", "\n", "Let's check the decision function on the second dataset." @@ -255,8 +284,13 @@ "DecisionBoundaryDisplay.from_estimator(\n", " kernel_model, data_gauss, response_method=\"predict\", cmap=\"RdBu\", alpha=0.5\n", ")\n", - "ax = sns.scatterplot(data=gauss, x=feature_names[0], y=feature_names[1],\n", - " hue=target_gauss, palette=[\"tab:red\", \"tab:blue\"])\n", + "ax = sns.scatterplot(\n", + " data=gauss,\n", + " x=feature_names[0],\n", + " y=feature_names[1],\n", + " hue=target_gauss,\n", + " palette=[\"tab:red\", \"tab:blue\"],\n", + ")\n", "_ = plt.title(\"Decision boundary with a model using an RBF kernel\")" ] }, @@ -272,10 +306,10 @@ "\n", "Keep in mind that adding flexibility to a model can also risk increasing\n", "overfitting by making the decision function to be sensitive to individual\n", - "(possibly noisy) data points of the training set. Here we can observe that\n", - "the decision functions remain smooth enough to preserve good generalization.\n", - "If you are curious, you can try to repeat the above experiment with\n", - "`gamma=100` and look at the decision functions." + "(possibly noisy) data points of the training set. Here we can observe that the\n", + "decision functions remain smooth enough to preserve good generalization. If\n", + "you are curious, you can try to repeat the above experiment with `gamma=100`\n", + "and look at the decision functions." ] } ], diff --git a/notebooks/matplotlibrc b/notebooks/matplotlibrc index f05e0c23b..5b66f14e6 100644 --- a/notebooks/matplotlibrc +++ b/notebooks/matplotlibrc @@ -28,4 +28,4 @@ ytick.labelsize: 16.5 ytick.major.size: 9.0 ytick.major.width: 1.875 ytick.minor.size: 6.0 -ytick.minor.width: 1.5 \ No newline at end of file +ytick.minor.width: 1.5 diff --git a/notebooks/metrics_classification.ipynb b/notebooks/metrics_classification.ipynb index 6332bc49e..4cda309ba 100644 --- a/notebooks/metrics_classification.ipynb +++ b/notebooks/metrics_classification.ipynb @@ -10,13 +10,13 @@ "its minimum or maximum. It is important to understand that this objective\n", "function is usually decoupled from the evaluation metric that we want to\n", "optimize in practice. The objective function serves as a proxy for the\n", - "evaluation metric. Therefore, in the upcoming notebooks, we will present\n", - "the different evaluation metrics used in machine learning.\n", + "evaluation metric. Therefore, in the upcoming notebooks, we will present the\n", + "different evaluation metrics used in machine learning.\n", "\n", "This notebook aims at giving an overview of the classification metrics that\n", - "can be used to evaluate the predictive model generalization performance. We can\n", - "recall that in a classification setting, the vector `target` is categorical\n", - "rather than continuous.\n", + "can be used to evaluate the predictive model generalization performance. We\n", + "can recall that in a classification setting, the vector `target` is\n", + "categorical rather than continuous.\n", "\n", "We will load the blood transfusion dataset." ] @@ -86,7 +86,8 @@ "from sklearn.model_selection import train_test_split\n", "\n", "data_train, data_test, target_train, target_test = train_test_split(\n", - " data, target, shuffle=True, random_state=0, test_size=0.5)" + " data, target, shuffle=True, random_state=0, test_size=0.5\n", + ")" ] }, { @@ -115,6 +116,7 @@ "metadata": {}, "source": [ "## Classifier predictions\n", + "\n", "Before we go into details regarding the metrics, we will recall what type of\n", "predictions a classifier can provide.\n", "\n", @@ -160,13 +162,12 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "With this information, our classifier predicts that this synthetic subject\n", - "is more likely to not donate blood again.\n", + "With this information, our classifier predicts that this synthetic subject is\n", + "more likely to not donate blood again.\n", "\n", - "However, we cannot check whether the prediction is correct (we do not know\n", - "the true target value). That's the purpose of the testing set. First, we\n", - "predict whether a subject will give blood with the help of the trained\n", - "classifier." + "However, we cannot check whether the prediction is correct (we do not know the\n", + "true target value). That's the purpose of the testing set. First, we predict\n", + "whether a subject will give blood with the help of the trained classifier." ] }, { @@ -184,6 +185,7 @@ "metadata": {}, "source": [ "## Accuracy as a baseline\n", + "\n", "Now that we have these predictions, we can compare them with the true\n", "predictions (sometimes called ground-truth) which we did not use until now." ] @@ -203,8 +205,8 @@ "source": [ "In the comparison above, a `True` value means that the value predicted by our\n", "classifier is identical to the real value, while a `False` means that our\n", - "classifier made a mistake. One way of getting an overall rate representing\n", - "the generalization performance of our classifier would be to compute how many\n", + "classifier made a mistake. One way of getting an overall rate representing the\n", + "generalization performance of our classifier would be to compute how many\n", "times our classifier was right and divide it by the number of samples in our\n", "set." ] @@ -224,9 +226,9 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "This measure is called the accuracy. Here, our classifier is 78%\n", - "accurate at classifying if a subject will give blood. `scikit-learn` provides\n", - "a function that computes this metric in the module `sklearn.metrics`." + "This measure is called the accuracy. Here, our classifier is 78% accurate at\n", + "classifying if a subject will give blood. `scikit-learn` provides a function\n", + "that computes this metric in the module `sklearn.metrics`." ] }, { @@ -263,6 +265,7 @@ "metadata": {}, "source": [ "## Confusion matrix and derived metrics\n", + "\n", "The comparison that we did above and the accuracy that we calculated did not\n", "take into account the type of error our classifier was making. Accuracy is an\n", "aggregate of the errors made by the classifier. We may be interested in finer\n", @@ -288,20 +291,19 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "The in-diagonal numbers are related to predictions that were correct\n", - "while off-diagonal numbers are related to incorrect predictions\n", + "The in-diagonal numbers are related to predictions that were correct while\n", + "off-diagonal numbers are related to incorrect predictions\n", "(misclassifications). We now know the four types of correct and erroneous\n", "predictions:\n", "\n", - "* the top left corner are true positives (TP) and corresponds to people\n", - " who gave blood and were predicted as such by the classifier;\n", - "* the bottom right corner are true negatives (TN) and correspond to\n", - " people who did not give blood and were predicted as such by the\n", - " classifier;\n", - "* the top right corner are false negatives (FN) and correspond to\n", - " people who gave blood but were predicted to not have given blood;\n", - "* the bottom left corner are false positives (FP) and correspond to\n", - " people who did not give blood but were predicted to have given blood.\n", + "* the top left corner are true positives (TP) and corresponds to people who\n", + " gave blood and were predicted as such by the classifier;\n", + "* the bottom right corner are true negatives (TN) and correspond to people who\n", + " did not give blood and were predicted as such by the classifier;\n", + "* the top right corner are false negatives (FN) and correspond to people who\n", + " gave blood but were predicted to not have given blood;\n", + "* the bottom left corner are false positives (FP) and correspond to people who\n", + " did not give blood but were predicted to have given blood.\n", "\n", "Once we have split this information, we can compute metrics to highlight the\n", "generalization performance of our classifier in a particular setting. For\n", @@ -309,14 +311,13 @@ "blood when the classifier predicted so or the fraction of people predicted to\n", "have given blood out of the total population that actually did so.\n", "\n", - "The former metric, known as the precision, is defined as TP / (TP + FP)\n", - "and represents how likely the person actually gave blood when the classifier\n", - "predicted that they did.\n", - "The latter, known as the recall, defined as TP / (TP + FN) and\n", - "assesses how well the classifier is able to correctly identify people who\n", - "did give blood.\n", - "We could, similarly to accuracy, manually compute these values,\n", - "however scikit-learn provides functions to compute these statistics." + "The former metric, known as the precision, is defined as TP / (TP + FP) and\n", + "represents how likely the person actually gave blood when the classifier\n", + "predicted that they did. The latter, known as the recall, defined as TP / (TP\n", + "+ FN) and assesses how well the classifier is able to correctly identify\n", + "people who did give blood. We could, similarly to accuracy, manually compute\n", + "these values, however scikit-learn provides functions to compute these\n", + "statistics." ] }, { @@ -341,17 +342,17 @@ "These results are in line with what was seen in the confusion matrix. Looking\n", "at the left column, more than half of the \"donated\" predictions were correct,\n", "leading to a precision above 0.5. However, our classifier mislabeled a lot of\n", - "people who gave blood as \"not donated\", leading to a very low recall of\n", - "around 0.1.\n", + "people who gave blood as \"not donated\", leading to a very low recall of around\n", + "0.1.\n", "\n", "## The issue of class imbalance\n", "At this stage, we could ask ourself a reasonable question. While the accuracy\n", "did not look bad (i.e. 77%), the recall score is relatively low (i.e. 12%).\n", "\n", "As we mentioned, precision and recall only focuses on samples predicted to be\n", - "positive, while accuracy takes both into account. In addition, we did not\n", - "look at the ratio of classes (labels). We could check this ratio in the\n", - "training set." + "positive, while accuracy takes both into account. In addition, we did not look\n", + "at the ratio of classes (labels). We could check this ratio in the training\n", + "set." ] }, { @@ -386,8 +387,10 @@ "\n", "dummy_classifier = DummyClassifier(strategy=\"most_frequent\")\n", "dummy_classifier.fit(data_train, target_train)\n", - "print(f\"Accuracy of the dummy classifier: \"\n", - " f\"{dummy_classifier.score(data_test, target_test):.3f}\")" + "print(\n", + " \"Accuracy of the dummy classifier: \"\n", + " f\"{dummy_classifier.score(data_test, target_test):.3f}\"\n", + ")" ] }, { @@ -430,11 +433,10 @@ "\n", "All statistics that we presented up to now rely on `classifier.predict` which\n", "outputs the most likely label. We haven't made use of the probability\n", - "associated with this prediction, which gives the confidence of the\n", - "classifier in this prediction. By default, the prediction of a classifier\n", - "corresponds to a threshold of 0.5 probability in a binary classification\n", - "problem. We can quickly check this relationship with the classifier that\n", - "we trained." + "associated with this prediction, which gives the confidence of the classifier\n", + "in this prediction. By default, the prediction of a classifier corresponds to\n", + "a threshold of 0.5 probability in a binary classification problem. We can\n", + "quickly check this relationship with the classifier that we trained." ] }, { @@ -443,8 +445,9 @@ "metadata": {}, "outputs": [], "source": [ - "target_proba_predicted = pd.DataFrame(classifier.predict_proba(data_test),\n", - " columns=classifier.classes_)\n", + "target_proba_predicted = pd.DataFrame(\n", + " classifier.predict_proba(data_test), columns=classifier.classes_\n", + ")\n", "target_proba_predicted[:5]" ] }, @@ -462,8 +465,8 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Since probabilities sum to 1 we can get the class with the highest\n", - "probability without using the threshold 0.5." + "Since probabilities sum to 1 we can get the class with the highest probability\n", + "without using the threshold 0.5." ] }, { @@ -473,7 +476,8 @@ "outputs": [], "source": [ "equivalence_pred_proba = (\n", - " target_proba_predicted.idxmax(axis=1).to_numpy() == target_predicted)\n", + " target_proba_predicted.idxmax(axis=1).to_numpy() == target_predicted\n", + ")\n", "np.all(equivalence_pred_proba)" ] }, @@ -482,12 +486,12 @@ "metadata": {}, "source": [ "The default decision threshold (0.5) might not be the best threshold that\n", - "leads to optimal generalization performance of our classifier. In this case, one\n", - "can vary the decision threshold, and therefore the underlying prediction, and\n", - "compute the same statistics presented earlier. Usually, the two metrics\n", + "leads to optimal generalization performance of our classifier. In this case,\n", + "one can vary the decision threshold, and therefore the underlying prediction,\n", + "and compute the same statistics presented earlier. Usually, the two metrics\n", "recall and precision are computed and plotted on a graph. Each metric plotted\n", - "on a graph axis and each point on the graph corresponds to a specific\n", - "decision threshold. Let's start by computing the precision-recall curve." + "on a graph axis and each point on the graph corresponds to a specific decision\n", + "threshold. Let's start by computing the precision-recall curve." ] }, { @@ -499,12 +503,17 @@ "from sklearn.metrics import PrecisionRecallDisplay\n", "\n", "disp = PrecisionRecallDisplay.from_estimator(\n", - " classifier, data_test, target_test, pos_label='donated',\n", - " marker=\"+\"\n", + " classifier, data_test, target_test, pos_label=\"donated\", marker=\"+\"\n", ")\n", "disp = PrecisionRecallDisplay.from_estimator(\n", - " dummy_classifier, data_test, target_test, pos_label='donated',\n", - " color=\"tab:orange\", linestyle=\"--\", ax=disp.ax_)\n", + " dummy_classifier,\n", + " data_test,\n", + " target_test,\n", + " pos_label=\"donated\",\n", + " color=\"tab:orange\",\n", + " linestyle=\"--\",\n", + " ax=disp.ax_,\n", + ")\n", "plt.xlabel(\"Recall (also known as TPR or sensitivity)\")\n", "plt.ylabel(\"Precision (also known as PPV)\")\n", "plt.xlim(0, 1)\n", @@ -547,9 +556,7 @@ "metadata": {}, "outputs": [], "source": [ - "prevalence = (\n", - " target_test.value_counts()[1] / target_test.value_counts().sum()\n", - ")\n", + "prevalence = target_test.value_counts()[1] / target_test.value_counts().sum()\n", "print(f\"Prevalence of the class 'donated': {prevalence:.2f}\")" ] }, @@ -562,8 +569,8 @@ "positive class and accurately discriminating the negative classes. The\n", "statistics used for this are sensitivity and specificity. Sensitivity is just\n", "another name for recall. However, specificity measures the proportion of\n", - "correctly classified samples in the negative class defined as: TN / (TN +\n", - "FP). Similar to the precision-recall curve, sensitivity and specificity are\n", + "correctly classified samples in the negative class defined as: TN / (TN + FP).\n", + "Similar to the precision-recall curve, sensitivity and specificity are\n", "generally plotted as a curve called the Receiver Operating Characteristic\n", "(ROC) curve. Below is such a curve:" ] @@ -577,11 +584,17 @@ "from sklearn.metrics import RocCurveDisplay\n", "\n", "disp = RocCurveDisplay.from_estimator(\n", - " classifier, data_test, target_test, pos_label='donated',\n", - " marker=\"+\")\n", + " classifier, data_test, target_test, pos_label=\"donated\", marker=\"+\"\n", + ")\n", "disp = RocCurveDisplay.from_estimator(\n", - " dummy_classifier, data_test, target_test, pos_label='donated',\n", - " color=\"tab:orange\", linestyle=\"--\", ax=disp.ax_)\n", + " dummy_classifier,\n", + " data_test,\n", + " target_test,\n", + " pos_label=\"donated\",\n", + " color=\"tab:orange\",\n", + " linestyle=\"--\",\n", + " ax=disp.ax_,\n", + ")\n", "plt.xlabel(\"False positive rate\")\n", "plt.ylabel(\"True positive rate\\n(also known as sensitivity or recall)\")\n", "plt.xlim(0, 1)\n", diff --git a/notebooks/metrics_ex_01.ipynb b/notebooks/metrics_ex_01.ipynb index 75ecc2bcf..e4d91520d 100644 --- a/notebooks/metrics_ex_01.ipynb +++ b/notebooks/metrics_ex_01.ipynb @@ -59,9 +59,9 @@ "source": [ "Create a `StratifiedKFold` cross-validation object. Then use it inside the\n", "`cross_val_score` function to evaluate the decision tree. We will first use\n", - "the accuracy as a score function. Explicitly use the `scoring` parameter\n", - "of `cross_val_score` to compute the accuracy (even if this is the default\n", - "score). Check its documentation to learn how to do that." + "the accuracy as a score function. Explicitly use the `scoring` parameter of\n", + "`cross_val_score` to compute the accuracy (even if this is the default score).\n", + "Check its documentation to learn how to do that." ] }, { @@ -93,12 +93,12 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "We will now add a bit of complexity. We would like to compute the precision\n", - "of our model. However, during the course we saw that we need to mention the\n", + "We will now add a bit of complexity. We would like to compute the precision of\n", + "our model. However, during the course we saw that we need to mention the\n", "positive label which in our case we consider to be the class `donated`.\n", "\n", - "We will show that computing the precision without providing the positive\n", - "label will not be supported by scikit-learn because it is indeed ambiguous." + "We will show that computing the precision without providing the positive label\n", + "will not be supported by scikit-learn because it is indeed ambiguous." ] }, { @@ -132,9 +132,8 @@ "\n", "So, import `sklearn.metrics.make_scorer` and\n", "`sklearn.metrics.precision_score`. Check their documentations for more\n", - "information.\n", - "Finally, create a scorer by calling `make_scorer` using the score function\n", - "`precision_score` and pass the extra parameter `pos_label=\"donated\"`." + "information. Finally, create a scorer by calling `make_scorer` using the score\n", + "function `precision_score` and pass the extra parameter `pos_label=\"donated\"`." ] }, { @@ -169,8 +168,8 @@ "source": [ "`cross_val_score` will only compute a single score provided to the `scoring`\n", "parameter. The function `cross_validate` allows the computation of multiple\n", - "scores by passing a list of string or scorer to the parameter `scoring`,\n", - "which could be handy.\n", + "scores by passing a list of string or scorer to the parameter `scoring`, which\n", + "could be handy.\n", "\n", "Import `sklearn.model_selection.cross_validate` and compute the accuracy and\n", "balanced accuracy through cross-validation. Plot the cross-validation score\n", @@ -189,7 +188,6 @@ ], "metadata": { "jupytext": { - "encoding": "# -*- coding: utf-8 -*-", "main_language": "python" }, "kernelspec": { diff --git a/notebooks/metrics_ex_02.ipynb b/notebooks/metrics_ex_02.ipynb index 2f7436095..dc161b983 100644 --- a/notebooks/metrics_ex_02.ipynb +++ b/notebooks/metrics_ex_02.ipynb @@ -114,7 +114,6 @@ ], "metadata": { "jupytext": { - "encoding": "# -*- coding: utf-8 -*-", "main_language": "python" }, "kernelspec": { diff --git a/notebooks/metrics_regression.ipynb b/notebooks/metrics_regression.ipynb index 33080cb44..36b27bb14 100644 --- a/notebooks/metrics_regression.ipynb +++ b/notebooks/metrics_regression.ipynb @@ -14,9 +14,9 @@ "it is a continuous variable in regression, while a discrete variable in\n", "classification.\n", "\n", - "We will use the Ames housing dataset. The goal is to predict the price\n", - "of houses in the city of Ames, Iowa. As with classification, we will only use\n", - "a single train-test split to focus solely on the regression metrics." + "We will use the Ames housing dataset. The goal is to predict the price of\n", + "houses in the city of Ames, Iowa. As with classification, we will only use a\n", + "single train-test split to focus solely on the regression metrics." ] }, { @@ -72,10 +72,9 @@ "source": [ "Some machine learning models are designed to be solved as an optimization\n", "problem: minimizing an error (also known as the loss function) using a\n", - "training set.\n", - "A basic loss function used in regression is the mean squared error (MSE).\n", - "Thus, this metric is sometimes used to evaluate the model since it is\n", - "optimized by said model.\n", + "training set. A basic loss function used in regression is the mean squared\n", + "error (MSE). Thus, this metric is sometimes used to evaluate the model since\n", + "it is optimized by said model.\n", "\n", "We will give an example using a linear regression model." ] @@ -93,8 +92,10 @@ "regressor.fit(data_train, target_train)\n", "target_predicted = regressor.predict(data_train)\n", "\n", - "print(f\"Mean squared error on the training set: \"\n", - " f\"{mean_squared_error(target_train, target_predicted):.3f}\")" + "print(\n", + " \"Mean squared error on the training set: \"\n", + " f\"{mean_squared_error(target_train, target_predicted):.3f}\"\n", + ")" ] }, { @@ -102,8 +103,8 @@ "metadata": {}, "source": [ "Our linear regression model is minimizing the mean squared error on the\n", - "training set. It means that there is no other set of coefficients which\n", - "will decrease the error.\n", + "training set. It means that there is no other set of coefficients which will\n", + "decrease the error.\n", "\n", "Then, we can compute the mean squared error on the test set." ] @@ -116,18 +117,20 @@ "source": [ "target_predicted = regressor.predict(data_test)\n", "\n", - "print(f\"Mean squared error on the testing set: \"\n", - " f\"{mean_squared_error(target_test, target_predicted):.3f}\")" + "print(\n", + " \"Mean squared error on the testing set: \"\n", + " f\"{mean_squared_error(target_test, target_predicted):.3f}\"\n", + ")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "The raw MSE can be difficult to interpret. One way is to rescale the MSE\n", - "by the variance of the target. This score is known as the $R^2$ also called\n", - "the coefficient of determination. Indeed, this is the default score used\n", - "in scikit-learn by calling the method `score`." + "The raw MSE can be difficult to interpret. One way is to rescale the MSE by\n", + "the variance of the target. This score is known as the $R^2$ also called the\n", + "coefficient of determination. Indeed, this is the default score used in\n", + "scikit-learn by calling the method `score`." ] }, { @@ -159,8 +162,10 @@ "\n", "dummy_regressor = DummyRegressor(strategy=\"mean\")\n", "dummy_regressor.fit(data_train, target_train)\n", - "print(f\"R2 score for a regressor predicting the mean:\"\n", - " f\"{dummy_regressor.score(data_test, target_test):.3f}\")" + "print(\n", + " \"R2 score for a regressor predicting the mean:\"\n", + " f\"{dummy_regressor.score(data_test, target_test):.3f}\"\n", + ")" ] }, { @@ -183,8 +188,10 @@ "from sklearn.metrics import mean_absolute_error\n", "\n", "target_predicted = regressor.predict(data_test)\n", - "print(f\"Mean absolute error: \"\n", - " f\"{mean_absolute_error(target_test, target_predicted):.3f} k$\")" + "print(\n", + " \"Mean absolute error: \"\n", + " f\"{mean_absolute_error(target_test, target_predicted):.3f} k$\"\n", + ")" ] }, { @@ -206,8 +213,10 @@ "source": [ "from sklearn.metrics import median_absolute_error\n", "\n", - "print(f\"Median absolute error: \"\n", - " f\"{median_absolute_error(target_test, target_predicted):.3f} k$\")" + "print(\n", + " \"Median absolute error: \"\n", + " f\"{median_absolute_error(target_test, target_predicted):.3f} k$\"\n", + ")" ] }, { @@ -230,16 +239,18 @@ "source": [ "from sklearn.metrics import mean_absolute_percentage_error\n", "\n", - "print(f\"Mean absolute percentage error: \"\n", - " f\"{mean_absolute_percentage_error(target_test, target_predicted) * 100:.3f} %\")" + "print(\n", + " \"Mean absolute percentage error: \"\n", + " f\"{mean_absolute_percentage_error(target_test, target_predicted) * 100:.3f} %\"\n", + ")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "In addition of metrics, we can visually represent the results by plotting\n", - "the predicted values versus the true values." + "In addition of metrics, we can visually represent the results by plotting the\n", + "predicted values versus the true values." ] }, { @@ -249,7 +260,9 @@ "outputs": [], "source": [ "predicted_actual = {\n", - " \"True values (k$)\": target_test, \"Predicted values (k$)\": target_predicted}\n", + " \"True values (k$)\": target_test,\n", + " \"Predicted values (k$)\": target_predicted,\n", + "}\n", "predicted_actual = pd.DataFrame(predicted_actual)" ] }, @@ -262,11 +275,15 @@ "import matplotlib.pyplot as plt\n", "import seaborn as sns\n", "\n", - "sns.scatterplot(data=predicted_actual,\n", - " x=\"True values (k$)\", y=\"Predicted values (k$)\",\n", - " color=\"black\", alpha=0.5)\n", + "sns.scatterplot(\n", + " data=predicted_actual,\n", + " x=\"True values (k$)\",\n", + " y=\"Predicted values (k$)\",\n", + " color=\"black\",\n", + " alpha=0.5,\n", + ")\n", "plt.axline((0, 0), slope=1, label=\"Perfect fit\")\n", - "plt.axis('square')\n", + "plt.axis(\"square\")\n", "_ = plt.title(\"Regression using a model without \\ntarget transformation\")" ] }, @@ -275,8 +292,8 @@ "metadata": {}, "source": [ "On this plot, correct predictions would lie on the diagonal line. This plot\n", - "allows us to detect if the model makes errors in a consistent way, i.e.\n", - "has some bias.\n", + "allows us to detect if the model makes errors in a consistent way, i.e. has\n", + "some bias.\n", "\n", "On this plot, we see that for the large True price values, our model tends to\n", "under-estimate the price of the house. Typically, this issue arises when the\n", @@ -294,9 +311,11 @@ "from sklearn.compose import TransformedTargetRegressor\n", "\n", "transformer = QuantileTransformer(\n", - " n_quantiles=900, output_distribution=\"normal\")\n", + " n_quantiles=900, output_distribution=\"normal\"\n", + ")\n", "model_transformed_target = TransformedTargetRegressor(\n", - " regressor=regressor, transformer=transformer)\n", + " regressor=regressor, transformer=transformer\n", + ")\n", "model_transformed_target.fit(data_train, target_train)\n", "target_predicted = model_transformed_target.predict(data_test)" ] @@ -308,7 +327,9 @@ "outputs": [], "source": [ "predicted_actual = {\n", - " \"True values (k$)\": target_test, \"Predicted values (k$)\": target_predicted}\n", + " \"True values (k$)\": target_test,\n", + " \"Predicted values (k$)\": target_predicted,\n", + "}\n", "predicted_actual = pd.DataFrame(predicted_actual)" ] }, @@ -318,14 +339,19 @@ "metadata": {}, "outputs": [], "source": [ - "sns.scatterplot(data=predicted_actual,\n", - " x=\"True values (k$)\", y=\"Predicted values (k$)\",\n", - " color=\"black\", alpha=0.5)\n", + "sns.scatterplot(\n", + " data=predicted_actual,\n", + " x=\"True values (k$)\",\n", + " y=\"Predicted values (k$)\",\n", + " color=\"black\",\n", + " alpha=0.5,\n", + ")\n", "plt.axline((0, 0), slope=1, label=\"Perfect fit\")\n", - "plt.axis('square')\n", + "plt.axis(\"square\")\n", "plt.legend()\n", - "_ = plt.title(\"Regression using a model that\\n transform the target before \"\n", - " \"fitting\")" + "_ = plt.title(\n", + " \"Regression using a model that\\ntransform the target before fitting\"\n", + ")" ] }, { diff --git a/notebooks/metrics_sol_01.ipynb b/notebooks/metrics_sol_01.ipynb index d83c207a3..ebfdd861b 100644 --- a/notebooks/metrics_sol_01.ipynb +++ b/notebooks/metrics_sol_01.ipynb @@ -62,9 +62,9 @@ "source": [ "Create a `StratifiedKFold` cross-validation object. Then use it inside the\n", "`cross_val_score` function to evaluate the decision tree. We will first use\n", - "the accuracy as a score function. Explicitly use the `scoring` parameter\n", - "of `cross_val_score` to compute the accuracy (even if this is the default\n", - "score). Check its documentation to learn how to do that." + "the accuracy as a score function. Explicitly use the `scoring` parameter of\n", + "`cross_val_score` to compute the accuracy (even if this is the default score).\n", + "Check its documentation to learn how to do that." ] }, { @@ -95,8 +95,9 @@ "outputs": [], "source": [ "# solution\n", - "scores = cross_val_score(tree, data, target, cv=cv,\n", - " scoring=\"balanced_accuracy\")\n", + "scores = cross_val_score(\n", + " tree, data, target, cv=cv, scoring=\"balanced_accuracy\"\n", + ")\n", "print(f\"Balanced accuracy score: {scores.mean():.3f} \u00b1 {scores.std():.3f}\")" ] }, @@ -104,12 +105,12 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "We will now add a bit of complexity. We would like to compute the precision\n", - "of our model. However, during the course we saw that we need to mention the\n", + "We will now add a bit of complexity. We would like to compute the precision of\n", + "our model. However, during the course we saw that we need to mention the\n", "positive label which in our case we consider to be the class `donated`.\n", "\n", - "We will show that computing the precision without providing the positive\n", - "label will not be supported by scikit-learn because it is indeed ambiguous." + "We will show that computing the precision without providing the positive label\n", + "will not be supported by scikit-learn because it is indeed ambiguous." ] }, { @@ -143,9 +144,8 @@ "\n", "So, import `sklearn.metrics.make_scorer` and\n", "`sklearn.metrics.precision_score`. Check their documentations for more\n", - "information.\n", - "Finally, create a scorer by calling `make_scorer` using the score function\n", - "`precision_score` and pass the extra parameter `pos_label=\"donated\"`." + "information. Finally, create a scorer by calling `make_scorer` using the score\n", + "function `precision_score` and pass the extra parameter `pos_label=\"donated\"`." ] }, { @@ -185,8 +185,8 @@ "source": [ "`cross_val_score` will only compute a single score provided to the `scoring`\n", "parameter. The function `cross_validate` allows the computation of multiple\n", - "scores by passing a list of string or scorer to the parameter `scoring`,\n", - "which could be handy.\n", + "scores by passing a list of string or scorer to the parameter `scoring`, which\n", + "could be handy.\n", "\n", "Import `sklearn.model_selection.cross_validate` and compute the accuracy and\n", "balanced accuracy through cross-validation. Plot the cross-validation score\n", @@ -201,8 +201,8 @@ "source": [ "# solution\n", "from sklearn.model_selection import cross_validate\n", - "scoring = [\"accuracy\", \"balanced_accuracy\"]\n", "\n", + "scoring = [\"accuracy\", \"balanced_accuracy\"]\n", "scores = cross_validate(tree, data, target, cv=cv, scoring=scoring)\n", "scores" ] @@ -223,7 +223,7 @@ "\n", "metrics = pd.DataFrame(\n", " [scores[\"test_accuracy\"], scores[\"test_balanced_accuracy\"]],\n", - " index=[\"Accuracy\", \"Balanced accuracy\"]\n", + " index=[\"Accuracy\", \"Balanced accuracy\"],\n", ").T" ] }, diff --git a/notebooks/metrics_sol_02.ipynb b/notebooks/metrics_sol_02.ipynb index 4b2a3b094..9dff91ae7 100644 --- a/notebooks/metrics_sol_02.ipynb +++ b/notebooks/metrics_sol_02.ipynb @@ -98,11 +98,11 @@ "outputs": [], "source": [ "# solution\n", - "scores = cross_val_score(model, data, target, cv=10,\n", - " scoring=\"neg_mean_absolute_error\")\n", + "scores = cross_val_score(\n", + " model, data, target, cv=10, scoring=\"neg_mean_absolute_error\"\n", + ")\n", "errors = -scores\n", - "print(f\"Mean absolute error: \"\n", - " f\"{errors.mean():.3f} k$ \u00b1 {errors.std():.3f}\")" + "print(f\"Mean absolute error: {errors.mean():.3f} k$ \u00b1 {errors.std():.3f}\")" ] }, { @@ -153,8 +153,10 @@ "source": [ "import pandas as pd\n", "\n", - "scores = {\"R2\": cv_results[\"test_r2\"],\n", - " \"MAE\": -cv_results[\"test_neg_mean_absolute_error\"]}\n", + "scores = {\n", + " \"R2\": cv_results[\"test_r2\"],\n", + " \"MAE\": -cv_results[\"test_neg_mean_absolute_error\"],\n", + "}\n", "scores = pd.DataFrame(scores)\n", "scores" ] diff --git a/notebooks/parameter_tuning_ex_02.ipynb b/notebooks/parameter_tuning_ex_02.ipynb index b6b18b96a..46345e86b 100644 --- a/notebooks/parameter_tuning_ex_02.ipynb +++ b/notebooks/parameter_tuning_ex_02.ipynb @@ -9,10 +9,9 @@ "The goal is to write an exhaustive search to find the best parameters\n", "combination maximizing the model generalization performance.\n", "\n", - "Here we use a small subset of the Adult Census dataset to make the code\n", - "faster to execute. Once your code works on the small subset, try to\n", - "change `train_size` to a larger value (e.g. 0.8 for 80% instead of\n", - "20%)." + "Here we use a small subset of the Adult Census dataset to make the code faster\n", + "to execute. Once your code works on the small subset, try to change\n", + "`train_size` to a larger value (e.g. 0.8 for 80% instead of 20%)." ] }, { @@ -32,7 +31,8 @@ "data = adult_census.drop(columns=[target_name, \"education-num\"])\n", "\n", "data_train, data_test, target_train, target_test = train_test_split(\n", - " data, target, train_size=0.2, random_state=42)" + " data, target, train_size=0.2, random_state=42\n", + ")" ] }, { @@ -45,27 +45,36 @@ "from sklearn.compose import make_column_selector as selector\n", "from sklearn.preprocessing import OrdinalEncoder\n", "\n", - "categorical_preprocessor = OrdinalEncoder(handle_unknown=\"use_encoded_value\",\n", - " unknown_value=-1)\n", + "categorical_preprocessor = OrdinalEncoder(\n", + " handle_unknown=\"use_encoded_value\", unknown_value=-1\n", + ")\n", "preprocessor = ColumnTransformer(\n", - " [('cat_preprocessor', categorical_preprocessor,\n", - " selector(dtype_include=object))],\n", - " remainder='passthrough', sparse_threshold=0)\n", + " [\n", + " (\n", + " \"cat_preprocessor\",\n", + " categorical_preprocessor,\n", + " selector(dtype_include=object),\n", + " )\n", + " ],\n", + " remainder=\"passthrough\",\n", + " sparse_threshold=0,\n", + ")\n", "\n", "from sklearn.ensemble import HistGradientBoostingClassifier\n", "from sklearn.pipeline import Pipeline\n", "\n", - "model = Pipeline([\n", - " (\"preprocessor\", preprocessor),\n", - " (\"classifier\", HistGradientBoostingClassifier(random_state=42))\n", - "])" + "model = Pipeline(\n", + " [\n", + " (\"preprocessor\", preprocessor),\n", + " (\"classifier\", HistGradientBoostingClassifier(random_state=42)),\n", + " ]\n", + ")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "\n", "Use the previously defined model (called `model`) and using two nested `for`\n", "loops, make a search of the best combinations of the `learning_rate` and\n", "`max_leaf_nodes` parameters. In this regard, you will need to train and test\n", @@ -75,8 +84,8 @@ "- `learning_rate` for the values 0.01, 0.1, 1 and 10. This parameter controls\n", " the ability of a new tree to correct the error of the previous sequence of\n", " trees\n", - "- `max_leaf_nodes` for the values 3, 10, 30. This parameter controls the\n", - " depth of each tree." + "- `max_leaf_nodes` for the values 3, 10, 30. This parameter controls the depth\n", + " of each tree." ] }, { @@ -109,7 +118,6 @@ ], "metadata": { "jupytext": { - "encoding": "# -*- coding: utf-8 -*-", "main_language": "python" }, "kernelspec": { diff --git a/notebooks/parameter_tuning_ex_03.ipynb b/notebooks/parameter_tuning_ex_03.ipynb index 02395ae84..ee40ef916 100644 --- a/notebooks/parameter_tuning_ex_03.ipynb +++ b/notebooks/parameter_tuning_ex_03.ipynb @@ -23,15 +23,16 @@ "target *= 100 # rescale the target in k$\n", "\n", "data_train, data_test, target_train, target_test = train_test_split(\n", - " data, target, random_state=42)" + " data, target, random_state=42\n", + ")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "In this exercise, we will progressively define the regression pipeline\n", - "and later tune its hyperparameters.\n", + "In this exercise, we will progressively define the regression pipeline and\n", + "later tune its hyperparameters.\n", "\n", "Start by defining a pipeline that:\n", "* uses a `StandardScaler` to normalize the numerical data;\n", @@ -58,8 +59,8 @@ " `np.logspace(0, 3, num=10).astype(np.int32)`;\n", "- the parameter `with_mean` of the `StandardScaler` with possible values\n", " `True` or `False`;\n", - "- the parameter `with_std` of the `StandardScaler` with possible values\n", - " `True` or `False`.\n", + "- the parameter `with_std` of the `StandardScaler` with possible values `True`\n", + " or `False`.\n", "\n", "Notice that in the notebook \"Hyperparameter tuning by randomized-search\" we\n", "pass distributions to be sampled by the `RandomizedSearchCV`. In this case we\n", @@ -87,7 +88,6 @@ ], "metadata": { "jupytext": { - "encoding": "# -*- coding: utf-8 -*-", "main_language": "python" }, "kernelspec": { diff --git a/notebooks/parameter_tuning_grid_search.ipynb b/notebooks/parameter_tuning_grid_search.ipynb index ec9372599..d26aff083 100644 --- a/notebooks/parameter_tuning_grid_search.ipynb +++ b/notebooks/parameter_tuning_grid_search.ipynb @@ -83,7 +83,8 @@ "from sklearn.model_selection import train_test_split\n", "\n", "data_train, data_test, target_train, target_test = train_test_split(\n", - " data, target, random_state=42)" + " data, target, random_state=42\n", + ")" ] }, { @@ -112,13 +113,13 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Here we will use a tree-based model as a classifier\n", - "(i.e. `HistGradientBoostingClassifier`). That means:\n", + "Here we will use a tree-based model as a classifier (i.e.\n", + "`HistGradientBoostingClassifier`). That means:\n", "\n", "* Numerical variables don't need scaling;\n", - "* Categorical variables can be dealt with an `OrdinalEncoder` even if the \n", + "* Categorical variables can be dealt with an `OrdinalEncoder` even if the\n", " coding order is not meaningful;\n", - "* For tree-based models, the `OrdinalEncoder` avoids having high-dimensional \n", + "* For tree-based models, the `OrdinalEncoder` avoids having high-dimensional\n", " representations.\n", "\n", "We now build our `OrdinalEncoder` by passing it the known categories." @@ -132,16 +133,17 @@ "source": [ "from sklearn.preprocessing import OrdinalEncoder\n", "\n", - "categorical_preprocessor = OrdinalEncoder(handle_unknown=\"use_encoded_value\",\n", - " unknown_value=-1)" + "categorical_preprocessor = OrdinalEncoder(\n", + " handle_unknown=\"use_encoded_value\", unknown_value=-1\n", + ")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "We then use a `ColumnTransformer` to select the categorical columns and\n", - "apply the `OrdinalEncoder` to them." + "We then use a `ColumnTransformer` to select the categorical columns and apply\n", + "the `OrdinalEncoder` to them." ] }, { @@ -152,9 +154,11 @@ "source": [ "from sklearn.compose import ColumnTransformer\n", "\n", - "preprocessor = ColumnTransformer([\n", - " ('cat_preprocessor', categorical_preprocessor, categorical_columns)],\n", - " remainder='passthrough', sparse_threshold=0)" + "preprocessor = ColumnTransformer(\n", + " [(\"cat_preprocessor\", categorical_preprocessor, categorical_columns)],\n", + " remainder=\"passthrough\",\n", + " sparse_threshold=0,\n", + ")" ] }, { @@ -174,10 +178,15 @@ "from sklearn.ensemble import HistGradientBoostingClassifier\n", "from sklearn.pipeline import Pipeline\n", "\n", - "model = Pipeline([\n", - " (\"preprocessor\", preprocessor),\n", - " (\"classifier\",\n", - " HistGradientBoostingClassifier(random_state=42, max_leaf_nodes=4))])\n", + "model = Pipeline(\n", + " [\n", + " (\"preprocessor\", preprocessor),\n", + " (\n", + " \"classifier\",\n", + " HistGradientBoostingClassifier(random_state=42, max_leaf_nodes=4),\n", + " ),\n", + " ]\n", + ")\n", "model" ] }, @@ -187,12 +196,13 @@ "source": [ "## Tuning using a grid-search\n", "\n", - "In the previous exercise we used one `for` loop for each hyperparameter to find the \n", - "best combination over a fixed grid of values. `GridSearchCV` is a scikit-learn class \n", - "that implements a very similar logic with less repetitive code.\n", + "In the previous exercise we used one `for` loop for each hyperparameter to\n", + "find the best combination over a fixed grid of values. `GridSearchCV` is a\n", + "scikit-learn class that implements a very similar logic with less repetitive\n", + "code.\n", "\n", - "Let's see how to use the `GridSearchCV` estimator for doing such search.\n", - "Since the grid-search will be costly, we will only explore the combination\n", + "Let's see how to use the `GridSearchCV` estimator for doing such search. Since\n", + "the grid-search will be costly, we will only explore the combination\n", "learning-rate and the maximum number of nodes." ] }, @@ -206,10 +216,10 @@ "from sklearn.model_selection import GridSearchCV\n", "\n", "param_grid = {\n", - " 'classifier__learning_rate': (0.01, 0.1, 1, 10),\n", - " 'classifier__max_leaf_nodes': (3, 10, 30)}\n", - "model_grid_search = GridSearchCV(model, param_grid=param_grid,\n", - " n_jobs=2, cv=2)\n", + " \"classifier__learning_rate\": (0.01, 0.1, 1, 10),\n", + " \"classifier__max_leaf_nodes\": (3, 10, 30),\n", + "}\n", + "model_grid_search = GridSearchCV(model, param_grid=param_grid, n_jobs=2, cv=2)\n", "model_grid_search.fit(data_train, target_train)" ] }, @@ -228,8 +238,7 @@ "source": [ "accuracy = model_grid_search.score(data_test, target_test)\n", "print(\n", - " f\"The test accuracy score of the grid-searched pipeline is: \"\n", - " f\"{accuracy:.2f}\"\n", + " f\"The test accuracy score of the grid-searched pipeline is: {accuracy:.2f}\"\n", ")" ] }, @@ -242,10 +251,9 @@ "

Be aware that the evaluation should normally be performed through\n", "cross-validation by providing model_grid_search as a model to the\n", "cross_validate function.

\n", - "

Here, we used a single train-test split to to evaluate model_grid_search.\n", - "In a future notebook will go into more detail about nested cross-validation,\n", - "when you use cross-validation both for hyperparameter tuning and model\n", - "evaluation.

\n", + "

Here, we used a single train-test split to to evaluate model_grid_search. In\n", + "a future notebook will go into more detail about nested cross-validation, when\n", + "you use cross-validation both for hyperparameter tuning and model evaluation.

\n", "
" ] }, @@ -253,14 +261,14 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "The `GridSearchCV` estimator takes a `param_grid` parameter which defines\n", - "all hyperparameters and their associated values. The grid-search will be in\n", - "charge of creating all possible combinations and test them.\n", + "The `GridSearchCV` estimator takes a `param_grid` parameter which defines all\n", + "hyperparameters and their associated values. The grid-search will be in charge\n", + "of creating all possible combinations and test them.\n", "\n", - "The number of combinations will be equal to the product of the\n", - "number of values to explore for each parameter (e.g. in our example 4 x 3\n", - "combinations). Thus, adding new parameters with their associated values to be\n", - "explored become rapidly computationally expensive.\n", + "The number of combinations will be equal to the product of the number of\n", + "values to explore for each parameter (e.g. in our example 4 x 3 combinations).\n", + "Thus, adding new parameters with their associated values to be explored become\n", + "rapidly computationally expensive.\n", "\n", "Once the grid-search is fitted, it can be used as any other predictor by\n", "calling `predict` and `predict_proba`. Internally, it will use the model with\n", @@ -293,21 +301,20 @@ "metadata": {}, "outputs": [], "source": [ - "print(f\"The best set of parameters is: \"\n", - " f\"{model_grid_search.best_params_}\")" + "print(f\"The best set of parameters is: {model_grid_search.best_params_}\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "The accuracy and the best parameters of the grid-searched pipeline are\n", - "similar to the ones we found in the previous exercise, where we searched the\n", - "best parameters \"by hand\" through a double for loop.\n", + "The accuracy and the best parameters of the grid-searched pipeline are similar\n", + "to the ones we found in the previous exercise, where we searched the best\n", + "parameters \"by hand\" through a double for loop.\n", "\n", "In addition, we can inspect all results which are stored in the attribute\n", - "`cv_results_` of the grid-search. We will filter some specific columns\n", - "from these results." + "`cv_results_` of the grid-search. We will filter some specific columns from\n", + "these results." ] }, { @@ -317,7 +324,8 @@ "outputs": [], "source": [ "cv_results = pd.DataFrame(model_grid_search.cv_results_).sort_values(\n", - " \"mean_test_score\", ascending=False)\n", + " \"mean_test_score\", ascending=False\n", + ")\n", "cv_results.head()" ] }, @@ -325,8 +333,8 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Let us focus on the most interesting columns and shorten the parameter\n", - "names to remove the `\"param_classifier__\"` prefix for readability:" + "Let us focus on the most interesting columns and shorten the parameter names\n", + "to remove the `\"param_classifier__\"` prefix for readability:" ] }, { @@ -337,8 +345,7 @@ "source": [ "# get the parameter names\n", "column_results = [f\"param_{name}\" for name in param_grid.keys()]\n", - "column_results += [\n", - " \"mean_test_score\", \"std_test_score\", \"rank_test_score\"]\n", + "column_results += [\"mean_test_score\", \"std_test_score\", \"rank_test_score\"]\n", "cv_results = cv_results[column_results]" ] }, @@ -377,8 +384,10 @@ "outputs": [], "source": [ "pivoted_cv_results = cv_results.pivot_table(\n", - " values=\"mean_test_score\", index=[\"learning_rate\"],\n", - " columns=[\"max_leaf_nodes\"])\n", + " values=\"mean_test_score\",\n", + " index=[\"learning_rate\"],\n", + " columns=[\"max_leaf_nodes\"],\n", + ")\n", "\n", "pivoted_cv_results" ] @@ -398,8 +407,9 @@ "source": [ "import seaborn as sns\n", "\n", - "ax = sns.heatmap(pivoted_cv_results, annot=True, cmap=\"YlGnBu\", vmin=0.7,\n", - " vmax=0.9)\n", + "ax = sns.heatmap(\n", + " pivoted_cv_results, annot=True, cmap=\"YlGnBu\", vmin=0.7, vmax=0.9\n", + ")\n", "ax.invert_yaxis()" ] }, @@ -409,15 +419,15 @@ "source": [ "The above tables highlights the following things:\n", "\n", - "* for too high values of `learning_rate`, the generalization performance of the\n", - " model is degraded and adjusting the value of `max_leaf_nodes` cannot fix\n", + "* for too high values of `learning_rate`, the generalization performance of\n", + " the model is degraded and adjusting the value of `max_leaf_nodes` cannot fix\n", " that problem;\n", - "* outside of this pathological region, we observe that the optimal choice\n", - " of `max_leaf_nodes` depends on the value of `learning_rate`;\n", - "* in particular, we observe a \"diagonal\" of good models with an accuracy\n", - " close to the maximal of 0.87: when the value of `max_leaf_nodes` is\n", - " increased, one should decrease the value of `learning_rate` accordingly\n", - " to preserve a good accuracy.\n", + "* outside of this pathological region, we observe that the optimal choice of\n", + " `max_leaf_nodes` depends on the value of `learning_rate`;\n", + "* in particular, we observe a \"diagonal\" of good models with an accuracy close\n", + " to the maximal of 0.87: when the value of `max_leaf_nodes` is increased, one\n", + " should decrease the value of `learning_rate` accordingly to preserve a good\n", + " accuracy.\n", "\n", "The precise meaning of those two parameters will be explained later.\n", "\n", @@ -433,8 +443,7 @@ "source": [ "In this notebook we have seen:\n", "\n", - "* how to optimize the hyperparameters of a predictive model via a\n", - " grid-search;\n", + "* how to optimize the hyperparameters of a predictive model via a grid-search;\n", "* that searching for more than two hyperparamters is too costly;\n", "* that a grid-search does not necessarily find an optimal solution." ] diff --git a/notebooks/parameter_tuning_manual.ipynb b/notebooks/parameter_tuning_manual.ipynb index 760c91988..585d8a5fb 100644 --- a/notebooks/parameter_tuning_manual.ipynb +++ b/notebooks/parameter_tuning_manual.ipynb @@ -18,9 +18,9 @@ "interchangeably.

\n", "
\n", "\n", - "This notebook shows how one can get and set the value of a hyperparameter in\n", - "a scikit-learn estimator. We recall that hyperparameters refer to the\n", - "parameter that will control the learning process.\n", + "This notebook shows how one can get and set the value of a hyperparameter in a\n", + "scikit-learn estimator. We recall that hyperparameters refer to the parameter\n", + "that will control the learning process.\n", "\n", "They should not be confused with the fitted parameters, resulting from the\n", "training. These fitted parameters are recognizable in scikit-learn because\n", @@ -41,8 +41,7 @@ "adult_census = pd.read_csv(\"../datasets/adult-census.csv\")\n", "\n", "target_name = \"class\"\n", - "numerical_columns = [\n", - " \"age\", \"capital-gain\", \"capital-loss\", \"hours-per-week\"]\n", + "numerical_columns = [\"age\", \"capital-gain\", \"capital-loss\", \"hours-per-week\"]\n", "\n", "target = adult_census[target_name]\n", "data = adult_census[numerical_columns]" @@ -68,12 +67,12 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Let's create a simple predictive model made of a scaler followed by a\n", - "logistic regression classifier.\n", + "Let's create a simple predictive model made of a scaler followed by a logistic\n", + "regression classifier.\n", "\n", - "As mentioned in previous notebooks, many models, including linear ones,\n", - "work better if all features have a similar scaling. For this purpose,\n", - "we use a `StandardScaler`, which transforms the data by rescaling features." + "As mentioned in previous notebooks, many models, including linear ones, work\n", + "better if all features have a similar scaling. For this purpose, we use a\n", + "`StandardScaler`, which transforms the data by rescaling features." ] }, { @@ -86,10 +85,12 @@ "from sklearn.preprocessing import StandardScaler\n", "from sklearn.linear_model import LogisticRegression\n", "\n", - "model = Pipeline(steps=[\n", - " (\"preprocessor\", StandardScaler()),\n", - " (\"classifier\", LogisticRegression())\n", - "])" + "model = Pipeline(\n", + " steps=[\n", + " (\"preprocessor\", StandardScaler()),\n", + " (\"classifier\", LogisticRegression()),\n", + " ]\n", + ")" ] }, { @@ -110,17 +111,19 @@ "\n", "cv_results = cross_validate(model, data, target)\n", "scores = cv_results[\"test_score\"]\n", - "print(f\"Accuracy score via cross-validation:\\n\"\n", - " f\"{scores.mean():.3f} \u00b1 {scores.std():.3f}\")" + "print(\n", + " \"Accuracy score via cross-validation:\\n\"\n", + " f\"{scores.mean():.3f} \u00b1 {scores.std():.3f}\"\n", + ")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "We created a model with the default `C` value that is equal to 1. If we\n", - "wanted to use a different `C` parameter we could have done so when we created\n", - "the `LogisticRegression` object with something like `LogisticRegression(C=1e-3)`.\n", + "We created a model with the default `C` value that is equal to 1. If we wanted\n", + "to use a different `C` parameter we could have done so when we created the\n", + "`LogisticRegression` object with something like `LogisticRegression(C=1e-3)`.\n", "\n", "
\n", "

Note

\n", @@ -129,9 +132,9 @@ "Be aware that we will focus on linear models in an upcoming module.

\n", "
\n", "\n", - "We can also change the parameter of a model after it has been created with\n", - "the `set_params` method, which is available for all scikit-learn estimators.\n", - "For example, we can set `C=1e-3`, fit and evaluate the model:" + "We can also change the parameter of a model after it has been created with the\n", + "`set_params` method, which is available for all scikit-learn estimators. For\n", + "example, we can set `C=1e-3`, fit and evaluate the model:" ] }, { @@ -143,8 +146,10 @@ "model.set_params(classifier__C=1e-3)\n", "cv_results = cross_validate(model, data, target)\n", "scores = cv_results[\"test_score\"]\n", - "print(f\"Accuracy score via cross-validation:\\n\"\n", - " f\"{scores.mean():.3f} \u00b1 {scores.std():.3f}\")" + "print(\n", + " \"Accuracy score via cross-validation:\\n\"\n", + " f\"{scores.mean():.3f} \u00b1 {scores.std():.3f}\"\n", + ")" ] }, { @@ -156,9 +161,9 @@ "middle). In our case, `classifier` comes from the `Pipeline` definition and\n", "`C` is the parameter name of `LogisticRegression`.\n", "\n", - "In general, you can use the `get_params` method on scikit-learn models to\n", - "list all the parameters with their values. For example, if you want to\n", - "get all the parameter names, you can use:" + "In general, you can use the `get_params` method on scikit-learn models to list\n", + "all the parameters with their values. For example, if you want to get all the\n", + "parameter names, you can use:" ] }, { @@ -186,7 +191,7 @@ "metadata": {}, "outputs": [], "source": [ - "model.get_params()['classifier__C']" + "model.get_params()[\"classifier__C\"]" ] }, { @@ -207,28 +212,28 @@ " model.set_params(classifier__C=C)\n", " cv_results = cross_validate(model, data, target)\n", " scores = cv_results[\"test_score\"]\n", - " print(f\"Accuracy score via cross-validation with C={C}:\\n\"\n", - " f\"{scores.mean():.3f} \u00b1 {scores.std():.3f}\")" + " print(\n", + " f\"Accuracy score via cross-validation with C={C}:\\n\"\n", + " f\"{scores.mean():.3f} \u00b1 {scores.std():.3f}\"\n", + " )" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "We can see that as long as C is high enough, the model seems to perform\n", - "well.\n", + "We can see that as long as C is high enough, the model seems to perform well.\n", "\n", - "What we did here is very manual: it involves scanning the values for C\n", - "and picking the best one manually. In the next lesson, we will see how\n", - "to do this automatically.\n", + "What we did here is very manual: it involves scanning the values for C and\n", + "picking the best one manually. In the next lesson, we will see how to do this\n", + "automatically.\n", "\n", "
\n", "

Warning

\n", - "

When we evaluate a family of models on test data and pick the best\n", - "performer, we can not trust the corresponding prediction accuracy, and\n", - "we need to apply the selected model to new data. Indeed, the test data\n", - "has been used to select the model, and it is thus no longer independent\n", - "from this model.

\n", + "

When we evaluate a family of models on test data and pick the best performer,\n", + "we can not trust the corresponding prediction accuracy, and we need to apply\n", + "the selected model to new data. Indeed, the test data has been used to select\n", + "the model, and it is thus no longer independent from this model.

\n", "
" ] }, diff --git a/notebooks/parameter_tuning_nested.ipynb b/notebooks/parameter_tuning_nested.ipynb index 2d7d02637..b7c14a3bf 100644 --- a/notebooks/parameter_tuning_nested.ipynb +++ b/notebooks/parameter_tuning_nested.ipynb @@ -14,8 +14,8 @@ "\"Selecting the best model\" to show how to evaluate models where\n", "hyperparameters need to be tuned.\n", "\n", - "Thus, we will first load the dataset and create the predictive model that\n", - "we want to optimize and later on, evaluate.\n", + "Thus, we will first load the dataset and create the predictive model that we\n", + "want to optimize and later on, evaluate.\n", "\n", "## Loading the dataset\n", "\n", @@ -45,8 +45,8 @@ "source": [ "## Our predictive model\n", "\n", - "We now create the predictive model that we want to optimize. Note that\n", - "this pipeline is identical to the one we used in the previous notebook." + "We now create the predictive model that we want to optimize. Note that this\n", + "pipeline is identical to the one we used in the previous notebook." ] }, { @@ -67,9 +67,9 @@ ")\n", "preprocessor = ColumnTransformer(\n", " [\n", - " ('cat_preprocessor', categorical_preprocessor, categorical_columns),\n", + " (\"cat_preprocessor\", categorical_preprocessor, categorical_columns),\n", " ],\n", - " remainder='passthrough',\n", + " remainder=\"passthrough\",\n", " sparse_threshold=0,\n", ")" ] @@ -83,15 +83,15 @@ "from sklearn.ensemble import HistGradientBoostingClassifier\n", "from sklearn.pipeline import Pipeline\n", "\n", - "model = Pipeline([\n", - " (\"preprocessor\", preprocessor),\n", - " (\n", - " \"classifier\",\n", - " HistGradientBoostingClassifier(\n", - " random_state=42, max_leaf_nodes=4\n", - " )\n", - " ),\n", - "])\n", + "model = Pipeline(\n", + " [\n", + " (\"preprocessor\", preprocessor),\n", + " (\n", + " \"classifier\",\n", + " HistGradientBoostingClassifier(random_state=42, max_leaf_nodes=4),\n", + " ),\n", + " ]\n", + ")\n", "model" ] }, @@ -129,8 +129,8 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "The cross-validation scores are coming from a 5-fold cross-validation. So\n", - "we can compute the mean and standard deviation of the generalization score." + "The cross-validation scores are coming from a 5-fold cross-validation. So we\n", + "can compute the mean and standard deviation of the generalization score." ] }, { @@ -140,8 +140,9 @@ "outputs": [], "source": [ "print(\n", - " \"Generalization score without hyperparameters tuning:\\n\"\n", - " f\"{cv_results['test_score'].mean():.3f} \u00b1 {cv_results['test_score'].std():.3f}\"\n", + " \"Generalization score without hyperparameters\"\n", + " f\" tuning:\\n{cv_results['test_score'].mean():.3f} \u00b1\"\n", + " f\" {cv_results['test_score'].std():.3f}\"\n", ")" ] }, @@ -149,8 +150,8 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "We now present how to evaluate the model with hyperparameter tuning,\n", - "where an extra step is required to select the best set of parameters.\n", + "We now present how to evaluate the model with hyperparameter tuning, where an\n", + "extra step is required to select the best set of parameters.\n", "\n", "### With hyperparameter tuning\n", "\n", @@ -171,12 +172,10 @@ "from sklearn.model_selection import GridSearchCV\n", "\n", "param_grid = {\n", - " 'classifier__learning_rate': (0.05, 0.5),\n", - " 'classifier__max_leaf_nodes': (10, 30),\n", + " \"classifier__learning_rate\": (0.05, 0.5),\n", + " \"classifier__max_leaf_nodes\": (10, 30),\n", "}\n", - "model_grid_search = GridSearchCV(\n", - " model, param_grid=param_grid, n_jobs=2, cv=2\n", - ")\n", + "model_grid_search = GridSearchCV(model, param_grid=param_grid, n_jobs=2, cv=2)\n", "model_grid_search.fit(data, target)" ] }, @@ -185,9 +184,9 @@ "metadata": {}, "source": [ "As previously seen, when calling the `fit` method, the model embedded in the\n", - "grid-search is trained with every possible combination of parameters\n", - "resulting from the parameter grid. The best combination is selected by\n", - "keeping the combination leading to the best mean cross-validated score." + "grid-search is trained with every possible combination of parameters resulting\n", + "from the parameter grid. The best combination is selected by keeping the\n", + "combination leading to the best mean cross-validated score." ] }, { @@ -197,13 +196,15 @@ "outputs": [], "source": [ "cv_results = pd.DataFrame(model_grid_search.cv_results_)\n", - "cv_results[[\n", - " \"param_classifier__learning_rate\",\n", - " \"param_classifier__max_leaf_nodes\",\n", - " \"mean_test_score\",\n", - " \"std_test_score\",\n", - " \"rank_test_score\"\n", - "]]" + "cv_results[\n", + " [\n", + " \"param_classifier__learning_rate\",\n", + " \"param_classifier__max_leaf_nodes\",\n", + " \"mean_test_score\",\n", + " \"std_test_score\",\n", + " \"rank_test_score\",\n", + " ]\n", + "]" ] }, { @@ -220,10 +221,10 @@ "metadata": {}, "source": [ "One important caveat here concerns the evaluation of the generalization\n", - "performance. Indeed, the mean and standard deviation of the scores computed\n", - "by the cross-validation in the grid-search are potentially not good estimates\n", - "of the generalization performance we would obtain by refitting a model with\n", - "the best combination of hyper-parameter values on the full dataset. Note that\n", + "performance. Indeed, the mean and standard deviation of the scores computed by\n", + "the cross-validation in the grid-search are potentially not good estimates of\n", + "the generalization performance we would obtain by refitting a model with the\n", + "best combination of hyper-parameter values on the full dataset. Note that\n", "scikit-learn automatically performs this refit by default when calling\n", "`model_grid_search.fit`. This refitted model is trained with more data than\n", "the different models trained internally during the cross-validation of the\n", @@ -260,22 +261,23 @@ "source": [ "The score measure on the final test set is almost within the range of the\n", "internal CV score for the best hyper-parameter combination. This is reassuring\n", - "as it means that the tuning procedure did not cause significant overfitting\n", - "in itself (other-wise the final test score would have been lower than the\n", + "as it means that the tuning procedure did not cause significant overfitting in\n", + "itself (other-wise the final test score would have been lower than the\n", "internal CV scores). That is expected because our grid search explored very\n", "few hyper-parameter combinations for the sake of speed. The test score of the\n", - "final model is actually a bit higher than what we could have expected from\n", - "the internal cross-validation. This is also expected because the refitted\n", - "model is trained on a larger dataset than the models evaluated in the\n", - "internal CV loop of the grid-search procedure. This is often the case that\n", - "models trained on a larger number of samples tend to generalize better.\n", + "final model is actually a bit higher than what we could have expected from the\n", + "internal cross-validation. This is also expected because the refitted model is\n", + "trained on a larger dataset than the models evaluated in the internal CV loop\n", + "of the grid-search procedure. This is often the case that models trained on a\n", + "larger number of samples tend to generalize better.\n", "\n", "In the code above, the selection of the best hyperparameters was done only on\n", "the train set from the initial train-test split. Then, we evaluated the\n", "generalization performance of our tuned model on the left out test set. This\n", "can be shown schematically as follows\n", "\n", - "![Cross-validation tuning diagram](../figures/cross_validation_train_test_diagram.png)\n", + "![Cross-validation tuning\n", + "diagram](../figures/cross_validation_train_test_diagram.png)\n", "\n", "
\n", "

Note

\n", @@ -293,8 +295,8 @@ "
\n", "\n", "However, this evaluation only provides us a single point estimate of the\n", - "generalization performance. As recall at the beginning of this notebook, it\n", - "is beneficial to have a rough idea of the uncertainty of our estimated\n", + "generalization performance. As recall at the beginning of this notebook, it is\n", + "beneficial to have a rough idea of the uncertainty of our estimated\n", "generalization performance. Therefore, we should instead use an additional\n", "cross-validation for this evaluation.\n", "\n", @@ -325,7 +327,7 @@ "outputs": [], "source": [ "cv_results = pd.DataFrame(cv_results)\n", - "cv_test_scores = cv_results['test_score']\n", + "cv_test_scores = cv_results[\"test_score\"]\n", "print(\n", " \"Generalization score with hyperparameters tuning:\\n\"\n", " f\"{cv_test_scores.mean():.3f} \u00b1 {cv_test_scores.std():.3f}\"\n", @@ -339,9 +341,9 @@ "This result is compatible with the test score measured with the string outer\n", "train-test split.\n", "\n", - "However, in this case, we can apprehend the variability of our estimate of\n", - "the generalization performance thanks to the measure of the\n", - "standard-deviation of the scores measured in the outer cross-validation.\n", + "However, in this case, we can apprehend the variability of our estimate of the\n", + "generalization performance thanks to the measure of the standard-deviation of\n", + "the scores measured in the outer cross-validation.\n", "\n", "Here is a schematic representation of the complete nested cross-validation\n", "procedure:\n", @@ -392,21 +394,20 @@ "expect that it will have an actual predictive performance close to what we\n", "measured in the outer cross-validation.\n", "\n", - "But it is also possible that some hyperparameters do not matter at all, and\n", - "as a result in different tuning sessions give different results. In this\n", - "case, any value will do. This can typically be confirmed by doing a parallel\n", - "coordinate plot of the results of a large hyperparameter search as seen in\n", - "the exercises.\n", + "But it is also possible that some hyperparameters do not matter at all, and as\n", + "a result in different tuning sessions give different results. In this case,\n", + "any value will do. This can typically be confirmed by doing a parallel\n", + "coordinate plot of the results of a large hyperparameter search as seen in the\n", + "exercises.\n", "\n", "From a deployment point of view, one could also choose to deploy all the\n", "models found by the outer cross-validation loop and make them vote to get the\n", - "final predictions. However this can cause operational problems because it\n", - "uses more memory and makes computing prediction slower, resulting in a higher\n", + "final predictions. However this can cause operational problems because it uses\n", + "more memory and makes computing prediction slower, resulting in a higher\n", "computational resource usage per prediction.\n", "\n", - "In this notebook, we have seen how to evaluate the predictive performance of\n", - "a model with tuned hyper-parameters using the nested cross-validation\n", - "procedure." + "In this notebook, we have seen how to evaluate the predictive performance of a\n", + "model with tuned hyper-parameters using the nested cross-validation procedure." ] } ], diff --git a/notebooks/parameter_tuning_parallel_plot.ipynb b/notebooks/parameter_tuning_parallel_plot.ipynb index 2255c5488..6b2cbe200 100644 --- a/notebooks/parameter_tuning_parallel_plot.ipynb +++ b/notebooks/parameter_tuning_parallel_plot.ipynb @@ -11,40 +11,42 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "In the previous notebook we showed how to implement a randomized\n", - "search for tuning the hyperparameters of a `HistGradientBoostingClassifier`\n", - "to fit the `adult_census` dataset. In practice, a randomized hyperparameter\n", - "search is usually run with a large number of iterations." + "In the previous notebook we showed how to implement a randomized search for\n", + "tuning the hyperparameters of a `HistGradientBoostingClassifier` to fit the\n", + "`adult_census` dataset. In practice, a randomized hyperparameter search is\n", + "usually run with a large number of iterations." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "In order to avoid the computational cost and still make a decent analysis,\n", - "we load the results obtained from a similar search with 500 iterations." + "In order to avoid the computational cost and still make a decent analysis, we\n", + "load the results obtained from a similar search with 500 iterations." ] }, { "cell_type": "code", "execution_count": null, - "metadata": { - "lines_to_next_cell": 1 - }, + "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "\n", - "cv_results = pd.read_csv(\"../figures/randomized_search_results.csv\", index_col=0)\n", + "cv_results = pd.read_csv(\n", + " \"../figures/randomized_search_results.csv\", index_col=0\n", + ")\n", "cv_results" ] }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "lines_to_next_cell": 2 + }, "source": [ - "We define a function to remove the prefixes in the hyperparameters\n", - "column names." + "We define a function to remove the prefixes in the hyperparameters column\n", + "names." ] }, { @@ -58,6 +60,7 @@ " return param_name.rsplit(\"__\", 1)[1]\n", " return param_name\n", "\n", + "\n", "cv_results = cv_results.rename(shorten_param, axis=1)\n", "cv_results" ] @@ -66,11 +69,10 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "As we have more than 2 parameters in our randomized-search, we\n", - "cannot visualize the results using a heatmap. We could still do\n", - "it pair-wise, but having a two-dimensional projection of a\n", - "multi-dimensional problem can lead to a wrong interpretation of\n", - "the scores." + "As we have more than 2 parameters in our randomized-search, we cannot\n", + "visualize the results using a heatmap. We could still do it pair-wise, but\n", + "having a two-dimensional projection of a multi-dimensional problem can lead to\n", + "a wrong interpretation of the scores." ] }, { @@ -104,7 +106,9 @@ "ax.set_xscale(\"log\")\n", "ax.set_yscale(\"log\")\n", "\n", - "_ = ax.legend(title=\"mean_test_score\", loc=\"center left\", bbox_to_anchor=(1, 0.5))" + "_ = ax.legend(\n", + " title=\"mean_test_score\", loc=\"center left\", bbox_to_anchor=(1, 0.5)\n", + ")" ] }, { @@ -184,12 +188,11 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "\n", - "In this notebook, we saw how to interactively explore the results of a\n", - "large randomized search with multiple interacting hyperparameters.\n", - "In particular we observed that some hyperparameters have very little\n", - "impact on the cross-validation score, while others have to be adjusted\n", - "within a specific range to get models with good predictive accuracy." + "In this notebook, we saw how to interactively explore the results of a large\n", + "randomized search with multiple interacting hyperparameters. In particular we\n", + "observed that some hyperparameters have very little impact on the\n", + "cross-validation score, while others have to be adjusted within a specific\n", + "range to get models with good predictive accuracy." ] } ], diff --git a/notebooks/parameter_tuning_randomized_search.ipynb b/notebooks/parameter_tuning_randomized_search.ipynb index 14d344ee8..37b4272ea 100644 --- a/notebooks/parameter_tuning_randomized_search.ipynb +++ b/notebooks/parameter_tuning_randomized_search.ipynb @@ -10,9 +10,9 @@ "search for the best hyperparameters maximizing the generalization performance\n", "of a predictive model.\n", "\n", - "However, a grid-search approach has limitations. It does not scale when\n", - "the number of parameters to tune is increasing. Also, the grid will impose\n", - "a regularity during the search which might be problematic.\n", + "However, a grid-search approach has limitations. It does not scale when the\n", + "number of parameters to tune is increasing. Also, the grid will impose a\n", + "regularity during the search which might be problematic.\n", "\n", "In this notebook, we will present another method to tune hyperparameters\n", "called randomized search." @@ -90,7 +90,8 @@ "from sklearn.model_selection import train_test_split\n", "\n", "data_train, data_test, target_train, target_test = train_test_split(\n", - " data, target, random_state=42)" + " data, target, random_state=42\n", + ")" ] }, { @@ -114,11 +115,14 @@ "categorical_columns_selector = selector(dtype_include=object)\n", "categorical_columns = categorical_columns_selector(data)\n", "\n", - "categorical_preprocessor = OrdinalEncoder(handle_unknown=\"use_encoded_value\",\n", - " unknown_value=-1)\n", - "preprocessor = ColumnTransformer([\n", - " ('cat_preprocessor', categorical_preprocessor, categorical_columns)],\n", - " remainder='passthrough', sparse_threshold=0)" + "categorical_preprocessor = OrdinalEncoder(\n", + " handle_unknown=\"use_encoded_value\", unknown_value=-1\n", + ")\n", + "preprocessor = ColumnTransformer(\n", + " [(\"cat_preprocessor\", categorical_preprocessor, categorical_columns)],\n", + " remainder=\"passthrough\",\n", + " sparse_threshold=0,\n", + ")" ] }, { @@ -130,10 +134,15 @@ "from sklearn.ensemble import HistGradientBoostingClassifier\n", "from sklearn.pipeline import Pipeline\n", "\n", - "model = Pipeline([\n", - " (\"preprocessor\", preprocessor),\n", - " (\"classifier\", HistGradientBoostingClassifier(random_state=42, max_leaf_nodes=4)),\n", - "])\n", + "model = Pipeline(\n", + " [\n", + " (\"preprocessor\", preprocessor),\n", + " (\n", + " \"classifier\",\n", + " HistGradientBoostingClassifier(random_state=42, max_leaf_nodes=4),\n", + " ),\n", + " ]\n", + ")\n", "\n", "model" ] @@ -148,41 +157,40 @@ "explicitly. We already mentioned that exploring a large number of values for\n", "different parameters will be quickly untractable.\n", "\n", - "Instead, we can randomly generate the parameter candidates. Indeed,\n", - "such approach avoids the regularity of the grid. Hence, adding more\n", - "evaluations can increase the resolution in each direction. This is the\n", - "case in the frequent situation where the choice of some hyperparameters\n", - "is not very important, as for hyperparameter 2 in the figure below.\n", + "Instead, we can randomly generate the parameter candidates. Indeed, such\n", + "approach avoids the regularity of the grid. Hence, adding more evaluations can\n", + "increase the resolution in each direction. This is the case in the frequent\n", + "situation where the choice of some hyperparameters is not very important, as\n", + "for hyperparameter 2 in the figure below.\n", "\n", "![Randomized vs grid search](../figures/grid_vs_random_search.svg)\n", "\n", - "Indeed, the number of evaluation points needs to be divided across the\n", - "two different hyperparameters. With a grid, the danger is that the\n", - "region of good hyperparameters fall between the line of the grid: this\n", - "region is aligned with the grid given that hyperparameter 2 has a weak\n", - "influence. Rather, stochastic search will sample hyperparameter 1\n", - "independently from hyperparameter 2 and find the optimal region.\n", + "Indeed, the number of evaluation points needs to be divided across the two\n", + "different hyperparameters. With a grid, the danger is that the region of good\n", + "hyperparameters fall between the line of the grid: this region is aligned with\n", + "the grid given that hyperparameter 2 has a weak influence. Rather, stochastic\n", + "search will sample hyperparameter 1 independently from hyperparameter 2 and\n", + "find the optimal region.\n", "\n", - "The `RandomizedSearchCV` class allows for such stochastic search. It is\n", - "used similarly to the `GridSearchCV` but the sampling distributions\n", - "need to be specified instead of the parameter values. For instance, we\n", - "will draw candidates using a log-uniform distribution because the parameters\n", - "we are interested in take positive values with a natural log scaling (.1 is\n", - "as close to 1 as 10 is).\n", + "The `RandomizedSearchCV` class allows for such stochastic search. It is used\n", + "similarly to the `GridSearchCV` but the sampling distributions need to be\n", + "specified instead of the parameter values. For instance, we will draw\n", + "candidates using a log-uniform distribution because the parameters we are\n", + "interested in take positive values with a natural log scaling (.1 is as close\n", + "to 1 as 10 is).\n", "\n", "
\n", "

Note

\n", - "

Random search (with RandomizedSearchCV) is typically beneficial compared\n", - "to grid search (with GridSearchCV) to optimize 3 or more\n", - "hyperparameters.

\n", + "

Random search (with RandomizedSearchCV) is typically beneficial compared to\n", + "grid search (with GridSearchCV) to optimize 3 or more hyperparameters.

\n", "
\n", "\n", - "We will optimize 3 other parameters in addition to the ones we\n", - "optimized in the notebook presenting the `GridSearchCV`:\n", + "We will optimize 3 other parameters in addition to the ones we optimized in\n", + "the notebook presenting the `GridSearchCV`:\n", "\n", "* `l2_regularization`: it corresponds to the strength of the regularization;\n", - "* `min_samples_leaf`: it corresponds to the minimum number of samples\n", - " required in a leaf;\n", + "* `min_samples_leaf`: it corresponds to the minimum number of samples required\n", + " in a leaf;\n", "* `max_bins`: it corresponds to the maximum number of bins to construct the\n", " histograms.\n", "\n", @@ -195,9 +203,9 @@ "\n", "
\n", "

Note

\n", - "

scipy.stats.loguniform can be used to generate floating numbers. To\n", - "generate random values for integer-valued parameters (e.g.\n", - "min_samples_leaf) we can adapt is as follows:

\n", + "

scipy.stats.loguniform can be used to generate floating numbers. To generate\n", + "random values for integer-valued parameters (e.g. min_samples_leaf) we can\n", + "adapt is as follows:

\n", "
" ] }, @@ -212,6 +220,7 @@ "\n", "class loguniform_int:\n", " \"\"\"Integer valued version of the log-uniform distribution\"\"\"\n", + "\n", " def __init__(self, a, b):\n", " self._distribution = loguniform(a, b)\n", "\n", @@ -224,12 +233,11 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "\n", "Now, we can define the randomized search using the different distributions.\n", - "Executing 10 iterations of 5-fold cross-validation for random\n", - "parametrizations of this model on this dataset can take from 10 seconds to\n", - "several minutes, depending on the speed of the host computer and the number\n", - "of available processors." + "Executing 10 iterations of 5-fold cross-validation for random parametrizations\n", + "of this model on this dataset can take from 10 seconds to several minutes,\n", + "depending on the speed of the host computer and the number of available\n", + "processors." ] }, { @@ -242,16 +250,19 @@ "from sklearn.model_selection import RandomizedSearchCV\n", "\n", "param_distributions = {\n", - " 'classifier__l2_regularization': loguniform(1e-6, 1e3),\n", - " 'classifier__learning_rate': loguniform(0.001, 10),\n", - " 'classifier__max_leaf_nodes': loguniform_int(2, 256),\n", - " 'classifier__min_samples_leaf': loguniform_int(1, 100),\n", - " 'classifier__max_bins': loguniform_int(2, 255),\n", + " \"classifier__l2_regularization\": loguniform(1e-6, 1e3),\n", + " \"classifier__learning_rate\": loguniform(0.001, 10),\n", + " \"classifier__max_leaf_nodes\": loguniform_int(2, 256),\n", + " \"classifier__min_samples_leaf\": loguniform_int(1, 100),\n", + " \"classifier__max_bins\": loguniform_int(2, 255),\n", "}\n", "\n", "model_random_search = RandomizedSearchCV(\n", - " model, param_distributions=param_distributions, n_iter=10,\n", - " cv=5, verbose=1,\n", + " model,\n", + " param_distributions=param_distributions,\n", + " n_iter=10,\n", + " cv=5,\n", + " verbose=1,\n", ")\n", "model_random_search.fit(data_train, target_train)" ] @@ -271,8 +282,7 @@ "source": [ "accuracy = model_random_search.score(data_test, target_test)\n", "\n", - "print(f\"The test accuracy score of the best model is \"\n", - " f\"{accuracy:.2f}\")" + "print(f\"The test accuracy score of the best model is {accuracy:.2f}\")" ] }, { @@ -293,7 +303,6 @@ "lines_to_next_cell": 2 }, "source": [ - "\n", "We can inspect the results using the attributes `cv_results` as we did\n", "previously." ] @@ -305,20 +314,21 @@ "outputs": [], "source": [ "# get the parameter names\n", - "column_results = [\n", - " f\"param_{name}\" for name in param_distributions.keys()]\n", - "column_results += [\n", - " \"mean_test_score\", \"std_test_score\", \"rank_test_score\"]\n", + "column_results = [f\"param_{name}\" for name in param_distributions.keys()]\n", + "column_results += [\"mean_test_score\", \"std_test_score\", \"rank_test_score\"]\n", "\n", "cv_results = pd.DataFrame(model_random_search.cv_results_)\n", "cv_results = cv_results[column_results].sort_values(\n", - " \"mean_test_score\", ascending=False)\n", + " \"mean_test_score\", ascending=False\n", + ")\n", + "\n", "\n", "def shorten_param(param_name):\n", " if \"__\" in param_name:\n", " return param_name.rsplit(\"__\", 1)[1]\n", " return param_name\n", "\n", + "\n", "cv_results = cv_results.rename(shorten_param, axis=1)\n", "cv_results" ] @@ -327,14 +337,13 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Keep in mind that tuning is limited by the number of different combinations\n", - "of parameters that are scored by the randomized search. In fact, there might\n", - "be other sets of parameters leading to similar or better generalization\n", - "performances but that were not tested in the search.\n", - "In practice, a randomized hyperparameter search is usually run with a large\n", - "number of iterations. In order to avoid the computation cost and still make a\n", - "decent analysis, we load the results obtained from a similar search with 500\n", - "iterations." + "Keep in mind that tuning is limited by the number of different combinations of\n", + "parameters that are scored by the randomized search. In fact, there might be\n", + "other sets of parameters leading to similar or better generalization\n", + "performances but that were not tested in the search. In practice, a randomized\n", + "hyperparameter search is usually run with a large number of iterations. In\n", + "order to avoid the computation cost and still make a decent analysis, we load\n", + "the results obtained from a similar search with 500 iterations." ] }, { @@ -357,20 +366,24 @@ "metadata": {}, "outputs": [], "source": [ - "cv_results = pd.read_csv(\"../figures/randomized_search_results.csv\",\n", - " index_col=0)\n", + "cv_results = pd.read_csv(\n", + " \"../figures/randomized_search_results.csv\", index_col=0\n", + ")\n", "\n", - "(cv_results[column_results].rename(\n", - " shorten_param, axis=1).sort_values(\"mean_test_score\", ascending=False))" + "(\n", + " cv_results[column_results]\n", + " .rename(shorten_param, axis=1)\n", + " .sort_values(\"mean_test_score\", ascending=False)\n", + ")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "In this case the top performing models have test scores with a high\n", - "overlap between each other, meaning that indeed, the set of parameters\n", - "leading to the best generalization performance is not unique." + "In this case the top performing models have test scores with a high overlap\n", + "between each other, meaning that indeed, the set of parameters leading to the\n", + "best generalization performance is not unique." ] }, { @@ -378,15 +391,15 @@ "metadata": {}, "source": [ "\n", - "In this notebook, we saw how a randomized search offers a valuable\n", - "alternative to grid-search when the number of hyperparameters to tune is more\n", - "than two. It also alleviates the regularity imposed by the grid that might be\n", - "problematic sometimes.\n", + "In this notebook, we saw how a randomized search offers a valuable alternative\n", + "to grid-search when the number of hyperparameters to tune is more than two. It\n", + "also alleviates the regularity imposed by the grid that might be problematic\n", + "sometimes.\n", "\n", "In the following, we will see how to use interactive plotting tools to explore\n", - "the results of large hyperparameter search sessions and gain some\n", - "insights on range of parameter values that lead to the highest performing\n", - "models and how different hyperparameter are coupled or not." + "the results of large hyperparameter search sessions and gain some insights on\n", + "range of parameter values that lead to the highest performing models and how\n", + "different hyperparameter are coupled or not." ] } ], diff --git a/notebooks/parameter_tuning_sol_02.ipynb b/notebooks/parameter_tuning_sol_02.ipynb index 67158f8fc..bbcb42f88 100644 --- a/notebooks/parameter_tuning_sol_02.ipynb +++ b/notebooks/parameter_tuning_sol_02.ipynb @@ -9,10 +9,9 @@ "The goal is to write an exhaustive search to find the best parameters\n", "combination maximizing the model generalization performance.\n", "\n", - "Here we use a small subset of the Adult Census dataset to make the code\n", - "faster to execute. Once your code works on the small subset, try to\n", - "change `train_size` to a larger value (e.g. 0.8 for 80% instead of\n", - "20%)." + "Here we use a small subset of the Adult Census dataset to make the code faster\n", + "to execute. Once your code works on the small subset, try to change\n", + "`train_size` to a larger value (e.g. 0.8 for 80% instead of 20%)." ] }, { @@ -32,7 +31,8 @@ "data = adult_census.drop(columns=[target_name, \"education-num\"])\n", "\n", "data_train, data_test, target_train, target_test = train_test_split(\n", - " data, target, train_size=0.2, random_state=42)" + " data, target, train_size=0.2, random_state=42\n", + ")" ] }, { @@ -45,27 +45,36 @@ "from sklearn.compose import make_column_selector as selector\n", "from sklearn.preprocessing import OrdinalEncoder\n", "\n", - "categorical_preprocessor = OrdinalEncoder(handle_unknown=\"use_encoded_value\",\n", - " unknown_value=-1)\n", + "categorical_preprocessor = OrdinalEncoder(\n", + " handle_unknown=\"use_encoded_value\", unknown_value=-1\n", + ")\n", "preprocessor = ColumnTransformer(\n", - " [('cat_preprocessor', categorical_preprocessor,\n", - " selector(dtype_include=object))],\n", - " remainder='passthrough', sparse_threshold=0)\n", + " [\n", + " (\n", + " \"cat_preprocessor\",\n", + " categorical_preprocessor,\n", + " selector(dtype_include=object),\n", + " )\n", + " ],\n", + " remainder=\"passthrough\",\n", + " sparse_threshold=0,\n", + ")\n", "\n", "from sklearn.ensemble import HistGradientBoostingClassifier\n", "from sklearn.pipeline import Pipeline\n", "\n", - "model = Pipeline([\n", - " (\"preprocessor\", preprocessor),\n", - " (\"classifier\", HistGradientBoostingClassifier(random_state=42))\n", - "])" + "model = Pipeline(\n", + " [\n", + " (\"preprocessor\", preprocessor),\n", + " (\"classifier\", HistGradientBoostingClassifier(random_state=42)),\n", + " ]\n", + ")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "\n", "Use the previously defined model (called `model`) and using two nested `for`\n", "loops, make a search of the best combinations of the `learning_rate` and\n", "`max_leaf_nodes` parameters. In this regard, you will need to train and test\n", @@ -75,8 +84,8 @@ "- `learning_rate` for the values 0.01, 0.1, 1 and 10. This parameter controls\n", " the ability of a new tree to correct the error of the previous sequence of\n", " trees\n", - "- `max_leaf_nodes` for the values 3, 10, 30. This parameter controls the\n", - " depth of each tree." + "- `max_leaf_nodes` for the values 3, 10, 30. This parameter controls the depth\n", + " of each tree." ] }, { @@ -95,18 +104,22 @@ "best_params = {}\n", "for lr in learning_rate:\n", " for mln in max_leaf_nodes:\n", - " print(f\"Evaluating model with learning rate {lr:.3f}\"\n", - " f\" and max leaf nodes {mln}... \", end=\"\")\n", + " print(\n", + " (\n", + " f\"Evaluating model with learning rate {lr:.3f}\"\n", + " f\" and max leaf nodes {mln}... \"\n", + " ),\n", + " end=\"\",\n", + " )\n", " model.set_params(\n", - " classifier__learning_rate=lr,\n", - " classifier__max_leaf_nodes=mln\n", + " classifier__learning_rate=lr, classifier__max_leaf_nodes=mln\n", " )\n", " scores = cross_val_score(model, data_train, target_train, cv=2)\n", " mean_score = scores.mean()\n", " print(f\"score: {mean_score:.3f}\")\n", " if mean_score > best_score:\n", " best_score = mean_score\n", - " best_params = {'learning_rate': lr, 'max_leaf_nodes': mln}\n", + " best_params = {\"learning_rate\": lr, \"max_leaf_nodes\": mln}\n", " print(f\"Found new best model with score {best_score:.3f}!\")\n", "\n", "print(f\"The best accuracy obtained is {best_score:.3f}\")\n", @@ -129,11 +142,12 @@ "outputs": [], "source": [ "# solution\n", - "best_lr = best_params['learning_rate']\n", - "best_mln = best_params['max_leaf_nodes']\n", + "best_lr = best_params[\"learning_rate\"]\n", + "best_mln = best_params[\"max_leaf_nodes\"]\n", "\n", - "model.set_params(classifier__learning_rate=best_lr,\n", - " classifier__max_leaf_nodes=best_mln)\n", + "model.set_params(\n", + " classifier__learning_rate=best_lr, classifier__max_leaf_nodes=best_mln\n", + ")\n", "model.fit(data_train, target_train)\n", "test_score = model.score(data_test, target_test)\n", "\n", diff --git a/notebooks/parameter_tuning_sol_03.ipynb b/notebooks/parameter_tuning_sol_03.ipynb index ddc2ed034..c7e032fce 100644 --- a/notebooks/parameter_tuning_sol_03.ipynb +++ b/notebooks/parameter_tuning_sol_03.ipynb @@ -23,15 +23,16 @@ "target *= 100 # rescale the target in k$\n", "\n", "data_train, data_test, target_train, target_test = train_test_split(\n", - " data, target, random_state=42)" + " data, target, random_state=42\n", + ")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "In this exercise, we will progressively define the regression pipeline\n", - "and later tune its hyperparameters.\n", + "In this exercise, we will progressively define the regression pipeline and\n", + "later tune its hyperparameters.\n", "\n", "Start by defining a pipeline that:\n", "* uses a `StandardScaler` to normalize the numerical data;\n", @@ -64,8 +65,8 @@ " `np.logspace(0, 3, num=10).astype(np.int32)`;\n", "- the parameter `with_mean` of the `StandardScaler` with possible values\n", " `True` or `False`;\n", - "- the parameter `with_std` of the `StandardScaler` with possible values\n", - " `True` or `False`.\n", + "- the parameter `with_std` of the `StandardScaler` with possible values `True`\n", + " or `False`.\n", "\n", "Notice that in the notebook \"Hyperparameter tuning by randomized-search\" we\n", "pass distributions to be sampled by the `RandomizedSearchCV`. In this case we\n", @@ -92,14 +93,21 @@ "from sklearn.model_selection import RandomizedSearchCV\n", "\n", "param_distributions = {\n", - " \"kneighborsregressor__n_neighbors\": np.logspace(0, 3, num=10).astype(np.int32),\n", + " \"kneighborsregressor__n_neighbors\": np.logspace(0, 3, num=10).astype(\n", + " np.int32\n", + " ),\n", " \"standardscaler__with_mean\": [True, False],\n", " \"standardscaler__with_std\": [True, False],\n", "}\n", "\n", "model_random_search = RandomizedSearchCV(\n", - " model, param_distributions=param_distributions,\n", - " n_iter=20, n_jobs=2, verbose=1, random_state=1)\n", + " model,\n", + " param_distributions=param_distributions,\n", + " n_iter=20,\n", + " n_jobs=2,\n", + " verbose=1,\n", + " random_state=1,\n", + ")\n", "model_random_search.fit(data_train, target_train)\n", "model_random_search.best_params_" ] @@ -122,9 +130,9 @@ "conduct such an interactive analysis for this this pipeline using a parallel\n", "coordinate plot using the `plotly` library.\n", "\n", - "We could use `cv_results = model_random_search.cv_results_` to make a\n", - "parallel coordinate plot as we did in the previous notebook (you are more\n", - "than welcome to try!)." + "We could use `cv_results = model_random_search.cv_results_` to make a parallel\n", + "coordinate plot as we did in the previous notebook (you are more than welcome\n", + "to try!)." ] }, { @@ -173,7 +181,8 @@ "\n", "cv_results = cv_results.rename(columns=column_name_mapping)\n", "cv_results = cv_results[column_name_mapping.values()].sort_values(\n", - " \"mean test score\", ascending=False)" + " \"mean test score\", ascending=False\n", + ")" ] }, { @@ -237,8 +246,8 @@ }, "source": [ "We recall that it is possible to select a range of results by clicking and\n", - "holding on any axis of the parallel coordinate plot. You can then slide\n", - "(move) the range selection and cross two selections to see the intersections.\n", + "holding on any axis of the parallel coordinate plot. You can then slide (move)\n", + "the range selection and cross two selections to see the intersections.\n", "\n", "Selecting the best performing models (i.e. above an accuracy of ~0.68), we\n", "observe that **in this case**:\n", @@ -262,10 +271,9 @@ "the values of A and B will be approximately between -3 and 3 and the neighbor\n", "structure will be impacted more or less equivalently by both variables.\n", "\n", - "Note that **in this case** the models with scaled features perform better\n", - "than the models with non-scaled features because all the variables are\n", - "expected to be predictive and we rather avoid some of them being comparatively\n", - "ignored.\n", + "Note that **in this case** the models with scaled features perform better than\n", + "the models with non-scaled features because all the variables are expected to\n", + "be predictive and we rather avoid some of them being comparatively ignored.\n", "\n", "If the variables in lower scales were not predictive one may experience a\n", "decrease of the performance after scaling the features: noisy features would\n", diff --git a/notebooks/trees_classification.ipynb b/notebooks/trees_classification.ipynb index 0c629ca3a..b92504bef 100644 --- a/notebooks/trees_classification.ipynb +++ b/notebooks/trees_classification.ipynb @@ -52,20 +52,20 @@ "\n", "data, target = penguins[culmen_columns], penguins[target_column]\n", "data_train, data_test, target_train, target_test = train_test_split(\n", - " data, target, random_state=0)" + " data, target, random_state=0\n", + ")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "\n", "In a previous notebook, we learnt that a linear classifier will define a\n", "linear separation to split classes using a linear combination of the input\n", "features. In our 2-dimensional space, it means that a linear classifier will\n", - "define some oblique lines that best separate our classes. We define a\n", - "function below that, given a set of data points and a classifier, will plot\n", - "the decision boundaries learnt by the classifier.\n", + "define some oblique lines that best separate our classes. We define a function\n", + "below that, given a set of data points and a classifier, will plot the\n", + "decision boundaries learnt by the classifier.\n", "\n", "Thus, for a linear classifier, we will obtain the following decision\n", "boundaries. These boundaries lines indicate where the model changes its\n", @@ -101,10 +101,15 @@ "DecisionBoundaryDisplay.from_estimator(\n", " linear_model, data_train, response_method=\"predict\", cmap=\"RdBu\", alpha=0.5\n", ")\n", - "sns.scatterplot(data=penguins, x=culmen_columns[0], y=culmen_columns[1],\n", - " hue=target_column, palette=palette)\n", + "sns.scatterplot(\n", + " data=penguins,\n", + " x=culmen_columns[0],\n", + " y=culmen_columns[1],\n", + " hue=target_column,\n", + " palette=palette,\n", + ")\n", "# put the legend outside the plot\n", - "plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')\n", + "plt.legend(bbox_to_anchor=(1.05, 1), loc=\"upper left\")\n", "_ = plt.title(\"Decision boundary using a logistic regression\")" ] }, @@ -117,8 +122,8 @@ "parametrization that we saw in the previous notebook, controlled by the\n", "model's weights and intercept.\n", "\n", - "Besides, it seems that the linear model would be a good candidate for\n", - "such problem as it gives good accuracy." + "Besides, it seems that the linear model would be a good candidate for such\n", + "problem as it gives good accuracy." ] }, { @@ -141,8 +146,8 @@ "intercept to be optimized.\n", "\n", "Indeed, decision trees will partition the space by considering a single\n", - "feature at a time. Let's illustrate this behaviour by having a decision\n", - "tree make a single split to partition the feature space." + "feature at a time. Let's illustrate this behaviour by having a decision tree\n", + "make a single split to partition the feature space." ] }, { @@ -166,9 +171,14 @@ "DecisionBoundaryDisplay.from_estimator(\n", " tree, data_train, response_method=\"predict\", cmap=\"RdBu\", alpha=0.5\n", ")\n", - "sns.scatterplot(data=penguins, x=culmen_columns[0], y=culmen_columns[1],\n", - " hue=target_column, palette=palette)\n", - "plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')\n", + "sns.scatterplot(\n", + " data=penguins,\n", + " x=culmen_columns[0],\n", + " y=culmen_columns[1],\n", + " hue=target_column,\n", + " palette=palette,\n", + ")\n", + "plt.legend(bbox_to_anchor=(1.05, 1), loc=\"upper left\")\n", "_ = plt.title(\"Decision boundary using a decision tree\")" ] }, @@ -191,8 +201,13 @@ "from sklearn.tree import plot_tree\n", "\n", "_, ax = plt.subplots(figsize=(8, 6))\n", - "_ = plot_tree(tree, feature_names=culmen_columns,\n", - " class_names=tree.classes_, impurity=False, ax=ax)" + "_ = plot_tree(\n", + " tree,\n", + " feature_names=culmen_columns,\n", + " class_names=tree.classes_,\n", + " impurity=False,\n", + " ax=ax,\n", + ")" ] }, { @@ -212,12 +227,12 @@ "metadata": {}, "source": [ "We see that the split was done on the culmen depth feature. The original\n", - "dataset was subdivided into 2 sets based on the culmen depth\n", - "(inferior or superior to 16.45 mm).\n", + "dataset was subdivided into 2 sets based on the culmen depth (inferior or\n", + "superior to 16.45 mm).\n", "\n", "This partition of the dataset minimizes the class diversities in each\n", - "sub-partitions. This measure is also known as a **criterion**,\n", - "and is a settable parameter.\n", + "sub-partitions. This measure is also known as a **criterion**, and is a\n", + "settable parameter.\n", "\n", "If we look more closely at the partition, we see that the sample superior to\n", "16.45 belongs mainly to the Adelie class. Looking at the values, we indeed\n", @@ -236,9 +251,7 @@ "metadata": {}, "outputs": [], "source": [ - "sample_1 = pd.DataFrame(\n", - " {\"Culmen Length (mm)\": [0], \"Culmen Depth (mm)\": [15]}\n", - ")\n", + "sample_1 = pd.DataFrame({\"Culmen Length (mm)\": [0], \"Culmen Depth (mm)\": [15]})\n", "tree.predict(sample_1)" ] }, @@ -246,8 +259,8 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "The class predicted is the Gentoo. We can now check what happens if we pass\n", - "a culmen depth superior to the threshold." + "The class predicted is the Gentoo. We can now check what happens if we pass a\n", + "culmen depth superior to the threshold." ] }, { @@ -256,9 +269,7 @@ "metadata": {}, "outputs": [], "source": [ - "sample_2 = pd.DataFrame(\n", - " {\"Culmen Length (mm)\": [0], \"Culmen Depth (mm)\": [17]}\n", - ")\n", + "sample_2 = pd.DataFrame({\"Culmen Length (mm)\": [0], \"Culmen Depth (mm)\": [17]})\n", "tree.predict(sample_2)" ] }, @@ -271,8 +282,8 @@ "Thus, we can conclude that a decision tree classifier will predict the most\n", "represented class within a partition.\n", "\n", - "During the training, we have a count of samples in each partition, we can\n", - "also compute the probability of belonging to a specific class within this\n", + "During the training, we have a count of samples in each partition, we can also\n", + "compute the probability of belonging to a specific class within this\n", "partition." ] }, @@ -301,8 +312,8 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "We can also compute the different probabilities manually directly from the tree\n", - "structure." + "We can also compute the different probabilities manually directly from the\n", + "tree structure." ] }, { @@ -315,7 +326,7 @@ "chinstrap_proba = 52 / 161\n", "gentoo_proba = 6 / 161\n", "print(\n", - " f\"Probabilities for the different classes:\\n\"\n", + " \"Probabilities for the different classes:\\n\"\n", " f\"Adelie: {adelie_proba:.3f}\\n\"\n", " f\"Chinstrap: {chinstrap_proba:.3f}\\n\"\n", " f\"Gentoo: {gentoo_proba:.3f}\\n\"\n", @@ -327,8 +338,8 @@ "metadata": {}, "source": [ "It is also important to note that the culmen length has been disregarded for\n", - "the moment. It means that whatever the value given, it will not be used\n", - "during the prediction." + "the moment. It means that whatever the value given, it will not be used during\n", + "the prediction." ] }, { @@ -347,8 +358,8 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Going back to our classification problem, the split found with a maximum\n", - "depth of 1 is not powerful enough to separate the three species and the model\n", + "Going back to our classification problem, the split found with a maximum depth\n", + "of 1 is not powerful enough to separate the three species and the model\n", "accuracy is low when compared to the linear model." ] }, @@ -367,9 +378,9 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Indeed, it is not a surprise. We saw earlier that a single feature will not\n", - "be able to separate all three species. However, from the previous analysis we\n", - "saw that by using both features we should be able to get fairly good results.\n", + "Indeed, it is not a surprise. We saw earlier that a single feature will not be\n", + "able to separate all three species. However, from the previous analysis we saw\n", + "that by using both features we should be able to get fairly good results.\n", "\n", "In the next exercise, you will increase the size of the tree depth. You will\n", "get intuitions on how the space partitioning is repeated over time." diff --git a/notebooks/trees_dataset.ipynb b/notebooks/trees_dataset.ipynb index 11787e1f6..7202c1073 100644 --- a/notebooks/trees_dataset.ipynb +++ b/notebooks/trees_dataset.ipynb @@ -6,11 +6,10 @@ "source": [ "# The penguins datasets\n", "\n", - "In this notebook, we make a quick presentation of the\n", - "[Palmer penguins dataset](https://allisonhorst.github.io/palmerpenguins/)\n", - "dataset. We use this dataset for both classification and regression\n", - "problems by selecting a subset of the features to make our explanations\n", - "intuitive.\n", + "In this notebook, we make a quick presentation of the [Palmer penguins\n", + "dataset](https://allisonhorst.github.io/palmerpenguins/) dataset. We use this\n", + "dataset for both classification and regression problems by selecting a subset\n", + "of the features to make our explanations intuitive.\n", "\n", "## Classification dataset\n", "\n", @@ -21,15 +20,17 @@ "Chinstrap. See the illustration below depicting the three different penguin\n", "species:\n", "\n", - "![Image of penguins](https://github.com/allisonhorst/palmerpenguins/raw/master/man/figures/lter_penguins.png)\n", + "![Image of\n", + "penguins](https://github.com/allisonhorst/palmerpenguins/raw/master/man/figures/lter_penguins.png)\n", "\n", - "This problem is a classification problem since the target is categorical.\n", - "We will limit our input data to a subset of the original features\n", - "to simplify our explanations when presenting the decision tree algorithm.\n", - "Indeed, we will use features based on penguins' culmen measurement. You can\n", - "learn more about the penguins' culmen with the illustration below:\n", + "This problem is a classification problem since the target is categorical. We\n", + "will limit our input data to a subset of the original features to simplify our\n", + "explanations when presenting the decision tree algorithm. Indeed, we will use\n", + "features based on penguins' culmen measurement. You can learn more about the\n", + "penguins' culmen with the illustration below:\n", "\n", - "![Image of culmen](https://github.com/allisonhorst/palmerpenguins/raw/master/man/figures/culmen_depth.png)\n", + "![Image of\n", + "culmen](https://github.com/allisonhorst/palmerpenguins/raw/master/man/figures/culmen_depth.png)\n", "\n", "We will start by loading this subset of the dataset." ] @@ -134,9 +135,8 @@ "source": [ "Here, we deal with a regression problem because our target is a continuous\n", "variable ranging from 2.7 kg to 6.3 kg. From the scatter plot above, we\n", - "observe that we have a linear relationship between the flipper length\n", - "and the body mass. The longer the flipper of a penguin, the heavier the\n", - "penguin." + "observe that we have a linear relationship between the flipper length and the\n", + "body mass. The longer the flipper of a penguin, the heavier the penguin." ] } ], diff --git a/notebooks/trees_ex_01.ipynb b/notebooks/trees_ex_01.ipynb index 3fb8edfb8..99858920b 100644 --- a/notebooks/trees_ex_01.ipynb +++ b/notebooks/trees_ex_01.ipynb @@ -7,9 +7,9 @@ "# \ud83d\udcdd Exercise M5.01\n", "\n", "In the previous notebook, we showed how a tree with a depth of 1 level was\n", - "working. The aim of this exercise is to repeat part of the previous\n", - "experiment for a depth with 2 levels to show how the process of partitioning\n", - "is repeated over time.\n", + "working. The aim of this exercise is to repeat part of the previous experiment\n", + "for a depth with 2 levels to show how the process of partitioning is repeated\n", + "over time.\n", "\n", "Before to start, we will:\n", "\n", @@ -60,11 +60,11 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Create a decision tree classifier with a maximum depth of 2 levels and fit\n", - "the training data. Once this classifier trained, plot the data and the\n", - "decision boundary to see the benefit of increasing the depth. To plot the\n", - "decision boundary, you should import the class `DecisionBoundaryDisplay`\n", - "from the module `sklearn.inspection` as shown in the previous course notebook." + "Create a decision tree classifier with a maximum depth of 2 levels and fit the\n", + "training data. Once this classifier trained, plot the data and the decision\n", + "boundary to see the benefit of increasing the depth. To plot the decision\n", + "boundary, you should import the class `DecisionBoundaryDisplay` from the\n", + "module `sklearn.inspection` as shown in the previous course notebook." ] }, { @@ -80,8 +80,8 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Did we make use of the feature \"Culmen Length\"?\n", - "Plot the tree using the function `sklearn.tree.plot_tree` to find out!" + "Did we make use of the feature \"Culmen Length\"? Plot the tree using the\n", + "function `sklearn.tree.plot_tree` to find out!" ] }, { @@ -112,7 +112,6 @@ ], "metadata": { "jupytext": { - "encoding": "# -*- coding: utf-8 -*-", "main_language": "python" }, "kernelspec": { diff --git a/notebooks/trees_ex_02.ipynb b/notebooks/trees_ex_02.ipynb index 21d958747..3b1c0e141 100644 --- a/notebooks/trees_ex_02.ipynb +++ b/notebooks/trees_ex_02.ipynb @@ -6,11 +6,11 @@ "source": [ "# \ud83d\udcdd Exercise M5.02\n", "\n", - "The aim of this exercise is to find out whether a decision tree\n", - "model is able to extrapolate.\n", + "The aim of this exercise is to find out whether a decision tree model is able\n", + "to extrapolate.\n", "\n", - "By extrapolation, we refer to values predicted by a model outside of the\n", - "range of feature values seen during the training.\n", + "By extrapolation, we refer to values predicted by a model outside of the range\n", + "of feature values seen during the training.\n", "\n", "We will first load the regression data." ] @@ -46,8 +46,8 @@ "metadata": {}, "source": [ "First, create two models, a linear regression model and a decision tree\n", - "regression model, and fit them on the training data. Limit the depth at\n", - "3 levels for the decision tree." + "regression model, and fit them on the training data. Limit the depth at 3\n", + "levels for the decision tree." ] }, { @@ -63,9 +63,9 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Create a synthetic dataset containing all possible flipper length from\n", - "the minimum to the maximum of the training dataset. Get the predictions of\n", - "each model using this dataset." + "Create a synthetic dataset containing all possible flipper length from the\n", + "minimum to the maximum of the training dataset. Get the predictions of each\n", + "model using this dataset." ] }, { @@ -99,9 +99,9 @@ "metadata": {}, "source": [ "Now, we will check the extrapolation capabilities of each model. Create a\n", - "dataset containing a broader range of values than your previous dataset,\n", - "in other words, add values below and above the minimum and the maximum of\n", - "the flipper length seen during training." + "dataset containing a broader range of values than your previous dataset, in\n", + "other words, add values below and above the minimum and the maximum of the\n", + "flipper length seen during training." ] }, { @@ -133,7 +133,6 @@ ], "metadata": { "jupytext": { - "encoding": "# -*- coding: utf-8 -*-", "main_language": "python" }, "kernelspec": { diff --git a/notebooks/trees_hyperparameters.ipynb b/notebooks/trees_hyperparameters.ipynb index f17a6734d..e60248e94 100644 --- a/notebooks/trees_hyperparameters.ipynb +++ b/notebooks/trees_hyperparameters.ipynb @@ -7,8 +7,8 @@ "# Importance of decision tree hyperparameters on generalization\n", "\n", "In this notebook, we will illustrate the importance of some key\n", - "hyperparameters on the decision tree; we will demonstrate their effects on\n", - "the classification and regression problems we saw previously.\n", + "hyperparameters on the decision tree; we will demonstrate their effects on the\n", + "classification and regression problems we saw previously.\n", "\n", "First, we will load the classification and regression datasets." ] @@ -77,12 +77,20 @@ " else:\n", " palette = [\"tab:red\", \"tab:blue\", \"black\"]\n", " DecisionBoundaryDisplay.from_estimator(\n", - " model, data[feature_names], response_method=\"predict\",\n", - " cmap=\"RdBu\", alpha=0.5\n", + " model,\n", + " data[feature_names],\n", + " response_method=\"predict\",\n", + " cmap=\"RdBu\",\n", + " alpha=0.5,\n", " )\n", - " sns.scatterplot(data=data, x=feature_names[0], y=feature_names[1],\n", - " hue=target_names, palette=palette)\n", - " plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')\n", + " sns.scatterplot(\n", + " data=data,\n", + " x=feature_names[0],\n", + " y=feature_names[1],\n", + " hue=target_names,\n", + " palette=palette,\n", + " )\n", + " plt.legend(bbox_to_anchor=(1.05, 1), loc=\"upper left\")\n", "\n", "\n", "def fit_and_plot_regression(model, data, feature_names, target_names):\n", @@ -94,7 +102,8 @@ " target_predicted = model.predict(data_test)\n", "\n", " sns.scatterplot(\n", - " x=data.iloc[:, 0], y=data[target_names], color=\"black\", alpha=0.5)\n", + " x=data.iloc[:, 0], y=data[target_names], color=\"black\", alpha=0.5\n", + " )\n", " plt.plot(data_test.iloc[:, 0], target_predicted, linewidth=4)" ] }, @@ -133,7 +142,8 @@ "outputs": [], "source": [ "fit_and_plot_classification(\n", - " tree_clf, data_clf, data_clf_columns, target_clf_column)\n", + " tree_clf, data_clf, data_clf_columns, target_clf_column\n", + ")\n", "_ = plt.title(f\"Shallow classification tree with max-depth of {max_depth}\")" ] }, @@ -144,7 +154,8 @@ "outputs": [], "source": [ "fit_and_plot_regression(\n", - " tree_reg, data_reg, data_reg_columns, target_reg_column)\n", + " tree_reg, data_reg, data_reg_columns, target_reg_column\n", + ")\n", "_ = plt.title(f\"Shallow regression tree with max-depth of {max_depth}\")" ] }, @@ -152,8 +163,8 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Now, let's increase the `max_depth` parameter value to check the difference\n", - "by observing the decision function." + "Now, let's increase the `max_depth` parameter value to check the difference by\n", + "observing the decision function." ] }, { @@ -174,7 +185,8 @@ "outputs": [], "source": [ "fit_and_plot_classification(\n", - " tree_clf, data_clf, data_clf_columns, target_clf_column)\n", + " tree_clf, data_clf, data_clf_columns, target_clf_column\n", + ")\n", "_ = plt.title(f\"Deep classification tree with max-depth of {max_depth}\")" ] }, @@ -185,7 +197,8 @@ "outputs": [], "source": [ "fit_and_plot_regression(\n", - " tree_reg, data_reg, data_reg_columns, target_reg_column)\n", + " tree_reg, data_reg, data_reg_columns, target_reg_column\n", + ")\n", "_ = plt.title(f\"Deep regression tree with max-depth of {max_depth}\")" ] }, @@ -193,12 +206,11 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "For both classification and regression setting, we observe that\n", - "increasing the depth will make the tree model more expressive. However, a\n", - "tree that is too deep will overfit the training data, creating partitions\n", - "which are only correct for \"outliers\" (noisy samples). The `max_depth` is one\n", - "of the hyperparameters that one should optimize via cross-validation and\n", - "grid-search." + "For both classification and regression setting, we observe that increasing the\n", + "depth will make the tree model more expressive. However, a tree that is too\n", + "deep will overfit the training data, creating partitions which are only\n", + "correct for \"outliers\" (noisy samples). The `max_depth` is one of the\n", + "hyperparameters that one should optimize via cross-validation and grid-search." ] }, { @@ -221,9 +233,11 @@ "outputs": [], "source": [ "fit_and_plot_classification(\n", - " tree_clf, data_clf, data_clf_columns, target_clf_column)\n", - "_ = plt.title(f\"Optimal depth found via CV: \"\n", - " f\"{tree_clf.best_params_['max_depth']}\")" + " tree_clf, data_clf, data_clf_columns, target_clf_column\n", + ")\n", + "_ = plt.title(\n", + " f\"Optimal depth found via CV: {tree_clf.best_params_['max_depth']}\"\n", + ")" ] }, { @@ -233,17 +247,19 @@ "outputs": [], "source": [ "fit_and_plot_regression(\n", - " tree_reg, data_reg, data_reg_columns, target_reg_column)\n", - "_ = plt.title(f\"Optimal depth found via CV: \"\n", - " f\"{tree_reg.best_params_['max_depth']}\")" + " tree_reg, data_reg, data_reg_columns, target_reg_column\n", + ")\n", + "_ = plt.title(\n", + " f\"Optimal depth found via CV: {tree_reg.best_params_['max_depth']}\"\n", + ")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "With this example, we see that there is not a single value that is optimal\n", - "for any dataset. Thus, this parameter is required to be optimized for each\n", + "With this example, we see that there is not a single value that is optimal for\n", + "any dataset. Thus, this parameter is required to be optimized for each\n", "application.\n", "\n", "## Other hyperparameters in decision trees\n", @@ -256,10 +272,9 @@ "\n", "We will build a dataset where we will illustrate this asymmetry. We will\n", "generate a dataset composed of 2 subsets: one subset where a clear separation\n", - "should be found by the tree and another subset where samples from both\n", - "classes will be mixed. It implies that a decision tree will need more splits\n", - "to classify properly samples from the second subset than from the first\n", - "subset." + "should be found by the tree and another subset where samples from both classes\n", + "will be mixed. It implies that a decision tree will need more splits to\n", + "classify properly samples from the second subset than from the first subset." ] }, { @@ -275,16 +290,17 @@ "\n", "# Blobs that will be interlaced\n", "X_1, y_1 = make_blobs(\n", - " n_samples=300, centers=[[0, 0], [-1, -1]], random_state=0)\n", + " n_samples=300, centers=[[0, 0], [-1, -1]], random_state=0\n", + ")\n", "# Blobs that will be easily separated\n", - "X_2, y_2 = make_blobs(\n", - " n_samples=300, centers=[[3, 6], [7, 0]], random_state=0)\n", + "X_2, y_2 = make_blobs(n_samples=300, centers=[[3, 6], [7, 0]], random_state=0)\n", "\n", "X = np.concatenate([X_1, X_2], axis=0)\n", "y = np.concatenate([y_1, y_2])\n", "data_clf = np.concatenate([X, y[:, np.newaxis]], axis=1)\n", "data_clf = pd.DataFrame(\n", - " data_clf, columns=data_clf_columns + [target_clf_column])\n", + " data_clf, columns=data_clf_columns + [target_clf_column]\n", + ")\n", "data_clf[target_clf_column] = data_clf[target_clf_column].astype(np.int32)" ] }, @@ -294,8 +310,13 @@ "metadata": {}, "outputs": [], "source": [ - "sns.scatterplot(data=data_clf, x=data_clf_columns[0], y=data_clf_columns[1],\n", - " hue=target_clf_column, palette=[\"tab:red\", \"tab:blue\"])\n", + "sns.scatterplot(\n", + " data=data_clf,\n", + " x=data_clf_columns[0],\n", + " y=data_clf_columns[1],\n", + " hue=target_clf_column,\n", + " palette=[\"tab:red\", \"tab:blue\"],\n", + ")\n", "_ = plt.title(\"Synthetic dataset\")" ] }, @@ -317,7 +338,8 @@ "max_depth = 2\n", "tree_clf = DecisionTreeClassifier(max_depth=max_depth)\n", "fit_and_plot_classification(\n", - " tree_clf, data_clf, data_clf_columns, target_clf_column)\n", + " tree_clf, data_clf, data_clf_columns, target_clf_column\n", + ")\n", "_ = plt.title(f\"Decision tree with max-depth of {max_depth}\")" ] }, @@ -329,9 +351,9 @@ "top are easily separated. However, more splits will be required to better\n", "split the blob were both blue and red data points are mixed.\n", "\n", - "Indeed, we see that red blob on the top and the blue blob on the right of\n", - "the plot are perfectly separated. However, the tree is still making mistakes\n", - "in the area where the blobs are mixed together. Let's check the tree\n", + "Indeed, we see that red blob on the top and the blue blob on the right of the\n", + "plot are perfectly separated. However, the tree is still making mistakes in\n", + "the area where the blobs are mixed together. Let's check the tree\n", "representation." ] }, @@ -351,8 +373,8 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "We see that the right branch achieves perfect classification. Now, we\n", - "increase the depth to check how the tree will grow." + "We see that the right branch achieves perfect classification. Now, we increase\n", + "the depth to check how the tree will grow." ] }, { @@ -364,7 +386,8 @@ "max_depth = 6\n", "tree_clf = DecisionTreeClassifier(max_depth=max_depth)\n", "fit_and_plot_classification(\n", - " tree_clf, data_clf, data_clf_columns, target_clf_column)\n", + " tree_clf, data_clf, data_clf_columns, target_clf_column\n", + ")\n", "_ = plt.title(f\"Decision tree with max-depth of {max_depth}\")" ] }, @@ -384,13 +407,13 @@ "source": [ "As expected, the left branch of the tree continue to grow while no further\n", "splits were done on the right branch. Fixing the `max_depth` parameter would\n", - "cut the tree horizontally at a specific level, whether or not it would\n", - "be more beneficial that a branch continue growing.\n", + "cut the tree horizontally at a specific level, whether or not it would be more\n", + "beneficial that a branch continue growing.\n", "\n", - "The hyperparameters `min_samples_leaf`, `min_samples_split`,\n", - "`max_leaf_nodes`, or `min_impurity_decrease` allows growing asymmetric trees\n", - "and apply a constraint at the leaves or nodes level. We will check the effect\n", - "of `min_samples_leaf`." + "The hyperparameters `min_samples_leaf`, `min_samples_split`, `max_leaf_nodes`,\n", + "or `min_impurity_decrease` allows growing asymmetric trees and apply a\n", + "constraint at the leaves or nodes level. We will check the effect of\n", + "`min_samples_leaf`." ] }, { @@ -402,9 +425,11 @@ "min_samples_leaf = 60\n", "tree_clf = DecisionTreeClassifier(min_samples_leaf=min_samples_leaf)\n", "fit_and_plot_classification(\n", - " tree_clf, data_clf, data_clf_columns, target_clf_column)\n", + " tree_clf, data_clf, data_clf_columns, target_clf_column\n", + ")\n", "_ = plt.title(\n", - " f\"Decision tree with leaf having at least {min_samples_leaf} samples\")" + " f\"Decision tree with leaf having at least {min_samples_leaf} samples\"\n", + ")" ] }, { @@ -421,10 +446,9 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "This hyperparameter allows to have leaves with a minimum number of samples\n", - "and no further splits will be searched otherwise. Therefore, these\n", - "hyperparameters could be an alternative to fix the `max_depth`\n", - "hyperparameter." + "This hyperparameter allows to have leaves with a minimum number of samples and\n", + "no further splits will be searched otherwise. Therefore, these hyperparameters\n", + "could be an alternative to fix the `max_depth` hyperparameter." ] } ], diff --git a/notebooks/trees_regression.ipynb b/notebooks/trees_regression.ipynb index 865c0b4f0..ccc4ff13e 100644 --- a/notebooks/trees_regression.ipynb +++ b/notebooks/trees_regression.ipynb @@ -57,18 +57,19 @@ "source": [ "import numpy as np\n", "\n", - "data_test = pd.DataFrame(np.arange(data_train[feature_name].min(),\n", - " data_train[feature_name].max()),\n", - " columns=[feature_name])" + "data_test = pd.DataFrame(\n", + " np.arange(data_train[feature_name].min(), data_train[feature_name].max()),\n", + " columns=[feature_name],\n", + ")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "Using the term \"test\" here refers to data that was not used for training.\n", - "It should not be confused with data coming from a train-test split, as it\n", - "was generated in equally-spaced intervals for the visual evaluation of the\n", + "Using the term \"test\" here refers to data that was not used for training. It\n", + "should not be confused with data coming from a train-test split, as it was\n", + "generated in equally-spaced intervals for the visual evaluation of the\n", "predictions.\n", "\n", "Note that this is methodologically valid here because our objective is to get\n", @@ -89,8 +90,9 @@ "import matplotlib.pyplot as plt\n", "import seaborn as sns\n", "\n", - "sns.scatterplot(data=penguins, x=feature_name, y=target_name,\n", - " color=\"black\", alpha=0.5)\n", + "sns.scatterplot(\n", + " data=penguins, x=feature_name, y=target_name, color=\"black\", alpha=0.5\n", + ")\n", "_ = plt.title(\"Illustration of the regression dataset used\")" ] }, @@ -123,8 +125,9 @@ }, "outputs": [], "source": [ - "sns.scatterplot(data=penguins, x=feature_name, y=target_name,\n", - " color=\"black\", alpha=0.5)\n", + "sns.scatterplot(\n", + " data=penguins, x=feature_name, y=target_name, color=\"black\", alpha=0.5\n", + ")\n", "plt.plot(data_test[feature_name], target_predicted, label=\"Linear regression\")\n", "plt.legend()\n", "_ = plt.title(\"Prediction function using a LinearRegression\")" @@ -145,12 +148,21 @@ "metadata": {}, "outputs": [], "source": [ - "ax = sns.scatterplot(data=penguins, x=feature_name, y=target_name,\n", - " color=\"black\", alpha=0.5)\n", - "plt.plot(data_test[feature_name], target_predicted, label=\"Linear regression\",\n", - " linestyle=\"--\")\n", - "plt.scatter(data_test[::3], target_predicted[::3], label=\"Predictions\",\n", - " color=\"tab:orange\")\n", + "ax = sns.scatterplot(\n", + " data=penguins, x=feature_name, y=target_name, color=\"black\", alpha=0.5\n", + ")\n", + "plt.plot(\n", + " data_test[feature_name],\n", + " target_predicted,\n", + " label=\"Linear regression\",\n", + " linestyle=\"--\",\n", + ")\n", + "plt.scatter(\n", + " data_test[::3],\n", + " target_predicted[::3],\n", + " label=\"Predictions\",\n", + " color=\"tab:orange\",\n", + ")\n", "plt.legend()\n", "_ = plt.title(\"Prediction function using a LinearRegression\")" ] @@ -184,8 +196,9 @@ "metadata": {}, "outputs": [], "source": [ - "sns.scatterplot(data=penguins, x=feature_name, y=target_name,\n", - " color=\"black\", alpha=0.5)\n", + "sns.scatterplot(\n", + " data=penguins, x=feature_name, y=target_name, color=\"black\", alpha=0.5\n", + ")\n", "plt.plot(data_test[feature_name], target_predicted, label=\"Decision tree\")\n", "plt.legend()\n", "_ = plt.title(\"Prediction function using a DecisionTreeRegressor\")" @@ -226,8 +239,8 @@ "partition.\n", "\n", "In classification, we saw that increasing the depth of the tree allowed us to\n", - "get more complex decision boundaries.\n", - "Let's check the effect of increasing the depth in a regression setting:" + "get more complex decision boundaries. Let's check the effect of increasing the\n", + "depth in a regression setting:" ] }, { @@ -247,8 +260,9 @@ "metadata": {}, "outputs": [], "source": [ - "sns.scatterplot(data=penguins, x=feature_name, y=target_name,\n", - " color=\"black\", alpha=0.5)\n", + "sns.scatterplot(\n", + " data=penguins, x=feature_name, y=target_name, color=\"black\", alpha=0.5\n", + ")\n", "plt.plot(data_test[feature_name], target_predicted, label=\"Decision tree\")\n", "plt.legend()\n", "_ = plt.title(\"Prediction function using a DecisionTreeRegressor\")" diff --git a/notebooks/trees_sol_01.ipynb b/notebooks/trees_sol_01.ipynb index a66a8febe..4f1672510 100644 --- a/notebooks/trees_sol_01.ipynb +++ b/notebooks/trees_sol_01.ipynb @@ -7,9 +7,9 @@ "# \ud83d\udcc3 Solution for Exercise M5.01\n", "\n", "In the previous notebook, we showed how a tree with a depth of 1 level was\n", - "working. The aim of this exercise is to repeat part of the previous\n", - "experiment for a depth with 2 levels to show how the process of partitioning\n", - "is repeated over time.\n", + "working. The aim of this exercise is to repeat part of the previous experiment\n", + "for a depth with 2 levels to show how the process of partitioning is repeated\n", + "over time.\n", "\n", "Before to start, we will:\n", "\n", @@ -60,11 +60,11 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Create a decision tree classifier with a maximum depth of 2 levels and fit\n", - "the training data. Once this classifier trained, plot the data and the\n", - "decision boundary to see the benefit of increasing the depth. To plot the\n", - "decision boundary, you should import the class `DecisionBoundaryDisplay`\n", - "from the module `sklearn.inspection` as shown in the previous course notebook." + "Create a decision tree classifier with a maximum depth of 2 levels and fit the\n", + "training data. Once this classifier trained, plot the data and the decision\n", + "boundary to see the benefit of increasing the depth. To plot the decision\n", + "boundary, you should import the class `DecisionBoundaryDisplay` from the\n", + "module `sklearn.inspection` as shown in the previous course notebook." ] }, { @@ -99,9 +99,14 @@ "DecisionBoundaryDisplay.from_estimator(\n", " tree, data_train, response_method=\"predict\", cmap=\"RdBu\", alpha=0.5\n", ")\n", - "ax = sns.scatterplot(data=penguins, x=culmen_columns[0], y=culmen_columns[1],\n", - " hue=target_column, palette=palette)\n", - "plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')\n", + "ax = sns.scatterplot(\n", + " data=penguins,\n", + " x=culmen_columns[0],\n", + " y=culmen_columns[1],\n", + " hue=target_column,\n", + " palette=palette,\n", + ")\n", + "plt.legend(bbox_to_anchor=(1.05, 1), loc=\"upper left\")\n", "_ = plt.title(\"Decision boundary using a decision tree\")" ] }, @@ -109,8 +114,8 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Did we make use of the feature \"Culmen Length\"?\n", - "Plot the tree using the function `sklearn.tree.plot_tree` to find out!" + "Did we make use of the feature \"Culmen Length\"? Plot the tree using the\n", + "function `sklearn.tree.plot_tree` to find out!" ] }, { @@ -123,8 +128,13 @@ "from sklearn.tree import plot_tree\n", "\n", "_, ax = plt.subplots(figsize=(16, 12))\n", - "_ = plot_tree(tree, feature_names=culmen_columns,\n", - " class_names=tree.classes_, impurity=False, ax=ax)" + "_ = plot_tree(\n", + " tree,\n", + " feature_names=culmen_columns,\n", + " class_names=tree.classes_,\n", + " impurity=False,\n", + " ax=ax,\n", + ")" ] }, { @@ -135,11 +145,10 @@ ] }, "source": [ - "The resulting tree has 7 nodes: 3 of them are \"split nodes\" and 4\n", - "are \"leaf nodes\" (or simply \"leaves\"), organized in 2 levels.\n", - "We see that the second tree level used the \"Culmen Length\" to make\n", - "two new decisions. Qualitatively, we saw that such a simple tree was enough\n", - "to classify the penguins' species." + "The resulting tree has 7 nodes: 3 of them are \"split nodes\" and 4 are \"leaf\n", + "nodes\" (or simply \"leaves\"), organized in 2 levels. We see that the second\n", + "tree level used the \"Culmen Length\" to make two new decisions. Qualitatively,\n", + "we saw that such a simple tree was enough to classify the penguins' species." ] }, { @@ -174,8 +183,8 @@ "\n", "We predict an Adelie penguin if the feature value is below the threshold,\n", "which is not surprising since this partition was almost pure. If the feature\n", - "value is above the threshold, we predict the Gentoo penguin, the class that\n", - "is most probable." + "value is above the threshold, we predict the Gentoo penguin, the class that is\n", + "most probable." ] } ], diff --git a/notebooks/trees_sol_02.ipynb b/notebooks/trees_sol_02.ipynb index fd2c78f41..cd7de2cff 100644 --- a/notebooks/trees_sol_02.ipynb +++ b/notebooks/trees_sol_02.ipynb @@ -6,11 +6,11 @@ "source": [ "# \ud83d\udcc3 Solution for Exercise M5.02\n", "\n", - "The aim of this exercise is to find out whether a decision tree\n", - "model is able to extrapolate.\n", + "The aim of this exercise is to find out whether a decision tree model is able\n", + "to extrapolate.\n", "\n", - "By extrapolation, we refer to values predicted by a model outside of the\n", - "range of feature values seen during the training.\n", + "By extrapolation, we refer to values predicted by a model outside of the range\n", + "of feature values seen during the training.\n", "\n", "We will first load the regression data." ] @@ -46,8 +46,8 @@ "metadata": {}, "source": [ "First, create two models, a linear regression model and a decision tree\n", - "regression model, and fit them on the training data. Limit the depth at\n", - "3 levels for the decision tree." + "regression model, and fit them on the training data. Limit the depth at 3\n", + "levels for the decision tree." ] }, { @@ -71,9 +71,9 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Create a synthetic dataset containing all possible flipper length from\n", - "the minimum to the maximum of the training dataset. Get the predictions of\n", - "each model using this dataset." + "Create a synthetic dataset containing all possible flipper length from the\n", + "minimum to the maximum of the training dataset. Get the predictions of each\n", + "model using this dataset." ] }, { @@ -85,9 +85,10 @@ "# solution\n", "import numpy as np\n", "\n", - "data_test = pd.DataFrame(np.arange(data_train[feature_name].min(),\n", - " data_train[feature_name].max()),\n", - " columns=[feature_name])" + "data_test = pd.DataFrame(\n", + " np.arange(data_train[feature_name].min(), data_train[feature_name].max()),\n", + " columns=[feature_name],\n", + ")" ] }, { @@ -122,10 +123,14 @@ "import matplotlib.pyplot as plt\n", "import seaborn as sns\n", "\n", - "sns.scatterplot(data=penguins, x=feature_name, y=target_name,\n", - " color=\"black\", alpha=0.5)\n", - "plt.plot(data_test[feature_name], target_predicted_linear_regression,\n", - " label=\"Linear regression\")\n", + "sns.scatterplot(\n", + " data=penguins, x=feature_name, y=target_name, color=\"black\", alpha=0.5\n", + ")\n", + "plt.plot(\n", + " data_test[feature_name],\n", + " target_predicted_linear_regression,\n", + " label=\"Linear regression\",\n", + ")\n", "plt.plot(data_test[feature_name], target_predicted_tree, label=\"Decision tree\")\n", "plt.legend()\n", "_ = plt.title(\"Prediction of linear model and a decision tree\")" @@ -149,9 +154,9 @@ "metadata": {}, "source": [ "Now, we will check the extrapolation capabilities of each model. Create a\n", - "dataset containing a broader range of values than your previous dataset,\n", - "in other words, add values below and above the minimum and the maximum of\n", - "the flipper length seen during training." + "dataset containing a broader range of values than your previous dataset, in\n", + "other words, add values below and above the minimum and the maximum of the\n", + "flipper length seen during training." ] }, { @@ -162,9 +167,13 @@ "source": [ "# solution\n", "offset = 30\n", - "data_test = pd.DataFrame(np.arange(data_train[feature_name].min() - offset,\n", - " data_train[feature_name].max() + offset),\n", - " columns=[feature_name])" + "data_test = pd.DataFrame(\n", + " np.arange(\n", + " data_train[feature_name].min() - offset,\n", + " data_train[feature_name].max() + offset,\n", + " ),\n", + " columns=[feature_name],\n", + ")" ] }, { @@ -196,10 +205,14 @@ }, "outputs": [], "source": [ - "sns.scatterplot(data=penguins, x=feature_name, y=target_name,\n", - " color=\"black\", alpha=0.5)\n", - "plt.plot(data_test[feature_name], target_predicted_linear_regression,\n", - " label=\"Linear regression\")\n", + "sns.scatterplot(\n", + " data=penguins, x=feature_name, y=target_name, color=\"black\", alpha=0.5\n", + ")\n", + "plt.plot(\n", + " data_test[feature_name],\n", + " target_predicted_linear_regression,\n", + " label=\"Linear regression\",\n", + ")\n", "plt.plot(data_test[feature_name], target_predicted_tree, label=\"Decision tree\")\n", "plt.legend()\n", "_ = plt.title(\"Prediction of linear model and a decision tree\")" @@ -213,9 +226,9 @@ ] }, "source": [ - "The linear model will extrapolate using the fitted model for flipper lengths\n", - "< 175 mm and > 235 mm. In fact, we are using the model parametrization to\n", - "make this predictions.\n", + "The linear model will extrapolate using the fitted model for flipper lengths <\n", + "175 mm and > 235 mm. In fact, we are using the model parametrization to make\n", + "this predictions.\n", "\n", "As mentioned, decision trees are non-parametric models and we observe that\n", "they cannot extrapolate. For flipper lengths below the minimum, the mass of\n", From 5d839baddfbc904c9fdf37352fc36d04c1b546d3 Mon Sep 17 00:00:00 2001 From: Arturo Amor <86408019+ArturoAmorQ@users.noreply.github.com> Date: Mon, 5 Jun 2023 16:42:05 +0200 Subject: [PATCH 029/108] MAINT Use class `LearningCurveDisplay` (#690) Co-authored-by: ArturoAmorQ Co-authored-by: Guillaume Lemaitre --- python_scripts/cross_validation_ex_01.py | 2 +- .../cross_validation_learning_curve.py | 60 ++++++------------- python_scripts/cross_validation_sol_01.py | 37 +++++------- 3 files changed, 34 insertions(+), 65 deletions(-) diff --git a/python_scripts/cross_validation_ex_01.py b/python_scripts/cross_validation_ex_01.py index d671b8fef..abe9f22c6 100644 --- a/python_scripts/cross_validation_ex_01.py +++ b/python_scripts/cross_validation_ex_01.py @@ -102,7 +102,7 @@ # %% [markdown] # Now, you can perform an analysis to check whether adding new samples to the # dataset could help our model to better generalize. Compute the learning curve -# (using [`sklearn.model_selection.learning_curve`](https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.learning_curve.html)) +# (using [`sklearn.model_selection.LearningCurveDisplay`](https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.LearningCurveDisplay.html)) # by computing the train and test scores for different training dataset size. # Plot the train and test scores with respect to the number of samples. diff --git a/python_scripts/cross_validation_learning_curve.py b/python_scripts/cross_validation_learning_curve.py index 26e615c14..0c064c1b4 100644 --- a/python_scripts/cross_validation_learning_curve.py +++ b/python_scripts/cross_validation_learning_curve.py @@ -13,8 +13,8 @@ # generalizing. Besides these aspects, it is also important to understand how # the different errors are influenced by the number of samples available. # -# In this notebook, we will show this aspect by looking a the variability of the -# different errors. +# In this notebook, we will show this aspect by looking a the variability of +# the different errors. # # Let's first load the data and create the same model as in the previous # notebook. @@ -50,11 +50,11 @@ # the validation curve, but instead of varying a hyperparameter, we vary the # number of training samples. This curve is called the **learning curve**. # -# It gives information regarding the benefit of adding new training samples to -# improve a model's generalization performance. +# It gives information regarding the benefit of adding new training samples +# to improve a model's generalization performance. # -# Let's compute the learning curve for a decision tree and vary the proportion -# of the training set from 10% to 100%. +# Let's compute the learning curve for a decision tree and vary the +# proportion of the training set from 10% to 100%. # %% import numpy as np @@ -74,56 +74,34 @@ # Now, we are all set to carry out the experiment. # %% -from sklearn.model_selection import learning_curve +from sklearn.model_selection import LearningCurveDisplay -results = learning_curve( +display = LearningCurveDisplay.from_estimator( regressor, data, target, train_sizes=train_sizes, cv=cv, + score_type="both", # both train and test errors scoring="neg_mean_absolute_error", + negate_score=True, # to use when metric starts with "neg_" + log_scale=True, # sets log scale for the x-axis + score_name="Mean absolute error (k$)", + std_display_style="errorbar", n_jobs=2, ) -train_size, train_scores, test_scores = results[:3] -# Convert the scores into errors -train_errors, test_errors = -train_scores, -test_scores - -# %% [markdown] -# Now, we can plot the curve. - -# %% -import matplotlib.pyplot as plt - -plt.errorbar( - train_size, - train_errors.mean(axis=1), - yerr=train_errors.std(axis=1), - label="Training error", -) -plt.errorbar( - train_size, - test_errors.mean(axis=1), - yerr=test_errors.std(axis=1), - label="Testing error", -) -plt.legend() - -plt.xscale("log") -plt.xlabel("Number of samples in the training set") -plt.ylabel("Mean absolute error (k$)") -_ = plt.title("Learning curve for decision tree") +_ = display.ax_.set_title("Learning curve for decision tree") # %% [markdown] # Looking at the training error alone, we see that we get an error of 0 k$. It # means that the trained model (i.e. decision tree) is clearly overfitting the # training data. # -# Looking at the testing error alone, we observe that the more samples are added -# into the training set, the lower the testing error becomes. Also, we are -# searching for the plateau of the testing error for which there is no benefit -# to adding samples anymore or assessing the potential gain of adding more -# samples into the training set. +# Looking at the testing error alone, we observe that the more samples are +# added into the training set, the lower the testing error becomes. Also, we +# are searching for the plateau of the testing error for which there is no +# benefit to adding samples anymore or assessing the potential gain of adding +# more samples into the training set. # # If we achieve a plateau and adding new samples in the training set does not # reduce the testing error, we might have reached the Bayes error rate using the diff --git a/python_scripts/cross_validation_sol_01.py b/python_scripts/cross_validation_sol_01.py index 7dcd6ad96..35aa635b3 100644 --- a/python_scripts/cross_validation_sol_01.py +++ b/python_scripts/cross_validation_sol_01.py @@ -156,39 +156,30 @@ # %% [markdown] # Now, you can perform an analysis to check whether adding new samples to the # dataset could help our model to better generalize. Compute the learning curve -# (using [`sklearn.model_selection.learning_curve`](https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.learning_curve.html)) +# (using [`sklearn.model_selection.LearningCurveDisplay`](https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.LearningCurveDisplay.html)) # by computing the train and test scores for different training dataset size. # Plot the train and test scores with respect to the number of samples. # %% # solution -from sklearn.model_selection import learning_curve +from sklearn.model_selection import LearningCurveDisplay train_sizes = np.linspace(0.1, 1, num=10) -results = learning_curve( - model, data, target, train_sizes=train_sizes, cv=cv, n_jobs=2 +LearningCurveDisplay.from_estimator( + model, + data, + target, + train_sizes=train_sizes, + cv=cv, + score_type="both", + scoring="accuracy", # this is already the default for classifiers + score_name="Accuracy", + std_display_style="errorbar", + errorbar_kw={"alpha": 0.7}, # transparency for better visualization + n_jobs=2, ) -train_size, train_scores, test_scores = results[:3] -# %% tags=["solution"] -plt.errorbar( - train_size, - train_scores.mean(axis=1), - yerr=train_scores.std(axis=1), - alpha=0.95, - label="Training score", -) -plt.errorbar( - train_size, - test_scores.mean(axis=1), - yerr=test_scores.std(axis=1), - alpha=0.5, - label="Testing score", -) plt.legend(bbox_to_anchor=(1.05, 0.8), loc="upper left") - -plt.xlabel("Number of samples in the training set") -plt.ylabel("Accuracy") _ = plt.title("Learning curve for support vector machine") # %% [markdown] tags=["solution"] From 4cd2e57eb87213e3e0caf333f51f712ef0584a73 Mon Sep 17 00:00:00 2001 From: Arturo Amor <86408019+ArturoAmorQ@users.noreply.github.com> Date: Wed, 7 Jun 2023 12:06:36 +0200 Subject: [PATCH 030/108] MAINT Use class `PredictionErrorDisplay` (#674) Co-authored-by: ArturoAmorQ Co-authored-by: Olivier Grisel --- python_scripts/metrics_regression.py | 170 +++++++++++++++++++-------- 1 file changed, 123 insertions(+), 47 deletions(-) diff --git a/python_scripts/metrics_regression.py b/python_scripts/metrics_regression.py index fc1b148d2..0020ced3b 100644 --- a/python_scripts/metrics_regression.py +++ b/python_scripts/metrics_regression.py @@ -70,8 +70,8 @@ # %% [markdown] # Our linear regression model is minimizing the mean squared error on the -# training set. It means that there is no other set of coefficients which will -# decrease the error. +# training set. It means that there is no other set of coefficients which +# decreases the error. # # Then, we can compute the mean squared error on the test set. @@ -156,40 +156,71 @@ ) # %% [markdown] -# In addition of metrics, we can visually represent the results by plotting the +# In addition to using metrics, we can visualize the results by plotting the # predicted values versus the true values. - -# %% -predicted_actual = { - "True values (k$)": target_test, - "Predicted values (k$)": target_predicted, -} -predicted_actual = pd.DataFrame(predicted_actual) +# +# In an ideal scenario where all variations in the target could be perfectly +# explained by the obseved features (i.e. without any unobserved factors of +# variations), and we have chosen an optimal model, we would expect all +# predictions to fall along the diagonal line of the first plot below. +# +# In the real life, this is almost never the case: some unknown fraction of the +# variations in the target cannot be explained by variations in data: they stem +# from external factors not represented by the observed features. +# +# Therefore, the best we can hope for is that our model's predictions form a +# cloud of points symmetrically distributed around the diagonal line, ideally +# close enough to it for the model to be useful. +# +# To gain more insight, it can be helpful to plot the residuals, which represent +# the difference between the actual and predicted values, against the predicted +# values. This is shown in the second plot. +# +# Residual plots make it easier to assess if the residuals exhibit a variance +# independent of the target values or if there is any systematic bias of the +# model associated with the lowest or highest predicted values. # %% import matplotlib.pyplot as plt -import seaborn as sns +from sklearn.metrics import PredictionErrorDisplay -sns.scatterplot( - data=predicted_actual, - x="True values (k$)", - y="Predicted values (k$)", - color="black", - alpha=0.5, +fig, axs = plt.subplots(ncols=2, figsize=(13, 5)) + +PredictionErrorDisplay.from_predictions( + y_true=target_test, + y_pred=target_predicted, + kind="actual_vs_predicted", + scatter_kwargs={"alpha": 0.5}, + ax=axs[0], +) +axs[0].axis("square") +axs[0].set_xlabel("Predicted values (k$)") +axs[0].set_ylabel("True values (k$)") + +PredictionErrorDisplay.from_predictions( + y_true=target_test, + y_pred=target_predicted, + kind="residual_vs_predicted", + scatter_kwargs={"alpha": 0.5}, + ax=axs[1], +) +axs[1].axis("square") +axs[1].set_xlabel("Predicted values (k$)") +axs[1].set_ylabel("Residual values (k$)") + +_ = fig.suptitle( + "Regression using a model\nwithout target transformation", y=1.1 ) -plt.axline((0, 0), slope=1, label="Perfect fit") -plt.axis("square") -_ = plt.title("Regression using a model without \ntarget transformation") # %% [markdown] -# On this plot, correct predictions would lie on the diagonal line. This plot -# allows us to detect if the model makes errors in a consistent way, i.e. has -# some bias. -# -# On this plot, we see that for the large True price values, our model tends to -# under-estimate the price of the house. Typically, this issue arises when the -# target to predict does not follow a normal distribution. In this case the -# model would benefit from target transformation. +# On these plots, we see that our model tends to under-estimate the price of the +# house both for the lowest and large True price values. This means that the +# residuals still hold some **structure typically visible as the "banana" or +# "smile" shape of the residual plot**. This is often a clue that our model +# could be improved, either by transforming the features, the target or +# sometimes changing the model type or its parameters. In this case let's try to +# see if the model would benefit from a target transformation that monotonically +# reshapes the target variable to follow a normal distribution. # %% from sklearn.preprocessing import QuantileTransformer @@ -204,28 +235,73 @@ model_transformed_target.fit(data_train, target_train) target_predicted = model_transformed_target.predict(data_test) -# %% -predicted_actual = { - "True values (k$)": target_test, - "Predicted values (k$)": target_predicted, -} -predicted_actual = pd.DataFrame(predicted_actual) +fig, axs = plt.subplots(ncols=2, figsize=(13, 5)) + +PredictionErrorDisplay.from_predictions( + y_true=target_test, + y_pred=target_predicted, + kind="actual_vs_predicted", + scatter_kwargs={"alpha": 0.5}, + ax=axs[0], +) +axs[0].axis("square") +axs[0].set_xlabel("Predicted values (k$)") +axs[0].set_ylabel("True values (k$)") + +PredictionErrorDisplay.from_predictions( + y_true=target_test, + y_pred=target_predicted, + kind="residual_vs_predicted", + scatter_kwargs={"alpha": 0.5}, + ax=axs[1], +) +axs[1].axis("square") +axs[1].set_xlabel("Predicted values (k$)") +axs[1].set_ylabel("Residual values (k$)") + +_ = fig.suptitle( + "Regression using a model that\ntransforms the target before fitting", + y=1.1, +) + +# %% [markdown] +# The model with the transformed target seems to exhibit fewer structure in its +# residuals: over-estimation and under-estimation errors seems to be more +# balanced. +# +# We can confirm this by computing the previously mentioned metrics and observe +# that they all improved w.r.t. the linear regression model without the target +# transformation. # %% -sns.scatterplot( - data=predicted_actual, - x="True values (k$)", - y="Predicted values (k$)", - color="black", - alpha=0.5, +print( + "Mean absolute error: " + f"{mean_absolute_error(target_test, target_predicted):.3f} k$" ) -plt.axline((0, 0), slope=1, label="Perfect fit") -plt.axis("square") -plt.legend() -_ = plt.title( - "Regression using a model that\ntransform the target before fitting" + +print( + "Median absolute error: " + f"{median_absolute_error(target_test, target_predicted):.3f} k$" +) + +print( + "Mean absolute percentage error: " + f"{mean_absolute_percentage_error(target_test, target_predicted) * 100:.3f} %" ) # %% [markdown] -# Thus, once we transformed the target, we see that we corrected some of the -# high values. +# While a common practice, performing such a target transformation for linear +# regression is often disapproved by statisticians. It is mathematically more +# justified to instead adapt the loss function of the regression model itself, +# for instance by fitting a +# [`PoissonRegressor`](https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.PoissonRegressor.html) +# or a +# [`TweedieRegressor`](https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.TweedieRegressor.html) +# model instead of `LinearRegression`. In particular those models indeed use an +# internal "log link" function that makes them more suited for this kind of +# positive-only target data distributions, but this analysis is beyond the scope +# of this MOOC. +# +# The interested readers are encouraged to learn more about those models, in +# particular by reading their respective docstrings and the linked sections +# in the scikit-learn user guide reachable from the links above. From c34c8cb3c2c1e9be0c1d4cc1551ac6c43f8690dc Mon Sep 17 00:00:00 2001 From: Arturo Amor <86408019+ArturoAmorQ@users.noreply.github.com> Date: Wed, 14 Jun 2023 15:10:49 +0200 Subject: [PATCH 031/108] MAINT Introduce use of set_output to output dataframes (#683) --- .../02_numerical_pipeline_scaling.py | 11 +++++++- python_scripts/03_categorical_pipeline.py | 25 +++---------------- .../linear_models_regularization.py | 22 +++++++++------- 3 files changed, 27 insertions(+), 31 deletions(-) diff --git a/python_scripts/02_numerical_pipeline_scaling.py b/python_scripts/02_numerical_pipeline_scaling.py index 6516e79f9..19ac871b8 100644 --- a/python_scripts/02_numerical_pipeline_scaling.py +++ b/python_scripts/02_numerical_pipeline_scaling.py @@ -171,8 +171,17 @@ data_train_scaled = scaler.fit_transform(data_train) data_train_scaled +# %% [markdown] +# By default, all scikit-learn transformers output NumPy arrays. Since +# scikit-learn 1.2, it is possible to set the output to be a pandas dataframe, +# which makes data exploration easier as it preserves the column names. The +# method `set_output` controls this behaviour. Please refer to this [example +# from the scikit-learn +# documentation](https://scikit-learn.org/stable/auto_examples/miscellaneous/plot_set_output.html) +# for more options to configure the output of transformers. # %% -data_train_scaled = pd.DataFrame(data_train_scaled, columns=data_train.columns) +scaler = StandardScaler().set_output(transform="pandas") +data_train_scaled = scaler.fit_transform(data_train) data_train_scaled.describe() # %% [markdown] diff --git a/python_scripts/03_categorical_pipeline.py b/python_scripts/03_categorical_pipeline.py index 3c4ce7048..17c3ddc2b 100644 --- a/python_scripts/03_categorical_pipeline.py +++ b/python_scripts/03_categorical_pipeline.py @@ -103,7 +103,7 @@ education_column = data_categorical[["education"]] -encoder = OrdinalEncoder() +encoder = OrdinalEncoder().set_output(transform="pandas") education_encoded = encoder.fit_transform(education_column) education_encoded @@ -168,7 +168,7 @@ # %% from sklearn.preprocessing import OneHotEncoder -encoder = OneHotEncoder(sparse_output=False) +encoder = OneHotEncoder(sparse_output=False).set_output(transform="pandas") education_encoded = encoder.fit_transform(education_column) education_encoded @@ -184,17 +184,8 @@ # ``` # %% [markdown] -# We see that encoding a single feature will give a NumPy array full of zeros -# and ones. We can get a better understanding using the associated feature names -# resulting from the transformation. - -# %% -feature_names = encoder.get_feature_names_out(input_features=["education"]) -education_encoded = pd.DataFrame(education_encoded, columns=feature_names) -education_encoded - -# %% [markdown] -# As we can see, each category (unique value) became a column; the encoding +# We see that encoding a single feature will give a dataframe full of zeros +# and ones. Each category (unique value) became a column; the encoding # returned, for each sample, a 1 to specify which category it belongs to. # # Let's apply this encoding on the full dataset. @@ -210,14 +201,6 @@ # %% print(f"The encoded dataset contains {data_encoded.shape[1]} features") -# %% [markdown] -# Let's wrap this NumPy array in a dataframe with informative column names as -# provided by the encoder object: - -# %% -columns_encoded = encoder.get_feature_names_out(data_categorical.columns) -pd.DataFrame(data_encoded, columns=columns_encoded).head() - # %% [markdown] # Look at how the `"workclass"` variable of the 3 first records has been encoded # and compare this to the original string representation. diff --git a/python_scripts/linear_models_regularization.py b/python_scripts/linear_models_regularization.py index 494192ab7..e97afc253 100644 --- a/python_scripts/linear_models_regularization.py +++ b/python_scripts/linear_models_regularization.py @@ -53,8 +53,8 @@ from sklearn.linear_model import LinearRegression linear_regression = make_pipeline( - PolynomialFeatures(degree=2), LinearRegression() -) + PolynomialFeatures(degree=2, include_bias=False), LinearRegression() +).set_output(transform="pandas") cv_results = cross_validate( linear_regression, data, @@ -107,9 +107,7 @@ # names: # %% -feature_names = model_first_fold[0].get_feature_names_out( - input_features=data.columns -) +feature_names = model_first_fold[-1].feature_names_in_ feature_names # %% [markdown] @@ -140,7 +138,9 @@ # %% from sklearn.linear_model import Ridge -ridge = make_pipeline(PolynomialFeatures(degree=2), Ridge(alpha=100)) +ridge = make_pipeline( + PolynomialFeatures(degree=2, include_bias=False), Ridge(alpha=100) +) cv_results = cross_validate( ridge, data, @@ -228,7 +228,9 @@ from sklearn.preprocessing import StandardScaler ridge = make_pipeline( - PolynomialFeatures(degree=2), StandardScaler(), Ridge(alpha=0.5) + PolynomialFeatures(degree=2, include_bias=False), + StandardScaler(), + Ridge(alpha=0.5), ) cv_results = cross_validate( ridge, @@ -279,7 +281,9 @@ # %% ridge = make_pipeline( - PolynomialFeatures(degree=2), StandardScaler(), Ridge(alpha=1_000_000) + PolynomialFeatures(degree=2, include_bias=False), + StandardScaler(), + Ridge(alpha=1_000_000), ) cv_results = cross_validate( ridge, @@ -357,7 +361,7 @@ alphas = np.logspace(-2, 0, num=21) ridge = make_pipeline( - PolynomialFeatures(degree=2), + PolynomialFeatures(degree=2, include_bias=False), StandardScaler(), RidgeCV(alphas=alphas, store_cv_values=True), ) From 6d2a2ea2723dfa797f4d54c47c3027fbf9ae088a Mon Sep 17 00:00:00 2001 From: Arturo Amor <86408019+ArturoAmorQ@users.noreply.github.com> Date: Wed, 14 Jun 2023 15:13:01 +0200 Subject: [PATCH 032/108] Add axis labels to plot about scaling (#699) --- figures/numerical_pipeline_quiz_scaler.py | 59 +++++++++--------- ...umerical_pipeline_quiz_scaler_original.png | Bin 13806 -> 15173 bytes ...cal_pipeline_quiz_scaler_preprocessing.png | Bin 40090 -> 42840 bytes 3 files changed, 31 insertions(+), 28 deletions(-) diff --git a/figures/numerical_pipeline_quiz_scaler.py b/figures/numerical_pipeline_quiz_scaler.py index 6ae7b4100..9e3ce294a 100644 --- a/figures/numerical_pipeline_quiz_scaler.py +++ b/figures/numerical_pipeline_quiz_scaler.py @@ -9,6 +9,7 @@ # %% import seaborn as sns + sns.set_context("talk") # %% @@ -18,21 +19,20 @@ ticks = [-6, -4, -2, 0, 2, 4, 6] _, ax = plt.subplots(figsize=(5, 5)) -plt.scatter(X[:, 0], X[:, 1], s=30, edgecolors="black") +sns.scatterplot(x=X[:, 0], y=X[:, 1], s=30, edgecolor="black") ax.set_xlim(-6, 6) ax.set_ylim(-6, 6) -ax.spines['left'].set_position('center') -ax.spines['right'].set_color('none') -ax.spines['bottom'].set_position('center') -ax.spines['top'].set_color('none') -ax.xaxis.set_ticks_position('bottom') -ax.yaxis.set_ticks_position('left') +ax.set_xlabel("Feature A") +ax.set_ylabel("Feature B") +ax.xaxis.set_ticks_position("bottom") +ax.yaxis.set_ticks_position("left") ax.set_xticklabels(ticks) ax.xaxis.set_major_locator(ticker.FixedLocator(ticks)) ax.set_yticklabels(ticks) ax.yaxis.set_major_locator(ticker.FixedLocator(ticks)) -ax.set_title("Original dataset\n") -plt.savefig("numerical_pipeline_quiz_scaler_original.png") +ax.grid(visible=True) +ax.set_title("Original dataset\n", loc="center") +plt.savefig("numerical_pipeline_quiz_scaler_original.png", bbox_inches="tight") # %% from sklearn.preprocessing import StandardScaler @@ -44,32 +44,35 @@ min_max_scaler = MinMaxScaler().fit(X) # %% -fig, axs = plt.subplots(ncols=2, nrows=2, figsize=(10, 10)) -for idx, (ax, data) in enumerate(zip( - axs.ravel(), - [ - standard_scaler_mean_only.transform(X), - standard_scaler.transform(X), - min_max_scaler.transform(X), - standard_scaler_scale_only.transform(X), - ] -)): - ax.scatter(data[:, 0], data[:, 1], s=30, edgecolors="black") +fig, axs = plt.subplots(ncols=2, nrows=2, figsize=(12, 10)) +for idx, (ax, data) in enumerate( + zip( + axs.ravel(), + [ + standard_scaler_mean_only.transform(X), + standard_scaler.transform(X), + min_max_scaler.transform(X), + standard_scaler_scale_only.transform(X), + ], + ) +): + sns.scatterplot(x=data[:, 0], y=data[:, 1], s=30, edgecolor="black", ax=ax) ax.set_xlim(-6, 6) ax.set_ylim(-6, 6) - ax.spines['left'].set_position('center') - ax.spines['right'].set_color('none') - ax.spines['bottom'].set_position('center') - ax.spines['top'].set_color('none') - ax.xaxis.set_ticks_position('bottom') - ax.yaxis.set_ticks_position('left') + ax.set_xlabel("Feature A") + ax.set_ylabel("Feature B") + ax.xaxis.set_ticks_position("bottom") + ax.yaxis.set_ticks_position("left") ax.set_xticklabels(ticks) ax.xaxis.set_major_locator(ticker.FixedLocator(ticks)) ax.set_yticklabels(ticks) ax.yaxis.set_major_locator(ticker.FixedLocator(ticks)) + ax.grid(visible=True) ax.set_title(f"Preprocessing {'ABCD'[idx]}\n") -fig.subplots_adjust(hspace=0.35) -plt.savefig("numerical_pipeline_quiz_scaler_preprocessing.png") +fig.subplots_adjust(hspace=0.6, wspace=0.5) +plt.savefig( + "numerical_pipeline_quiz_scaler_preprocessing.png", bbox_inches="tight" +) # %% diff --git a/figures/numerical_pipeline_quiz_scaler_original.png b/figures/numerical_pipeline_quiz_scaler_original.png index fcf14d3057b9584a0e324111c483081f80b622a3..6323bd9bfc05ca2ccaeadf5c15457830b74eee05 100644 GIT binary patch literal 15173 zcma)jcQ{<%xA*8o5IuT`64AowEs=~KB#7R{=)DX^Pof4P+8~G;M4jkuf{-vG5j6~h zAc8T8GGdhX`2OziKKH)&dG7oEF^^-Pz1QBSthGODeLg4YfuT0tCC*DA5Qt7!2Vx8Y zk=O#iP-;q`g|K;32Kb{Cq-hak;tLIeIR?0Z3><^}ynKVa+?}q4xC8{c`}%+-WhG_A zuek*U`2{LVNqPTg2T9)mS1GY4D;9te8b2M&KoE$Q`Qk?c$}hYM0x37^LNrW63wJQ^ zLet>W;omc^9Y|E%4G#gwo5}1^l^TqXM>J!byMwNS<&YC6r)Y?nzJ`Qc#f08gm3zon z^dI9s2ss3-!|c0EL16%q(C83v?2p^tAPg%Z(ez*>SOvUMh={z+m%e9lh(-jegHf)% zdumI&8=knd#6(D$Bn||DC^HM$GGg1q2w4Cs-MiUS!s1Wg{?N zP#)d{H|V`F^q7|x$G`}zhhzC5KF2TZdVgS{aqq`+GtteE0O<5MT2dA%g7$iX=<$Y* zBjz4KTRj(k2U9Lnja>$-2;wYc5OYlrBS9f-dY!NG5Oic`G&Yw{Xpe;Pa#%*J7uFKY zW&6Y3h$=k$%Cy+>UY~WpQq9-{2W3BDY!nUjQcUBysJ+bFxhoCfn0(^qE zcM)ArwoVc)gM5OCbSVntc(9A(!bFNSwv_pY-O>yQ)Gn6F4D#3vv8y5(h~aY)v;i%g zN@GbK*sN#dOS%;crw)>paQT=SRSk?w#<@#iR{QJ z^rUXNiI4I#)gmo74=>@tRU!rI0JYSATJ6h*p;6z*fzHTfNv_kV%MRzyIpTVSp6>>t z8u|lm$h}D8K`&!9PzB5W2_$AXL}TtX@1-*Gx@AK7&K9Nk<#D;|-sfLb ziTN?F-HCLelt)HG2fvVmQ(U1Wp_0z$&+!ZQkUeMbRy#prF>Nugz8xxD5e%U+)SX}A z`I)n(nj1fyrZ*3RtnY#CoEyZ6MzqCy)$bxqT!(XLVsB!@esOVr&2U zK&M5#qfNdaz!U5T;LPuE@dcKB0u8i&R~tp}`mkuR9jjDwiCy`HNc|&DX=FWA9)%7O z{wYEg8qLpsfU3l++M?e^hZg;{{QOM^uY8;75V?y&(^G0)dDjcioBLc%XK@GrKE#y2 zNq6`L@V;-+F!{obhaU_7te~)SZNP8PTJnyzN)<=D%(W?>%zf{3hdx1Xhl8zkUqBWG zjyz7Y@h{Fa?bxwu%i|kJ)kB+Bk-RwswDqc+@DUU*jCVM~$J~|jf)>jlqpt+9zoi?0 z87?(39>}3lOZ34SfV&3Jy>8IRZ)qI7XWfdbM#rs}-cZS!HpWd>&4;Oxar;1t?ZlKh zdTQUJZ7OjXA(exn&lVfp+}f_}W(}XAu?`KS&KymT)g&mYaOo%X(05>yh1ILE5LgZt9j$h`v!}a{lUf(&8N+(Io|fw zT2~fWItMZH&!yy$QaOH$x&6fhNZ`zpF1f;ja2H(P>hjBG?gC#*ph5Rg2j+WP0!E%9 zdA4z5sJDIt=t>X)WKY$5Nt~7evyR(qepQw@TTV9F^Cj$*G?Yc?$q4K~TO2xQv~7D^}d&34&gTLMIRLjFUgkzdNa=m8N6kPis2^dg<=W$ zO;AgaiYUxi2AR$n@u7Pdjl1^KH-hS_=9m8O)ZAM*->lFMca#lwm+gM1o8VQMX*E{E zqz$7s@=!L#+uOWhu@0(#a%nQhUh^nbclm;mfzH# z;|YZ^?WacPY%^;Yk;#Kx^XKNt?{%1U5Kr)`BA~so?HTei^)#V&#~k-GgdEf5d+_X% z-D8$G0zqMU%MYxQgwrh6xj8v*?x?87L7~{#1H8v+z3S6^lQWTO8oDFPcnnlSGDncq2TvdCq1cg-70=j z-~I22I$X8SLQe zO}uw00!ulQO>WDm(#;kp{n6;@ZaNu5rCNxxhSTEZlLQCVUBhL61y^o`T!)4I9Q>>$ zxh*rIVHHDOCYh5kTrcc4W>3G32(hs}@?Dl|ZRss%>h#-&e%j|0*jikaZDDK@#31sY4ZOHMQRit^kS0W+yj)CxiT{7y=Kp`Y zK5%+jkKl2H2BN)Q{l_k!yOO34DQ8KtsUJogQu>hlfTDC77e>$S5ENIL7)2kg-4I`)2yO46y%)B55svOX8!5l5Lbqs9>Mv9^DV`v48 zr!r@I@Tz!8VDq_)wRGUdOJGA_x1~n|r&|KQQ=(!IT z`?qwXiLKg7f+}VxmFx9K-Y0^>m?=~*Dw2b^OHoOZKy=}F0r;vZH%u#+i!iB zZEp@-neh(y$>YUfIo$OqP;cl=l>D?grIb!vFl=4Q<|zZ`G_)AD_2K)__R@MY&ytMW z)jf7~PSu%La9qJ?2fOXiTcy1aoyrPf?>R|--5HE4bC8gYC)BlIjvU-|S&{3gqpOd} z_o={dE(NGytA~3*WNR=p8Rs!uLz%&zC5vn7vp_ zR8`C_$n^t9mX7g#B@a^IuF#9wqcBa`QeVdN=iE^-ap~uG(q|B={t+5=vk`!W%vp)d zkL6o91DyI8hTuoQ;d;}%?3Z!B###B>N29mKp#hjl#)#$(r}|gQamxdTlE0274c^_g zYWkZQk}1?=Etpis4k-sghqB6qZ>_zdK^-!Jp79pT_@H%g?K|$X!F|(_!L>OuKEK|` zkM|t7f)R4Ayb73bs>O-^4fVZGGQIgD)annkUZ@WV9K>e+Rfh;=NZ2wa8A{j9E-lIx z3~$B%KI&2*GWUfmA2$5j(RFG|%&gZD$!*x;w`J<9X%k7p2>i0Ul2K^Ou$g)Es7I}u zAq#kH<0sG3fk|%&8>xy+BYXkiZT#Zyag@nTC3v01vxpLi?u8q?xZ77F;-6=hd1EtK zN5i9IS3&Q>R@{`wl5f`J1ZzgK-pZ{y!G{+|`Ktb^yHQ&Yr_T>^y~16jyMIS)54_8S zT-cGveNJ*M8WvMYFWn^>2;Dev<@kQY(zdW)TO;9OpzhlIGqcg7@tMlPl> zt9kVx47eu~HQ*UTKNzmVS8~HHhy!qi+Sz=ce@>WrioDBxP-b)A1jt;hSUZm%HQbX4LvS!cy5+!H9! z5%v1nR9OvK;ZeCKl!|l-WH!W;lE@-OQ>!aj-56d9#Lg)a1=2ncJ>~H*=ZnN3oy<(T zh0Mh&IQ^}wM_oXH30p9n!H>qTw5!4V+4YQHxwXl2=C@%s3$J%d zj9F*IlZs{7x=X|Ely&j(;ow((7Vl`~BkDXivgi&D4$wtvUFZxmJ7?$XAUZlaX>H>| zk5sN2g+q@-_(6V)I2=jSdEkL@wN_{v|J?KQdy8kO(>Uu^?~`AE2ZBJt2Kf*Z6O*zo zg4BMu?v9-zPbeRL$)8FL@(R}af#so3hm-Am`|I*O;d=OcbO>cV zxg=FSfK!hin3w1u5uvc9!f-=)u?)2JGn#WKjmJ78Pt5&&90kbKdV7jf!PIwq>I%&H zn@#1?q3m|2{g%4*U(y-wJ}F??Ny%#FZ)Xg7zH+*1cXDLTPViioy&{Ndai716Ey0h2 zS1uVslqF{%&n9OM1gj^-TLxQ#V%cM_Be(lFf09KoS}?lY5isWM7}}P!X}^X067rz> z_lG3?PFycz=&iu$UrT{z_X70GXT`kB^P#Kn8zSRNxgO$ z?lj5c6PrL1U((-0;5~eJUScAU(Q6nh-N3bFi5lB%rdO*tO%X{sq%< z2&URYEwKw7Q<~O201QK#F^a@ty`ydpuvG6g9A23M@XsKDPJg!BLbe%(5QWbxRTs7g zEmdAy=}DR89|viA*sRMgS=V5d++Uw zm5th&x2Y9k<+BG<&5-<>;VW(10>T~xIX(hLu zW5Pq)t!dxHi(yf_`wY$3K6-1~8UaIx5FA?y-W9vcRbK)>CQd)jeaB^aN<2t)SmlXKkLGtDUAc7^0?c z^zu5yV*%J`L(9V$IvMEW6%V*0K$xzpB0T5zNdtZj#!Q zP?Fvz%>i|hZjhy*x2a*#f}gRF8r}No8z?oxn76mPVUc1~0-FcaQ+8gOC&QaH>7X}O z&T}Rw0(Sw6h>u|Q*}Vhkb&wR<28qZZQ?^d_h*v6`%(vQ1 zR)RP!D`-C*p^hwv6pQ7X*8p_=bZBB6`aSBJ?p3Q8tqd_sqi z*pWnS;Hq50B5YNv+}<=axE^lLWm7kla&mEzxVgC{DrK&idM-TZn9UWBOw!sV!Gyn` zHjuRlv?NN?rV=aG#6poS!!A7`)t*kfukT$_ue1Q1zy)(mx4%1+o9$cY7un4+DTqW!jNMHW9m=ebYT6zXUi7MVoh#7zI~NJ7cFpY+KIAbq+O^~Cc_C;{ zGE7?4NzC-3dPfj-hHjzXI&RN4t)IQ(*zys4I0FI&EAc2F-ncsO4$)d>*Db&d>gD6StDSyT8W^umg;#>M7ia~{Ga zFlsUTQ(MN-QL3X&>5ME@j2Sn}U)z3lliEC_oTz=^bvatzMd{GHL|-Y6DdVS*ySTrE z{*M>ZjH36$8DAM!v3Rf(XZ*0Aql@vtYFJNch7|Qh-|=a zMSRgz++ilJIs?KlpgC%G`Q()WSNquuKyZ@*^sRbUF!fVmJn^eAiG;hp0 ztI@Z>pYTnkqnK%zXf~hkuPg)C*Hn#!axW$S;p_kaf@#xaY6F7Z9@Ukwt@}idUG7J1 z;U^Kf&uzmmp3HTh=*suCK3#uwUOLQ6Y)cJmJ5@TL;QMiZ`{DhG#6W?IEP{?h-D|gW zL^}ukx4cuuYs&aI5)W<+S2|q`kN0l4J-^<*K2z)(yc8KAe0W3HeNc%Jad{)ZIEw<4aNKTkN`5xG=Qebc0zrJQPaf&UoX71`jhL2g< zS@p&N*B~9Xa+GXzu?8I{^Rz1&BVvSPwz;9dQRi{UzbkUl(1GZjuaPr7ZdbuA!E1@8 z#eW>HYu{}PuS^&i1$L5~K=#l#Aed@%oj#c->B^2FFHQe`Ex0;%!w>7;ld`f?wJbIMboL5fd$5O(UbPoIF#bRLDLN^&jEen}J zC5>iYHAZ2xCE-ArA5lR|%p^Xc?eq6}lwB?^ym?5@xu?3=FlYfOf4*WzQ~Jfx5;|l^ zM%hUC!XHM=3Y2iRH#awbG&;`?4qq0+HmatllNE_ca5IYLqT>|rY(1#EVe0f4QqM>= z_3pc$v@XF6dl%fr6DqT?0fL4|z8Au%?iSt@?hyg z3a{`Byt;3?8jX3?hb|iUEfW1~)##Xsu%OT;%mkNS4LS!!jG#^Fjmw!+oK70Lzz5IW zBR1QN{v7%6t?l+oi9~IP+&>-ZcvKvk!qJ1L#p*cJBWw-)$aZ$nPZlCI!|pO?2qrt7 z>;hQJqTL+^V6ADZEu|w6DioWYt8e9`PLu53qE7bwnoc*2Y@Afebq-PO9j_qX4QWm7*HMdtz8I_~o+ZCvj{K-5%m zz$E$5f?Pp@-Mm(t8h!cS10mVIKZ21AuO`;pP3;5ZgfNgr7BEsBdgu%$rX##%ki+P0 zXa8X#vjGL#lO$-d7pD&$y2`dx^-MDUk_Dni>RB?B5wWj8& zvm`|u@Z#g|rUeoblz9znyJGIe?$I148Ij;*kj@K{W*ncTCPZ5}a1@POQQbXjJhRQu zimI3G#h$E2!DqJL>aQ-S5Tbm#59rT&YsA#FF3+oby+zv}j6F~1R?7f>jMPl0w66D4 zT5=#tEZ?e#*Pb10hHP5gAORPU-Up?@2wnY=44_?-C=d`KgC5*k@PrhQ5Y#C7Q1QcO z_s99K$6&>Xscj3NnQi0SbU!onEk9RTew;&ymg=3?cARJ8WIs!bmo||`CPz;-WLq(P z5X6`Ssz&?9<@^l5M(Z;qty2FwJQHmhz+li(6gIw4s`f5Do#dc9t;=>}4Qkp1TY|@F zkI$6x+j?4cH;aNF-R^?Xv&1@dJ7D~qsXfF_+{MG;_ z`q4@Uukc4CAA0sY3Lq`N;iBL}&RgG=yzfOusv%v+C57Hu$jVa&uX1s6I;E8>{wu?M zHq`TYQ(rcm;pa5%d?!u*G`0s8hkL(!`WB&~vOCzUP;jk9PpYx*H(jUfcI&|3R&NV~ znWCOe*RjidtuRezqkP?$!e1vRCx7BjX(J62ava8g;5^5+*~udtg@kM!t~JVzp2@kV zr|3}8@wF;=PS?A5uToN8{xu2&i(R}5s-|uFOMCY851Xa2*=_*yLC`sEn@eONe5Oms zs-GIfbrs&_lq|Jy|B>s1g}&`E;<#WR>70ktB;@KpN?epVJyLqCR^xxA_AL0PbAX}n z^Ub`6_w(_hKqk?RU;lo{Ww~a-S!K1fm)4DI76f8FO(Em~OOH3$&5VU+EgljD9OwcH{W{^0@1WA06Xnj8%IEYnzi* zXVs6{zW5S`LvDbR{E-+Z-C0F4{6apC!Flg-U_Bj6Lq~K6g}MYo}|!+@4#~IrkfP7F_|rGeN@- z@9gY_^V(mG4&N|w*M`~e;#@n3KDfni7=~a%*Qmb_V}>H@%b@u#P0Iwm+sTtpBR=@7 zOioPnKOb#)Jo#|g|2Hex-MzY(;$zBA)fnM31J!h}Dhs1-TsvElO%K$2QE&<9tX`JB zzA=s(AruXb7tu#Gvg32$U*Au#WUmmMD!!NbRU@7wwO`8HFR4v^+=P;Jd z`$9QAb6ECho z=;+1+nf;*8R#-9;vz(VdIa!SY#j|-kS^=ZX;seyc{?&*(1(6qCr?$ubqsMH5DPR#O zv(B^=?Qp&98pXI7l+ zVLcvu?bA}w7TXpf&Bt|c$(wAknym(_D;P&qU}aAn5_}nzZ#GcLAC1c zIpgt(M+QP7*#Y9JmiIlwfYkAyjAX1q%JKb3y|>+{=2D*4F8fCF-zisrn0(){c#nPI zXbFaDquZ>bQ*T5xWAq?V$8qB2B@iY&yU^75Lw`eoa-7kj4Z6ido5gmrGH?11Q1$)d zFxI%Xa}Ckk`gClxxO6yC*V|6t)|N*~J%Z1b+h(}lXz?Ql6;)q4pUTaRmXWWIjD0i# zz%oCNj`Dy4*`+3!Xg+jkTtUj3!Sm>5b-9N)kknk%7+(J@mH+TAXY4spcCM5vusEMB zE^*nSuYFiPIW;AHTJSA2=(Fwp^L5!@zkdC(tL3&8S~EjGfA@suQg$@sPs0a8O*B7s z{lMN_jio32$vTp{_iuh_n*QVSB;>W@N561-j<4Z1{0%}&|60=rmhKMnS<>eMH~QKy zDb8ay3g$M+O+p@VX$am0&>re%7jzG=hmGlJsw-eZS~xecexk(5#(Y(?+x^o2vZFqG zFnB6ix6?Ux3v>Fc@e18j&QGN_nTtMc^B z9P0%Tw&*m_XVNpv){hWD|9_wNrJ$^wn(VpacI{1@-|hEV%d{h$uM!7^tu-6U({kPj ze?aRHd%E~=xPtzrVsd)&&m?D5SC?_LH<>FFA)@A*4Z6%jLO#@HEu!vn4zOs}BY%Vw zW5!dRhMyGVtUWQib_ZpHbnk2AHI2yRk>TJ12dGH?TzxjR9%LoEt8}KHe#}hYs-i)W zRfX>8GB((O9`6~4qdBRa5WZC(y83`K^E;>K5rr{AXJHqChh zDc??Z_lKJ8e_zY)2vYmuywGv*qSU&E6FO}M067P2y=wY7)m2j~MFC^OO^JE^`aDb1 z6!4?3p2r=o&5lc-CU?}k@BN4Q@~*m9t4o@<^bH_P*N`sovEuWuui0FyC8h+;j+0FQ zd&Q`)bG)~Z%Gj5v(g1QHfr`;>{T=oFMjoFu)`=DImI|x1(w{Bm0E{(pXhzsx+Rh5^ zMeIvk=t{}-@%$KilbkRC0cHTONfRBJncGH7k_-_jkR9J z&5@T~y_|=`5pr%r7&jCBowWjny^Y!=*PNmIOB)XiF-voe?*8wYxiT$Yw&t_ST_3~r zIYVKjBBVrS!Fd;y@4{)*r9lw^o-8|Tcgu~?@#w&>3d!dlOB{NVT0*`+O8*d*g6kD- z8OIb~vuM?ZFx{7!eC*LsLA>S1r14A|{g%N`4Q~sJmRw@i6pZqCEo=l=n2@>61P2xK zi$f&^S~OZ1OHGWkr5UTEvhCuuzu)W}$xEDMo6<@&Tgj|PymEl{DIlW}bHZTw96HJc zdT%w&pq#B*{_eX=n|66l&}#LC#Tc?;S>(NDfLuQ-V{%sJ3ngnO59PReDS|4rA59-f z$kkWvODzAyJx;s)99jJeDSx+uH8bcom(ONz;i?6doTCip|d_ zX@fmO)^d37F!a?Y=5f(u8{B6A&F1y_F|H1$IsF;@?Xt;UKN+_A#co1PCiBL`W5I>V z-rTWm2?xK)--(l*0jV2xlXM+cU0-t*Ham&-WtRl`z$(YB-OQZ>HtM@G8QR0n6{; z>k!(mr@1Z-AR&YQKtlC*o@P(Y&QjmGckdoLgQe!tq~f} zFyB?O(fclf!nbvDw=w#n;>4>gxO;H;bBQ*Bi)+SrcKKp5)BnmzC^e_zWZ#W<2`L7#sUgjHqfn* z?b*ERqJ}L)bW>dLYX4&RSC^BfFi1fI0ry!1E(rxVrgy}X*2so=-;Zm*;0Aq*mQ>wj zD5IjC=7ED`HQ%h`!<|upU9K&wpjD-Rj-EZEtD_+Z$30{g$A7wrG3_E z?U~&;x*aGKD+bTXheZC-GMsC822|s39zLCa*d$~NTf*YfUA(NtLq&YA-FUZ00TABx zS2dMi-c8PKYioCr{xI^bfl4D+qjlxyzbHSc+z`h#xY25^)TI9kV&jo_Wa8kM&UK5( z6;Jx}{c+U3NTslJ=E{4edWf;>Ah5U!GjGzR6MAfQk|5k=t!)`EA6md-o#wd8m^#&`42SW|a#y2d4XI``<91ZTh0umRyW}!-?bjjl}Rb z<`-2aogYv&{4D^=eSz8o9TvcScn{~PPT8)^`b3D&NJg1^)J}&y(JFsw{rc(h^E@;uXw*_5*Ws8*&%K%oF@Z2v zuxxD}BL&d+0tWHE3S{5qZFXgm5@Ni#{st!|_m;F9iJ8Dce>}P`km?N4;E(*MdqFmb zvA)zP<9+xb6*=+mu9Md<)%cs`Ye^;Q*nKctP7Z)*d^wPg+t)F+<y-nnR}4Vb>Yy!VJqojPege!abF2|)Yc)+1l3B&+S|KFAnQzI5rF2kk zlwbFQ2*v^vSY;A}q^PDr=49tQzBd5<#gx35M27ARTAezAKJ%nnNMNeLM(hDg zf2MrV7&;teL_+8!&J0A3##Swy&T@}i&U|*~`n9!%P=km3B~8`&pHyrr*GhWir-@v* zecms1>|Pki6FG3i?Bb5P??kz-tel)&C;a?w1FLm$Jw<1w@AAEGv396u{vU(|cC34# z5>gWO@AF%xB1c^SLU*27@=lu_APZrI+xmw{MO+QN7MXzH0TogeUJe2y&zA26yZ80& z;H)Q#XBMlTBkI7&ctE(4Bl3s(zh!r7HSP0eOkDAGRwF-o8*+ol#6A;F7E&Zv)5orV zAH*#t7}Z;#(J4|Ni3@4ni;CJ(|AaDn;iGHq&m0EtddxHDF`fOT^(Pk-i1jF>$T6C9 zj&!~tT;F5(^p|4(qB1O7vr+x{@#r)$Xol58y^K7oT0yo0izP^^ogbksh6+q5n1f;^ zU`xwDJ=*Z~v!Or^Kwtt}_!l|c3(!Iw3UYu>`j2Qxeqc3|iqc)k&9X_{#cb0>cPo{0 zYo9(Ywd(9>Cw^&nzEaIv!IqTN>5@m@(aRGAc_av^6p90a7)!P|TF*7#6b|D-vk^@Fs*E_ld^MsC1_FS8w32w#38XH__R(A8p2E^V| z(Uk!u1obbg!Dq%U0F@`GrEB@+E5^EtMI_kqlg$#n@?C(oGki1I)VY?n+L&?uwYAze z=j5_0YsVoaIeJ>820j?9bx}D zkF?&79Lolr;-dAfk0-aa0)@(AXi=vS^6%P&B(M{ayPo+-7ppm283+LpDZ}uFsB^H1 z|79-IamiqV!cKcy399>PJq;b1vf&!#FZD(&Y{AY|BxJJy-Onavo5sMoxZHVV>JAY3 zbf1^+%b8y|q@b4ZalY7v2qp9CtJhxT7fpH>O`1Rxpb^(Mhs2c7_y2boB6rZDE_&gw z(iDEmWI#OCak_*JqP);+h|-P^YF>oY59LqG0j4*@P+F8h1B7=~YC-m@j7gL7thdt5n7Y_|FTMS!(+;o~HywB-0txNTZUg8u3RQ{| zx+Bh8S3@b<>5pPvX=c&?2+(2xRautwgj>0tPfLs9h5l=CR)v!Qu;R@wBxIeFn%^XA zrwI|OL#ZGfj-u~j(v}@)k>WDB7UTX_?t9{|jGe zui`u~;nOBFF2tf7@1yB41C)wOa-G`LNfG;jE~B;nOkDaq$xY(IDGzv_G>A;Ai+J6A z_?+bEMq^5sulOIUt^PfNl=Sw;N1p%ISp1{3;7L;Xtd3zIv;L+4=DPHja=J=n;Y-gQ zBW%o#CoGJjRf=8gyi{7yZK48R7Jnu_T$PWP$iA!xDbIAeaAJ~B@xVR=hUUI3QX1EL zYv;A-9cH{2gg-ek%a$qOIfnd%`REDa6%?wCm>Z|@skB|e6|B%1LCRN|6vE$M)Q!op z$$p-o9MY;TqLdSzA*37v$T2{;GdBDOKW4^i4&80LD~#2aNd+0g0>-0&xapsd5dN)B zoUG3DX=%`Mgenvu3g$5W!xKgLp0ggr=_pzrF*gc5to5sbt>jtqfr(9sQ<3KcX)w`Z z!M|k~J(O@RX?F|RHnuGX?)rwNPg7}iLd-qKFA+@qhmEPhD*fp7ytz(%PxNPW@8sg1 zvcUwMd09J!1&EtALg1w5rdio7WD>b_L%K>1=_{iLVyjFJOd5{I45UVRb9T~0*`v0o zBor(r9W+h?L(FAalzA+f$MB=lV?Lo~2A0?X4sqr3+pEx!Yj`>bmxU&len2a^P9jB8 zOp-K9DHFo(Um(&EM$Y7`X#yUm}2N_Kz-qm}?g^=QBW zjBwA1?j{3xo;j2x571(=5l=o(Ow=7t-H@c3zVepCoK=QdqkY!}D-qYfuN%Ov!83Ea zBIN0ZKhYH1zR;vPuiVv0J1X((VY`8n8&?LuFI&wqsz{vteVL}(ffU3H#EOh3UB@E%+9(i+;K873 zn8!$T!fw<9yCI$M(*2r7Wcq#23EqZFj@MVh<0z-lc>|xso*z(R)nR=q>wd|<>-~r{ ze*7A11OBMC7&RIk-49a}U4itkwm+SRlC?1+C^%wfE-P+O)V|_GpP0otF`pZ zfACSFYEu>8H;N-UKWh9#pc2*$qRri1*OJ2=wI^e>eplmq1=lUU2P)&$YkVE!luM-E ztCh|JCGLDnDTjQ70gS-Dr`C~Uy5+$#x57(bNM!^1XH%xj?0mO0e|nQS)@E1fOZdwl zx#UaYm7T-|#W=5FkLETg>{4|7r#}`ed2@uAz*r>abRD>;(F>84V7vX-ti#rKihVfI63@PXolb$Eb5OHlRV)Q#p3k+PX+dDR6m6? z<1Mj)xG2}bCe~k;SaNXf=+7C0)Zn`K-mm)9>Luea0v~XDNhq1Kyn9p^Wa=n-86vo#7X#Op-0iIHN0X1dDIe(#GMrTHdDn3=PUW;*>CF&Lsj=U)s0|-34`gwe0 z_5gjO52pOpU}PasK%oj+aD$T347C?seompz>*}y0jSQp_E;^ye%l)km@aZp`$fwir zydzH#L~r7+P8HQ07M%J#8b?u2*QIN$F!7l(3Wsa>0gtN6<0h$#)G@zuBBEEPL_%*)cO(r%P}XHZ``b5f^pfW#>`r6hmt6FAbicqo;|c)# zBt3=}%OfeEt^78N$G`_oN!-Jyb7-?XOC+Ml1qv&eOMJdsSi&}#0Pd8HujExg79b1H zv+zyVCk`ueF5Uiuj&g%GkQC62o&x0pa2E<)B!?Vh6aRV|4Lw{GSc-R!(IEG1!Cams zE0+Ajoq5XxSXg)hMdE~9m6ad#&^Ase3(A}&?!|AR`iB7;lG9~haM+EcPJV9B&`q*U z7eQm;22JQBHsjl0Qs;<9(;nG<-_y7Ju)-GCaC-aNjZH#vW3Y08L&x7G&~2d`jX~j8 zzb)f>?}?(h4X?imJyd|f!0^nH0P9Dr?wOZy^#<00Tz=#l0VAy#Hr_zDBch9CQM5uM z@6UqP2K7<@fmkTE^ZDs$ELGs(I%>@Fu$W0b1oFL?W4V_qww`0Sm{~m}Xy`18Jo`Ui{@DLTMy%zYR0vulbiaUN8{ubO4?+B zpoJb9tCUItMJF@BNn9g(@UI26xW1*Lil7hW4OP`Y35s3_`X?Wd?W0vzWVml)AC%Vccok4GCbbq!t)p&)xJTUdqaZmQOsxEXmsaebvNjfi711XrxL4q-(oS%h4AZh z%wpuW6lG&?Vc)|juSASV4K)mSx_SfjO8wknmD=K+#>Za(5($Gj&+A~^_coG8Vp5(P zFaelXmj!0eQttzo#%Cc;V;yt}nAiboGn*Y=-YBW}<|j*eb4D~dJG{)idrT^4#3*vb zF|;H_&Xsop=ThAq1QToXb);l zq2O{;qZrQtk-1 z;`>RUYOajs#a@eBwiyMd@jzZQPG1qUp^C3mi{jPD^cL0`vA=fyqUp7g)9$4chWEw` z2FtC_9roWp-08+?qRbrb6gh`q5&Xlxa;7q6c2fCF^dh#X|8L)X{u^%n-`|b=JSSOo Yyzw~A{LM$;lU9(frXi&Mu0!1a0Ri&U3IG5A literal 13806 zcmcJ$1yoht_ceTI5djJ55|mB>>6BJbKoIHfP5}w&?(Vt*N_TgPgfvKl(%t>;`#k^u z`~JQf<9oj`zTrSGc<#CT?7h}pbIrL3ey1Rf`Hx z0{p}0D5>VCY-8-`qGxXe$?G}VTG}{Tn(0$H8`(RU*;sS3f_@c`~Rb3rlgTRB!VrRh#*y-+jk}N zY$bETFu#Vb+qy-PDkbv}rLBFrLSxwbbS3j@C3DSlx5`$Ms&hWrcq;>JK}D{b9%eod zOY*ZWRauKz#dGQZ>!-9O4;w#4`aGN!*PzN!J%4+J7N<0Tz#-i1l~`?z5Kl+CcQZ30J6W4Dv;~oE?9$2=4nQ}=qDbC>5E&9XM%x;d!5x9O z03?W|%i?V@Z^sRNYTItuMp#2C@KJn0j79USDJ$+OBIqWQN@wcWK@)ok8#0G(=)AF@ zTEvT*%z|Bb+JRPS=58^x_~M0c&_US+!qRJ~7pcyg4nkn>x;-!4Yo$bqT3)|$vi-o= zG-T!f>j%ZI(-6Xv{CV@2p)(|2L|wnBOD-ZxL`2<|DOWluF!P$(MMq=J$=dvU-Q}7%_9pE<bT%*-Q6SH5Mxkzpeq=r>XD;l?zavd@DNe@j3EVwpL8KxVfX4i+r0Z!vYGKy1pUD2 zXwGZz>^P~_!Q=Aha6Q|+m-B={yb`foAzi%EDLd{E4w25j*G7w>X;di&lsoUGdhKeN z*h)ta+j}Rsi#+mhm9$q6Ds7~sAlSfQf*FIX`Jz~1YP;Bbto>i9EM^(54HKX`AU4iT&;rfi$z@_&-PppSpZ$q!eJn~EHha<_YVb~O z9%@6$ee)hMcXAOickPI7|2n9T0yeG&-LD9LZQG!^s8`y0h;vCdw;wA|BXW9Ff%)Yw zEsv7c)cq4e+Y@&FioLx*DfR5;-E5DRoXqFEA`apEgTzfC6*%P;5;T0F1sk_Qsw>U! zPT5~CFZUesAnY0TtPV;$3t85=)%-p6b6~$irf*ClZ~K)Z!yh)RAQ>ia!g&A_i-={! zZRrpJ;`C}um9m8Pp-Uu{J`HAhmmc<8qznAE^t+eIxX|cQtY>PP}+Mr zEu>Y8J!9B7rS7=20EFLeTF;G8H+(Q=Y{vRms}yC?)Z^S zz`d+sBa{Z&1}bgghb`L#O02Q%UC%vOVv{R8WpDYViI&S*)+oXRi)be@c7dddJ%q!~ z&p)0IKiymYd>T!QQ3{-?}*w;bno{Vn9xPLuAlixfVKs=Bqy05HlBjRZ$>3M*R zP&U5Zpoz?WGPGTO{mUwe%$p1v?lis4^igbWZb`IoPupImSxE1=M zNYjGWQyZsBlTG4#S}7gm`az(y4LJ==}Qk z-ODGbAP^-8i^gn3V8)NuNhT(Wo-ad{m&jw7%nAPCjYq&20c zmgf+|!!Zj%PYIGyelsE=k)rQIOFfE@$AuI9HR*rnQdt4H!cr#KOC}geoRYj;A$FY7 z@nA$K{cPQzCuWiiPTbxi9n+uBw23q<)3K?3ca;UbX=f>wA(boSf-<23{Ak?D)u%5= zUz6pjIbM$<*{-i7WkMH#%sAOa^ZAEhldk@GhwwxSaD9Yl1B@71bNt@^VHZPwEqu!OI7 zI0a8eUeU9#?DC6POfp-})#0FFQ*;I4(0tC|bleyb-k+JC4yvl+A`@_CM#PE5G*eMB z2==Xtj*CNvk~mEwhA;O&uM3#k*z`^oYq$%pPnR2bP?(vSDF%B`X|fVN?2QcU+>}_p zIJB+T#Bs&?h>gc(M#IE}0kPY!$z&d9wDNFqNeF$Faj4g%XJzflxT1abHtmbhsXX{j zN<~FQEpv6B;?2R8fe)?Kn-nf|=%AZW=OXR?=6DzZ+T7e!jAR)7!NeGq6df4t@IimN z3WbQzo6;j!y@=n)dzA zjr|EYCMG6iex^ToVskR}{Q~*fUxlu{`Nmgn3RL|(ao8wYUFrjwxx6x@v9$6XbM+2~ zOTO4EBiXO&HgZDCT#t>LPDWp=mFuG@DJu_RaqJu&AD8DF3*=QIF$yD$&U~3n*(KsM zej_t{sL+zpyEVnub97|?Gld(&J9;UWZfGNU$;C3{$1oY=aNt{gsf@#A4x{S*HOb-Gd zWhzuf$eQ?lDsODcp~Ycx%Seg z@z~2`_Sd(rudT1=PRuu;F4anE*H{v(WA94NUrCyNvSKlxiqAS7(Y_hrX(c|)bfFSw zK>G+dtD!|hZB{4C2*G8JWN6vdFKGnGPPQ=UwQbJo#j>Xe*-!jX7E@Z&ifXmjBhWh) z75_hf{!BN!^Ynxf`??SOaC@f3l)wk>7{0GKu~3fh$4X-}4WD3SXI$KFs<+a5w^!Zp#eq|h+RkpSt1T+(tuB9grbY)c zpM!qn7fqUzDeGib|D>8P1rd`oEmo8&w`omQaKVRi-hZze|1L_~@@~x*9}VYpD|PJa zGNr|&?!A}RK4{?G|op69Jw7ag`_?_sA}m8s$zOean} z0rdXKX<4EMq8%#aY0#aiwqc1M100Ih*`ea16hIH|E3GgZc&Diuc6A!+K|rUErgu_? zA}`_TU!ltFPx$ZKWyJL9p5ow~f(GY5psRwT;koyWINBEX@ve!=&(JmdV|M#^qJso z$loinFOHx2(-h!1pf^U~C{+5t!uGZq8NM@a|Bo4O&Vf97XqFFuBpeipK*gtDp6Lio zEF@?=OUN81yVHR8-iuw15+* zH65J{dBXHgT|ERHk5;!gg8b>cFFJn{bsCvd8!|a+!lM+#4<9}R9uJF9=D*GCsb9|+ zK$icNqvB(XnNsC=KJT(>p>s{ra#mfMO#X1j?}-WF3$a+|8teIx#oIlXPixWAKe7C; z^y2eOCT9GZ?V?mAkP|iS(H}m1c#yhygQ}db&|PlOtF|%{#hM=Q<$=DDNU2$RP;Y;f|qLI@KZprYsMmIX z+R)z60}kAFD_!Vw9_RCQ*L#i8@$p8^yyMv9v?a)LfgE$$x)>={irU&(`D4KX`%Ron z>n9%?HcI{Fvj>KUdnsH-kE^nr>Ch(TG{!GO*KWy3Oq14d6u2&awI%Dg`MM+ z(NQIM9TJ;QO!td6>I`O5cftj`MLKVyvMj#mgER?9n{edDhB+WR94rr|n_5_P2MV~PWTYW@i17YwQxFFb zw)Je>bvn)8bV9ZdS-ytgHM;tgo&!NHh>~F6K3d+Zc27Vsn143@C8NyrPDnYgCVeTL zy~-{?9ePy4M+Z_sdBRJwH=AU|I=3T+)kG)@kBUd=h8enBtkj*msG^J`G(sPyaoI;u!|9?H9`L5_vni-Om#uB8I&B zHiyJ34UTldGf+DwGZ2}QsnYv?YQfwov=IjpXB*asWj!|Vz<};)wB$yGf<2X;W2QtU zD2XAVq1kbx-ySJxD}Lz;YbvopdR3bAT_soLyKYc{%lwpViN{yu;=r?_} zr85GDn5i`o&3cK1z7$15VxqKQi>~_KT?^)hA9_k3wcz$MlSf-!hsC5LzupDrIfU32 za3g@?S@7*!h&h=@I0eUdvax1Ec&!^g?^#r{PcMs&vK&-M_oGnV$m3+{(bcKF4vPCb zT$64spovFW4*7Nwa7A}-jDDOZtgI=1c-FjmXO(q3fYeA$q8kN2Em|f{Ji3L&Fb`~< zKwk^5u&_cBFP)dGsi?CTm->)54;& zqO0IMtl{S7a~z9Cz)YnHHkTnZxiRfH#wb>aDsTAoSs< zwR0;#`D0DQKs(zT<7oGrzqb8-j8S>|L6FjD$5T^2nEF$h_^0O5r5 zxG)m#hv3vNPR8C2V##Afr={~m5tKYhcv8(+1Gpd0yd&Pi=VSSwX?%FAy^+#dTI3Y& zI~-|(uDBYt*4-$CnvYL}cE^2CQ3ZQ=b}XllCcmLrUA;+uj9rniT$m6J= zE+#37{`l`8#%6|ds_e#FD$WyX?$A+*8OugtpOtsg>*>O`W!p| z6-&?T8F4T6ATz8X>xM%(|J1FTwZFSZC0W$VL1=hWiT*U z-0&HTrR)GDnKiSt?3(zX6Ar2noCrUqTbKD&xJS}=@yB*(#UNkTu^dHS9rOh4C!1sC zR=;01U2T1@TZ?&(>ewn%m1V_y#-j@kH+g3DMqIEjfS;d#EybcF+RqPprpeVoZI8In zZiF@Q^}~l4-OG-0fKr?_Ci}f6a%TROK2txq4li2eroX6*w3|sdjel?r$E zY%KWWYXM9(qfWzMw+m%(*QI+plcK_p-Wb`yt8)9nF^$*fqK;RS-Pv+Fpii+CPWW1y z2zBj^GeRR)1DA<}ANta1S!|iRHZXrD`3^vaYe{G;=Wt)eHgBEU;?7+8!vGTlgP5Hi zhw~JLG=SKPygLaQO8hrR*?`E6dY>1Bp0SBYD=kQV#fk>K)F;P55?-k_M~`=gLt6e9 z?2|XHqGC!NoBS3hp(ZSmP|;RIfMiI73b%b%#_4Rb zk1zDnC}O^>1p{&7j28em*yJ4HeZ0-;N~c-Tr*q&LYK-F*#FxHvvkEU%W2C32+ZwOw zzuS{Ucsqu*eqtRs7CaVV*o93IP~`eOYX#5Nb~h3M$8b<*o?D>YSqpu(bq@Q35o~cP zsAqOYFN3>ZL7aNaY_jO%h3&e6qoZTnAN%E-Xa+INXbckzZcyIs&=smEOT>0LLq`r~ z)vG8M*2i*G57EJz(IGdFaML$Q<;O*`prOZfjd9YjU=p6I=gcCo<;7E(sW95$^&~@%+vttP;a;)O(mvRc5w8Yhhxqv3iis|W% zl&h+$!ejgyFBzO{$H&Lt_B`pq#Q!;DrCFA#9~jS%-uW~pbkqGMbn$75^%-y4hN(>_ zib3znLdA>E>j@e@TaR{%4cH9c(P8Z@+Wh68{I4S3t?m;McPABuED)1Z`-_}zVg7-M zTSW4UCp_ZMrul7cPVE)Tc7wv+u>+RiJrm1uMfwzXXUnzu`N>Y@ zPzC&d0G`ztC*lUqXsAi9oZe0hNOtWs<5x4wj$4Jt>Q_oqV+04pH(#(uuNbva0 zv**v3V_vhie=lt@cUDtB;(0+G_X{_g!kowb%rY*HsL_kk%LR%5?Mujqf^?+ks3NEN zh5PG7P?8D(^4Rf`*-vS?^y9r1e4423R>(_+<+@#5E8dW)Rok>>H9+sRE+;2rw6mDSa9}$a81~KVZc(dB+Ma9bZ#6;6o z09D5p#k;RGF8^?rn%VC0tA! zjlI&sGZ~Y{;rcOhD{v5-95_b(G0(M(!pC#AaWVz*rSD8o6g+RH3X0--t0fUT|47M_ z{;su=_Vg4EctpJ7&{9zFfZB&#z?q7PX&?`N%gbMXn}27mkPELX@KE+f`FL$AZr|QC zhih>@Q2C4)pv^%Dve@<{zSnMLm(ieLKBW)KeYGOzwo)_r6+<-}3yW3UjNQA7bY43| zJ3Bif!B1twmt*;FSG$7oWaQ+0#x_q}i^|J6vfT4wFn<;W@~Ya})$u}Q?+~Pwl`muI z-f1r>c03gr=uL3LZ1xe&EI5XG1+s8k+(clxm>ZVTnwn&&m?X%7oq9ipjlcHv^xRU% z7&wzrs509bqFsr=tU=dBwkvQ zH9k1|gbQ~AHqu$^bIB$d#Jf?J*|q96RW)TfBOGF}FIk%o#NjmwYmGMqJ&b&`%EWt&<5J|KyymwV?$1qKWe-2}L*u`q=6zwAXyDU8K`Wfr*JpB;qNk{rON5 z+-YWKBq%51Io9ZpEHU4MgHG(+ni>KY7MAG-C#I67Be}}X3NEeM)KPBMbUtHh)9xCth(Gwa>4^Lz^Qkn*jhIose#Ij036cU)@< zy_I+-AfN#S(FlD2JpHk>TL~Q3cf2zw#f;ZqX8Jw{!~tXKYuT3s8bmFPt2RQNw&;w| zBG)~ShwH^YG(v=E!uQlU7Rnl!@IKPtKzUl1;DoKTU_;%nztSxOz{uaB)cw?P$*8y~ ztIUq3P$-&W`0dZW&ZDEF5;gro>p@V(qiyN3Y%L#dvu(Q3$=i4)ijfPU7Z!|ef@K?p?!o%k~Di5-Hn`z1-vsE$9 zQnit>52HkGH{ZT6`hhPiD+`2Td8g>49eRLX++VI<}>xmSDRBzrA^ew z>%)o1LjrGOVqy$WHhvx5?tAH&TUZQY4VES(5PWQKEHpGSLWhDQBNIPpyLRuCU2$Kf zIXq)!ki;G0)rg}}Tv3*i(Ol8x+rK%M4`)h>kQ%0bW7q#G-2dAOIijePrn0#sDu24x z6l!CTvQOvJLP;JOh0U)exz|Fks-3n~FDGkToEDP1N&Zbmp%ZuIHPQIv%(ikNC1Lgn2u1rezqYSODn6box!Zt@IJ$B zivegs?f~5-D=I4TT3Qx%4%`kFrq~%6WLH*K(W*-Is_0=7e7MhzMj!F5>UdF`B`#9` zh^B`Or%JTGc0bwq;qq4SP-2SFC&;BQ$U@GSL}-Q0E>yx=;zuZ33q4lgtCEH-MuYM`mAa(0QIXvI%DmE+xNA#H8pB6o0$qpegOgZG$bk@ zOJ(wjovx(2EZ{&D_J8DXF3pqX`PS2D5_LT5>)THaz!9vxIiHuAu<7@b+j4cU44|yZILjecTu-p(%7|Z+ZT*SC$aV& z`L+!SJ4i_mi$c)U-miI^1p&yvvWkk)Gs`GzVA;_Z7oTQYJQ~Bu1yliy;=fbEe$P&c z|K(z$)bqJQzH$V)04+5&q_nh`I<{Q4LLb4@dUsm7`b(Np(#?$@2=vj($-!Wsx_0WW zCg$eCp1)4Ws;U}4a^D7uRd!(^I*n8qy8u4rmmP}&CNI)Lz{hexM(OW6@JfdVJ@7n= z&D33Ja;-b)z=+Pshya>lR&n(c%Dp+_1Y#e+VLE^W6tJ67+H~{~*Z_y1wo(fSkU|ki zykJB^hT)0Gk~i9qowq@{*id+9Aa-KAFWLam8P_h?GZQUJ>U-8Lic9ud?>+m6h7y1V z7#yV5fYSW=(jI}TwDqoGCi`Neg();2MJP(^<4-`xh95#7Ypu1W_sRES+8dm<4f~>= zYAq^#eUnU$NBic1LDw>m=glYLmzGJq=?V-?OnAUURKv&(Xp#FlaA7U>fTqQpOwIF2G{Eo)Va=tiN4vIWy!44zA(T>?A-$fE5a*58CT)=p!S@s6Bc zd_h8E_qcL=M9z<&%45^B)b1-EHYp;oV=)cPH$V?b!o$NqZNd_xl9TzeuIn{f!+!LC z2?_NLiR>*<%w4jt?|6Yp#=h`d<-4lha)*CGY3cse`TXdp53r5K$H$ZI?u~2&J;X9@ zmSGFG>t&`rW4_6Q=Pg+0qz*>429LdefSdxv^z`!E5vZ6L&`h(t%VQnej{S{axkpC> zY}aJ>w~P0;6Llln5jyu*Q(t@LHFgFE2dSy;*ZOe!hldjZ3IHuBvOoIF8jv?nX}kCD z-#>sZLG-_gD6;{V+F$-jd~I)EaRcC;^59m`$+$O&EK_9_k^x0R&3ptfy8ZJ-7c;X> zZ?6OyU`RfJj)uMN#vFdcdRg1|3Tx%0T{mHm;n|HKAn_nZ(Pe(F*p|1pu!se{S$}_j zG~j1wHv_>5zG&D#0x(J4$bdJf^#1)b(wCM|&F<$Z!u80|!FO9)S~^hEVURl}D(Z2~$8r>Z{}tfck!z%9Ffl$Yw@6Yi z>zr(0Lu+u}Rd2m-=?y2PtFN!G(Cb9K1bnLF=-BVK`+hSWv}Pi5oxF~(`{w)(FO1l~ zumhR=y^pH$*EcwUoljjBT=RisEJgb^gITVyEUYq&_PZmlw)<{1q9_5g8r87yb%+A4qpMahnK`y3u z^(U#H5uKloE`O@D1T2(8pot7|E6HJ18h9$eiz# zqLBZ%MI^t}i$|WHon6+6;&?g3hR1gNXK*8t>Bv+z`=u4+vseX(lgm@DwvTn~R< zg}#*$1p*!m^+i%?QQL>}ehy`Py#Tpt)z<+g%%~K)P zHvAzMYuSA%t^6BK(YM7@WIt1WM5|Y-(*VIn(YDEjn{VD8S2mZPc4~b*HvMr4d?<+bc?!^Y}HHgwQ`?+v%i+-QDFn4Idvd z-1ql9@TCMWIJ7gu{5-8OFY1yPE2RqVbOB7)|NAqU7qIt(8I#QA@?MJ0ukxv72EAB@ zhK5PxInVtJs2`ej?ff|bQk`*)<;$@@zvWuIrPQSTP7P--y@4kC+UI_O zRMg_Xx%XF&GCLFf7y4Ajiu1 zx(cygP?9C^(tiU^mF>xfVx|KCkD$oN{VR7TCvK3YVPKZPa;5?WfD6{;seXAEPQ(jw z_VWjS)iIbahH=rlO90uz0Lr;=FM#p}3K82S=rd2kMENraT+(n-P2!P^#NNe(g9I8XF@*EjLH6-((7tgHi`aefXqAY;N^e z(SZjZ6XCwHdF=@-f{U|__p+WzLPKm_zLsAL3uQw>Lg18h0pL&BJkMm0SSa#K0{*mB zm~VX|N{?A;`tQ$JlUcGDKr&d8xK{+bsNvpGC zU0@?hgDQ963`!t+N>x;9YAE0WmmZd2-#*=*PQ7ZdDWcuD#4FNac?YPjUhca zq`CRfP5_>aSL=-YK`D7>osE#Pt?E5eyIhi6-daxazQu zHGzw$JqjHgL>mnwB=V$F-(sWC!K%^uj|BTCqBDH+nq+|+gSi`cjuNlQt=^9&U(qf#QS17x?knEln|6@Z3p{})k0n^7q{U)XwqZ_gp zH*M<(i=EcFY#sh!VgOZ&fotDmk*@ybzK2LbjcFbaVHqR9uq_oT8m5Y2ehi>+zPtWO z&DB&xP(92ne!WbR<}vvZp*3P&O2;pJ=66Wor^hJ3-jW9{*H5fhOhB!Kib*$qc(MP@f?z7I09qLa( zzP%PWEcW1}})WhT5107wYTR4juYxKp@PXb0Fiwp-&2fQIS<#thhAuyb9_ z`uxgCMssab(_9QXKT!7F=L`ykL?YvRl^pQVBvoF$LaMR){nufyFPa7nnJHurU!RtG zg;U-g1d6q4Gl#8!VKVyC0#Sg@pa?61sUzdz4Dt~N?&M~|m+eI^^BDg+Q;OVx<7>M9 zi5Ff6ibADm7U4V7hDF>J zJq-qhGhpiM&)>iHWR4=J4Q=m;)YB5iduy?Pkpz4)L%nw9w*9xvLkq{B>30ilUbHxk z^S~z~0PzPT4GBK!WGSZXa@%AA$DU6~<=ep%m?s+2T%cxuRAWw{NBto;kWvAc($8@N ztx;dBv8i8e5*Y>i26em++?-Y2{9HL&iv@=v{cM!tfp>hVj%wx;KE^(@8lnEk#jo|N z`imrrak)U9C&(WtKxT4k`3Nk(=7<*>|h6anf#W6{sZ{NO!q4+rOza)6T$i(UK-iNjNkjhNO_n!}nM=qubbtW+f zHUXF&3L1gEv^LVjP6?vU`42*KICEa2C#{z??L(UrR5ltWqy>V$rU#5@26LNIhcJ^* z{>}OVA2}la&a4iw6ppwAugM6=y^1grc!TKe9Y3b}qDBDc#hCYQS{U$=e_w$6RS-vr z)L_Fwq8ki5VLku@KtTmw|8K`MtByurv*~S(7s|-XOPZTA!Ue{ri%1oj_Ywv>L>gwtjNR#hCZp47m!ur@B5%RRsQrQ*e zpd3muOrYnksWMZup7WtBSwXo7h9NTfnd{`+>yw2LnB7+RL^Bx~5_&(vLG5li*)(C7 zLEyOC5Hq%`Loxk4UjSn;A`%SH$ScC)>=Ai$e{Xg|BnZ3+&s^#BVeiqsRiK{Wz7o@J z**qjbr)FCama47SkDzfxqLuu3&HXzEwLB!6QJ{B9{zijJwwz0_+Fn9D46ron(Z8qD zGkYS~(lor(>A+l42b1j&md+thD(biO!+cN^HWi9!#sMW$HxbxCLN;g`aVb%*@H6C& zfR99kmb)6&pRA9iF_MP~d(Y^N2K_{4$8VE{5<>e>IajG)VEi-6%R7vYcM;J$VHhNOmt+`i^e$tF-aDh$ z=fA6ftSm*4s_q+Gm`+48}yw4V)p(aoC@aaP=EG!~L1z9aDESweK zC;UD>a78q6PZszBxygNS({=*8d78UgVX2zCIomtA+1q@4=3(Uuv2k(~?$<>;dD`e>dP{{*l1$_t>79rW)4>ndhj2a88pi@!yjgA*=d)C`e zXLa)SV1Pn_4g2l=`$e*pRKj>%krY34b+OCsuB$&8)*#f&{jyS&-SoT+cSfV&MOYh&_;8xsR&#xp!>xQAAIA-d{{epY<|y6mPL>wPJuEmE+y#OjS4UWOJ2wFPrIFnW7%lKgFr#p-8g!<%~)#qP1}Y#^xfd^SfVSQ&qyoq z5IFw9c0uCTMDnr%u*2|X0q(dNd?e$ACyANFzVU*ep!h6E52I%`*;S=weQ6EG;PB#z zh+}qRCg$ir5Ypk&EyJdv3+JKex<29iLd+&LxqR@s7LSF?7MmZ_2Wsqiq!B+Pm=fL^ zgB=OpiDPs-Bu*YVKj7;Foqi)(v|@GF-)MnBE`#(A48&py_~5GGlSF7H7<|-UhVy|Y zAQqZw369H{Bo-_s+6jzmkNy$fm6M$)VBcdK_}b69y+LiY<_q?Sl0$jfZCo1 zpKOSBi_fEOY`~16CV^2rIHz?hTC*t9)ERL0q***Jqj?GS|VZEvCVfuWJTx3d0b4cmYx4^*)^Y(k&kZFY_R~ zW~e_X+!HKMEO}g0ToYk9Ke}#||I&m`Y%HY3DailF`1J#4!hNxQ`tOhThSsCN9tjXf zwGQ8l%E6;dgVuUa#U{D$U0{=7<-aE7dvbPfpISA8*iYzR6C*Yc4j5;(WtMbrPtYqhT4c$c z7j(~v!Y8n}p7N{^;+O<=!0mfGO)0eefoTYvBhKwiLW}%>HUIt@i^*wsev5d&s7d!K zY`8pgJ_v#xHa@=f;=9Nk{GZ*_VuQM}a4h?_!4&^&15lj%bcJSqXXXj_KRj!KMSp0qohUo6LShMtzyv zOHZ7JeS^32U3NtN43%=vhZq(}5@=EO_vRifD?T#=%ZEC7vDa~$DZ$3CiV3feaYQk5 zEp~Wmmf+csC;qLV8e#CXoELjawisrUHxPu8Y7y5bY1z7bE&*>9jRicyVtF?y1e}*Zaj%mg6lt~AHR{Vk+bXzO!f@mo zG9p$9&%68Z@f>463zAIE2{R*CvG@!YgRX1mpZ*%U+s?k+V@NYd;ZzX&;TQY6eFFxu z#msxmDP5DE8tVb7uu zKosrc2SE@=!F?H|>~H1~&x1m+aD9X*QzMo2^zNX4<)Hr=`~#!)yw~)S0Gy}AIyiM- zZG=62GNJ{0!5I0`SeAuhl~g?p%}i?IiROMd3l_&TMR-!IN+5ad5?PL)?|l#!#UySp z`aMGn6VA?ujhpgh9p0NJqvuK)S6!`D?SJy|hkpU~uLsQbM1UQh>tD_uAX(gqO$b{u z!NRkkQ`=9?09-J)od57|zCU9RmUc(`b45&$b&|%495&_OFq&-jpZ_pyr+Rgn2HBOi z@sT98kxrJ6-5yUkQm|OVmBwcqMCzUrE8^9$*vYKlI480%|1@v0b3xE|(Y_c?j`?w! z#C$_9EaArJNO3=~K{|%{pbL(j_Ru~8VgnD>;2Kc=K?@ds;)Q$0K#Erm*_A|+UN;NC z{g6pR++dGvNJUxkxMJJokIUyS3{f@Q33dnXS_37~7Tzt42L`>j;`!^cvQG;EPjaYW z;VN?)Y9nS6_ML|-v8-mFXUD%26&vS2h!QirN~v>v5YtAoC-QJl3$vt=U0om%H~!EC z4h_BS;fGu7zjJk3>&fpdJSfnwpGDTWSXeyjKJZ$1L9I-^IOR|M=_$oBa4xq|3hM!~v)D2@4+N)`y#QNhH;J-%tssT*uMsiirH3-vO- zpjJ-ywKt2_*jfUM5nX*CvugL&jIV`9G zZqPaSL}IRu)U;jv!7OZLt3m|syI)Z1Gl{xEC2W+`W9xmrV_A0Cswl{{D~R1yspBf%8RekUr%UA zBrL)30|UL2u-GHHg?V({+P^A4=_(}`IwAOb~shP-TwkC@_WD@ zNkhSdon`x`hl%MpS$;~DA>vg(>g(5=S$b+h6cz%zSw$X2|8;&CxjuFw604#oBpBf@ zg3T**c!q_PGSApIAoW=GC%hqbHtGn7`)UFgF;Q-FL^k;=2Mt@qg5uL`U`xo|)O`a! z+x>zhgZq9=txei*E^w6KpPUhG9&>Md$jgg5<`|U=Er9p8p|MV^6924yuGr~Drn?;WH8&l?Nb9Uqx z&3B(7EAL$%w`VZf2hTS*0H|mPo=KKZmVZJga_GVLFYw~oTuFJ;?HqWx|1D#gf&p|g zx8*&$4nh5TXc0Q8+S1rUjWMz}RVGa+17aIQviw!P)C!Y+2kP+SCD+%Bo7tdopMM3s z!4csKx*80)6Yw45PUb(Art%iqQkLOs2rn<6TGBMF-9V*~!y#Y#iEUMnR zvknqC@u4{v0y80j2ZDU0q`eROm}wOJd*oz+k6#|p(A`H0c;fDRiJ9X6mkYVk+&%r< zMuLWBVBh}jAZVin*z`o42QZhokC`cvTkMklwFKCMVxMnce8IZqjoFKMjI|LHQzCt` zyevyb8U(#Zos%}{(&Do~jG0ltj+K{{{T+1}9&VYTvi;lDc>PAN>D_ZCrb#7l;D!<# z*fk zgJ(8V>mnte2(yo(B8M3EAiS_25)w`}2R0kLfg6SmFi)P<6s8%PB032tX&{h81vUgu zxzbO`tI?v0avUJCF3Xc(V$x?NBRsTzdpc}^Q19Mqu0cAq1Gir0{^!41?80L1nkAQa+XPE- zij^T-7{tisTE&Fur2UwtcUzx}<_Kk!<+ndqE9X&i+3njD=?&~GdWE5)e^f9wI=A64RH@t|lnP2Uflo>K?c`vzTU9 zCvsI}(^La*GhBfUSQ6#n-ruXWQX#pB(PJ7^qE%%fGEf`Ov!pYS?b9~lSD=_19cNGe zs0y|CQ!cKUsc+i4eP4cA7yM7D=J}BvUSL(3|EEm0Km>pA&QFLW8f1_0K|i#&o?40E z*qhgBrLRo$WUR2bn#?uC^jB*LBi4a_D{N^126Umk4qJV_&AJqL226Xe1>Xsay@vm~ z&aidJma=+cjfc=~a1cU+Y!Qv^O`axe2i9v~%TfrU`oA@30yXrZ(twhubLaF0>d=vu zGWLiQr;_Sf)Y(iYMzhA3D}^g7tX_s(^-SQlg1fgz7o*c~bl3mO{LesiazRQWvuy)JN1% zGVy+VG!;9g*aHLHPdHJ=tE-~zMTm`)6rs5Hh0k7n7Z}|IrWMnWRPnP^59Uc-Yoa1@ zprp#sgnwUNZJapf#mjY-)i(KJ@dGVa9OGb(@ozLwH8_Ar9n|sa+^83u zmMifuHNDWIuUW%5`xe^$VJWxR!)){Gp5b@(V;BF4bff+ib+B@;evqS?cKnh1e;;oG z28jM0vT2!e{jHUId87+q=t0aw*A2cU)u>MdrPb7TQ{hbMt`VXFIf)Odo`kmicilZm z!FN{*Bh8|8*=_55sedBIeoXUbVCjjw9&nS?6zC8!tE)jrzH;gXeJoH%czxO3@9;B4 z?v=z87zCyNGo1C$H$j&fs7i)}rm;GJ#w=V`a)fozS#vNNb+hdGa;bSAM_NRpIq`IU}54U`_ z3M>gX8sqp3ePRGPea$il78SmR*R-e`XSV`I*Q=9O^I+(BQKmhrHM9L$hpx@-q`jPX zYn4c#{uzPeJ*A+%3;J-&STDk>mE;ugDvoqQ=Lrli>Zw~p#7>sBfo|v2P|Ri)bE3pu zQ(iQN@fni`EygY5z~1#LR$f3p1TVm&0#nv<0_NQx*3q0XNZcTt|L&{ih<$1FdKMy^ z_#K!5lWhXO^81k+0+iSk=sIPYX%ieDQH&I&^=+dn-R~m1@d{Li2ycG^A}ccTAy#B$ zq+cuxD4?UE5D7bsREmsxxS?+ZJ3MOV5Bzz=%xQ$>O1NMqBu8mEu|PIN?}*!QnhZDB zOF6B7?(s7{>B6l%P?}c6!CcRH{G$6Z^hV#%xbp$-WsOfXcZx%EH94%zyH@Omz8;i$&Y+aa~W`kHhuqDo>( zm{OGW-=6r@L$#??hFL;?+Gpu1re1=}y##x!!8FWu5`nDj#~Xb=TiJ^~k=p=l3E)Db z?Vc0N;dOOS-VlDL-Ip^{HpFMy(W}hrjDBE`8O>a{c*Is%_DY9*eqytdo(F;9F}lN7 zLGvzPJ@{FKe4PFdYz-HxlsIubPI!0cTXX^6G6Sk%^y2XF^=gTMRJ2+!ag!R7t7rZr zT6_7SBhl&)RCHIRKs9T^mU}3n(}NQRaiGjqKxi_MCfFyr;MFM@M!k&8y|jV?Av48-NVXWBwlA}K} z^5?I#AxSy&`nzgLZ;Jr`#sw8tRyreWBT_G4+E`F%60>8~Gm<95z<3YtT3G{7ZAYn~ zKQYn9^zj|892Eih$~p{I?5*Ox4*+7KH-Oc;&gg@-tI!#@=~)1bUYMuGsbsLS4mUmf z-zsQ$QvxOy^|ct%_!Ive>ABuUCp7VJWdCoSpT1G_dTCz&Y~2cLBm>+lo#(d%mGod1 z!vf9!8u~0X8gtyvplJDkb~h~Zft;tdzy{%Gpms|Jij~9!Cd}@xNqm$8cW9H$CgT7H zmsQ)N<9145=(Y<#8wsdwFG<dSMO#dJ$kh6L-N~i4dt}d6InM zTCf6r$|%<;i6PL}DGH!&m!)GGPya9t{l>dR5$og<0km3$58NUBco_}1lBAQYx;a3s z3o_8Oceq#!I2_7Wx81qiQtdZ9O;J07j)uw`RUSNH*cZe$Nm9$d_)%HaiTneoV|U^s zoejcJpr{8B9yCRX4AkOQIJ8SG_oCgiishVq&)x!2=8&dMzS=?NxV4}22%=spq`0hN zbxypw{U1k?L1U2$c{?!E$8TYMkv}$;V9TNBgmoK;1}NN2v+X*kNh8hM^u_1vv<7FF z(oQy@vl~A?8~KRz%ZsPIEW#bg8uTXrR(SK#lA`-BUIuSphc7FZVGulWZF&Un9b?Ki&a za0`JreunIdpz8v~FgG|VI3+R%3gRoU<^IvFpK6&L63cIUdh2?M5f>E4O`97Efv^Ss!n;+VrBhCF>IGeN#TAWvgh_o=W;cus|JgMiZ<(k*Pv zmXnhc>jfw0At-UB>eXCK2I6OFgYGc@>*4sljj{FMPA({x8jt zi{#zrdbW$9jT(bZ$sZ-EXRh}qZwCF}!~99GG86q5EurHxXOB(FP75y25@|S3_X>Hu zp4n#5mlf0&nFnrI?#yWi06x2iTR~J=vE*d8G@7&P`9Y9!0Um z*X-xG>db>qFIpY;`2MwUn)t27_3z@i4m)@rxj9>o9A$2Gts3gO?q$x~Xrd_I6V^$3 zw5+0!XQVAGOK-B>^X3Vgi}w5*MivhhfDQ{$LO5qhrd`b}#iTLo6aZe+?f1X(2(@== zrh=A$ZcTg1SFF?YRoCYiD{YohW%8(iTeXwVJ`9PMt zEra+t5a;$#($}!cLA2S$R0UroK3OHTLFnL%3nQGgLk=vW&I@nO@YyBLsFlIe@6^<$ z%W@}1{xz5v`za=D59+)LP|c7fZAUp0IN~|Fqi2`p9}Rgtp+IZ1`-f1i@!J2J4zvh` zK%&+x)!lltMCHUpf$B+}5~>npv+A~Vx4wq)58rTUnMw2IxN?d|6~l}(+he#Ysc2$a6UH&@4V z(YI)jNm_Hf)AfS2k^8E3&&E{J^mJ{gSPCykBK_3bQx#h&iuz_OFQK)QGSfHiEQ?9~ zKu|u*NjTR*)=<gU)t@t1C@iEZ;>;lzD~SfcUfiqY|YeZZK~N!;~QH#9~= zj;3N@p=u@DQs~%(<} zWh$Hit(tz+Q)~+C>iNR`H)dyA$6Ok4I%*{bs{La^cU}kaEnU?~^Knn9^lcdJXEA!g zM}o}kgQ2i1$9pvuw!>dDbei%MJbJ#`kmM6tDS5_N@@N*=QKn2(@FJ>{)$A~?)Zh0W z_kRbtDY#BOBiob+!gu9tAhxScJI(ttZnOItH}sl%nYZcld){WX-In1Em!%h7D}fr# zAKj1@OrT&Mz!PMDe*KIa?!_uYMpz|HGyaz6rZ0K-X|LapwV!`cYG<&%W_sWxeA=)^ORY>U1V?J7GGi|*L>%-UZy z3F~Bg^8##s;YyMnQ`y^Q7kqM#4p-;I#8guUEx> z&@6if=cMj+29Z?Cicsk@5qJ>vSP+hMBVf5vvf7U}9g-weVe@}Ly73}ezC)ecCOpFa z{-(YWYFBbqQ?7DLFpWxNe1F5P!f6!Rmo9!D{D<4@U?{NW+}?JmHu<5eAgvWxIY`Nj zkdLRTF%HTtET|eVNt~BET321w^^$LLpH()mQG+2tB_91Xt?`m`E6M~zNAU;zrZTu} z-SKGU4ki0)BWiF?yP0o&h19)Jq80QhgRAnA8fYcMX{he4%7G%$s-v2R0kYl>!&qWYeVSU_h-~eEpO*VoI*y#%NA1r zLueEcJ|Y+;d%%TXPsl`bPb7>f;tcs)iT;)|C8M#7)p>Xr)3OmGA%*Q#Xtk)|OMbq? z@O^1kM1S622acaq<%R_|Hl|11#9(E>@x@40y@Ye#w1puhnsC z(kD%WYTi!M1wm);so*tb&F$w&h8uZddV*5T=R)|S_*tGv{pHq2AOrq}T7J`rmFzl; zsxb3E)m%x1J)-+OgJV(-`qmKO$9q{qB3$=fMu3pNN(Xsb@}^)&4H$#ca7mHd^8=B| z8btDvA{n2ctsRNh#NJ8~=8GZJ0;0IRBn`64|3SGzo8)(`{mUUg`oXrN^Nl0=yb*N3 z;?uZmd>G8keX)2tM_H%(G*yZ4ys^cu#c#n1nY2F8Z!f;DZT5Ws^%Qe*8P)kt6Jq;1 znE;t*SJHmRPHyJP+*A50?=XM4n{nmgPOnZNf0)94(;odUcGdun{8HUj70W~;>n&>_ zcQ&&P>~L$XC&9>ZkIQT=sKGE^Kt?VdN*>3v;poCA22 z{#a83y&=H&R8XcpLftC~erm?IPwGtQ91N+b5c-_FO*y$IUUT~T^K?$7f{6PDw=sJp zz#%1xn~}Hf$m7q9;Fb0;@+Bgd)ml+KATTxW-d;Pv)jVOy6_PL9%6qW`Jq)k<20WoX ze#zoQysuVAp5^k~+dt0wwl7o^bN?UXoBoSH%^C{_t?L42g12fJ5(z-Jg{D%Np)-&7 zt{9r$PV0yytf}$etGK4Ny+p-T2h@V{T!YuY0oDK)m>vijx(m$o34Sfnb9$C+q27(u|R{d^v1plW1Soz9Y#h+S?vcOIO@yY1ZnqjgYw^3gm?X62gs?VoHqQ+0?4YCO%^by z8l|0KuZ%k0uvitF+*!snqjkGLQFwUX8f%UlXVnwYt$lnV0gOQxNWLH8S)_lsp{sE3o$ zTsa4aik^f+@Y>qeChp8u6$74>H<^KWH`&nqL%IY?WlRKD(P^M4v1`)LcPh`R%hC^iO zZsffxxD?`Y+G2MEw-nVg=b8Ih5Amb(9J>vgoF|LZXn?>9e4<^tiBS(5>~($Do@BK$ z5^L@bFl%j5B+F_I6|E;;+76-BUA`2(j{$9qY!2k31*YlS9SLtOEgq+*MklBG^IKmk zWByMtJM5si=z8A}bRA*Ou72O~~{|MMdo0WC)>tNProm8VpW<*1EG5TNXn~W?h?7 zcT+2fE*rqUW0mC_JT$enL$(A(H~+{N2CzdGR^>UU`2-d()!po?RXkMy7wAc9(DH`G zIETmX?GPbezRVp{%I%6zxHcP6u(UDYcrW9YL>eN#}GTL~*XacE$)I zQtANvAEX+0eyu2UUQYfoLjTCs`SR(Zl3H<~8nFFg_VENCp^u5?0>von?Ka^>gmC~&73Ll4+oooj2bs=)n{=?D3BdX>eW_5Je=;%S+fJrPQmaEwZGE>^NcuHfa zP^e{Q*V(_FqD1JO!4G$AnQd8pO=3pmoC?|XmR?T$XBYD)y_j0~>>k65jnr>n5{kU^ zvYitUg%V56ZdsSWDBAv+E8LGJ%w@QX_d1nt0c!Ql;?v#A>wjZ@WqXegGYP9BWxiXA z0**DB@$o`0LEz}hxioSKvbet}iV@Y({OZ1`a>}2L?U~j`F_1Lw`PjSAw2Nf#9JD1I z*h32nagirk>o3&hUl=EWpBD+z8c}4y%}4`%JWN8~UPKV=Dt>6`jv5^(MGpxmQmy>J zqfcLf`PQ8;j`&hiht{}GU$%_4z`-XRm>JkBI#U7JdR{wW5&X;|S%d7u@K<`4%c>^X zZ-_4Zr_skq&0n#ThJ;I$6$yC$weOMOKb#j$$9$eBl9D6{&{uP z7tiO*0>X2Bw9zo67y==PUK%_R(wruCGL@N>Uv(**^k3}M(=uq(kNRU(sZ(EOl(cZp zzECY;j}tI`7+-s@2;X=#Z3zh`{Jo~$PP>+5Q6 z{MG?mM+UkDFKcAx))vJtF-T7QcBgGP-ObgD+kLbiXmkxZ-7U?lE&3@XZ$I@rWvAiP z{5*7_g}$uSHTwB=Y>wW*$CKn#Z4*t!fM=;ZA~}aOxP)lXf5Kz z8wLS%R0AFjus($yCi@U^Ckfgb3TiDG0jC=01a6yJU-0VRtx^Q}yTtxYbM{NvE6&gU zW%`?cZEYo?>d~efIwfOr@8DvydR_M6KOCiH`+2g1T7|aVXS{VEx^ABB|b5@(XuJ5O%W7LkH?VKmNtE~WJNwG?Zb%yJ@|7u>Yk&|nB{=~Pn z`-$`uI~D!f7cvT)XyB5kCHhZq6D5a0f0RJZe(ABJAK?BW-RT$j9|4I-*(X5jz#UWC z|3z=O?^~K6$6RDd=nR7(Bx=WK0LYTR>l~j+bE@BQ%p#7B-pta{8;_agm5_}aE*Pqt z@BN_9VSzI}EH$db=8x{Z6H2&?AEq8Eq+2e}n#N*eVUI8XYH}4FB6J+-uKf^oSza(3ANuzjBK&`FyYj zZfe%FfT-A2SuH~RyTYBmjftCrA=%fZ&7KEb5427h0Gzz$Wydt- zx=7WHx5vN1FSwlAE;}iy+VC6G5qrZZNNl=;9SB(&-Oi6ICG-rwtY9_OF8@Sggp3;Z zAHJiSjEY^*AJofkkRH`9f7$f__opT@0y@axSJsk(RIughsYLt`JLFjWI>@>|nw_Dx zFo3oC3Q_@v=yT`=9IsVXeNYD^5x)k|%c~Ayy*r))tvhxP90Q`-G>3CfhIO=={sQD< zexfl8``lA1fI^KQ_tdx^2%a&rttn>%qQDCW5C}BlUmL5|O$XKDuk|;+K3XCwnrSSA zWjHo8{Pp_Cj#GQmr+yc3?hlk_Ow?FNde?dX<9PJbcc*#%Epc!_`+Xn?Pd)wSJl9cq z1SG?_4fqrUOgW8pE7X4albDz`(q=-`5u!{^yAxYM>ZJT{8P0HinlLX<*LX!eMgGo> zo%*f1lI>$TUCJoNX5>MU;RB_Z`_2gF`1>z zxOUG@#&-qj3mj48w@54ai>Knag35ML)le58(gfWidxJdNj03M#RE1cF?gJf;(9!)7 zoyq3X7A*(`m{gQuRo)GzQ`v4pNAUBML+O^@#LM)3D>Xp!(r+QZ-;)F{hPT6fHrOXj z8%tVk(SUkKOdv#wnMfM=>!bcL|4JyOz~Myy~R9lLy*sgt*kkj3Zm3e+HchZ3{$rYl4zGj&-9)^Tg*Y!ilWbw4JS?5n+dds$_JO$ELr!D zjcy0H3(w#Dz2)DN&M10--$+mPXeb-#6rkE-cR!o~nX2Dh$7?$s z73ll>%_2jk!kr@Ot+aMjwF3Cgtf+A_GRIHJUt~8p^H<9h1eG3L*f)sP z$22AHcy}j=tUvm1)(r-6G0X<+l1;!{G!t@nch`C89S$?O*8h9@xoBKP$3A;ny0R(Z zg@o{iXEtN!dgiT}JsLmjtTpgx=m@^lgfVP(*KQkl`&+x@C|<-PhYo++A>aj5ma{U% zUb=R|;oyS61V{+vUbw_@PqZeH+~|hZDy-d^9x=`S}yqTLo~trTh48U;GD9P2|?&SuGr1%+qpHvdA`L z_7@4L90{fk@;&4&0CSqSbanu1MqM}$7*Q$caM*ECjMG51;_5YQAjarx>5>Jc2|lRt zcsIu3A|2yWc%}Q!vV`|4&L$AXM3&4;zUI5tTkb}5Xv65C80=y$5~aw zi~Xe2tYAihZ2|`DrVR9gtX4i*vO?EJFu*-NIOuO4n+-|QIeRG>O zP@21_(_xfSG3n*FS5uj9yYpvdh3bhY+EDL~-|@T7f!FRBij0cv%A#+td;c=8>jQd& zxubo%t_&H8#5FRiPdR|J@S1Ty5DTnJ;K!^A>4*%!+t4zVb0esU0cNW+F|23Ha^kPd zR^bo#-DC}ZskDV}??=8hs+$Zv_gh_|(@nj>wRYbYLl9>8W)cShaRhsPipzA(ZVYD^ zbJWHEQ1Ro&*XQ_0S%DHxv~ki6nd48PToNZ<>emNEYCik?-%tV3Fc>18RY%~>(Csw3 zjxJM{HHk$qvWa_ibkwCgE}##p&9jJRoL4qBHXc@gvhf340OP-~<6jrjZH-nT22fnM>Ganc>q*@2M+m7*!?1aoVftzjU9 zIxir>Fv>Q~S$iuIa>w{%9Z_=I@I7zE@Id?Q5-z)-<*X-Ob*zOyEB@REl3$^FAl0dF zO#zC!4{JDI2}L$>EV9OBaIR-?-fV4din^e@ufRgv5<6lHMyuC;XZwrM7emjOhUPjl zGq*%e?@%2uGZR?<)un3*00;tDI9jQv888EyIZnDnPAZ}90E97lXOSd7^XW>e7vx97 z3=W`@k+j{M2IQTGJ!RJEFfNI?102?N{pu}^5F5&Tf8}Ce5|g2}$H9mh)SM9f^rIuK zA(uvi0+9j76BJp7f>-Y{AS_IOxt_#vvv<+kNGIwdaNeD{ow*Vx4#j0xTdF$1u=|-5 zRPRSs;Q@YzsJ9H49{^#W_l_)sygvI@j=jBC^YbR9($XT|OqcpY9SKTv*KUb1(RzJi zq%%#zw(}EpZS2wX{Xcf>a!wvNn#uzyrRpAaf}}pBOy(ewU|t;WmV?(6@g$G#5#t&T zG3HbeZ16~F8~@_goneh5BmZ}xhW=9$XAUIa#m}tG4~bI zB)||~S%R-teCInl_4C-B1~-l|n65+Rkh(zrkF$ z4t#oNfOtW)*j7l$aQw^K3sdF(f|?iiSMseAvL|M^pj;`GhYdjn3p&rPrvJHPW`4B< zF2>|FKQI5Q*f99C)0&eD(V4WJ*uCg)p^ohY|9zjy}r*pDhV1l%So>?XU$h9!g=P)>lT zqT7#$I7-S>dhlUkX7+;WTH=Ojv#JQnHT{GU9}NXQrb>M6!1K?siu0t(`f$tOcsZt0 zk8Idi;OZU|S}F4mzItskGychjuvy`o76YkiDB&h=HcV#qPu$ztHvqfRg4!GyP+d*! z+2xmvilRlntNreY_J-GO`(?j239_q1N;vAJ!(enK7t=3q+%S4)S7zO~da%AEk~zY` z9IihHu6t5cX6oMFn~bokdLXOv_Sw~FiLl$e(9(gG7rSA?3&7GY+77bm!@;zk|E!-o z2XlIAng^a1*W~8(W=59=k^o^U@^aI!f-ym6zJ_7#f)r%$zr~71-#*(_Y|IBHgg(G; z`&ol$PMQTldn4E@1UPeu@FR%W;U9hP>7ox;k5^VSyt+a^3_M{7^l)s;5krtj3@&l~ ziNOE9#N`SZ#w)bt-C%Q!0t3V|W4*MkE#7>R&JH z-6kX$aF|)yzS|GPeoaks`NI`NntRXL*~6x9J=rRGy}sRjXNtKid1*QQ&(J1B#}}>R zSHAZFgeo`-S>jFfBF$ee!A>((p^~esnxVg;E~|i=m~X}YFisp#5a4LXGoP@{OH*ZAbinz z69~Wyjr6nEr#CVW=tOb?enb?^$>i`7aXSQ)_G;a}@s<^kg&5{DUxN2awv=r2--le^ z?2o`#QeaPxkuw48M`8N#q)6_?l-2xRQ~lUf-v&cIM-kcsQ%=0|q}RNHgmGUQC|=;0 z+@oA3>N;05$W8M2!5uX)mfn~rG$*5Ma3=RV8ORBiSpsdG;y$bSz8K&hMyujYJ<~*) zy7JlOz@R2%dd~bjP?s%pU}?47JVae#vu@}_OTYMN09^-xK}LUM&*P}!JKnn?px%HP zPKlW-A?#w71WlB^?9e7TI0kuE$?_~ood_jyjHrfjJN@Y#qkFXw)J1dL#2X~Eb^Bg= z)w~~3JyLc{1zhpg`lSNwm4bceBeRHIj;8H*L1L?ZYcLtXO>h%s`3P_pZWDHy57`aC z%=m6RRLbM{#l2{Hu}mA?{EzVE#dx!jZ-fFRkk}hbR-)?a(Y5(c4%R$V41QF#l~aLW z6Xe;vI${%aABp(sOLnPyMdac15pYLa4PCD~QPl2z8b-5=MZc6Q_Tux7paqgB9u7cy zJT{7AgHEP^$7+N=Ke17=SN3x3=ExO5}H1GfjW891v|9A{-2eXI?s%(BHbtq*tVi}$HO#1cd zU#f)pK+BPl=w1czluS8wCP<4DND=#(iT}hpW1h=@hbpUqj;2kf4m0K`$QEUnCB*{QEU^pWEH24LFna&ni5vgOYH)-QsYvL|&6Kdrq(MpN7M44F^2 zGPOH_G;bHZHu>(F*8u%|7pNOG9hj^YV6*Y%zN)Hk5k;<`GLWK!8spEziJYxPeW8S4 zIiq)Q*08Da#a}0Q0a=UxxDEtfO(nDE!bbGXL{Bol>1H0wkTBrtxq-wdtbUGN^OrrP zTVFmvy8al)ay(z;O^1`4`6?^tq!P)zkt|;n~iQ)reVH|(Roh;4aA_Y zwTBsOJJ(T(K;E*y1t1=kgPJ?N{qUTXU^`5T%7}cQyX=)twU|%KpMsj6KRYPjJtv$e z1B|rF?5qjGz$p#~FrWn)sYsWOp__2J*nNJ?_SOE|RA8axU_)ttU2ufMNy=oJ)|c?27?7#F!~Z zAVIOnp5<7z$Zo7X`)(~22-4vnZU`!Fp>LUN z$p8z#AnHzi^u!yq7eUlj&%>PPSd8xNUCeLpUsCrB2zBw!aPdWGzFGvL%auI0oMA=9 zy#BkqIzIr3jZRBaMpV_;3nxM^xwc?B?dr;f^)V>kYO6hwGuOeX80iB_>4(FSf1#`? zpRXJikX|Hd`ozwlpAtksgaPtQfb0UgJn2kG9Jcm8W%QZB(^y^_mP6OMR^yB(#5syA ze!po%ze@)vXy>Y>^=7TR{ZID`(9%u<7K9!n+4EZ-WveY0R`Q2(Ts2zqYpFAHxDOnC z**9IJX#=Z1x`7n|-GLBaKdOo3-y`XH)K{#)PFjxJKi}|IU|dn~9sdR9?|FB7R$u^l zD+iu^p3XHCNR-sC&Py@;-?z_saux8-m}9m(uvkAvgxmjmfXL&K;nxnEvWwUPbghuA16XtM$qmn6>0w@h1VvI^2 z$;a#;H3gAzV=Ds=5q4%)8jemxWy49GCn04z|0()+UgSQ@CGcbvvPr%uY%m>IV z*fiPP8qAl>3qGmI0777AGSz@E#GTv251bam6?QH8Sv17aJO@|kn5YAFlCJ-^)@9j% z+I3pL-i-rm@(IMQn>H(k6}g$7k|MzNvr*2!5xyh%bvp_K98GgO7rjoF_lpOPqm|Pf z5gt(;wg>LSu9plEOxI;yhLKmTAOGHgp#S%{fQ_w&^)@Lka1l#QI>S8q!Edq8Y7C6*hYBOQny`Cn984T*pN&Q7QQ_-eK^Z>O(t5Snwt@5X&*G{MM)-*f0(U;0ZuBkt6$;$+33LV)V<8NcqMwUM)`Bk?263kcPK(_jY$!ZK+ks? zI<;*Pg9L29%hZbJp%;g~0eZ{I`+)U&pZ)0vbI6JnEN=d7OU!M*dNgj$=L8%R;vYaO z*9(cwMU1afc1Rwu8GhQbEPA-_&BGe7qW?iLu2b+>?YzL z=xZ2#{@bvs7DGaGO6+bt)5hQ@KmmAZZHUJ-lx!KbBJ1C5p^jXnn7TV$u~41JT~(-> zrGR#jUVXju9N57DFcrqL$78vf=ahH36J^N?G-?@f@qcjl-ceC)UAreqP@;$=NlK6; zAQU+xARrPHB_|1mlA#Di1{DFxC_+gADoRi?lB1HP2m+FGDl$b5Md(#}&ilTnzwh3@ zxBuww(T;Hj<5a=kReP^J*IIKv&u_kQAdmZ6dKYP{%9(EiHNLQa5+h+H?>`gpm!A#H zwK(ywn@&x{?YUOE4H&)P0;Vu$ic$~gQeDGxn8Xu(BHw)N3cJeA{)E=Kuo5KpfOdeB zc6NpgsQJYbcoR~L6?wLrMoTIrryZK11}5 zd$bhimgS@dr+j#2g@ba5H%`yU2Qr*5fHNaR*v#BXWxpUMw);p^4JBbW-wBV@0)~Td zPEFwTdYG`pF_A#Mtwljf3;Y`z61amHSI5GoNkW1BBb-zDT%}Fo0IN!lksdh7$2{Bz z=^c5-Bj$j6`>Pnm^6T8bx6gP=ql$n3KQQPDi#!>){EuNeUOk6pNdWJEM3B12lheA8 zwrm7`pa(m&DeuSNGbY@|nTFrbXnm{bJ~yOizW&#%d~K>AWzWlC=8Qj2n<%f!34S+E z2^eKwyp%YX@mnX|lNFBpTw>Y4^1}5;!wS;tDL@!BjnrH^+|0CGbyo7Fqc6YT1pau2 zgkt$}1c&Keg{>xwBJI=)uNGUdbmd)IqMD^>W8OxMyOB(txiZ`a@>fKsEx zW_Tr!x;R|)7S2Uyb{i~Z#TgGo3%h~cYt|6tTtRUJXa#ONm1l%qH`8rpibMS6>lDzY ze)1M)Z+aF5{K~mZz>MsrNF${ADuG3Xal&ObQ(%=>$fXchk-0E85Aq3uM?P%%npN*& zB(84fHhAHlo*D^mc3z+njCYJLIwytUu%yC}rO(FQxX7yF)2R0aE-jf*&}G~Xe5sDQ zj%w@WU-J1s**>J$O928B^d6ON$*C=Hdp?83fIPWWwWV6`IZwFa!oPhLBA>vFlR$^_ zDbb3!>t-QRDURuHwn%v!sz!-&4J*Bl$9}ukCNNcMS)YA^iUL{JSa2_WN^)1n+Qiaz zoZcj2+}KHMUaF7&woO^(bVZMipl;wDlO(SovCzk(*QXS^DkEuR?ObTW1eQli`L`E3 zH7(xd<$m};5d7f-dpd_k%;css*c+O8nlU+UfHgI+*R_rpSUlcedq}abnS`s^cRV;=4MC@_o+wBjVsN7?v--<)alqC|ptt)faUhtJqPQApX9sWv7V>Y-Osw*lh`F zF;e`6sV*%$qcLL}4zoH9l=>xBhrlXR+W*(QNa-1k&(q`;*l@yiV%#6vywNT7aF|Z$ zUEt}kst;#~@RwkT-MCG8JFdG&c8+}sq=MPkRBv|{ z&mghqft#)x@e7Yl3VlWPdP2m%mu?@bdVm!#WAc5(R|ni^iWd0{LV!@h+O59-5`6x8 z_>*#Zz-I!IaV5@&0;yrLzVDXRpI%t6TI0B$tKoKJaUMFJgbMGRqkegA%$ zckD;NnVp*JQB1w?kHoD?&CD_%X2gU-b?r_PnC5VZEzR55zLITtN#9NVo!VfgG~-{H zCmAL0gS`Bfg@7x)_a2yOqUOEN9@kqaFtX&-*WfPvr$lKk>t>T2kue47i6{xpOpFa$8oNS!ilaaD#(-P(ZmqY*n zFcN+qud}fn9^q~=6V9XcFPGq--|5VvbC2gd91pz>&NRYZ8W+jQI~HpD&ppJEdycZ6XY4#X5t(^d`q!CAr`@)xRffe!`2ssBXFy_95y5c_mmmI1}n`lQv zDcsD=`49{p2ROeSWz0zC6B3+DbJW&R$r+l27`_krLOwnrQWAL*qj@~L{{sYIGaTE% z1h5U9q2&ZRc$y)RAv$4!f*9c&n}|%7%kefiN2_>yg%SYjT&W;To(2k+Cgtsu2z8Or z-|zGMMN?p|z#qBU~JY! zIb8#*{-OFHgzH_R6--|vcq zZK)H0ILot87KfW9T9Kp*WLpx!KZQr-@$~W8VoQP!Om3Sw5QC+Hkutb)Bj+YbL1mk( zkR8N4Il{Z*B}HA8w$xNwB%g!DoGyO=BjqM8whD>3_4INyj5b(wu1Z5vtf2MZnBrhb zu>6rp47l%&xO)fO$UY$0l|scY9Jnp&6TK#|00y~o{fr2!X**!Cqs6jUJWQf|A9cozi$jB*h0OZ2Zo&i;y|=;39QpG1hGaD zFt7mi<)MeEucl^Pzb}dX1+s(m}392+EA>6^A8k!rjlojX?81!GE97W7? zf#?6zg>t+@Kmd3!MLFd-r3GP{vo_t7B{RL?PbJ9)CxH*3GO<@4T3O&95h{>m;!l#F z)Qq9JwjhNz$;0gp<-w_et-TD8YEnZYMM^Vg`n?Nr{Uq7qy2H{BvAa?|(Ob4;Bbt{W!cN$PflIR~+Wfb4BpN@g0 z>v|A-B%QQT9iHhj(og(rXK)aHEfUsv93_-SlawrLUP=5+ZhoTVNdKQIoqe zao0`htIiS&?x|<~`0E^Bw$$H%b6f@IxD2Q@E{Nf1XxT7|GVCb3lRV%{rzF%%9AnXt zgoK5i#AHdKRxiJkFkiWwDR~Z#ljf8J*fz~FcB6b}o8M}rJ6(pzux|8j13m*HFz1-r zQo!{|cKZCZu0jV}3Zg1kePABk>?f*5>1xH`X1D&}NHtNYi*miuLncYH!8D*hb$^QW zT{1a#z!r)>^=N9at5V_^oRxi|(RoTjK8z-ZUiXR}HZ8mrVw{Z!Kfgh?TMJ$_;Axq} z-v(KLLuPG}7>e=0bAN+%Vj5TY`B)6}f1lRYDD}^2J2=)pU!^)hd;ShJ&-XP4StPqb zs9Ai^1s$M2$V8m9NtM?-WjKjWjBXN>j*kooi+zeu=Gb2%I;K8g0EP)?gOqd0$QEL4 zo5pu9{V>FdX0u3>71)gk{|l0YZP}XT(QWvon@8srXh#58krofqxeBse)u*A{AP%~p z=ZQVwN!eBxpS!qd>GQGH)98@X_2w3I!b77vB!HT`x3ChboAIuI^ziP$>e7Wq;P*`?JXo4DkCZy9ed?xvhxq|gfaFXjNs*(GrS}=cbw-h?<$?Q-<5pe|y z zy0@%YAS`$*oUK?S*5)2%2xYR`$U~4R_(G)L1SpSo!R=7-0jcsEGw+uaX(!kWQX=UK z5KaUD198F&$aGRVv%6*vG|~Po({<+Qt<~gv4_4(FXN+p%d(zphp37p+S=PeVSu8%- zKShkkcrP95qsQa|Tl=1SN1VwV{|hAX`nerOFjkwpDjX5xedtrkrlc|YqYU>*%C}d^ zCfG_5Nn5f$F*xVL!TK$Q;jCamVErO%9E=wufHo~n|EB8R6+l#Txq2LB9~_H>QLuJg z1X-nK8N?go9V-2g&o3Wl!jfTWOTp@c#jJ+Z}A zNOUXUiN(utXQ-cYtOOk8(pBjL6{oLI@W>IBWk9bqxFBIE0TjB&Z^UqSZ$wdlKHjn8 zT95FydYB=@`DHA}3ByM0r}G7#Q}kVxlh`YU3AY4T5jF1`uUjfU4Le&x%9O8tc!ETD zlB~CwROXXRkUDtVF7p5z{)K{WTiFz_)E*UlyFgYO8)e#q-+-zj@Fsv@GL1$Fj#&?$ zAeTBnK5O5@U6!TST3{5#!)s-J>pf2I6!SOFt-2$>&VaCS-a7wB0*$BAtEQsGAhI!CqFM0yg zNdhVTiH>j199A1+aMjSTEC83z}q%Wa+YYXMLvF zEy)Y<8l())Swe{~6ZpXMXjejF%Wq?-1E-NPmrK@Q4boH5lAA(A`649*Xj4y`*19VN z*daEKGCBOQXMh{1N9ALGPh8qSEw>FX!Xy6M)jPRa0X}uu$@F`7E+YRYjU~o_aps z3*e#-+A+rBnfPHet0lx>fj}rUYZiDcZ|-pW;o@OEUe%{-X)UL$lug6X2EM73XxW0C z(--MP-?_UfE&x%@M(>yKa1}k)d#q3B!9$ZSj;9wQ&{bKgwd!|aqJ8TYdMq*?l!b(9 zPJ`l2Ia)^uccMy84z62;m*Ou{+;Hpq_32=&&|%Z+j9^0XqhWnBQ}(tgBvy#MH;p%z zC-y5DD7GE%;P8=1Mk#XQtY z#+4hx2EpP3#6#%0@bG6W(TnYzsHK* z1tALfny6K`cO(;+^->~&?rYDT1-CcILB;V6pL+9tRk%s*k~X{N%WA=IDM8ooT?l7Y}7Tqmx3HrQK4 zUa&D%5E3FBb}DGz(Lw>|zC%ci51@(Ms8z$7!^c6yH6&x9-o;wJufc@z4v|XbPwUb( zBBM+dfL9Y!E~#vt-6}OKeevkG&VnTU+XWmR({4s&{ZJZx!QLNB=t@ z>i=OR1n8LmRhaevFOTt+8p8A#K+D!KGJxOHF8JfxS_W>fJ--rXx=JQTlIxO#CPi!dT9q?l=>Qays2@=yGh7Y?cr2?CI*csKNnk*P!Dc^D?oFI|L zZ*FJ}UI_;3lkQ+Je|FVliX%z4V&=8bW9i=&l82u)7*7FC^cR@3T@5$F=shgIIPd8W z*3(3DvFX9AH#6X^nN!!1B(EY(n6?j`8WL?3bJ-K_y;k;r%`3(E-4s3cvW)_|r&V+8 zPE$yB5WyB=%kdv9ZYlwlw!$Q3TxN3A}B$nZhyyOW1|b-km(eC7hddLU<0U7%;8C9$^(>YGt%!X@9R zvCMYR*yP~_-ND%Z0m(ZF`rnbfBi!iC`8(*XBi%6CN1$XZg?_{7c7uB9-D_oOM@kK_ zJ}0SQ9wPCV**kl9Nsr8j1PUcaG8}DYgohmwYGUfLD@(J77=!R5yl3)R8ZT)qG zk-V#X;V&r(Ap|-+yeWgZchg;61PSv)tq%9Nwc>C0d9HeON@bbI)Z?QGC=2Qb?;I$z%{GvoM$fcUnfgr!&^hn0z44QCMzPt4E+1zc$F?5Ywu$1SYf#{R}9 zXRvr+k(iG=LVFxAUmC%oyzNAbyCsyd$K{jt^~3IZNr^G@+BNcKK=#(+uvu4W!CW%= zFPyi=zj5AG|H65D{=s=)_#5XfwtCAA;N<4Bx++J)XZ?X8fEDk#kcBzr9KN?Sdas3e z2Kj~TWcyAHP4mBC-g5tfd20s!7nt{y>eOO*_mW2q=TU)E~!1fIv?TP$2_Wwkic)k28H_00%{*j8Z|yw4aaU zSDk~}V`+~Xl3*|k0%US4$~TM(i1EpfFA5=6&+l3s=OCh;*<;iDN4_vfSD5Ig+{(7ZBlD7q?DHCDaK;l%Jt3>P~~qjsHpH z_5nn0O&pPXLBJP7jFGE(We2a{$K2jM;}eCu%ezJ*#?eTAhh@(MOyFXmIHAt z;&FdBwPBNwL@eNV-0aeF%Gk@QEyU*29r2oZ&000BHhFkncm^p(C&J1ng!KvE-JBaE z&-S5o`<#wYgXH1oZ;1HWFDXgQg*$d%yAXOxb@p=5c0R1D5^f;CNm&}Fc^?VuLdnNU z*#7#_^Qlx!k-jD;Metl*w1oQgUOY>nk)mRLCSm(f%oUbUP(6WK|CUm`47M^5wanaQ zVpGQTFN?@gD0C8UokSlo%(kgrc7h?^GbKB2l`0=YIk&EK9Q~|q0@8Qwa z&2R%c!Eb&p`;zE&4Ye_rP%uV-(?nrGpbg_Rb*sjuKM~${TaNTb%=XtDc%S@ugg3h{ zg^Sd9qT8Ohq%NNe0^o!%<=2oYEF);Anw-k6W~PA23qwyO_E!Sh z>$+_u(P^S3%J3xVji!{k`!FPa5b9NbldY{D9~e?OHpPj z(&E3r->r4hakeHui;d~dkkhaSr$J5D)Fl@pK1L(tNKNOF4joFIEN=kDWQ;7zt*d#j z{4x%Xdd*2wswoIpPFF?U5Gw1o%08QqBVF?gJ0P=MLV93<9S@Eh8(_5Q=R))&J$$7%YckKJ~tu4 zOuq|%fFSAZuv0@s1Lc%XgA@*1vK5`fVu)K$qp`=?e-un2(J#@0?+(`WRMZNR1j-rsl0qqhJM zyD;c15D#}O{_q5%`S{g;7Hm0bOoIPgu6MNmQ&_&5HHm0R?Rb?Z|L5FX$HZWh#@Dsgdt)~+$xYamq4H$u!!LTtYiEh*a z!G6l{J!ZzJZO+7r|2DLRtVDeTe)&kJkYe|IO1m$?OAp#Bicmfz8v1!-EUH$lY z9D;^4nXw9BGlAn9{JO^Vy=%V<5=%G#Z>5U3JuJm27f_A9+x;iHCAuY2k@F#M9g101+84@L4rnc?&dY<5_o zVCM$<5CJDffHbqA{WAa!;y^W!P^j6NimJk`OAHR;8t2{L;|^3VafL1FrJLXgvef9Y z%D`4I@`g3RyCR9kf43$0yEF#$Nw|%=KY*Fyem1H7f9-{S9I71m?`?-vD^MZeItv@^-MMN=ji1CC9!MQE=K0fo}y0^+c2*CLZ5@v((Qi} z%IAja)bKZFqsz|#JG)<@?hlIczUc!1Ntnq-i|-j~a$y-ac!jcR*ee-~A?AOOAdAb$u!2|TkY`ppS?qoyW7mt{wRnT1&n)NDJ++W(cxv3OQBm$zt z<+Tw#i(ECiGMGZDxXbLpDc8<2;`CNI&K&iMFQ3UzDDY0^RfR^B{jD=W^6^+m39B$R zTHEBoHRzu5)b**Au1YnUwZZ=+pSlF+J^qV)`tKlSB;oD!22>HP|9oY1=IHvu(o|da zHUc(Y!Cx3lL%ip5(dEO55P+8BOweIxBP>7FsxOWn;_oNzG)mEL8+zN9IRUaKBv3IJ zu!PIMNs0ZEYV12?tiKub==**`7o!m%Cj1A0bj9kw07$_&2PATJraPiNMHP!5U+<;g zoV^P~EG&a|obg;xI2{f8F~;!S-4+xql;1{spqg;@R5D(Ebm- z%h9Rd9BiE=?BLhC+j?;tT)*5rf(sB(dotEO#lIm^Qo_zk2_A3zm1Lk^@c|-}ku=2S z+_QZV?`bPwuS~@uTSn%XjLIC1?qE}lGMc3ufD*sRLybjBjf$%xfJH22;=|+BsO(!3 zHE^9Q>zZ4V^)?K6-}1hk2Ue?2CXI1H1Bz$k=)R#A|ckm4jjqdJAV>)>Gr^yPeTY|4TMFqrzle>33mF3JaYLg0Jy*q z^0xQC-;yvlhzBHtx%v50A2Lr7iwE~v;BP6lfCda#zG5-07`+e>25){cd98--Hyt&E zBXqsiJ1`8~4^`U2qR9RJHABq;0in%AogH-Ygd9%Wn^J=7;wZMu1=(T~HM6S?| zt~Gh(Q|+FmC2tj3;aHh|lL4>72Kq~VCE_!Uk7xom8G`l}_1Av%<^n+o0d$}nwbgqG z0LeXgzxRW|y&Nr+(&_b}bu&<6?Nb~+@~<|?0i{ne93p4F_zPpsdM_oazu2~p zx&OOK0%v3B_UO!~g6)N6PaM%Yzq>8lr*(F|8XkG}+$3(c^N>Y~tpXO~IoKUl{S^A) zmnu`=gBv-5Z;A5if7k)jA9pa2H}0hvUG17R3AXM72WN_u#i#Xbw+2|0=b>H)_lJP~ zOC$B8%EvSiGhReETD}F|=2!(t(xAO&cD{IS$N4QRY{H??c8Qf$E)efN#t3%oC?>BT8lEpUg>yya<0aG5gxd=)A2ef{~chc0eAXU?l9-{ikB9G zBMt58$t^$*TekGln%xNO8aSUl-%bZSF<{_g9x96Ol8eoNcHH@6|2OMDjow9QXS#TO z7{knykv2@LQ8pI9Ohu|GLvQN-qkAUwFXBc0={Lht-*B$&t3KENhvhRxh5 z4JJ-ax@sR21NuW?y;%HR;D<|4dUL}>+ic@iP0Oc$05RuSyh(Af61wvCBbp_anOq9A z2>@*-`WJ(HZ1m4?m>E*~Y@rd*N~NZv@`a8CEFgnWiz$Wrdi(?8%uGMkZeW@73mFVP zZ0LY<|4dIA_5us^TT>xW)oU{un6@VcVwM6AKNZ}Qv?4uh_1S*EkVMWAH1$N+JA5Q{ zgJw~IV5L7|#QRJOSk{TkbJ=fBSrbW|k*pv!e{vT<1IlCFMw%iYyp9CbNb2^hv)%H~ zK9`>={c-it{qSkzmG$HUQGH3;l1Q0va&Aj;D-RXtzv3nWNm?B@xKMd+w>8gDoXnTP zMs<>`w@y+=#Ny)H5*HDG-vz3+1{=%kx)l9*&q88@O@hggu+Q?GE6>rKogE#YW_Hn% z2S2eyN9mVJb>2XA3?p=5}0g4hs808^o>a?%of>+4=1U-+xWzQ7HyYRdx6K z2yFEybAQ`&W@=`NuA^_W2Zbz-O?;zHZkVWq~W zY|gLB|7{DET#!vOIpzb1^-DaAO7kgamXl0Ex&{VnI9^M@p&L;7_o$gH1}<>zjvL`9 zme?h&wNCFJl`0fK!LTt*gFG)U-vBMSm_5gU6C4mW38+vE?FWHV z@^=H@J^6f}k$hwXWm_9twX=WiUb~!BBeP&@_mWM;mT_)P<_4;tJ z9SWvBEzNz!Z<#oUbjh~DyTzt^6^&up{)hKS_iL1=Di)nnru84AnW+?{s@n%PY>Z>A zCmph^U&JfS(EtSbOKE2xRofF`JB7WAz=ruT@1bf}tSmf0Q@i`b?ohcdXTxK@{%>_PEFR}8R|7m9FwZUt*pSE?heiIKgqY(la zTfTI*)q}oNi|JFg@hjz?egG?8D<<!Z%kiJ}+0xW*k)AI; zQ-DcrL|p?&Q0285m1_yt!S#<;-zR+)Pqxv&%c``YHM;abx2f;j;=@d9`f8t<%ENn+ z%j82?y857FmWf5t*9~p&C+>1HfGF+;5I>IJVu&NxRRllKoZiReRiDKIFLF_a+WqZm zad&$I5C%V-N|BToToV$aLYtmg$;(A{=f?88e@pXQKkzc%cCaQRC=CUX_7`050i&xk z2iDVGLlID4YM#=Q#O_w*t&X>R+j3(5Jfhk8Ch;=4WNF|efZJSV4k9jrXjJFNwXx6KIchTa^CaqhGNY=s9q#BjI>z(B z7C;6Pe^f^n#0z(WQS2xr9b3QfF_YonzhCY^CYYs{9pXR>FpASj-s33Eg4}nmu7b2A zzQv%`Ra$fNJ^q(3>hH>KHpspIw3-pQ_#kp+h*p!MKKJWGR-I2*>mz#zDA|W^_!l2y z6;73WRJmqj@~scs3JU#xDz92pUhG)93oxDl)|x%MLkDg*N5RHbN=H+r&C0>&nPSWu zfoJiOXdb@>5)Z!B=II~fL`ShxJ^5Nl&s!3iTBn(lE)O~TK;#x>|IA>%C;gXCSG<~M zr>Y^U2~%aaK=eXQF0f)>o32xp?pARTmFDTBpil`mKagk3y8aA~Y=z0H6AMD6~? z^RDAj@fvw8MkduU-vb-*Ts}8H4avI1BPI1A|8_3ydfG7K$*D99pd>Y<0Zjv>n+{zuZ&OZ)eUpIj4EarJkVbtc#yAVqj|E(>OiW7 zn)yn(YxIpp!V|UURiAOhS*~$kHu>A;T5(|10f)Qo+32?9@^_OfqONNun-Z)(?&2pcQkP974Wz#MAgbawZN3xDWL#k-qFwZ>acA4&cyO5DHUKtMtT#OOrxQ;@4M!_-f!or^bbkxXGt2auOgXHUBXtwkcZG# zA2YzeLcO#_%0-)ZZ@6+{ zLhIecJov0H#18N)Pe%vi0f%==Yy5$M!QkiZ)|WQ#lR#qaiC#R+c7NIEF6hLDvmxR9 znQml*-;c+#pHQd6r&#kD{gdlle0a{xN> zNh#4OobV8O)pt!Ku}-_1v3O?4ccu+O!E(bk;9mVkYLXH5l(OqiQAOADrOSGZ{XSYx z(dl@m6#o;TEoPWq;O0hpL444N=NuKow>sLOS>5LRij$TVvW*nrOc5&?;n zr%x4wO9)0QPlD_Q=o{h*O>~waPt%&Y0O;`f{>$!n@Popmeb+)qn$l`5{TAQbzft+j8*nEf6Y zMvX4|I36Z%#T0%Oa>otu;E5nE3yDMD{Z_6#WvlD>My0vr`?;JQ4o~d*Py{g+4a!yb z!|i;etn55^nUNhF&U{`Ti=SOV`sCk8%iuXp7Z*-Ik#LXMEc->|Fx}KX7e;&!Y*I)J zNY>*d9+%iiCZ^^F>e`Ax+EwUUaSdo|LXxD^Vxt0S)-Qsm?DFmy=zZq1nK-}u#STiQ zX>g_w!3w&D!q(@mdpjRXep1YVtv^j(;2Wa-3mwNKBQJcBxZSQ_tq+~qkM4CXecxMC2 zCyj1cuVw4~qUKFZjy3Fu`R(3->i9?jJND&$-j8-}8cb?J3BSP9WBd*DWC2?@jo9z` z9_!5YJ1akHK-Zv_s*3Fi4=k~oht=OWqagmr zN_pbf%X82-RQ{|7Hz~qKDMXJypOaJG_p@Sqyv!7i0IkM8xXred%2tVAs#AF-AZ!P^ z7E40W%==~sMt8`b-3w@SLi>VVZ@q(>mcx5e;O$_2gWZTNbzj@r$uukN`bAnElI_W7 z`+7D?nL6odt}k9=>HrhOU$*c>oox$1qw|rGyRtFModv%W6DKe3mw5804%8@4WJhKk zePCl1r#+&~NiR4~0m!b4yI1t}eWG(Ydiw`vq$9NUnK>BUn-C(?|w-^cQ_Em}agk zXhW#?ZiCvO`=zlM_pa2`%vU@O%r!ob*~Cj6Ivs0(>>`RoPJo+TBLZ11HbHQnJ>kkc zb$l7E%egy%Q{f$t9HJGU)jGlv{XqZZ$hX*PtF`-e9v%CYb7W`Dktxs24dOWUPs!_{ zV5u@OSp)7VuEk#!m9_r%SY3>w`8)a3(K@kL09El3Lx^u#}& z(@N23*na9dh3faT&7On4Il>GxbYUke7~rH8Ov(S|!h&LW5H91NN6fGL0(wR<3f+ zz20YqiZN)3{)7HdRC(`Lgz(283$PZs1ybMd*DuO(pU^uuojcwpd^d>4_onl6khwK~ z(7>->_loAOx%=+gf&KKAngyKq*rbqG4Rmn&)wxrOq8B%HF3NaPo|9hnhbLWy;mH%Dl<}HKnZ)Vs}kMo z=7{TVjH`*fGz;1KC~t;ij-zhaBe2}!3@cSpm?M4e$K#YK*j|UrG1BSl8A9y!XX>aZ z#rAgQvC8=4gy}gd*=PU_R%{$Vx52v(Fg{gO3Sl(~dNXyI2;r4)evD(~OrfRKj434{ z+<}-$-0XkPAQ$HDUI8Y;t)r)QxCya0M=$I77iZ}Am&k~Zy`hXzvqqOnri{B1L$0b{Ov%lwA&mN6Fw;`&%3tq*rhXCvHtz4 z$!7Txt)w-#^7+;h_|gh)CB84#cv9IJ1(GXEzeMd$NvZgY~G&SyW2_Wfb`q1F*yZV5U!Za zReQI)T=av-Y}x*5jf3r|pB^;@Nn#O|s2_3b+?_ukkoizq>Mkx|7Q$IB!@*}jnSN?**Qn6>S*(dvDk%Fohkvy)k`;uLR5hZBVaEBS|>SF$B9 zFARPca`7^?hZMEY`41UeFVDl3sGl>cK%&AM4*0H`vtQ+S2eJOHs(AM@moQJ#-R}ZK zJ<|zH8tS*y&n6VQa&*md*7LQ!f930)_z!&kwFJ{PkTYA#pDgfRE9u@#8909Nvb3jT z;bnK#Y**(3th>Tpnmh1lRhwgrj83l1Xz#3U*K6HK?yZtcwDKhz{5-oYDq>dScYgRvd=5eTD57>lrjc8b~g#b;R9&Rn<359eU)~* zk6%Rj?0%OMYTKU@iasbos&uuKA29xx5oI`^EArSgMp%)6WzR!}$TD+N0X?n`;Xb0x3bRgc}v9&p22}N3Nw+ zg?Z{I9NQDtz~xP%75GxtdpmjH0uDh*SB~CSN863h9Hv}ZGe`@Ji#P6eC>zc13t)rw z+hnfp-!`mDrbnM%dLsz^5$Lj-^od+AZ=yGa-!q2fl-!?rh!jp(NX&5C>sHroON}!% zQAmmYl(d!p-GqY$sjS9c?rnY#=XU`!qQr7%=hg0x)wkZmq-D0ezT&I#RR>AT!er># zU5c#gzS>gzB=)mUK6Zl36xQN~bZ=Y7{Y(*L=udi4b!??EZdUY_)faLr5FaCgp?kL@ z-?&6w1i@$&LzWc!3YJi)+(dB4X#0iLw4l8!VIuD3wG34&2U;DMS>+B{0_q5$KFX6n ze}3zToot<=)qIz7yTbU@PpP+K+22bi&I$YqBsMg)vLkb2lzK>Dk_3z!B`}7 zV-e=h&VTFN_U4-GN{tKN&68sx1D10e&T~~BlSx6AYy+o*s*a_tA&vp7g9rO0&lFA% z3&iYJ9hAq%O%Hrmw3O=vbl;BdyPvwjhfk$!gvy{$=r7!7HJr~D)$I%S5a1Y0NR~MZ zB;N3rZS)#v(jXu8bP;#1WziAz`JCw=;FOK^-R-Gc)KWgZfsqalH&hkhC^)X3pk(L| z`U$>xGdNgjoUz(R2l@4Ug_|Cz6d5N6rYZ_egBnAVjB6 zfB^sZeodDvTiB4Tjm7eYr3A`E4@uW6n4~~W@Zs_n0!`IW#qH=o1a2q_NT@gDDddB`SsoQLI#O(uMK~T z;?EkCrLPn6?Tb4jA2RP9I7Fif)_`T*rA@mrT6|?RA=2Gy4uWhgnPB$v+E3=a3%j1xp?i;S;khFa??v=-jHLFkWXw%4oe)&50 z-ohG$3Ap;)RBGztrd$EJuZB zvCkBTSFp3QZosa92%rlt?prkkAfasYgdfBsVneHd{?xX4YK%$YKS|zZcJUIN1AQrAcbufD;*isTl+@U^-!f zcS%J#F_fg($nB6k3`=^tlt2T*QV#MNvm42vLczvQm52Xl1IaC3P#e zGWQQ-rf>LqQG~I{

S)#ZnockuD!4vY&fI{4jq<@gHM{k~9Z$fz50FVO_tv4%ng zIz5|meG;$q?rO4H4h@wHJKh!FwP{NZu(y}SIwU6ih(9tI$hVqqJF&kKzpgb~&vm#` zs_)YYjz|;C-=nRkD&8%U@(cSziaK0kJx$`6MNVLJG*XaejG zZ!`i?Dgy56b$xqv9fdtS^PJ+eeI%NAU-Hg)67_PzR;Glxx!P!tlk{2v`WM9Shebi@ z55K;+SQ6)T=-Fh*&)?rM4+XwsyT@RlueHI%850=3rUlca$eZ6|wI9**@sh1*wHR-Ep4)EGExo_k+kJ)Ns#?$boQzs=sW@ zcfSsgoKO%i6Aw*DLKyVstDnMHJp%D*8pPa-z*@!2=Bwb;tM1t%xdZ&>maZBUWo2J9)`a};Z48yQ(V)trT>rF^W z3z&!blf$pFGn@{&82V@_Yz9tSRzOZ9fS-+h=LI5MFgVa zfYdQGkrqKvI?_Aeo4MaNcg>IabJx9V-SZ^vSJb?ms1N6;pEXf`rC4=ZjL{=wqrNdKaV%;vbk(0pi{PYpH@ry=4v8 zPmT%Gg&rdC4_#pfmZ2FtsZ|EccZ?K0ohV)BdDs@g8=&8(Jo{}VD}Q^`fR)mZ)T+dE z6&t?Yv8xY2f)*AQ9;ftC?}Wd}63f364yInodYi58X=BpzywBc&FlTzwcyBatdS}ji zto8W0FMzA$c#r3M^WvV2mbz317vAYn@v@hgnW1zWB;^V~@e~*-`t(efW2V2LJA1}> zvA8PHEANXV0o(5kFpH9+dRCr#o;>{aVRdc|_ML-C{78CXNBZig38bQ9(}nA=1{l4m zY3-Xb4%-{8Xsh?SeGg9n69A)1V(30G`Xf!g`FR{KRLZk?jKJ#mT zP0}~~p|`K)>F1xq$~Qu-=Vl1-GRv8opfmzK#S(+kbm8S87aXYCCoVK3U0 zQbrzE7$`)8B1O0CjGV^5|Fzc3D#@^-A^lZP5m>?I!rFlC4E&N$Tdk9?5>KCO2KWrr z78-|;unU=61==vHdWO?vFu?&WEl0oC8|T&(zD|vL@!b7>WHDuM>TrymleteCbC~)> zB>HZu^fs&}KAWx$XlCPEn%vLHvcC-|znaK`?k_%h@1^qGdta}ZxFDVj_cM}Wl$+Cb z-9%d1|9B?CAc_uWyaez{odhBhytqrjAfn>ju|8Wp^{*)>NpCwl9nt`P6<74s)+cJv z8~B3F-c|yDaC(4n7(LJ5Q*^pIfx*tB`4vd17|hXnX-yUZG;8Cn&jN=1WuLY`XB-PK z^kGd%PZA4YO{ly-l9a=mf&BV#gtdgC^#A1&=Hp3buWW9ZzCs2c2lAlgAWYRW#C$Z# zESu*wf0)W&Zz|lQdbc6tLD8V69Ew-PTRS_seh+TxjQtHO7QhsvpJy^!v;m*Y1h7#8 z)EY&qUM7q;!2NiaU@BJ`x0nY+En3{9?V4Z4!n;TO9s?!z`zI=CHZ-TB7jZb;1fp8b z!P#%xMApF>FK0-{ zORE3N*()+mD?kR5X~KWsgCZ_%!1L)^bc!0EmY5cYC>)zgx$dTIM)pmUFU!hO7iQ_{ z=m(QFLOA_?toP)H^Igva_GoS%A7Z$+oTCZ=1A_rtW^ZOX0Ms|~u0Go}dz2isHD9x2 zn9!Dd+4IFOfVO4;G}^x~)xO-6MGPa%muy)eN}e-@v$hpCUsE3n{!fub$r zlYmZ)zf-$iF_`y2qQrD{%KZNZ*FQo$yGA-A=pJQ=WHVK}O-eA!?da@`ee)-t=rD8_>H^M#Bydon z!&AdGZL`3KHc)hSY>&ZlWZ=h@d#ak3KdxL(uh)9HXA?hX>L0U7&zdU&FkF{~vqq8P zlN5F>J+5un5W|ca9drTauf)3A`7JR*C|K91W!71LAa?HnwXPEKPU!4^FQ=l)Z$*>2 zsqi0X5!K4FY(nBh%kuY5V-?GU!pirI)ieb~*R(9~NotOa;yn!)pp|Ukx)GzAbeVCss4@r{yu-m$rPw1`-=yZ zM&PQG9%?s30xiY12$F;zz``{B*vhQiod(2X9Si{sTEtqRxNaJ{y7nc5%iCiqN9m39 zD&ki7?1gHQ8YE0$OKCZVuq!iBN6CM63L9i}!M5Bxv_h)g7uAx5B?YH9+Z|~7MH-Df z^%Y#!b=@V3W}J6Xzj>jy0DlIrD>HE)@#gNSBHk@MKPhENQDK;``N?7~bOVH&uqrd* zShQ*vD0i0^JeV9I>mVT)z)K)lqS)Ixw>|JkW1KWw6Ic(*Z z+E=&>p{&4I0VZ*q__Q1zSX`CiHOFf;wk@rmFS zM75uXUJ!>#2oE1R)HKhaKNA!^RrP8C--ix4_B=B-s%*B7mMsHm>61O`RmdLU<%p~H ze(6gxqC7k+vv@V(Fg4RrWOBIHGM^^mh{o%DAO*o2H9g z;EZ%;Xu@v=-|h{`*}PC8_@LW6h?*CaQoomZiJnNPg)GGqzlj|$(-_Uwx_&=Jo|=+t zwNXyAOVTp4wO||LQ|ouE9lZmsLFULrMl~(?pdFzeXq|I#Npq2&llgblk2s-@$p`!? z2iOMX;N9N-GyUO8PQ?@{9da(cF)6(hp z<=uAe{UBn15Yx*(4y%2}ITT-LCDj;ae&BDP7G-Y8Xa;#eo}pT*A5U5#XR7J7ZPPTG z-0lfU9-ejssr2)=h0RtIYi+#9SusHS5qrhl-&p2A=a&UjIMt7^V%~(NS2qsN7qtIqfOHV9CNC~D(DEi3ao^SH90Y#j7IWTio47}KK}5B6=b=Jr z>*uP~8N|(tHm=PAu#Cd)-guRZxn~OvydE@6+C$#GwySd_sJbLYXB{R@=rR^flqsXWhNOpo$JuBjuDKTUuq5WWrrP z!J**-%bAguD@fe#c9T&4e`HF2+n;>zFPVIt3~Q7R;@g!LujxpC3uDH?$~D~U0$@KN zexXw#=sCOy{8nX@`NVmqTZKrk@oZ!{K_urp+w354{L)XVIpg?fy?`!F$K7C`i>0W3 zP)hpYLYENkS=b?Fk`Z^OwmunE@bTuMrat(sbV?Roc-yhCFvZ`3qe%pI^)&CD zB4O%7{G2INs5n8W^dX&IA_JXxd$A0w&v@_FthnG-0L?IIu}1&WFDv!q&NR1nx7K~}GSRx4?!;cQ1=t4cxf;`BA=cViB3GBt*BwmxbtK8LabFuAovmXOLsB~; zDg>L`+15AU8R0BBqAxvD2`evdmVDP)@~ac-Zu8Z_4n$Xa)lUpF(t+$u z)J^eC1+*EWG-K@^#SyXK=WByZKim>7&%05H@)kq$tC*^J2INl5hv|9 z#50$q+-8!O-VnoA!m`0C0 zBKShjfevqMtoLkw+0DGPah!>~2#V=J$zZ@7Py?_M$7h1&>Xk<(z~3U54uzOUnV~Qk zX&MUDcI(oWz$qXQi2;_BY<%ChX?>Sh>*+k(v(F1Vdzt$yJ?Dy7)@woMAj9e2V$M@_ z{`3fDd+b6Sp`oGeIKA4shMc;j``T-Gd%LEA@J6_moV=u4d~4U#6>Xd7GUXr2?hNie zHyjcXJwAEk&}C;j$G54lu0EkIfrpZVYAYRUx~f4g9XXtqx{AP+Z(UFOnCJpt0`oz9 zphTiNh;+n@+sjzPpqWeJX(>+kH;36$`cz#jE!J6njt%h5?hyFcEQFClT|7c}6D^5fE2T*Y?G)NeVv z4(zX~K$=@i$_F~N^3ii)A3t`yJgc3hxQlADFe+X>B42Vj~^@A#>%r7*NO<#%I@m=Xm41dxUlfJ}eksKslhWF}Y8mtONC~-t{jmXWM zq03evsd<#0xB%NG=oNOVnGU2bWxn)}QG94psj6;K&P%e1M(DE0sQ4;(MM|(fm(xfb zaAa||hb4n`opdb)J^?|m;(JIpsbN~n;>)LoNalr;k|{xIelS12A=6gziu%9|=xv3? z9R)WTF|sr+U#LvC>@*x>`Z>!>?{f@Pl0)efuDo^3T(hVZg3Q{6c(vpI}qEQj)2I=lbxa|D65#oG%AOeyq6Hbzd>(7-NoUn2M4NJ`N=g1VQ+6vQlaggxZh%#=-!PRKJ!< z0Y3ztq&1vg*qS=I8aS9hiUv-0R<=%7Zw>Fcm^e7TwYA}S4E{X2XYS-==P1O%Vf~*M zJhpW(1mTh*zfqv1l)Dg=f+;5@@zO12bK2AE3GV88U*D8OgTll_W7FQA#Lm# znJ}zS#M1%Q#*$TJdTY5o&qrtR^G1QJ)DBLq~1vpLtLe6~p@!$zi#uP&%3hhJKzU6*Jyb zxYXUoj^uCemM);j#Ib~zx)&mTT9EbwdM#&jT1Mte>{LZ4g70quJ%Ch#^hH5Eo)d4l$$! z%Z<2)NA)@A!+7>MLMir(*yy?k`Fnx&7b@VokQTxM# z7%GCRo%yu?dIh_B!#bP`7ec^2^jMFU@2%EKupULL{rS3aaKei4DS``)ZX~E7h7Mr4 zrU>{8R%Z>?qe3+cpW+>gh1%J#@QF(fdK|bhSw{@6OU%@-A=z#)^Pa7S9(|i{iVW zOYA0j$OrRYyYuCf(Q7_P`PoK&9n^#dJ$?=QMHDOA7;}c^w+PXqtKiK|LNlr`ZwV>U zW8q`d(P|%*xp+uOJN<$bB19RxHK&IX8F1X;lCAcKiP(L5LwI__<7Txm{a4`*4{p== z_D(K5FXXsthY+Gg;lwIfU0_5ERZ1_->a_oYb|3o1pj?g}P50fhi`tm*{!=4&N!;`X z@*$Xe0l7n8pem9kT&$gR$gBIPJ{{AJ)UOp%f=**cPm*!BRF7vfKWQDJCG4F03}`}g zG{kS!U^I~3SozK`gOmuC%)5=hMSYgtud?Y0oAAs{%zZI-J2`u0p3~D$RYkd>*^xp1 z>O66T8IY!(*GTz|NRd&p_Qf`VLs|^<1oiH6<;|T@&+Yw*%C_>I6T43g)yK+#cQ$Y4 z%8Qzvt?2v?E_+1=oG;T+#_EpKvA0b7Z#8XBxbi{FWpjc;vKNYl>EzX-J2Dc{ZwR;A z_`8l)nYG*R5ZHU?R7Jb9^vnicc#2XQ&&ok$x))?+$+{7vqPS&LmTZ1px)+X+*UF76 z(HHi3j5A>^?2O5asi~^BUFgHpU5WF^5O*>y;jDNi*`1= zYft!?mw0@+a)-(Mhg(pn8*O)3hs0~D?#$tB)TS-S3i7!}Z+(p)I~WMtes{-8Z}*zE zaA*Hv8EqCvRkXG1RUP}-qHLoP*!wuos7ZT0h~Ms8d=vk6@#wflX)t5jZ8Wq4yU3_E zHGl^pw$1CcM^Yw;1wSm-BR(dM1ZzuUEyL@F-Q-ZKu z#|%bd-@rI+Qbwva4n@s&4vu80CB@NWVd67_vYgvDgcD+gEKwyGNru)jp;BtI9uGT2 zm(R~^`~PeQ2of9meh%=qdCFogA~1@!+c;3ObqgXET&aoG#@<@Fk~SBi5mSL(eGIKz z6)IF>J$mX#rQJS*KPQ10!XbV`B9Sav{&RXpqlQM4V;H}XeHZOhjfzx7>#1Yd^MF11 zkK22eEVhsvUeoYh-6B4UTcqYNCQofKO!~U8>Qv0RBA<)ikUMtC3~r6JW=14s{>;~_ zd!lXAXioP7PB5ysPj-cK3hbIB)p95_)8tT}#7J<8v*G{-#xd_0`hR?AdkEGf$@p{5iZ) zF83L0G>CZp$9j9CQL|UOtT&W_-lPL#?Gx1D4W5l$x&=$F`2&L*Nn_&!t!}XAZXhpd zR+AXj+OWg6Q)kVtwj!*l>x;=d0PbTWQTv%U zTHi~mbbxGG5V}DXb3ZCG(RVNzE~lnpx$T*XXh)4OR!k*jv`=VrO0M0teab ze>s;M6l^3ULTRe7D*{b+z4(o%u4rz0Qo%1~$u~6**${WoPVxN|VYzw|$ryhgn;!*# z>bDtrW-!re*C|^+D5ARwOKYn{yXsLt?R;t0|D}#{!*vS3pY{c92%q&=)PL5^X?xe49 zS?m22=vYGQQ@04&Xv+PKs3_MNf|tvKPU^^~i(Yk0p@>QHZhCvYV)=nFwGx8Z8;kSKbwl9a`f4I#1W=ev!eBoU0%QaRq}P3F)n zKwm>F2>9=EX(_2#gG+kYJupQ%I%pZhO=7HwW3)908I2Rvy7FxYLqmgGPNm`E`Tk5o zTh!*|0hUlcR7}XE1t3Bw&dQx3RV@mY&zL9!T*e#T6E%QN24vXzE z?PlrT!R@r3c=B-bBWnyUeQ?|CYI=Z``X=!OuYbvld zvBdOKG=897ByaUsU3jfs{4TJAk%xyAvTLVtJ>6_P=oKCq9Su=T=0mT`DJZb+VCUfA zxNpsz1!9FXTtguvOdAG+L6EexwCKgOuc-ZkRP&^i)NSlGj;t0#RK+E3>xm~&(`vf! zcIi!%KN{p05D@ULGAxvZ1Pnf`^;S~H^6{iCIDvm&H|}qe5z~zb&oL{$;sbbNy9<|48c4< z>TQk{sOx9VsH`OKj${9c%Ted|EQG*ds>)uY%$#_;{$$1=wNIogHG6C@PW5>Sj}HQB z;|S-*D}{{tKW@Rh~F>Cg1jPJsGr8aKLn zGXC(3FHQ4yElV26Cv6onI*F(WFfA-BigU_w-@Gt0P_`rozxDU`b7mO@*2jnO12*Vk zgzvPlE7`7k1ZN!^oCwNtXUpul(;+UVIr_NyT^9A`Sc6EO5e)(6pLh-#BO`{qygZdk zC)SnT)D5}1t)NfEuN8ty-?F8IoHWnd>G!4xN=&v=HiH3EO12d}X*vCL97)o+5FpZT zj1?6Xg?u%)B@fTTQuO2S+_8F3*O%=C!@@d^UV@uJpIDMANsLS%ix1R&DkfJqeBmB)rVLJRHxY3Q3G}k@~LQF9it#_J%LpJ(xRZ>Ib|3Yz&Nzh34m$ zqLS`dQ;}Rbhb&FMNaJdb9WcckbD*bvmw+_@FC_x+ zJ`@P$w0FyVAztMlfRB%#z<={tAXTJzC7xji*D=<4n_QAWW5|d|=?U6H*6m1ctdKo6 ze0w5;2_D#GI4)Cv*iYP4dP+*l;`|2MTGySnmN3#iS)Ej6<%(g`XCp&z?(#VLo6Uwr z^<{fbVjK#kimv`%#CVc2F3DmO>3{|{&@+XM!LSe7F5rb@CElLmzXe|= zhVX~{^zZFeDyytyc4CS&?|Oq26g)Qlm8f;i14BQDjC$k2u{jeB=sfv6{Rcm)de{M^ zPvrSum&HtCfyEx~)|fS`RN%2hMDZAa+or(s%peUR7mBLd_$(~ufd;8m^b>zqO2Yfa zA|eo}_cm0d|7{=39^2!p_c*GNg46^FzwS6$`Jh7?Jr(!KUG9U;KGaM?kagE#R8-!E z*ny)O9lCd=MD4Xg_1{lNBLfiHwIk<_7BA-SN&H$7G)p~=pD6Fv+gW17DZn7*1%Cf? z?_iL@m#rr-uff^2JxC2=Ba*HMFxM zT1{~zsIZ>Ab0i3c`gg`On&~9rK9p2cnfow(wlt{Y9!4s&K~X~Z^d#Tf^b`85Vp^Y5 z18uLp*1Ocy#;6LnF(1v}!^z6e$McpsG3a^a`c_zK&T1k_jSVOG+q`KMe=L{vL>YGD z^-2hA~u#!gdCM?j7c1C!U44pJJzs9oaq|D{M<)25J zZ@2by=Lt@Ph!IT0Kdw$@W6lm%OOF#-qBYQvA(M3>h+Ma+vhm9L@-RoN^~KHg&e+*r zd-3F!+k6l?r&%v`y3sUmKqlDX3~X$;IZCMk^~EG0AldO*2OaF{3Q$M;9kjy!mx#DO zWEbL@YS_lY`z}(CwV9?-^y=^h91fq$`&K9kQdvLcn(V@BU0qsv@!#=(4;f}shrF2? zGjejCUhe*^I+^h-0SC@&FG8;coZXPr&EvzvCS8p%5O}#vS5R;N(!%yMV2m3cZLd1Y z;pO9N*5xAW%Exz9F^y3F2?C~65vl5*y=Xl~Mn(_*kdA(V(Dxnk-rgcsd{hkwsXJ0m zUR7y{1KBd!_4Oid-znQs!d|_4H8xmA#@)ZOZg5=q&%C9%wQ$hmL}NdRm6w;tRWHyC zGU<-XEH57}ZOqBRtgo*(x3G`|_t6$fK|1WN$V8fgt)#vY_ZS&SL1x%zrE*`AlR=Kf zBJA}geHM|_J1s*#n1fcO4Vig)+WksK`8X-(-nzP{on2idkReAtd}_TiyuMfs3jH}B zI>%51jPyda#h|A7QR2V7qLTpV< z75=@?pAF#>AjZf&PTPwJ!~$s+;0sP!)5!AWZz3zTBVyFx=Fo3G%LKWk*@zICJ#+5g&{DewBy|X8X8$?DB7(SEF$GAnr5$ z8w&wTgNt`I_{qI`^~tPD1WKVx0+AT27ftc+!`Q1KAPC{ai1bTgzXUlM{br{LH=h0E=7Js3+NBd-KW+ts)5YCWj z1Qs{+107HbsICJs2u~7FprGUHs(7nY-{Z8A$3sIy8JU^)SJDB5m)dJ}N2XIvv%Q2> zLF$vIc?c5|?3+yudw>P?8mIx%`kI#~IsF>}7Qb-j7mVzAqRb~Gkf8l{2{!$|l+BBo zbH*M#l{p{pRA2d#<_MNtTmjt0%U7>{tW!S0GCz3g>RW-%^igm-%%ZsOINeAnw$tKs zSk1{x-~SV)^#G_X1%IgV7wZZ+GJmaT#xZ`W{cgLjB4fv#=TSBK+KvE8{JGf)Lw`=X z4<#=zF93KTvJLaJmz|g~<+ZeU(*#>YB>(RihRgD!KgBXJ3lafi%lwlz!R$@^yR%=) zunF-sqOt^Cj?3Gvi1fA9zaG9C2PqC9DTS9|a!=~JVh>Y55d=ZS)DKr*=}8Q1XwXgI zb{ufsgeGPk=Z122kWseQWvkiyWIE)aPmFGSe4K@i?HP!6*T5x2X>c_Vlj!*^q(fb{INB=I|X5@d;{9`1l<}XSsR+MM1!0_E!m9Q1Ur_cz$sn&|n4_QL)A& zx0|bzzy{Hwr=34(-jV(iZ=5$bH*flW?y)&jPesbC)WYEGy!k4cFdMJg;1Z@fzc8v| z$Q!B>BY4Wn^f$ILQp42&i|emT@dlsuJW*= zhk4I0yv|w^#1vDl(>Sz&RI@-j*_^E;g!=cJC5_e7(z>=|nFB``uU7aQe*o4Oz1$s- z>3zkfcZQRpux4#5y(41_5s4=v>E&FS=eC5Wr!bhZEE`L}alU`)#i^UOnj|`^4rLJu ziqPApT~P_9YPSth|4|SEJY>oOCtp5(a;4Aa`^7VB2s^Rx{?x$CWgXWf&xHYX=Clqs5qQ)v7F@_4+QBHi-HJX5)iSpQi z2x#KqT;697uHpc&RM%LE5u&8TzQZw?N{80s_668g#tzO$?|H1;`Z~bJB{7V=`1p4# zNlWp(XjN$!y}3O25F2|RGH$M3J#UFKH~Sen{D^uW{DG*NK9j@)oe0gP@@N z12g?O^HOCi?Bj+watqZI*0Iqk>V77XxP4PEg@=C$GzkHWDwCEW91DrBTP>sw4DQc( zpXk&b{+3T$5}Yy`Ha9B^J;9@rf_r2o=F*RqRP5vGbn>X!ZK&Pu(WEBQ3aIBOb)Y=n znWW=%eE)M^9ALbO$w|_hUF<~$1P(IJTz=Fc-cxvHMynMK*c2Wf-dy9&MdRLv-VNm} z2r8;CUIeyz7jPwiiGCqO9NTEFj)m{CnHw{U3TZqiH!7~A(h=rgFkZZoSdKi(m(qCk zii|VL>|};8^ATVAt@4VBJG`z_{y%<5K;lLRnF$Jg6xeMtFtSJUvH;i+1#5;rn5mAeHuD!*b1m$;l1(^zsbJ z|fuP@IOALP!ItdF8^#Zn5@m7 zDvtK5#2L52=RH+Ef$+u$t^b3>{Jg)FG5cpfKJ{H2yToBqPD^=T^>nDo3dZu^LX3nP zre!55Rl3Yc1^$||(Bk#kJZ*Bclho2DEZFyFOeQi_%HOHyxnJa?}3{ zMKKKy1;xhEQja5l2mS=HRwM26CRh|OJWiC-} z*Jj^_g%-Gp=(Eg+*mZDL=j+XNYO$F4L}NzEBj%pgk;e=TOFPrvkd(bhZ-{6pTs(Da zZ?|Vl#o{M-DjrqL#wqr_xlU-Y%91U{6hbN^viYW%14gXcu!qmre}GIO!Qfr9I*aE0 zXVBIDaF`4!l{ik7Rf#B4DG?rCzj8F7coF&G?Ax1%jYahbl5EcrWL(tGON0qquW{=W(cQVdZO|GdZX zn}qk~lx!oY$k1K1w}+Vg@6c$?x$$mhBHLD;eXU+3(0CO06|=U*@Y;+%%gwPb#HvC= z1w;j9E#aTX4@0Ovlf&{cn4-&QesG$;$l?N0AE2g9>Rtrr!qH7a6<`C~P>zId#`X{_ zG~@zulwDf-@#7__s)8HGJvDXR?e!;qBA6qW4!8TX$cpX48)M;LKgGp0yO&X($vf3Sbf%>9_D-}vrKJ;l}zZ1j*q<&b7K;zWnVhKn} zqJ)2nmnpMQfBJfC;g60~ZubrAWTwp)wrB6*B?-2Fh^7~ykO)8syY3W`Cp`Kfe5KB+_x(rdGFXg!zRCBi z8kIJwNaO+{%ExctOr~p`Rm%;)zo>|oB)!Kb5u*{`%_6>!q5EkZbeHqphT}p2{r;FG z3X-L_c_qdI&0<%))%)+Q8?Blu^QG>Mrht`p4i8K8E!zoz ztW~aI%?Btc%@cy9MCMfnF?yX0~yrgah6^@~bsw$xGyKxD9A5eYRG z7-VlXBtj|$NEQ+6*~t#{Ht z#{W>#CXxCt1*g2S(Pz}ZW&*K(DsHXQ>D9P2L@oNdk#P4% zbbe%Rw|dniyZB^LN8v6~Sdtg$;H<-pWicZ^kLFth$|JED@ zfbeXEX3MRK~cSeoh?_3geQ@9a&OMT@9@D9dyXax1x2mZLt32nK=4P z6~KH8eDSQS6-+Se*L^8r{=M00%$i7w(W&#z;1`F?i5Z>#?vlO0NWn8-J-wE|HTqQ2 ziALSCIi6X+16fiR?|nJ(&HYG;L^>aFqcZyc45m1XCYC0`PYaM^&vA+0S1T z5(m0$SzX3aaAjRFk+)2eU0VVd3V-3%#W6a~%LBx>B7%u$w}0JBbR!(Q4|^R(nSY!U zMTH{Kq%R?_OAHk}b_CjV(<~IIH+0@)n;)FJ7dGHPI^r*Dm9L- zpHt>g?67SAlOR86=VPS05r(V6fRhBmg3Dznq)7yUOeKN243CRD$IHttD8Qd6RJjPT z*s;Ud><f}y|FLOV0fj^XN_mOriH$;v05J&7_?}So#Q+N& z8!TnycD+`=>t_luq|h3`1`!I%Jck}x{B zZykLRfMcw#`>vNzFVMuvz5``&eu95!WBcu&G@tXg>HeP6>DCfp#+UkZ12zu|Z_QEF zF4RBNf3)}nzw_Ua`<8JkdqhL2MGvSj845r+E>O3>wg7N5mfzzEU~Hs&*&QmpzP)L* z1Rw7IenXG>?JPdj83@F`ckqO64hnT@%ij3Acs(_~pl5a7yc@39sB}K5j)3q%HT4*RAWTkGo8)x^6(8A&|%YqM+D2!CV*6zm>FZV6vjS& zPPTpCvN3fVK4Z_E%Rf!^{yEjU)z=@~dvgsX5{b{fjPO61JHA)=J&1%_jNunhU@!_BwO$T9?pw91)su=TN5Pl~XzQ1B-Bc~$FLTHRPt$#0dj@6$)7#=l#KP1LrO!|l9tM{{= z9Hh{wrnwyYKan50QU?pvg5&>e+oTe}miPhJO9;-*AUqgc*sC<_A0U&Y&&DFNSNBSk ztD=72r#})x;$&@C#foblP@JuR$_;!XPrlNG^!mK8_tJ| z43GW{b0#WOU7v64&3NwMZ}1eTvptWZY!wox$LX$3n-Ds-N+(k^$)L-jydeZi$kM$v zopi4N|7`+ebtSZW03=8L#E+RfFl(e3pzhe#Tm@nL{Tp{;PEJT1fCfqjCY>N2@~1)j zts7i_vZ(rp$H_Q-pFqYhPkV2yC|~(qU7TP>ioB_^Gk9sFR`vZJBT}N^YsQZoFMCTm zQlJ$=#-^^{8ctRPgwEX@u`6s76BCW^#&S((B;{_X#0bnC-z6;v+;TF-(IF4@>kSZu zI@xiO>}-IxPLJw%eE0U$eo;qy{jHS&X+JCsVFH;axWU5yVySht>S{;wOSNK?s4TZ0 zlzr)GLWvOhMfXEfg|=9wnj4No33}o&F8axDblr|3<{wx@Ks^O5 z^zrzh{L=|aMbxc^L zys=92UU<4bOn`1uA%St8aSsLi#?DYP@3I~(cTgRWHaA1mR4~UCV2EbDZi)`gJ6?N>)uRLA9cBS* zF#p^&EBs!_nio`znXkJ8e#hNFlpj0awY9bRCGt)MV)i{fQ0ViVK5_%x&dH~kwwFPX zg`F+p&vQ~3P#J1YFep0H^r8U=Xh*Chg?>nP^*(G87yF)9csL}L?AYt~eXJjZVyNoF zsOT6@I~w#z(9G@ljPOr)ew}Rc?6{SL>2evdBK~V6?23NkX-fFk-Qa!(=aR#MRBkBt z69hsZ&D>>k%-{Rscz!UBAU%YRVrk_5SjWlz4Ntk*?y>VM*5km*6D9h?`WV znCktZaID~Cy9dIn;*2om>%p2gxDPqUrct`@Zqc1{_Y;~_WSQNxPizEzO7p(H5#O1) zxSwy9Lc?8x&;KgN)lBMYfB9=cfwX;GhU0<9A>Cw1iCKz&L$)rLwIVwxoGi${*87z! z&g#5gf7?*V3YtLbr{l|j2Rtl$VZ3C0q{}^8uTbdjR06#p;2}?$;eV(Hg?^v#j*DSu zXCq}{>@ti8+66)N&|_`+x#!*UcCv5ZB(j*0m~e*;o04i77H|p^a%r-U#7~XP^+@nENs(xV+36ULswo)s|}6?ae)ajB!g7%)b^1~&Fnbt zbW@8>o4H$^oXuu!NB|I#D)6_9ZncsWb(zZHKvVP*%889Vp6JX+m9euM$-ZTe=Z=Z5;cjXO~ zA6QdQ?!M5#?TURPh(GNqCN&$&^e=qF`?^7;gm1tZJEBea`ky;OSsWtz0=q<2TRk!Q z>hk>Hrp8tA5Ws@@&GjEZ>*8lYsA#JNWXnV&Z7?eXt#{X+<{fbhD`!36Ky%NQF*@0v zW_kP=L(1-JIoZELaRP7+0etzRUDySJ0Bysnc5*?|tg`Y!v$O3u-tsl!3c_gT4P-Uf z*x1y7Glz)LwcuFrOXC(1j9kbo{;KzG)3x_SR=9KS{t3ZnC9NpnLkR;Cb6h}y#V`oS z`V|I4(){OK4NnD2S_aE#%v&>*21Uy6|c9C**DMXdZvQ9yYH>ei0ng63$u zpi0>9XHOh^E;#VYy)RrYyBjZYad2_%H`i1TkB{zKE|3Xa%&a2S*u>9cj7IqtPaoeRZNTk8bvvHVHOQ4b!mLUo*t;Me(rX~LDPj@ zHhE#d)U;Q}n8V2cD)>8Y0M75ON$)hoycc=Pv}x3b0S85Lv^1&OoiP*u!EXydt7{Ea zMUlKi55uL5@xaAZ3;I{tL9&h!yA}Y_SO4H(AYhRfEgUD@Q`Obg2Cvl$^78ftT}OK> z2hQd#*ETi|a$gq$qs@1ft`JU>6^{w4z7DLXE2Zozrj|`jlGxv+1lm1SxRIp})|WAb zHw_b*wBkGSOv+gW3M(1kELhQxaYzS3(LIIkY<9-oJd*r@F{dNq$mBuDe}*@+Av_Uj zBol)QVMyzustx z*62~|r@z?zuLr|CaF?!Xk=BV+ArJ3vW|gW3|GnpT8%bC+B8iFn=9+;f#%UIHOaUji z^3&`1@jH1Gjfu4DPW|Q^sn*Xgn=YpJHkNWzr`2!!tL@E+=xXU7{hP2m8g1~o z9(3hul0UDZSU-{{u9pwl*$%vpRj`FAqW@MNFpx#(PItn`Wziy3AQa$zC4KxDsI=59 z)zaV~hxes7Oy-EbIZy4lXx8r27p)cf`SGdesJg3*OIA@4$6CgQET)2t+BZdb_}|VW zM!}sQLGF49fej2fn3y%BnxSrA{7T8v(TE|K0buz~RvzfUGlBvk%Z4IlGRfX+<~$<} z#cklK%yWULDMHRTu-uHmLTTfUC4sTsd~ktND{tR01;`KL?e`OY znLhef6!LR^w?{(f^b?t%n%}pUgTmVFW>BT^{xaY+YE1ZvdXz(t0MrrtQhFkD$8@9uVu%ySd_z_ zKM$UA8dUV)Z*MPqJNWtgcQsm%wfpYJ<7UgXvsZb;=8 z&}vgDZ)*C*YaUmR=-cqaorVjWjy_6@b-uie{rT%xSqBG)rH{&@6vyt11qg&RplOVtOzZFioqTON1wt;gMuB?u@q{IM zQzgn2?%XW&lwOV~BDj>xr?nT@a*E%u)hPl&(bE8dJn%pRC38@E8Ry@hS zkRoKK&`t#EM``^E9QYf!8p>Yx)tgMy6(ZukGmwRLtpLU%D{bXo*yuPVqXYA{H{oO} z4gkYL9&YZXFy^%PFYSk4z3v=sIOb*L;83oi>KERrV?aYgL%D!DP8x5n61(;xbSIQYQ zn+4VLhc2}`D}Cv7d+sFG*OzD9jw=d@Tt?4-Py0|lt0eyK^uR2Bt_uFgk~s~kg)KPP2YP^aAdTasiWwQ z=NPSJmGB0>CctHI#$4Md%=AYKb+|54<_ocok;KoU+^{Es3rSxv&^sE;`7rXw&8@6j z29?FO9KDJWo(1U38kci{X-1OlL6f7C#e!5|QED*6xvWcD(Ou;2a}b$SV!k=E-3d{-Lj{~wA(7Ts z-t}w3_C!$s>Pi;4U+ao@ z5AU^Q|61GGpcy#11(dlT8GCFtz}?fv8?Wtv?Q>k3o|%yWfwT(Xj8fN4SWRthiQAsx zqc`7g{q<`x8|vujto`{T>Eq+m^W0mehCZPYkj*TRZpE%DebM09U2*jVV4zKU{=t6sILNZ(?7_@&GwhMi?hgaPDPm&4J%F)VEYI>27Jb0Q zP9xpHYM=7N#1H;iFH*3l+yEg})zrj+XrO2|1nmo0qO;A69XJz-u9uX#f{rV5LA*D_ z301UjC7++6BR#ntV=gmitJyU*l*f^HhnuHp3OF5IUCqEN%K4X5(_VBc>vQC{P~5mR z^1UB8peRS-aZK^OJ}X|j13^L+bvWWwO>W>4F>AuX;d_yK3`YGR%6Pc?=lsOT@ZKCN z`H-5*4$O-0<8?jcO#9Nd+hGxyMvgvjS`=Wy5%%Dl{{Fts>j=0-mLQ+_N-}1ly&0ea6pe48ntUbCqeRsp0E` zIr2U}4TmQu^4#V&HgP>Y3TI#sZjHy|@MpNrxXV1c6ES7qPV=)+i-2(|BUQgGii)|^ z^w$Rfb0(r%(-?shq$APwMh%jclMCzH+olRA7uSj#t*^6LIb9b$dUq{9A5A07x8@qu zdAov-D?z8$Mb5^iP}{zh423tzV`FXY{lbD_Zc)+6rhFS!f0g~>Ex)J7V@BIL+jsk+ z407>?wc8=VwP3TvyIzJP>^d#Kj;%!N(Klcp0xyfJ@?|(vTkv^`9a_#aY_dgWg@mS4 zUP)wBoN}`Di{*p(pjMw`3i0@13;e2qe9dwg)X|$N%ov-Zlv7q#c4~EXb;S(;e4?m# z4ITl3NA&Ok#_hb+8JDdIMza+AV1;Bp#hRL$7Q=D^Hup$ISHj+P6cJ5HOOeG#DEEMq z(zeMn_zyb{;}HILfcXCSN`Z@ZfSdSSSEmiIuAu{s;d*8W?T7E*xt05}WF2PwfdA?N z^trlvyuO=;%EsYn;qv3!?eu2rNWHMPIsBktXZdp-!Wu>M?h`pN7a2&*E=v^B4!i@B z%X#P|?}xN?eJ;_}W?vErY--TCCm*ADf^qjPn%$?3_#V%1P;pLGmB!^2undtw`#>2J zoGW*msIt%YrCJUo6eWW8lV>lVA8$`@jQ0v{JVNFwlbxCR-va}gp2C0xZMb{SHh8On zf;jLwegt47tYg=#Hxyx6Sp7UtCw(<7=%+=goHWAaBl1wA&G1Z7!HTJj%T_HcEkAVJ zlgF-BRXs<-UO5j09!uAN#coUm_8R(k8)x;vtn@iQU%@i~Bec`_17N3}$+ivaVS1x_ zzE`1NhNh$z9J-x%>nrd!d-iPL9$?;ONm{pOz zuhqDWLs8~?3#|Y{AWBOIb1==$_Lot`kyet7RVpQ0RhHPaZ?&~rKt_ZCyXgny&KOH4 zR|aNO{EOOZfaB_uS?TFPHwrPd4|4D{a`78BEjFC9o>_$(gX8Th{Ij*lE#iUb3{Zqn z#jVbk3*9pmQN=;=^XZ=D=fT0KCArKK*SpSpidp$CDEB}s2i3827&B&^(2YL@rlxOcM^)h!HqX5|zXK1S|( zBvcYToeT6EHA)N(<{pY#EAqYi22DV0uBg+4gABlk9j)ziw#;SQZFh8X(!Due+G<$R zVOdoF#nn*~EIaObt>D|=4|2bLeUJQ{d6Qe2Ri2VJM|wBI^|;V&KQU+x^*x&$(Z{Uu zyN!ly+Lz%uvK~I|QFI)T@nH{o*5-KD<{Lw{uS6ilhUtl~xqo3m);pe{xN;nlL<2@8 z_mMdQRI56E5{gk@waOA9Z@J6apCuZuW&zy6fdIxm-fq0{d8w{$2&%JgC)2JaHq$~q zi99kpOu;WzRsF%nB-$7)0`Pzqq$Hl}jmB$S+&hn&z)QS!*&aS@E+5sUI^lb%oERWd zpo44z>9X?2JIn@NR^V1e+ETedjL6q(q-FYu4gP`z(Yo?0xx{|)IdW?3ZS@`iA{ILQ z?X}TJ{u`9WKtVx4xyoDL8^0CS-~doCR$-kyqqxWAd+k+sx)5=CKH3-!I6Z2VgREuVyrDSWnI*K8K<8%TkOan^ z0oTvipt3erX{$GOB1}y%e*W!mTlel&{)Y;JGI zwMLM8ddNPmULrYLq`SF$P!ZJtj+rGc=P@hb$&P~uvJb)21JUlG-_HC{>RrL(k zoUUAez>}${JvJ8-n#Q`b6Y&^OYvgl#!>UX%vGm8Hgbd#OHNy6ec!XY8R@A&5E^mS9 zXWo9`Crmc^n$gT-H{m;ysWPQ}1LqCgVT%sHhlm1kacGvZmaWCB-uJ(U;Ij-QBGJnMxmn`ai5; zIT z9c*y)k-iu0u4Hp8P=7RzPF|_)Ij(xUwy_oXbG`}mNFjF#wzQxDnp1w|5cg4d{O0bu>e9 zH2bsTvSeg`8bI5?EgIT7dWbdZNmCe-#G*d-x0yr8@jsqs7S_jY*H;$|e~5KK@0`zG zcMcF3QEmgW+{}=8v zFoa?xF);mq_$cJ(H51Z$J82+p&y$$%BvkE$aWI@LjLQwk5pC09-2NYZe?t4pKFM^A zg7?ODeD}ZIeL5T*LhFG=%B0YQ?5@m7^S}XN4um$UUZIQ4ub#1Qke;u<9bSgp1gHs* zipipWim$5TIniLku~l&HQr}h25Bq{qQ?wz_^CsI{h;dBr{XTNJo3mWeLP;`9N{B&^ zp!@R_t@gC2?Rq*xP>*#Fs2`8>KWI~IQ})>>?Yp?%qq`yIG3{yU^SyQiHgSWPZxy{) zX|k3ZpdSzy0Uu%eZXGznw*TOx89ilnwd1NGV1RCCu8lvLg~d>M)R8r_N7ZNMSA8J{ zy_8YlAacd<&)8d!mUyoT4@W<-^KUOyKybakQ#O;AL zjyR>ef5j}jI|MM?YlhxhzAFMqC581_@3V>F;UJKVmsVF}%(vC(v!qy9StU+p*x&p6 z;2pad6&FzjPSao~1~rTLIu#9u(M(qI@O{Yq$NJ^WmogK3e(v#fzk}hgmm5$BztL%t zCx2UuI5Q?A>!2bt>_uuwH)+W!qT0kx~Ep6IB zl-NzbIZKy_S;Y@+=Ip``-D5+qk{@>XG8lpW8_K?t|7lo#EA@Ys zch*r=_FbR9Xr;SD8bl-|qy-U!xg{SzrDZT&n9~bqEn^6p11a8RDn>m;fk#b zNmB?QH&?3;XTlG!rl>7Dtnes-i+@iF8Pg5f@+(U}Ozdm!a>)HUl>O5i1P`k%@BwHC zKAims70|zXGl$VF*Ke+hMEu-vrRRBu^(dJq6+`>pZB4t{FHdz8L&(p2G|KC}f}A)u zGLk$|(c9hBWx+ZqQH1#^`>yyhru&O*U*yhu?#}xlA~HXW;=7HW*|egUT>M@;k`oO0 z@o2L5PkXR-PFfoNmoHz`tRc_zYpzZ(B}6GK@O@Xlx6*-NHM(9cp9fn^*nx#&;Tj>? z1F$5@!aI+uuj%MJJGAx-mgWHK{o0Cf$S>?s;bB{?$lOzqgA#^?7i> zh5xqTY4$z1>XR#}{EC*SgNTM((;LNcq?vDvBv`^ijDw8pJXXszPm+ z1LOpr&V7x;@j}FVwiscaZr&!hTv=)9KP+lDwzpFY-89xs@AI<@H~qt*`rDv3m_(?* zr_=JL?Ya=t`S`M`x_@XwpzaaswS7Jq(tkL{c8zAQ)HGth-(4jB$T^1l!#Iu#VNoA8 zkPo;+UU=Mn9pKHk>GT3_Xz;)>aEmAHEoep{va08yoqu&If2=AZ{d;iCgMZk2e_t1E z&Q+e4beVPvp%{jJb>jc97XR@>FROgFWmBUMT~ZaU+Z}zL#9pT0{nm7gW0cK5Es})q{I= zd4V*tKWo<^D2Xn?;q{f_w5@Rh~VUM1hn?6)nX}k$F>r+gTq5bOlfHoau-=+5F*to% zI%}njWOYxIEM44dA>s7S$X1hRD7Ji{${U(_p(FKtoG-N;0vUmOXi$CpA7v zw7{<)fNxwtvlh0$ett21=jUDN6y4xApj1h9UWAwkk6?4Ae3T8ndng6}&1}EEm1hE_ppfQ+o+chjb!_LfHhKOZgrfl{TZ;xXmc~UMyfuI#Ed}&(WbjrNY})gJ7R<2J<;;IKYrH9}J)% zA)$Htx8+#4pG0L9FJS4VwJqdxY8x6BOYn#i*6jK!dPrmOA;ce%KLJ)Fo7cz8oMFcm z@|DrFoZD$GcOmde(&MP$7_Z!&C-mZ{H%UJw9SCdb?Y#>YAptaw*b}D@7r6E*Zw?Ep zR6zp8-r@Xt;8=%(~}z~^~H3kw(*70_hB`NqWY!dXan zsj90J1pG?qONDe>VMRp+7Z|ON85z-Sk0|t2p#_pxIhMml1k)+ZO-Tw&7-1-0)zpM- zfXrvE>smfXzsVjI0*@>c{>Bks*TLq)9c$QzmxPb-rbXNCax4B8H{9h^x}`8vY+t*A z=94N}K;MC$^vZo@1@scbPfSdN>X;54*;jL^5cQBp#+OtU&cfI;b4>D!G*qu$z0apf zTk&q~i>v0683?rYX3t)5lK4ZMxnl2J~`bO0A_>+s>XbH`4fKCN-B_j(Vu z80S^L7pO&KoZU>r+5Y9QAmX2tJ{7H(d7^YLav|b(`r`+HCn7{tZO6^fIH(3|hvF_| zVN4pAr6f01rN{hau^)n##}0R_apJpY6V3~1wwMu4i&-;8@DF3bGj0occ-Vl;6Ewkr zxd`hJMQ|C3m%dY+43)P4m$Q=sK$cl7i*XZ?TCJO2z2TS;}}5 z4`CUL`g|z@7Ann;gpXP}-F#z=%1Pe*EB^rATDmByk;yYJz@mi5aKd|c`Y^4nx5U}4 zb$(uJgw++6*Xx6a*ilqPNN8D_b$X#``h*aq&S|7HS*jaX?}r*52{35b)vtqXHk;y& zpgiL*alyjVx%04c-xA!7`>)d8g8O)}+)w_-F&UqEci*STxU`w}Py#J|qxM-TAN>Wl z44*qF$V72(2m5x4Pf2c1{rLorG;n7q=>s-HP;7fZt#ge~lDf?W%u#-JaJw=9VKZ+YIr?Dxc7QZowcy zSzfd_(;S#*^_S*iXpkSCx^_K!7Fz_p{7t1CzSbUsq~|Px6MwqygD#%NE8TgwABv@r zg$PypV=g*HffYSw-$^-Iu*qwWn-NeTzwOaU3SRFeU(@sdGAZQjl#5*0heM$hA-|2< zC*_2u$CZXY{yiz7_bme#0`HZlhL)1&f_7H#Ia;3wF(|lX7zP*r{#l<3vxYGzl803x zY)ErJnH+qNy@dC}kUAc^U)q4$CtGJMCv4{0Q516^(1hLEdL~G zy{Gwnf4@A%X^7^fPoZ1*yvUw=9IW4)VroN*v4;qxhbO>Ud z!>;ev1kghRy0fdRso{Zm!N9^IQZV`YvDv3IG!zj4g38~EA+-;SiBTE9WB^~}>h?-S zRDC@L&1hbS<4L~z2jikqt#Xs5*S+~@?(p)k;tw$y(Aw-LiN0+(A%ln! z6ra#YzBXNmze-&xgxr6>V;e5CAH`lV!YI)L+Nc+@S!Vcxw9*PWf(WRwJ^YYBMZ+%w z%qK!SD(PsZjMIm)TzJJCfWT6~J46?4vgpi2V*p74jff9~sNFtXh}0tD-rgS|;6eY8 z2Jz5ikl}(ih zn}GK=6IZIHVb%8Xj)oO;+2Vca{Ldr4$TGe-0W(4WF~pkW)JvStF%8Dab{Pyl&7%X7c2aTcL|98?#u%f-!;@Xrn~r1U=ShmVl-`@dCdR{q_$*!yGrUCmDt%23>vY^9 zu5YrBz)dhFHCy9wq~%e}gZ7b;isQosvmBVrS=HN4J88J1JUl92pIyX}gJJLrKMx1e z8{GO*iW0Gin(H&h=$^rpKgL}m$?MtT%=k{2@%=U?ld)~6P$1*`5GK=k)&t;m#hm#J zKp{MsaaTA)?_fr+A;xmpE9==iJ>es5OV?ksRmK9FFB`XhO87@M?ve>+3$V7gZ2C?L zA2AYMdNe8Ji!)3Cer*P>ensJOj^RX8&caP2OeTP7tts8#vEdA7g);7zDhH+*POJbe zv;3~`8H`dc?vm&IW9-J5@|UJ_%_s)qKK6t#@}3HyXRUrIg*RN%3K*`*|(*Gv=!&{Jj-FL=E6%P z>K_Gg-aQfnENeH-40ES@QOKOvUO+rOfunbUEHqmn9B$uJ3Ex3mp^B|H#l0SJZSRic_*ammzC|gwbN`NV8~P{Z7ORg$ zQnm!dPL3YWY1%NjXWyz`wsyyTYi)m9{!Mj@KH(gDPSRg1{oBb=#2sF=syEvLh^9Za z4Hh-wPyg&BwTW!oW=B%ib~r*Vs25EZn=EaY4}TKv{?tC=t>0O5xNSbF^ltg}aQk;d zM}GzYcf4f$4M2?Trb&>yh#dKJXaRDTs`^Vn4F|~_SFZxj@JDmdbRd47NX9}B+bgz@ z+DIGGS_1>m+0(!P?ZD5j8Dh;ieYY<^SKCIrO1Bw*fs|%^>1Y#*xy94DHvNtROHoOs zk^ZHEn@4hNy-DFQQ9IMyim=JDx1`x|V`Mq@U6v}YWXoDDvk&7jneVAw0O| z?e~5u&iC+`so+oCF?d3f9YgE#)B|@KrWKLEr7kb!SE*g>RU19!Y_X{n%JV8wD-ttS zsbn;7Pt>|lVFL;C#A@x^kQ~nJHS8rH^1hRmt(BHcdWseT4}J>B+kD?4aXI*WI_E@y0-l3f0`0gl(sxrPsK;C$ zD~mJC#TU9E!*E&_`xmN~=r^jCXctxc<)5fpZ9vtk{)wuU`i-hZC`&tKDLP;2$e7S2 zpTQ(i^ar~69LXukk;%HoAw^dyg~{xL^|F2BXj2c;%M|@k5knsxkn!CPsldzA7G4s$ zV5HVVl>IZ0wv$4k+gllye@YijW)#kl0_hD~UNNMcK5L0gOGp@C?58C4rl5EX@z&WB zL(CU~C0uVT_i!>r*Vuo(jIs66CwTvbnN3fhs7NlzJ^^{F^l@^eSyNY5ZBjy>csja2 z3J>SQ@#fP~JW;cc?p?K)benY8ivgoq*gkq~HRTnyW*;Y?R0_&9UaW{ZyKmCul4sj| zidw?*rL4XyvZ}tPz1h!N&i##9EBHTW);hn~W!4h$0J9cvRasBJ=7`{}zT1U}BBc$< zE4F8JoeRZc_{EsnKJ&}n%w^1F#}eDhmyRWH`O>{wpAkm+-Y4(sMriz^l1v+Zj5|wU z^DcZ$BAjnpC}W$bQXkJxO=ELXCPL^~q-6vIN%NH`n)WfSV69ELbYkCZaI4Yswc227p?KcSu0l%RpBE5(7);e2T=zUZVPA1uNT{p z3xsp}n9MEYD8de_?o*uPWEZdqxzRHMP{1Pe6iCiy_p>4*h%=EJFP0yXL=G(m1{=N* zZIJiCn_`MyV16ANUdsi?4t!uW0(~O38;WQ1a7rA=ui^X~GKMi4k&4&zUWMukoP@6I$_0RN=3dv(K!~9FJ>?Ck&FF!M z)b`s`UA)pfVdBiS;)>R%e#I%+4r!HUv(}UZ$fSQSPHc%dgJi45pv}i=Dr1 z*MMd#e`D_kDTJn3cFW?Sf6yfJP*VYV`ucz4&~;(ozmPY1Zjr(}*#w*@=>R;d=efBj zz}pE(&b`0zGee101wN|-VOQp>ne2hL2`VLkIfe__!IYQ>*m>Fk8JErw)L(mjFhX$g zGAWJ0X?PohQIG-Q`U`pKc`w`${Z)1-F1e|*$Y~zxe>Pj|OSVxQpo6)W3!X$?pn%E1 z8ibOsg(T2g8EEL}s8lw^Q6?p@E5~}+dD6nsQAmkm_rABaeK=n@pwPFbr%@BemPnTq z1{Y?Gj)gGz?f+9;mHzN>U)RLIR!3U@oN*`xueVq9QJeiPUQoyai(fO z+VJAZ@g6KQKwkbznQ&gM@5>6c$rf^p1bmtG=HxvAhZXPUo)Iq7y3x zB!bhw3xpB@U-#`M7-7UieT(N7{$5s(BLB~yp#aw6Uh1Ce|2hU7vR(g`9eNCC|4D@& z>)|7X4nWbLVdh`y^IsVNe>03qnfm&AlsQuY^=XtLdqFGGIt1%l3aTSrMNu(Io9f@=G^z28k?$cLVqSs4Si~vwKD2y6!uO~h^ z5k-u=Dk`deosT<+>!k>%*U|4!DPj?iOfM^XW%vGS_PE!_iRfQ_xZe<%31?yQdY(Q^&4}vMh4V3E)blTLm z@$XNx|IEe(u7(B9m8>C^A^YjXK{+|qS# zlDTFT%w^V~@YJFrhPAadAK9DXLI*+S{FS`mWD%bCztuXy6AgkLSvsU|^QY`8JaG^* zCZ^^_aVq)LW&@wK3Qe;yiv}E*sy*DDug>4*!{7Vg)urK0s_kqWPIHlH{${L8fW)^L zNM-0lhli{n9*@6MKAuBEo&jDZZ*T7`ZXIjQOm2rwjKtnWvQ4x@!3{0kKrJ6i@dI#= zkVA!Urlv`0D_yy)))##q2qc5^HC#2_B_7w-m!FmFT%pd|dc2)4t@D>G%BU8F1cU6+ z9#9y8V3meUrDE1EH)_4#s*Ib=N6@>u{M;VYYs``*#MEWs9Co#<_pYkq5(~ z8HK(%cqor!CT()B&2CY*N;NKPqZjwHZ|R7tF!?PMvjoaJa;<`)8->^S_R|bJcz7qq zA~3L`Az8^BWclcZDIxD@Wt9xDpz2`@T+DBHt>s|cpFTtsDg-d$=TVlxGD4`qcr*-j zBardotFlvujUI7QD?vZVB4WT-{nT4*3V$P6Qo6nzT`ggS}fdts1+lQz@RB<{L~6HA9Rs3fJ*dBd+XZ#k0w|pa*yg`1-h_q{ zgKz(q2Bx&R!4pUoyL%m2PbXP90SNohM6lu9iOXVAIE$@ML@3rD9-SY_A=zLb-p@Nz7>93U1W~*r=zk=CRmOHxD(BoUv8AJ-F`Ut7BDWC*We16_v9<}bB z2ei+pi61ZN@{9Ll=js9@f8LToi%Gi>Zi7yix~p#6BOI<<4plnD{}Z?3@w z?GkUE3LGAx8~7PSxwM|6uWV~}A`(^wD1yAA`|`GB;JRRzpx2hzU6)niL-v?^fP%VW zX2Wl>z%{jG1azevH*SQ8ZQ7mQ4S0m68=&F|RVsj#qP1UG_EvEg8dCe36l`4rbMgKX zRDL{ns5XsF2gdpJzrPpLgnk7OKD8J=kC#caeA=a8JFQSk6@Q|HSfzdPeC&P}hI}WJ(xC(IQ82o?CRd2MEz54{npN`BChMgc$2SAy?>(pv!Ib|k zhh5Slp7ZWx4epQ*-AMXt2t4C{(u)cJ_?-8sd7!4@4_K>T^Qdf&rHX3sS z7?8Qa&LZWaA75FudAM})9=Uq9lN&wHP${YQdm@dCixV_6Gs9PfsTCt1P^#+7e)y*W z5xy!&9#FSm{?9lvL?Zr+U7aWj2ICK0RtF3rwGcm;@XTP+_F6}zJwc&8X|@os$3iK- zJb~su_#nIoT~NHCu_-sg=dpPe&}C{cjZuUle9f*e$^5O=-h4jVZ$TNRjv9`CoR0

lMXYWDW9$NNNU6mpm_L!tHLPRl7+XCCSZ zO=(a&je8WvcuG!+eLh5h4BtNdHQ9UuRVU~nNtlu;9fsz9KE(n zk-E@sC~+ehs6rs$5=~}2y-$ZxKk=Fu=DL@K#-n?hzd9br{2CDWFt0Vt>s+{NU4}_f z-O=Bl`YbdX+~u{ZT!%_--kZ^LN!z;ZNPv}@53q_}7C3ioG>pv#7+ol&XJap5vob0Yr7Cx)7j!_~gq&zn)dx=v5C4_CBq= zes}sq9hLQ+VoJFf$uPI2dI0!rWEq=+@EM%I&?g~>yNT`6i}F)J{(8^XlI_jxlfZgM zR~=vHu<$+4tR|TP7!YL_Mc~aQM=byL?W0H=Ts)a7!VTmJqu{OQqm zPwqF#ol40{USG5XL{XKEo~=$znMN#qP3fnjNw6Z^Dg<3$lMQY=hwfg0pm;Nr_&t8u zpg_mw;6`2VOpw~V)j=`_^EJ3!(4TIm%=iQRs%k3b=d*FVkOzQZ;@El;9$CEU!mR@Y z{s^H=0tZiYQv>rw@Y+{?2yXHt0A5G?Qk@{$^#jDn%5|4Ed@0UaR5)GJF(Hl=5WYL& z{&O6y#U=t27hp1jK*SUh6}`Fiokq*6Y&lI56%Hxm<-q@9CDnW^GC01u-niy0d?1*5 zG_x|qw_6%^hm`VW4IC}4MArvc9Bhj^{05!SYBdT!xhr~nhX0MK`|Y6R#n4OE(n_U3 zpLPTLV9n2^Jqnt9!Lyza&;hnrxTfeAbb7s#Ph(L_W6(8h$ea$1BFjWil&n`&7J?=5 z1f+ms5DL)!3%^=!KjTB6MZ=YE@Q4@}Ib4W!&YL#LvGIE)SyjCNiXIG+uxrklv)fFJ zX*(KZ0!dz%ztc?^y=OkhjSt;M=s({)ZC6sPC3pTdQ7{2P9^>f)@Zhfotg=Ga1OLHi zJ9d2PMXrEuYp>SX?CS#t1mH>RVdg{ww7bcdbb70nO^Fq{Y5=SybeQq+_1+gWVljhF zIco7!@4cke`WC2n}ejiI9gL z3leS%gzjR=9C__N!I9}*x7cmHTlHTv$vs!+QqBZxF0Xq zQhu4gMvYTq^RJ;o2ncGE0!}ucA+ok8V}z+OG?b)Jq<8DUXV*4SOIDdRs^u+|u7}8Z zR2eC!jNfgG*gR1!zT14_q+Ix5jwe*p( zjCoKHT1-n(QwML=va4`VSJ`+e0KX7Kn!BA-vi*kOxzvM2iJJPxwFa|J`#N1Qk315Z zj1beN<^GCNCo}JoXPzBfr3w;C09yuI0>Pyq?m-sz3xEp)tBv|=v2N0U^Myn}Z6l6! zGN@N{;BbFs^xc5=t1r(4I8yEu3l@Y)?r$)j7Q~?;*q`2Rp%F8V&yTDFeDILM*9g}8 z^bJQmMxl(PlbPWH-vxWdQ6S%icDs^$^=?cw`@IgYOymqOYR+3lroYP4!JItlZ&%=W z<9^Vs@XnP=0;*NNoO*VKtLHjeT3S-cQZ>qWY^B=-LPvin%e?$p@>J`*aI<}LF5Nthj>~*6cl#V)x73MYf4c5BH zBDY*}DgBGv=-0`B4RZ)$OmcINM@!Wo0J8;5Kedp2hrgJ*KWkhmnXmh=j&P+p&XR~H zeEH3!Prg2`)IJN)^+7sQE$XqW+;M70ns`&dQGj6{bk6sMOdYZO0U|5b!;( zQPL~65CUAWX^M>`WsMkEj)dewgw*o~KFE=Omqr6P$c*x*3CBF*+hXNE}`;{hiqQsOCmyaxDLkHA1}hg4Wqr3impv{C>MR6 zkxvn|KN*}eGpf`sw((YsCvcgTJn@ox%~KN|&W~-dlNp^qT)Gkvo|EKipE~3=cQc=O z-OXm_2KKlqa^T5(ug7s!+g5-j^}k!lh*LM4q7ki|-s$Nc9VzIZM^7cUzm0!{)S}zZ zct@1<^(S5;1(jMMo%Ej6H-x2_f*WJ+Z7ZwBH@B^O!7_L<;X%$c^--DfAhl89Jr>Wq zeX6hU!X#6(pPnnQ0fQv(WhNSy6K@Ai3g)eZKE{}bzvI^@na-eYv;h+Ldn9e3iH-hX~-F*O~k6O{SuQe z+wTPA9MjTxRxJ|1&o(>Wx4z=4e(|C(z*ON}uS)C_X3-<|=*)WeFX>rm2{Zm>8)W&Y z+8*MCd)rnHIO|mY`I(Mb3w^4LzKE8>MpCUyx{PcnC>7cxoT;98uBmpB{9_q#2CI{!g|W20 znoofqLg{_DsXd)4ytJ&w!Sb9*`PINlD5ZfuKkNz9?ARnV00ZTvcZ-#x6LD zeLX)~TTdrs)Z8K8{jF2j_W|QWytnd9<SLf(GAKVnkhJx45WSwsgC;Dx?_+O#- zeHkrUUS1AwOp1#mLB?igG+kYDUmFPv0n>npe9>v?>&(`9HX9jrpRyw1DT@tT6zG@i z4~Lb!v)Lau-qU+Q4rD_k-D1`=l5oi65;~ra!$)ZM0Qs$+j0z^^vETxyHcXenKd4&(Lh8?LVrfx=P#Xf^$X#QXPe0#|Ul_J*=5Atv*oKvSl>l2JwyR#0B} zM;V92J=%FPt)#xF(-$3IFi-eA(h+E*Pn&5x_Vp+??&y|dr7Y*|)oqrXMzSmS`Kc%` zjB==%yU4H|IBR!!D@X5s)i&0Xa>YZlUUgn1U@?bVJB&ir{NVTK48wAB#m=LUlPdw} zKmPJ;cZm^F3vDu(mgEOm2?xIfGKkFHI1Cor((h#%^?m*?UKwdX$6gujAZFF%2t#$J)dcpU-of|Gi)hdz&ZJ(SBu>F6D~w+6dsA$j%P2*Il|XW8dBO z-9hGa`zAg>gz$w7a@2;lPu{QwVK{daNA5`F@kqKqs11fA#JGd9T+yqqmS6-&PaG&b z`xaC+hru*>UCEbkf#)(b8qbkkXR(w*VpHuU5fXze3IT~P2eWU;fl?L z=dQ{FU`Zo^jw~l~zp-2;I>eKv#eXzlye_crQnqG?Viy0)o|h}e)e>P54blvJ~?6X8g? zivoZ0C9g{gJV)Ztg(}5fdeF?awY;GJhm_->BVb6!J&T_M^a<>nC>^oX^6q zT-s-}hae;wYO5tbLolt9H*}a2J3qE%DX$zAP_|?SUma-Fe;zQXlmW6;+R_ zj2RbIT6?Tkj!?)Pkmz1nR8ju)=QL0Y*?;d~u$$pIgcA6dx~*z*+lC`PPcaI3L*(zZ z@9%;}PrP^~^<&Tk=!{L3rRd^Ltp=2F0^&}z(Zc%3XJ82LE9?jgaFS^(T`BqYlA+Yt zwDZasMGPKk-rv7Yq%*TG zWO#?G)BMG&BVxym=P>stZ=)#fhDx!J#_r%I<>W%yc^@^T?I!$jV_5NK5f)$)3erso5kS&kgOo|O%Z&@?m8Gz*fOJZQ%nfMohaaRSfthLIq; z#k$P(KF3G5+$@g@1WffhImuH}<}u)&2N8T;r!$&y06_cx|GvkAb?N%0lp%J?6RKrf zO$<}am}5R|_aHR6D0Jo$hw$a!=ax*y3ZCp6GVx!~R9w@ba4Wt`bBb^DOD#A-seJOi z@c^=821%cLG*?JNz`}Xdq6+b;NlNei?f4QpGtnb-`F9OT8h2~{bKs2^PVW}^->$Wq zE0YZJFCReqHqk~?Po}|Si!@s#&Mm?`P-_CB>2c+yuh8!+Z}Ni)ilb|nlixR-AKZhg>)UzaoYIMdP+{kh zm1P3@kuXe~Yvl8GV%FL8ezTC!485OEbhvL71v2iH$Y}lN34g6Mi z(Kg#OA0H!(+Max5`JaDDkR1wTyNorQ2r+zdQg|uaaN=-z*AeK4n_socz{VS44W+2# z7tPF?@9z*36GPirEKCpZn`l6PRVAnxOUQeBJdEfd2wEs{{ zr1JV0Z~obAv`3I^&(+x8FZ0kSQaUp$3kUvpEmWSrHse(x@nGTmFZZ;xzm%*KIv-WcUdL8IW9*jQeW z_DA%7AAp_dGqE1BqEBv#yJBuQML~wT=_?yfcS><66Z#|jSn=yAt07df0v7Q zuKlJ#kf`0@Rxhl&uhLB~z9egWD8YLm;74ogC9>B-{7{Nd~J+7vnuo8}yp>we7!_v>L#an;VkqmDnWA2v@G zaK%G|=G#EMQiQ&vK5s-+y}V=@#jGNsVP$H1y2k8JjjWe1$pF&^S|32xw3l8JfsuSK zHg>2maD8!{ng*l>x+$T@RW+JBJ0H=veN74PhbygtHxT=Jc@4bV8jy!Tr!^E*@l{E% zpbDKYp#06V@Gd@g*2j!nyo0w)Ecl-HlbMb&2-J=4%Id`jZjQE(1l6{>SCQl5p(E%L<(Dp5(qQW{x&BLP*oz{Rxa%y^ax)`q< zir#klro(p?W}3@@>?C=)TQ2^H(}VEUUhFm)W5^J1Kzd(uw!8p6{bIViElLJ&iQr$7 zUTXdNmAksS+R)m1wj(Zb|Kk4uc*caz6Ul(%XH^uxUx5U7+9A(;#?!SORGY>P9rk>) zw-lg@%wgzE^Wjsj+mUv4mKonWDB=y=RiQXTnT*r;!Tx5$~Fx@ zVU-KJN;u2$Li(IgCVR!=Jqxe8?<-Wu+0EC6h$xsdZ+1+XqpnCoL=N)h$Jo|j4Ngl> z|LE8Ss1oQ~eahVY_RpW^rH>qW43Gfezo9(HM~@ygcXVU|18uf%*4uw)dkby*CJrPY zw9F1|*!3=K91{jE2dIxK16JNiH7QDc)nzpV30_XcwhwUUK`YR!3e3=6-G#3$EkS2q z8V|hNm{E6fdO7M<2(2jgI@)4ZchnyjPhRfXarM1;ZHqm&c3C50UZio_Ve?kGX-%%y zWY`A|=!XMP`N8jfGLMIXciun0eYp}iDv9e|8_%JeFs>ux* z!i3~6(O{jnmO1eAMrxyZM}fid9>gkEyZz`;p`I-TA(rfLF%k$p;{l=p8WzNDG|7vY zeFp@6PYMcXE5Ls}_r*0pira-S=*>3{MZorj!gQpBtv+!5TT0LtRc-h z)AmYVAQyJIJm&b{6b=wJJZL(G)TMz=1@0IcMeKiY$c%Q*=iE7d_(M<)r>{;}>)Im_ z0QUO*()EJ_8VLCkS57*X1#o_SY!(gcIrpdB)!QdF7pz(4AVr%7xEN*S?hs%gD1P=1 zGe4IH8A`Mj=}z(Tj&@g$e)YKnNXT8$|GC4 z{OANAs*!tUh3APZYS3pj`}5}g$l!g)H9+Mg>1-Mv>aM0T~5PyvIDALf3?~%xEyX%@?g&nvSB&R3J6Gi}& z!@`|w4{sS_FA+1k3(vj@19lSyA2w^|$l@29#2lcc@B$4B2M}z-1p5?uSEz-VxyXQvoydG%{sPD>{A(yHanTebQ^x`W18Y~#{CfZ$@4EAfGY3@80ci5R z;}t{W2VY*+?`^+fVt-l8dCu1I$GkqV+Vd~ciEiwL_TPjIkWxXdBNfVeXhW*Aryf(( zZ#Y94QidwN_AcMr(zf%C#uD1rs^kqB0l^kjOcuy>A~W1vymp@00j(i<1#RzK`o=|d zcKjkaddv&@)(+aiXj}C7y50B$T2nj$2p6hb0Umy6cAj1BNcM?>#5V~$evQwR{h6Re ziTFb!xnad`ybL+hGb5kMpVfKDxaii$Ka64ct&v2vHlcfc6ED_k&40GbKj7P`i>H(W zjpIHzOVnfwwpWm>qxdY1Q|@gOwsAK<9t$a-+|`MJd=ob0SHBunR8nh-&>C|GYg@9- z&KbOoyE16wKHF?@=9Bqjmik`5-R=(Qob0TOdtP^L(3b3Qw*BryW$$VcnW$}2T_o7< zE1vpCQTCvvZz`_?T7v}yhr9P=_M5q)Tz?7(R}L2x6o>=J7*0(}a#zRhp(qbl+MzZ; zpIL0&ojw6`5YopC*wtmirax0h5lpR{Bmk%P_DKgEN)VKyp~}njt108R{z$L{W*##z zGi!i?bay|UK!0JA@&_Y*rJImdgJ6&XLTrPSZ(2%n(uei{Y@lWR8c|Ju?Zc6n@p0O} zNugsl`gXRgQTi&hU{T%xcHfQJz@3fji(gMXQGOQfs!C6@hX7(9G8u-R-KgpXZHCbr zIL$W^ zp$2A~1s#L@u^uGNq4iJR6&y5q(w>V45WU!i`qX1v?!0}=F3>~|-3Bz&O^Yru0L)UI z*m#Av@f4ghqvPV3!T=pW0V~F--(7MOH7|D&gFZ4dl%HLLdvkVYTP$EMn5!QGvr51X zE?%ajG5E+_5i~^VM=!G1H%~hadO*O_TvgDJ5rP=KPuHcmH>(jVDY51?nE?5#Ps6RLX zA~WbnmIr`JQ0=>X9My?qqlT7$Y}7+0Ak1S`p4{o{-3rn2Pr1Urvv~k>=$cpdp@a0L zT_^d}@<{eC#$2B>z4xm6g8PEavBNJ@!@~x-4YUZ23^Dj@KKtdR9~koy;@?P*Yu?OH zzrl*3$}!I4r}d(-;|aB<>PRoGn^`v<)jw~QdwY(}(T{m9Ube3z-m*lpZ=xG2^F~f7r*NEC6!+8DTP6v z9rg|HP~&!~Kb!3329;42)72mQUr{j@unRW#3`NX%(9zM!1G(@`?=l=VhanqrP~~S{ zVt?LqHTkOiiuJ^;Ya5IkHjk~dB*iyg@+PW&cncA&RTSc8K9$?`2I59(-B`gCpMyKA z{XfvzyY|C1R5{r!lx}QhUW4kft$dZOL9ly}eINz*hSL~D{r1E@YkLv|A;M=x5K=;E z1fkz|0YQYQn?ZqbAQcb1eX0bA>J2P0g2l(9_^pT86co}Qj{U2zHnc+c}HQdRa{2yKKABMDC5y;rBVR{ENt1{xD* z9Uwp2n6$CJ4u+Ut&~W_x`SSu!Z8=|HN|h-cu55#Qrqba(fs(w(;@H=zt~axH6@Rw3 zo5nc6DtlLvvrk?(m!w#IDp%Pt@GD+$LB-McQEruQnXImJ11i1l`}fl@+|U%15^beu zenn~-j4xj}Y=IT%f#OQH2MY|N+FGsFTAFe*oPcS*7GMMJ%$qxaz>834jpN8Ms@nH%Q*&1 z#nX%LYU5_>6}w+g&yMCE(cX?3uF-CNN8ttDkSvmz1pO_v!z4$a{^JtyFOIPxEYa&7h7s};#hvUQ8GwogrHy6pe zYQ8?}>zLSKm0y85sT&WK6SrsD%NquOkM2q^x5vb#TTesaJ0^0QiZ%Z4Bp9M!R9%YK%IqAz6zH| z=j~$(wK?_Nt>10h+-2fQwT6p1u9rGMtotpJPyW$xoc5=fO{Kb6m#*#1I1ju-HQPZm yH0avaFH3{}zh_|`Jk#o!@=)VE{iMlrc From 6006f88179805c57137abbaa4b6ad338a078fe0f Mon Sep 17 00:00:00 2001 From: Arturo Amor <86408019+ArturoAmorQ@users.noreply.github.com> Date: Thu, 15 Jun 2023 16:03:12 +0200 Subject: [PATCH 033/108] Add hyperlinks to User Guide for regression metrics (#666) --- .../linear_regression_in_sklearn.py | 27 ++++++++++--------- python_scripts/metrics_regression.py | 20 +++++++------- 2 files changed, 24 insertions(+), 23 deletions(-) diff --git a/python_scripts/linear_regression_in_sklearn.py b/python_scripts/linear_regression_in_sklearn.py index 199b0d976..4e9dc8303 100644 --- a/python_scripts/linear_regression_in_sklearn.py +++ b/python_scripts/linear_regression_in_sklearn.py @@ -9,12 +9,12 @@ # # Linear regression using scikit-learn # # In the previous notebook, we presented the parametrization of a linear model. -# During the exercise, you saw that varying parameters will give different -# models that will fit better or worse the data. To evaluate quantitatively this +# During the exercise, you saw that varying parameters gives different models +# that may fit better or worse the data. To evaluate quantitatively this # goodness of fit, you implemented a so-called metric. # -# When doing machine learning, you are interested in selecting the model which -# will minimize the error on the data available the most. From the previous +# When doing machine learning, one is interested in selecting the model which +# minimizes the error on the data available the most. From the previous # exercise, we could implement a brute-force approach, varying the weights and # intercept and select the model with the lowest error. # @@ -46,9 +46,8 @@ linear_regression.fit(data, target) # %% [markdown] -# The instance `linear_regression` will store the parameter values in the -# attributes `coef_` and `intercept_`. We can check what the optimal model found -# is: +# The instance `linear_regression` stores the parameter values in the attributes +# `coef_` and `intercept_`. We can check what the optimal model found is: # %% weight_flipper_length = linear_regression.coef_[0] @@ -59,7 +58,7 @@ intercept_body_mass # %% [markdown] -# We will use the weight and intercept to plot the model found using the +# We can use the weight and intercept to plot the model found using the # scikit-learn. # %% @@ -80,9 +79,12 @@ # %% [markdown] # In the solution of the previous exercise, we implemented a function to compute -# the goodness of fit of a model. Indeed, we mentioned two metrics: (i) the mean -# squared error and (ii) the mean absolute error. These metrics are implemented -# in scikit-learn and we do not need to use our own implementation. +# the goodness of fit of a model. Indeed, we mentioned two metrics: (i) the +# [mean squared +# error](https://scikit-learn.org/stable/modules/model_evaluation.html#mean-squared-error) +# and (ii) the [mean absolute +# error](https://scikit-learn.org/stable/modules/model_evaluation.html#mean-absolute-error). +# Let's see how to use the implementations from scikit-learn in the following. # # We can first compute the mean squared error. @@ -97,7 +99,7 @@ # A linear regression model minimizes the mean squared error on the training # set. This means that the parameters obtained after the fit (i.e. `coef_` and # `intercept_`) are the optimal parameters that minimizes the mean squared -# error. In other words, any other choice of parameters will yield a model with +# error. In other words, any other choice of parameters would yield a model with # a higher mean squared error on the training set. # # However, the mean squared error is difficult to interpret. The mean absolute @@ -115,7 +117,6 @@ # ยฑ 313 grams when predicting the body mass of a penguin given its flipper # length. - # %% [markdown] # In this notebook, you saw how to train a linear regression model using # scikit-learn. diff --git a/python_scripts/metrics_regression.py b/python_scripts/metrics_regression.py index 0020ced3b..494447732 100644 --- a/python_scripts/metrics_regression.py +++ b/python_scripts/metrics_regression.py @@ -8,7 +8,7 @@ # %% [markdown] # # Regression # -# In this notebook, we will present the metrics that can be used in regression. +# In this notebook, we present the metrics that can be used in regression. # # A set of metrics are dedicated to regression. Indeed, classification metrics # cannot be used to evaluate the generalization performance of regression models @@ -16,9 +16,9 @@ # it is a continuous variable in regression, while a discrete variable in # classification. # -# We will use the Ames housing dataset. The goal is to predict the price of -# houses in the city of Ames, Iowa. As with classification, we will only use a -# single train-test split to focus solely on the regression metrics. +# We use the Ames housing dataset. The goal is to predict the price of houses in +# the city of Ames, Iowa. As with classification, we only use a single +# train-test split to focus solely on the regression metrics. # %% import pandas as pd @@ -53,7 +53,7 @@ # error (MSE). Thus, this metric is sometimes used to evaluate the model since # it is optimized by said model. # -# We will give an example using a linear regression model. +# We give an example using a linear regression model. # %% from sklearn.linear_model import LinearRegression @@ -86,8 +86,10 @@ # %% [markdown] # The raw MSE can be difficult to interpret. One way is to rescale the MSE by # the variance of the target. This score is known as the $R^2$ also called the -# coefficient of determination. Indeed, this is the default score used in -# scikit-learn by calling the method `score`. +# [coefficient of +# determination](https://scikit-learn.org/stable/modules/model_evaluation.html#r2-score-the-coefficient-of-determination). +# Indeed, this is the default score used in scikit-learn by calling the method +# `score`. # %% regressor.score(data_test, target_test) @@ -278,15 +280,13 @@ "Mean absolute error: " f"{mean_absolute_error(target_test, target_predicted):.3f} k$" ) - print( "Median absolute error: " f"{median_absolute_error(target_test, target_predicted):.3f} k$" ) - print( "Mean absolute percentage error: " - f"{mean_absolute_percentage_error(target_test, target_predicted) * 100:.3f} %" + f"{mean_absolute_percentage_error(target_test, target_predicted):.2%}" ) # %% [markdown] From e14bc541abcb7cef19c788ab33bb83da476f6bcd Mon Sep 17 00:00:00 2001 From: Arturo Amor <86408019+ArturoAmorQ@users.noreply.github.com> Date: Thu, 15 Jun 2023 16:05:33 +0200 Subject: [PATCH 034/108] Add mention to OneHotEncoder parameter `infrequent_if_exist` (#667) --- python_scripts/03_categorical_pipeline.py | 161 ++++++++++++---------- 1 file changed, 86 insertions(+), 75 deletions(-) diff --git a/python_scripts/03_categorical_pipeline.py b/python_scripts/03_categorical_pipeline.py index 17c3ddc2b..5acdefc82 100644 --- a/python_scripts/03_categorical_pipeline.py +++ b/python_scripts/03_categorical_pipeline.py @@ -8,9 +8,9 @@ # %% [markdown] # # Encoding of categorical variables # -# In this notebook, we will present typical ways of dealing with **categorical -# variables** by encoding them, namely **ordinal encoding** and **one-hot -# encoding**. +# In this notebook, we will present typical ways of dealing with +# **categorical variables** by encoding them, namely **ordinal encoding** and +# **one-hot encoding**. # %% [markdown] # Let's first load the entire adult dataset containing both numerical and @@ -32,23 +32,25 @@ # # ## Identify categorical variables # -# As we saw in the previous section, a numerical variable is a quantity -# represented by a real or integer number. These variables can be naturally -# handled by machine learning algorithms that are typically composed of a -# sequence of arithmetic instructions such as additions and multiplications. +# As we saw in the previous section, a numerical variable is a +# quantity represented by a real or integer number. These variables can be +# naturally handled by machine learning algorithms that are typically composed +# of a sequence of arithmetic instructions such as additions and +# multiplications. # -# In contrast, categorical variables have discrete values, typically represented -# by string labels (but not only) taken from a finite list of possible choices. -# For instance, the variable `native-country` in our dataset is a categorical -# variable because it encodes the data using a finite list of possible countries -# (along with the `?` symbol when this information is missing): +# In contrast, categorical variables have discrete values, typically +# represented by string labels (but not only) taken from a finite list of +# possible choices. For instance, the variable `native-country` in our dataset +# is a categorical variable because it encodes the data using a finite list of +# possible countries (along with the `?` symbol when this information is +# missing): # %% data["native-country"].value_counts().sort_index() # %% [markdown] -# How can we easily recognize categorical columns among the dataset? Part of the -# answer lies in the columns' data type: +# How can we easily recognize categorical columns among the dataset? Part of +# the answer lies in the columns' data type: # %% data.dtypes @@ -61,8 +63,8 @@ # # In the previous notebook, we manually defined the numerical columns. We could # do a similar approach. Instead, we will use the scikit-learn helper function -# `make_column_selector`, which allows us to select columns based on their data -# type. We will illustrate how to use this helper. +# `make_column_selector`, which allows us to select columns based on +# their data type. We will illustrate how to use this helper. # %% from sklearn.compose import make_column_selector as selector @@ -95,8 +97,9 @@ # ### Encoding ordinal categories # # The most intuitive strategy is to encode each category with a different -# number. The `OrdinalEncoder` will transform the data in such manner. We will -# start by encoding a single column to understand how the encoding works. +# number. The `OrdinalEncoder` will transform the data in such manner. +# We will start by encoding a single column to understand how the encoding +# works. # %% from sklearn.preprocessing import OrdinalEncoder @@ -130,37 +133,37 @@ # independently. We also note that the number of features before and after the # encoding is the same. # -# However, be careful when applying this encoding strategy: using this integer -# representation leads downstream predictive models to assume that the values -# are ordered (0 < 1 < 2 < 3... for instance). +# However, be careful when applying this encoding strategy: +# using this integer representation leads downstream predictive models +# to assume that the values are ordered (0 < 1 < 2 < 3... for instance). # # By default, `OrdinalEncoder` uses a lexicographical strategy to map string -# category labels to integers. This strategy is arbitrary and often meaningless. -# For instance, suppose the dataset has a categorical variable named `"size"` -# with categories such as "S", "M", "L", "XL". We would like the integer -# representation to respect the meaning of the sizes by mapping them to -# increasing integers such as `0, 1, 2, 3`. However, the lexicographical -# strategy used by default would map the labels "S", "M", "L", "XL" to 2, 1, 0, -# 3, by following the alphabetical order. +# category labels to integers. This strategy is arbitrary and often +# meaningless. For instance, suppose the dataset has a categorical variable +# named `"size"` with categories such as "S", "M", "L", "XL". We would like the +# integer representation to respect the meaning of the sizes by mapping them to +# increasing integers such as `0, 1, 2, 3`. +# However, the lexicographical strategy used by default would map the labels +# "S", "M", "L", "XL" to 2, 1, 0, 3, by following the alphabetical order. # -# The `OrdinalEncoder` class accepts a `categories` constructor argument to pass -# categories in the expected ordering explicitly. You can find more information -# in the [scikit-learn -# documentation](https://scikit-learn.org/stable/modules/preprocessing.html#encoding-categorical-features) +# The `OrdinalEncoder` class accepts a `categories` constructor argument to +# pass categories in the expected ordering explicitly. You can find more +# information in the +# [scikit-learn documentation](https://scikit-learn.org/stable/modules/preprocessing.html#encoding-categorical-features) # if needed. # -# If a categorical variable does not carry any meaningful order information then -# this encoding might be misleading to downstream statistical models and you -# might consider using one-hot encoding instead (see below). +# If a categorical variable does not carry any meaningful order information +# then this encoding might be misleading to downstream statistical models and +# you might consider using one-hot encoding instead (see below). # # ### Encoding nominal categories (without assuming any order) # -# `OneHotEncoder` is an alternative encoder that prevents the downstream models -# to make a false assumption about the ordering of categories. For a given -# feature, it will create as many new columns as there are possible categories. -# For a given sample, the value of the column corresponding to the category will -# be set to `1` while all the columns of the other categories will be set to -# `0`. +# `OneHotEncoder` is an alternative encoder that prevents the downstream +# models to make a false assumption about the ordering of categories. For a +# given feature, it will create as many new columns as there are possible +# categories. For a given sample, the value of the column corresponding to the +# category will be set to `1` while all the columns of the other categories +# will be set to `0`. # # We will start by encoding a single feature (e.g. `"education"`) to illustrate # how the encoding works. @@ -178,8 +181,8 @@ # namely easier visualization of the data. # # Sparse matrices are efficient data structures when most of your matrix -# elements are zero. They won't be covered in detail in this course. If you want -# more details about them, you can look at +# elements are zero. They won't be covered in detail in this course. If you +# want more details about them, you can look at # [this](https://scipy-lectures.org/advanced/scipy_sparse/introduction.html#why-sparse-matrices). # ``` @@ -205,8 +208,8 @@ # Look at how the `"workclass"` variable of the 3 first records has been encoded # and compare this to the original string representation. # -# The number of features after the encoding is more than 10 times larger than in -# the original data because some variables such as `occupation` and +# The number of features after the encoding is more than 10 times larger than +# in the original data because some variables such as `occupation` and # `native-country` have many possible categories. # %% [markdown] @@ -217,26 +220,26 @@ # %% [markdown] # ```{note} -# In general `OneHotEncoder` is the encoding strategy used when the downstream -# models are **linear models** while `OrdinalEncoder` is often a good strategy -# with **tree-based models**. +# In general `OneHotEncoder` is the encoding strategy used when the +# downstream models are **linear models** while `OrdinalEncoder` is often a +# good strategy with **tree-based models**. # ``` # %% [markdown] -# Using an `OrdinalEncoder` will output ordinal categories. This means that -# there is an order in the resulting categories (e.g. `0 < 1 < 2`). The impact -# of violating this ordering assumption is really dependent on the downstream -# models. Linear models will be impacted by misordered categories while -# tree-based models will not. +# +# Using an `OrdinalEncoder` will output ordinal categories. This means +# that there is an order in the resulting categories (e.g. `0 < 1 < 2`). The +# impact of violating this ordering assumption is really dependent on the +# downstream models. Linear models will be impacted by misordered categories +# while tree-based models will not. # # You can still use an `OrdinalEncoder` with linear models but you need to be # sure that: # - the original categories (before encoding) have an ordering; # - the encoded categories follow the same ordering than the original # categories. -# -# The **next exercise** shows what can happen when using an `OrdinalEncoder` -# with a liner model and the conditions above are not met. +# The **next exercise** highlights the issue of misusing `OrdinalEncoder` with +# a linear model. # # One-hot encoding categorical variables with high cardinality can cause # computational inefficiency in tree-based models. Because of this, it is not @@ -249,8 +252,8 @@ # # We can now integrate this encoder inside a machine learning pipeline like we # did with numerical data: let's train a linear classifier on the encoded data -# and check the generalization performance of this machine learning pipeline -# using cross-validation. +# and check the generalization performance of this machine learning pipeline using +# cross-validation. # # Before we create the pipeline, we have to linger on the `native-country`. # Let's recall some statistics regarding this column. @@ -259,26 +262,34 @@ data["native-country"].value_counts() # %% [markdown] -# We see that the `Holand-Netherlands` category is occurring rarely. This will +# We see that the `"Holand-Netherlands"` category is occurring rarely. This will # be a problem during cross-validation: if the sample ends up in the test set # during splitting then the classifier would not have seen the category during # training and will not be able to encode it. # -# In scikit-learn, there are two solutions to bypass this issue: +# In scikit-learn, there are some possible solutions to bypass this issue: # -# * list all the possible categories and provide it to the encoder via the -# keyword argument `categories`; -# * use the parameter `handle_unknown`, i.e. if an unknown category is +# * list all the possible categories and provide them to the encoder via the +# keyword argument `categories` instead of letting the estimator automatically +# determine them from the training data when calling fit; +# * set the parameter `handle_unknown="ignore"`, i.e. if an unknown category is # encountered during transform, the resulting one-hot encoded columns for this -# feature will be all zeros. +# feature will be all zeros; +# * adjust the `min_frequency` parameter to collapse the rarest categories +# observed in the training data into a single one-hot encoded feature. If you +# enable this option, you can also set `handle_unknown="infrequent_if_exist"` +# to encode the unknown categories (categories only observed at predict time) +# as ones in that last column. # -# Here, we will use the latter solution for simplicity. +# In this notebook we only explore the second option, namely +# `OneHotEncoder(handle_unknown="ignore")`. Feel free to evaluate the +# alternatives on your own, for instance using a sandbox notebook. # %% [markdown] # ```{tip} -# Be aware the `OrdinalEncoder` exposes as well a parameter `handle_unknown`. It -# can be set to `use_encoded_value`. If that option is chosen, you can define a -# fixed value to which all unknowns will be set to during `transform`. For +# Be aware the `OrdinalEncoder` exposes a parameter also named `handle_unknown`. +# It can be set to `use_encoded_value`. If that option is chosen, you can define +# a fixed value to which all unknowns will be set to during `transform`. For # example, `OrdinalEncoder(handle_unknown='use_encoded_value', # unknown_value=42)` will set all values encountered during `transform` to `42` # which are not part of the data encountered during the `fit` call. You are @@ -299,10 +310,10 @@ # %% [markdown] # ```{note} # Here, we need to increase the maximum number of iterations to obtain a fully -# converged `LogisticRegression` and silence a `ConvergenceWarning`. Contrary to -# the numerical features, the one-hot encoded categorical features are all on -# the same scale (values are 0 or 1), so they would not benefit from scaling. In -# this case, increasing `max_iter` is the right thing to do. +# converged `LogisticRegression` and silence a `ConvergenceWarning`. Contrary +# to the numerical features, the one-hot encoded categorical features are all +# on the same scale (values are 0 or 1), so they would not benefit from +# scaling. In this case, increasing `max_iter` is the right thing to do. # ``` # %% [markdown] @@ -320,9 +331,9 @@ print(f"The accuracy is: {scores.mean():.3f} ยฑ {scores.std():.3f}") # %% [markdown] -# As you can see, this representation of the categorical variables is slightly -# more predictive of the revenue than the numerical variables that we used -# previously. +# As you can see, this representation of the categorical variables is +# slightly more predictive of the revenue than the numerical variables +# that we used previously. # %% [markdown] # From 27bd570161ce3a877f7a3c329a5af11484fad4c6 Mon Sep 17 00:00:00 2001 From: Arturo Amor <86408019+ArturoAmorQ@users.noreply.github.com> Date: Thu, 15 Jun 2023 16:06:27 +0200 Subject: [PATCH 035/108] MAINT Rework "Regularization of linear regression model" notebook (#698) --- .../linear_models_regularization.py | 357 +++++++++++------- 1 file changed, 214 insertions(+), 143 deletions(-) diff --git a/python_scripts/linear_models_regularization.py b/python_scripts/linear_models_regularization.py index e97afc253..7718875a8 100644 --- a/python_scripts/linear_models_regularization.py +++ b/python_scripts/linear_models_regularization.py @@ -8,19 +8,18 @@ # %% [markdown] # # Regularization of linear regression model # -# In this notebook, we will see the limitations of linear regression models and -# the advantage of using regularized models instead. +# In this notebook, we explore some limitations of linear regression models and +# demonstrate the benefits of using regularized models instead. Additionally, we +# discuss the importance of scaling the data when working with regularized +# models, especially when tuning the regularization parameter. # -# Besides, we will also present the preprocessing required when dealing with -# regularized models, furthermore when the regularization parameter needs to be -# tuned. -# -# We will start by highlighting the over-fitting issue that can arise with a +# We start by highlighting the problem of overfitting that can occur with a # simple linear regression model. # # ## Effect of regularization # -# We will first load the California housing dataset. +# We load the Ames housing dataset. We retain some specific +# `features_of_interest`. # %% [markdown] # ```{note} @@ -29,22 +28,31 @@ # ``` # %% -from sklearn.datasets import fetch_california_housing +import pandas as pd -data, target = fetch_california_housing(as_frame=True, return_X_y=True) -target *= 100 # rescale the target in k$ -data.head() +ames_housing = pd.read_csv("../datasets/ames_housing_no_missing.csv") +features_of_interest = [ + "LotFrontage", + "LotArea", + "PoolArea", + "YearBuilt", + "YrSold", +] +target_name = "SalePrice" +data, target = ( + ames_housing[features_of_interest], + ames_housing[target_name], +) # %% [markdown] # In one of the previous notebook, we showed that linear models could be used -# even in settings where `data` and `target` are not linearly linked. -# -# We showed that one can use the `PolynomialFeatures` transformer to create -# additional features encoding non-linear interactions between features. +# even when there is no linear relationship between the `data` and `target`. +# For instance, one can use the `PolynomialFeatures` transformer to create +# additional features that capture some non-linear interactions between them. # -# Here, we will use this transformer to augment the feature space. Subsequently, -# we will train a linear regression model. We will use the out-of-sample test -# set to evaluate the generalization capabilities of our model. +# Here, we use this transformer to augment the feature space. Subsequently, we +# train a linear regression model. We use the out-of-sample test set to evaluate +# the generalization capabilities of our model. # %% from sklearn.model_selection import cross_validate @@ -73,45 +81,48 @@ train_error = -cv_results["train_score"] print( "Mean squared error of linear regression model on the train set:\n" - f"{train_error.mean():.3f} ยฑ {train_error.std():.3f}" + f"{train_error.mean():.2e} ยฑ {train_error.std():.2e}" ) # %% test_error = -cv_results["test_score"] print( "Mean squared error of linear regression model on the test set:\n" - f"{test_error.mean():.3f} ยฑ {test_error.std():.3f}" + f"{test_error.mean():.2e} ยฑ {test_error.std():.2e}" ) # %% [markdown] -# The score on the training set is much better. This generalization performance -# gap between the training and testing score is an indication that our model -# overfitted our training set. +# The training score is much better than the testing score. Such a gap between +# the training and testing scores is an indication that our model overfitted the +# training set. Indeed, this is one of the dangers when augmenting the number of +# features with a `PolynomialFeatures` transformer. For instance, one does not +# expect features such as `PoolArea * YrSold` to be very predictive. # -# Indeed, this is one of the danger when augmenting the number of features with -# a `PolynomialFeatures` transformer. Our model will focus on some specific -# features. We can check the weights of the model to have a confirmation. Let's -# create a dataframe: the columns will contain the name of the feature while the -# line the coefficients values stored by each model during the cross-validation. +# We can create a dataframe to check the weights of the model: the columns +# contain the name of the features whereas the rows store the coefficients values +# of each model during the cross-validation. # -# Since we used a `PolynomialFeatures` to augment the data, we will create -# feature names representative of the feature combination. Scikit-learn provides -# a `get_feature_names_out` method for this purpose. First, let's get the first -# fitted model from the cross-validation. +# Since we used a `PolynomialFeatures` to augment the data, we extract the +# feature names representative of each feature combination. Scikit-learn +# provides a `feature_names_in_` method for this purpose. First, let's get the +# first fitted model from the cross-validation. # %% model_first_fold = cv_results["estimator"][0] +model_first_fold # %% [markdown] -# Now, we can access to the fitted `PolynomialFeatures` to generate the feature -# names: +# Now, we can access the fitted `LinearRegression` (step `-1` i.e. the last step +# of the model) to recover the feature names. # %% feature_names = model_first_fold[-1].feature_names_in_ feature_names # %% [markdown] -# Finally, we can create the dataframe containing all the information. +# The following code creates a list by iterating through the estimators and +# querying their last step for the learned `coef_`. We can then create the +# dataframe containing all the information. # %% import pandas as pd @@ -126,50 +137,60 @@ import matplotlib.pyplot as plt color = {"whiskers": "black", "medians": "black", "caps": "black"} -weights_linear_regression.plot.box(color=color, vert=False, figsize=(6, 16)) -_ = plt.title("Linear regression coefficients") +fig, ax = plt.subplots(figsize=(8, 12)) +weights_linear_regression.plot.box(color=color, vert=False, ax=ax) +_ = ax.set( + title="Linear regression weights", + xscale="symlog", +) # %% [markdown] +# Notice that we use a (symmetric) log scale for the bar plot. Observe that some +# coefficents are extremely large while others are extremely small. Furthermore, +# the coefficient values can be very unstable accross cross-validation folds. +# # We can force the linear regression model to consider all features in a more -# homogeneous manner. In fact, we could force large positive or negative weight -# to shrink toward zero. This is known as regularization. We will use a ridge -# model which enforces such behavior. +# homogeneous manner. In fact, we could force large positive or negative +# weights to shrink toward zero. This is known as regularization. We use a +# ridge model which enforces such behavior. # %% from sklearn.linear_model import Ridge ridge = make_pipeline( - PolynomialFeatures(degree=2, include_bias=False), Ridge(alpha=100) + PolynomialFeatures(degree=2, include_bias=False), + Ridge(alpha=100, solver="cholesky"), ) cv_results = cross_validate( ridge, data, target, - cv=10, + cv=20, scoring="neg_mean_squared_error", return_train_score=True, return_estimator=True, ) # %% [markdown] -# The code cell above will generate a couple of warnings because the features -# included both extremely large and extremely small values, which are causing -# numerical problems when training the predictive model. +# The code cell above can generate a couple of warnings (depending on the +# choice of solver) because the features included both extremely large and +# extremely small values, which are causing numerical problems when training +# the predictive model. We will get to that in a bit. # -# We can explore the train and test scores of this model. +# Let us evaluate the train and test scores of this model. # %% train_error = -cv_results["train_score"] print( - "Mean squared error of linear regression model on the train set:\n" - f"{train_error.mean():.3f} ยฑ {train_error.std():.3f}" + "Mean squared error of ridge model on the train set:\n" + f"{train_error.mean():.2e} ยฑ {train_error.std():.2e}" ) # %% test_error = -cv_results["test_score"] print( - "Mean squared error of linear regression model on the test set:\n" - f"{test_error.mean():.3f} ยฑ {test_error.std():.3f}" + "Mean squared error of ridge model on the test set:\n" + f"{test_error.mean():.2e} ยฑ {test_error.std():.2e}" ) # %% [markdown] @@ -182,58 +203,78 @@ weights_ridge = pd.DataFrame(coefs, columns=feature_names) # %% -weights_ridge.plot.box(color=color, vert=False, figsize=(6, 16)) -_ = plt.title("Ridge weights") +fig, ax = plt.subplots(figsize=(8, 12)) +weights_ridge.plot.box(color=color, vert=False, ax=ax) +_ = ax.set( + title="Ridge weights", + xscale="symlog", +) # %% [markdown] -# By comparing the magnitude of the weights on this plot compared to the -# previous plot, we see that a ridge model will enforce all weights to have a -# similar magnitude, while the overall magnitude of the weights is shrunk +# By comparing the order of magnitude of the weights on this plot with respect +# to the previous plot, we see that a ridge model enforces all weights to lay in +# a more similar scale, while the overall magnitude of the weights is shrunk # towards zero with respect to the linear regression model. # -# However, in this example, we omitted two important aspects: (i) the need to -# scale the data and (ii) the need to search for the best regularization -# parameter. +# You can observe that the coefficients are still unstable from one fold to +# another, and finally, the results can vary a lot depending on the choice of +# the solver (for instance try to set `solver="saga"` or `solver="lsqr"` instead +# of `solver="cholesky"` and re-run the above cells). +# +# In the following we will attempt to resolve those remaining problems, by +# focusing on two important aspects we omitted so far: +# - the need to scale the data, and +# - the need to search for the best regularization parameter. # # ## Feature scaling and regularization # -# On the one hand, weights define the link between feature values and the -# predicted target. On the other hand, regularization adds constraints on the -# weights of the model through the `alpha` parameter. Therefore, the effect that -# feature rescaling has on the final weights also interacts with regularization. +# On the one hand, weights define the association between feature values and the +# predicted target, which depends on the scales of both the feature values and +# the target. On the other hand, regularization adds constraints on the weights +# of the model through the `alpha` parameter. Therefore, the effect that feature +# rescaling has on the final weights also interacts with the use of +# regularization. # # Let's consider the case where features live on the same scale/units: if two -# features are found to be equally important by the model, they will be affected -# similarly by regularization strength. -# -# Now, let's consider the scenario where features have completely different data -# scale (for instance age in years and annual revenue in dollars). If two -# features are as important, our model will boost the weights of features with -# small scale and reduce the weights of features with high scale. -# -# We recall that regularization forces weights to be closer. Therefore, we get -# an intuition that if we want to use regularization, dealing with rescaled data -# would make it easier to find an optimal regularization parameter and thus an -# adequate model. -# -# As a side note, some solvers based on gradient computation are expecting such -# rescaled data. Unscaled data will be detrimental when computing the optimal -# weights. Therefore, when working with a linear model and numerical data, it is -# generally good practice to scale the data. -# -# Thus, we will add a `StandardScaler` in the machine learning pipeline. This -# scaler will be placed just before the regressor. +# features are found to be equally important by the model, they are affected +# similarly by the regularization strength. +# +# Now, let's consider the scenario where two features have completely different +# data scales (for instance age in years and annual revenue in dollars). Let's +# also assume that both features are approximately equally predictive and are +# not too correlated. Fitting a linear regression without scaling and without +# regularization would give a higher weight to the feature with the smallest +# natural scale. If we add regularization, the feature with the smallest natural +# scale would be penalized more than the other feature. This is not desirable +# given the hypothesis that both features are equally important. In such case we +# require the regularization to stay neutral. +# +# In practice, we don't know ahead of time which features are predictive, and +# therefore we want regularization to treat all features approximately equally +# by default. This can be achieved by rescaling the features. +# +# Furthermore, many numerical solvers used internally in scikit-learn behave +# better when features are approximately on the same scale. Heterogeneously +# scaled data can be detrimental when solving for the optimal weights (hence the +# warnings we tend to get when fitting linear models on raw data). Therefore, +# when working with a linear model and numerical data, it is generally a good +# practice to scale the data. +# +# Thus, we add a `MinMaxScaler` in the machine learning pipeline, which scales +# each feature individually such that its range maps into the range between zero +# and one. We place it just before the `PolynomialFeatures` transformer as +# powers of features in the range between zero and one remain in the same range. # %% -from sklearn.preprocessing import StandardScaler +from sklearn.preprocessing import MinMaxScaler -ridge = make_pipeline( +scaled_ridge = make_pipeline( + MinMaxScaler(), PolynomialFeatures(degree=2, include_bias=False), - StandardScaler(), - Ridge(alpha=0.5), + Ridge(alpha=10, solver="cholesky"), ) cv_results = cross_validate( - ridge, + scaled_ridge, data, target, cv=10, @@ -245,48 +286,57 @@ # %% train_error = -cv_results["train_score"] print( - "Mean squared error of linear regression model on the train set:\n" - f"{train_error.mean():.3f} ยฑ {train_error.std():.3f}" + "Mean squared error of scaled ridge model on the train set:\n" + f"{train_error.mean():.2e} ยฑ {train_error.std():.2e}" ) # %% test_error = -cv_results["test_score"] print( - "Mean squared error of linear regression model on the test set:\n" - f"{test_error.mean():.3f} ยฑ {test_error.std():.3f}" + "Mean squared error of scaled ridge model on the test set:\n" + f"{test_error.mean():.2e} ยฑ {test_error.std():.2e}" ) # %% [markdown] -# We observe that scaling data has a positive impact on the test score and that -# the test score is closer to the train score. It means that our model is less +# We observe that scaling data has a positive impact on the test error: it is +# now both lower and closer to the train error. It means that our model is less # overfitted and that we are getting closer to the best generalization sweet # spot. # +# If you want to try different solvers, you can notice that fitting this +# pipeline no longer generates any warning regardless of such choice. +# Additionally, changing the solver should no longer result in significant +# changes in the weights. +# # Let's have an additional look to the different weights. # %% coefs = [est[-1].coef_ for est in cv_results["estimator"]] -weights_ridge = pd.DataFrame(coefs, columns=feature_names) +weights_ridge_scaled_data = pd.DataFrame(coefs, columns=feature_names) # %% -weights_ridge.plot.box(color=color, vert=False, figsize=(6, 16)) -_ = plt.title("Ridge weights with data scaling") +fig, ax = plt.subplots(figsize=(8, 12)) +weights_ridge_scaled_data.plot.box(color=color, vert=False, ax=ax) +_ = ax.set( + title="Ridge weights with data scaling", + xscale="symlog", +) # %% [markdown] # Compare to the previous plots, we see that now all weight magnitudes are # closer and that all features are more equally contributing. # -# In the previous example, we fixed `alpha=0.5`. We will now check the impact of +# In the previous example, we fixed `alpha=10`. We can now check the impact of # the value of `alpha` by increasing its value. # %% -ridge = make_pipeline( +ridge_large_alpha = make_pipeline( + MinMaxScaler(), PolynomialFeatures(degree=2, include_bias=False), - StandardScaler(), - Ridge(alpha=1_000_000), + Ridge(alpha=1_000_000, solver="lsqr"), ) cv_results = cross_validate( - ridge, + ridge_large_alpha, data, target, cv=10, @@ -297,15 +347,19 @@ # %% coefs = [est[-1].coef_ for est in cv_results["estimator"]] -weights_ridge = pd.DataFrame(coefs, columns=feature_names) +weights_ridge_scaled_data = pd.DataFrame(coefs, columns=feature_names) # %% -weights_ridge.plot.box(color=color, vert=False, figsize=(6, 16)) -_ = plt.title("Ridge weights with data scaling and large alpha") +fig, ax = plt.subplots(figsize=(8, 12)) +weights_ridge_scaled_data.plot.box(color=color, vert=False, ax=ax) +_ = ax.set( + title="Ridge weights with data scaling and large alpha", + xscale="symlog", +) # %% [markdown] # Looking specifically to weights values, we observe that increasing the value -# of `alpha` will decrease the weight values. A negative value of `alpha` would +# of `alpha` decreases the weight values. A negative value of `alpha` would # actually enhance large weights and promote overfitting. # # ```{note} @@ -313,25 +367,22 @@ # generally common to omit scaling when features are encoded with a # `OneHotEncoder` since the feature values are already on a similar scale. # -# However, this choice can be questioned since scaling interacts with -# regularization as well. For instance, scaling categorical features that are -# imbalanced (e.g. more occurrences of a specific category) would even out the -# impact of regularization to each category. However, scaling such features in -# the presence of rare categories could be problematic (i.e. division by a very +# However, this choice may depend on the scaling method and the user case. For +# instance, standard scaling categorical features that are imbalanced (e.g. more +# occurrences of a specific category) would even out the impact of +# regularization to each category. However, scaling such features in the +# presence of rare categories could be problematic (i.e. division by a very # small standard deviation) and it can therefore introduce numerical issues. # ``` # -# In the previous analysis, we did not study if the parameter `alpha` will have -# an effect on the performance. We chose the parameter beforehand and fixed it -# for the analysis. -# -# In the next section, we will check the impact of the regularization parameter -# `alpha` and how it should be tuned. +# In the previous analysis, we chose the parameter beforehand and fixed it for +# the analysis. In the next section, we check how the regularization parameter +# `alpha` should be tuned. # -# ## Fine tuning the regularization parameter +# ## Tuning the regularization parameter # # As mentioned, the regularization parameter needs to be tuned on each dataset. -# The default parameter will not lead to the optimal model. Therefore, we need +# The default parameter does not lead to the optimal model. Therefore, we need # to tune the `alpha` parameter. # # Model hyperparameter tuning should be done with care. Indeed, we want to find @@ -350,26 +401,26 @@ # these predictors finishes by `CV`. In the case of `Ridge`, scikit-learn # provides a `RidgeCV` regressor. # -# Therefore, we can use this predictor as the last step of the pipeline. -# Including the pipeline a cross-validation allows to make a nested -# cross-validation: the inner cross-validation will search for the best alpha, -# while the outer cross-validation will give an estimate of the testing score. +# Cross-validating a pipeline that contains such predictors allows to make a +# nested cross-validation: the inner cross-validation searches for the best +# alpha, while the outer cross-validation gives an estimate of the testing +# score. # %% import numpy as np from sklearn.linear_model import RidgeCV -alphas = np.logspace(-2, 0, num=21) +alphas = np.logspace(-7, 5, num=100) ridge = make_pipeline( + MinMaxScaler(), PolynomialFeatures(degree=2, include_bias=False), - StandardScaler(), RidgeCV(alphas=alphas, store_cv_values=True), ) # %% from sklearn.model_selection import ShuffleSplit -cv = ShuffleSplit(n_splits=5, random_state=1) +cv = ShuffleSplit(n_splits=50, random_state=0) cv_results = cross_validate( ridge, data, @@ -384,15 +435,15 @@ # %% train_error = -cv_results["train_score"] print( - "Mean squared error of linear regression model on the train set:\n" - f"{train_error.mean():.3f} ยฑ {train_error.std():.3f}" + "Mean squared error of tuned ridge model on the train set:\n" + f"{train_error.mean():.2e} ยฑ {train_error.std():.2e}" ) # %% test_error = -cv_results["test_score"] print( - "Mean squared error of linear regression model on the test set:\n" - f"{test_error.mean():.3f} ยฑ {test_error.std():.3f}" + "Mean squared error of tuned ridge model on the test set:\n" + f"{test_error.mean():.2e} ยฑ {test_error.std():.2e}" ) # %% [markdown] @@ -401,8 +452,8 @@ # # When fitting the ridge regressor, we also requested to store the error found # during cross-validation (by setting the parameter `store_cv_values=True`). We -# will plot the mean squared error for the different `alphas` regularization -# strength that we tried. The error bars represent one standard deviation of the +# can plot the mean squared error for the different `alphas` regularization +# strengths that we tried. The error bars represent one standard deviation of the # average mean square error across folds for a given value of `alpha`. # %% @@ -414,12 +465,15 @@ cv_alphas # %% -plt.errorbar(cv_alphas.index, cv_alphas["mean"], yerr=cv_alphas["std"]) -plt.xlim((0.0, 1.0)) -plt.ylim((4_500, 11_000)) -plt.ylabel("Mean squared error\n (lower is better)") -plt.xlabel("alpha") -_ = plt.title("Testing error obtained by cross-validation") +fig, ax = plt.subplots() +ax.errorbar(cv_alphas.index, cv_alphas["mean"], yerr=cv_alphas["std"]) +_ = ax.set( + xscale="log", + xlabel="alpha", + yscale="log", + ylabel="Mean squared error\n (lower is better)", + title="Testing error in RidgeCV's inner cross-validation", +) # %% [markdown] # As we can see, regularization is just like salt in cooking: one must balance @@ -444,8 +498,25 @@ ) # %% [markdown] -# This range can be reduced by decreasing the spacing between the grid of -# `alphas`. -# -# In this notebook, you learned about the concept of regularization and the -# importance of preprocessing and parameter tuning. +# This range can be reduced depending on the feature engineering and +# preprocessing. +# +# Here is a summary of important points highlighted in this notebook: +# - scaling features makes the effect of regularization more even: all variables +# are regularized by comparable magnitude, which would not necessarily be the +# case with the natural feature scales; +# - scaling features makes the numerical solvers more stable which is also +# helpful to tune the regularization parameter more independently of the +# choice of the solver used to fit the linear model; +# - tuning the regularization parameter of the `Ridge` estimator can be done +# very efficiently by using the `RidgeCV` class. Wrapping it into a +# `cross_validate` call makes it possible to assess the true generalization +# power of the whole pipeline by including the tuning of the regularization +# parameter as part of the learning process: this is an example of "nested +# cross-validation"; +# - doing so makes it possible to check that the optimal value of the +# regularization strength `alpha` is robust to a resampling of the dataset. If +# it wasn't the case it would hint at a problem with the dataset (e.g. +# presence of outliers in the features or the target that influence the +# learning process disproportionately) or a bad choice of other elements of +# the feature engineering pipeline. From 3895613c0dfa04898fe05b60aa14b17e899ad6c4 Mon Sep 17 00:00:00 2001 From: Arturo Amor <86408019+ArturoAmorQ@users.noreply.github.com> Date: Wed, 30 Aug 2023 11:47:09 +0200 Subject: [PATCH 036/108] Introduce use of plot_chance_level parameter (#703) --- python_scripts/metrics_classification.py | 29 ++++++++++++++++++++++++ 1 file changed, 29 insertions(+) diff --git a/python_scripts/metrics_classification.py b/python_scripts/metrics_classification.py index ce91a3483..a620c3612 100644 --- a/python_scripts/metrics_classification.py +++ b/python_scripts/metrics_classification.py @@ -392,3 +392,32 @@ # the ROC-AUC is 0.5. Indeed, we show the generalization performance of a dummy # classifier (the orange dashed line) to show that even the worst generalization # performance obtained will be above this line. +# +# Instead of using a dummy classifier, you can use the parameter `plot_chance_level` +# available in the ROC and PR displays: + +# %% +fig, axs = plt.subplots(ncols=2, nrows=1, figsize=(15, 7)) + +PrecisionRecallDisplay.from_estimator( + classifier, + data_test, + target_test, + pos_label="donated", + marker="+", + plot_chance_level=True, + chance_level_kw={"color": "tab:orange", "linestyle": "--"}, + ax=axs[0], +) +RocCurveDisplay.from_estimator( + classifier, + data_test, + target_test, + pos_label="donated", + marker="+", + plot_chance_level=True, + chance_level_kw={"color": "tab:orange", "linestyle": "--"}, + ax=axs[1], +) + +_ = fig.suptitle("PR and ROC curves") From dd0b29943c5ee813fb1c74bfae0475d62c0d7ced Mon Sep 17 00:00:00 2001 From: Arturo Amor <86408019+ArturoAmorQ@users.noreply.github.com> Date: Wed, 30 Aug 2023 14:00:40 +0200 Subject: [PATCH 037/108] MAINT Use class `ValidationCurveDisplay` (#702) --- python_scripts/cross_validation_ex_01.py | 27 +++--- python_scripts/cross_validation_sol_01.py | 62 +++++-------- .../cross_validation_validation_curve.py | 59 ++++--------- python_scripts/ensemble_ex_03.py | 18 ++-- python_scripts/ensemble_sol_03.py | 87 +++++-------------- 5 files changed, 78 insertions(+), 175 deletions(-) diff --git a/python_scripts/cross_validation_ex_01.py b/python_scripts/cross_validation_ex_01.py index abe9f22c6..8fd6e4ea8 100644 --- a/python_scripts/cross_validation_ex_01.py +++ b/python_scripts/cross_validation_ex_01.py @@ -22,7 +22,7 @@ # * use a learning curve to determine the usefulness of adding new samples in # the dataset when building a classifier. # -# To make these experiments we will first load the blood transfusion dataset. +# To make these experiments we first load the blood transfusion dataset. # %% [markdown] # ```{note} @@ -38,7 +38,7 @@ target = blood_transfusion["Class"] # %% [markdown] -# We will use a support vector machine classifier (SVM). In its most simple +# Here we use a support vector machine classifier (SVM). In its most simple # form, a SVM classifier is a linear classifier behaving similarly to a logistic # regression. Indeed, the optimization used to find the optimal weights of the # linear model are different but we don't need to know these details for the @@ -78,23 +78,17 @@ # As previously mentioned, the parameter `gamma` is one of the parameters # controlling under/over-fitting in support vector machine with an RBF kernel. # -# Evaluate the effect of the parameter `gamma` by using the -# [`sklearn.model_selection.validation_curve`](https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.validation_curve.html) -# function. You can leave the default `scoring=None` which is equivalent to +# Evaluate the effect of the parameter `gamma` by using +# [`sklearn.model_selection.ValidationCurveDisplay`](https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.ValidationCurveDisplay.html). +# You can leave the default `scoring=None` which is equivalent to # `scoring="accuracy"` for classification problems. You can vary `gamma` between # `10e-3` and `10e2` by generating samples on a logarithmic scale with the help # of `np.logspace(-3, 2, num=30)`. # -# Since we are manipulating a `Pipeline` the parameter name will be set to -# `svc__gamma` instead of only `gamma`. You can retrieve the parameter name -# using `model.get_params().keys()`. We will go more into detail regarding -# accessing and setting hyperparameter in the next section. - -# %% -# Write your code here. - -# %% [markdown] -# Plot the validation curve for the train and test scores. +# Since we are manipulating a `Pipeline` the parameter name is `svc__gamma` +# instead of only `gamma`. You can retrieve the parameter name using +# `model.get_params().keys()`. We will go more into detail regarding accessing +# and setting hyperparameter in the next section. # %% # Write your code here. @@ -102,7 +96,8 @@ # %% [markdown] # Now, you can perform an analysis to check whether adding new samples to the # dataset could help our model to better generalize. Compute the learning curve -# (using [`sklearn.model_selection.LearningCurveDisplay`](https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.LearningCurveDisplay.html)) +# (using +# [`sklearn.model_selection.LearningCurveDisplay`](https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.LearningCurveDisplay.html)) # by computing the train and test scores for different training dataset size. # Plot the train and test scores with respect to the number of samples. diff --git a/python_scripts/cross_validation_sol_01.py b/python_scripts/cross_validation_sol_01.py index 35aa635b3..78518309c 100644 --- a/python_scripts/cross_validation_sol_01.py +++ b/python_scripts/cross_validation_sol_01.py @@ -16,7 +16,7 @@ # * use a learning curve to determine the usefulness of adding new samples in # the dataset when building a classifier. # -# To make these experiments we will first load the blood transfusion dataset. +# To make these experiments we first load the blood transfusion dataset. # %% [markdown] # ```{note} @@ -32,7 +32,7 @@ target = blood_transfusion["Class"] # %% [markdown] -# We will use a support vector machine classifier (SVM). In its most simple +# Here we use a support vector machine classifier (SVM). In its most simple # form, a SVM classifier is a linear classifier behaving similarly to a logistic # regression. Indeed, the optimization used to find the optimal weights of the # linear model are different but we don't need to know these details for the @@ -90,62 +90,44 @@ # As previously mentioned, the parameter `gamma` is one of the parameters # controlling under/over-fitting in support vector machine with an RBF kernel. # -# Evaluate the effect of the parameter `gamma` by using the -# [`sklearn.model_selection.validation_curve`](https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.validation_curve.html) -# function. You can leave the default `scoring=None` which is equivalent to +# Evaluate the effect of the parameter `gamma` by using +# [`sklearn.model_selection.ValidationCurveDisplay`](https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.ValidationCurveDisplay.html). +# You can leave the default `scoring=None` which is equivalent to # `scoring="accuracy"` for classification problems. You can vary `gamma` between # `10e-3` and `10e2` by generating samples on a logarithmic scale with the help # of `np.logspace(-3, 2, num=30)`. # -# Since we are manipulating a `Pipeline` the parameter name will be set to -# `svc__gamma` instead of only `gamma`. You can retrieve the parameter name -# using `model.get_params().keys()`. We will go more into detail regarding -# accessing and setting hyperparameter in the next section. +# Since we are manipulating a `Pipeline` the parameter name is `svc__gamma` +# instead of only `gamma`. You can retrieve the parameter name using +# `model.get_params().keys()`. We will go more into detail regarding accessing +# and setting hyperparameter in the next section. # %% # solution import numpy as np -from sklearn.model_selection import validation_curve + +from sklearn.model_selection import ValidationCurveDisplay gammas = np.logspace(-3, 2, num=30) param_name = "svc__gamma" -train_scores, test_scores = validation_curve( +disp = ValidationCurveDisplay.from_estimator( model, data, target, param_name=param_name, param_range=gammas, cv=cv, + scoring="accuracy", # this is already the default for classifiers + score_name="Accuracy", + std_display_style="errorbar", + errorbar_kw={"alpha": 0.7}, # transparency for better visualization n_jobs=2, ) -# %% [markdown] -# Plot the validation curve for the train and test scores. - -# %% -# solution -import matplotlib.pyplot as plt - -plt.errorbar( - gammas, - train_scores.mean(axis=1), - yerr=train_scores.std(axis=1), - alpha=0.95, - label="Training score", +_ = disp.ax_.set( + xlabel=r"Value of hyperparameter $\gamma$", + title="Validation curve of support vector machine", ) -plt.errorbar( - gammas, - test_scores.mean(axis=1), - yerr=test_scores.std(axis=1), - alpha=0.5, - label="Testing score", -) -plt.legend() - -plt.xscale("log") -plt.xlabel(r"Value of hyperparameter $\gamma$") -plt.ylabel("Accuracy score") -_ = plt.title("Validation score of support vector machine") # %% [markdown] tags=["solution"] # Looking at the curve, we can clearly identify the over-fitting regime of the @@ -156,7 +138,8 @@ # %% [markdown] # Now, you can perform an analysis to check whether adding new samples to the # dataset could help our model to better generalize. Compute the learning curve -# (using [`sklearn.model_selection.LearningCurveDisplay`](https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.LearningCurveDisplay.html)) +# (using +# [`sklearn.model_selection.LearningCurveDisplay`](https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.LearningCurveDisplay.html)) # by computing the train and test scores for different training dataset size. # Plot the train and test scores with respect to the number of samples. @@ -179,8 +162,7 @@ n_jobs=2, ) -plt.legend(bbox_to_anchor=(1.05, 0.8), loc="upper left") -_ = plt.title("Learning curve for support vector machine") +_ = disp.ax_.set(title="Learning curve for support vector machine") # %% [markdown] tags=["solution"] # We observe that adding new samples to the training dataset does not seem to diff --git a/python_scripts/cross_validation_validation_curve.py b/python_scripts/cross_validation_validation_curve.py index 79297634f..997fb0433 100644 --- a/python_scripts/cross_validation_validation_curve.py +++ b/python_scripts/cross_validation_validation_curve.py @@ -48,7 +48,7 @@ import pandas as pd from sklearn.model_selection import cross_validate, ShuffleSplit -cv = ShuffleSplit(n_splits=30, test_size=0.2) +cv = ShuffleSplit(n_splits=30, test_size=0.2, random_state=0) cv_results = cross_validate( regressor, data, @@ -104,10 +104,10 @@ # %% # %%time -from sklearn.model_selection import validation_curve +from sklearn.model_selection import ValidationCurveDisplay max_depth = [1, 5, 10, 15, 20, 25] -train_scores, test_scores = validation_curve( +disp = ValidationCurveDisplay.from_estimator( regressor, data, target, @@ -115,22 +115,15 @@ param_range=max_depth, cv=cv, scoring="neg_mean_absolute_error", + negate_score=True, + std_display_style="errorbar", n_jobs=2, ) -train_errors, test_errors = -train_scores, -test_scores - -# %% [markdown] -# Now that we collected the results, we will show the validation curve by -# plotting the training and testing errors (as well as their deviations). - -# %% -plt.plot(max_depth, train_errors.mean(axis=1), label="Training error") -plt.plot(max_depth, test_errors.mean(axis=1), label="Testing error") -plt.legend() - -plt.xlabel("Maximum depth of decision tree") -plt.ylabel("Mean absolute error (k$)") -_ = plt.title("Validation curve for decision tree") +_ = disp.ax_.set( + xlabel="Maximum depth of decision tree", + ylabel="Mean absolute error (k$)", + title="Validate curve for decision tree", +) # %% [markdown] # The validation curve can be divided into three areas: @@ -158,33 +151,11 @@ # could reach by just tuning this parameter. # # Be aware that looking at the mean errors is quite limiting. We should also -# look at the standard deviation to assess the dispersion of the score. We can -# repeat the same plot as before but this time, we will add some information to -# show the standard deviation of the errors as well. - -# %% -plt.errorbar( - max_depth, - train_errors.mean(axis=1), - yerr=train_errors.std(axis=1), - label="Training error", -) -plt.errorbar( - max_depth, - test_errors.mean(axis=1), - yerr=test_errors.std(axis=1), - label="Testing error", -) -plt.legend() - -plt.xlabel("Maximum depth of decision tree") -plt.ylabel("Mean absolute error (k$)") -_ = plt.title("Validation curve for decision tree") - -# %% [markdown] -# We were lucky that the variance of the errors was small compared to their -# respective values, and therefore the conclusions above are quite clear. This -# is not necessarily always the case. +# look at the standard deviation to assess the dispersion of the score. For such +# purpose, we can use the parameter `std_display_style` to show the standard +# deviation of the errors as well. In this case, the variance of the errors is +# small compared to their respective values, and therefore the conclusions above +# are quite clear. This is not necessarily always the case. # %% [markdown] # ## Summary: diff --git a/python_scripts/ensemble_ex_03.py b/python_scripts/ensemble_ex_03.py index 35d0cf66e..72f8f362c 100644 --- a/python_scripts/ensemble_ex_03.py +++ b/python_scripts/ensemble_ex_03.py @@ -21,7 +21,7 @@ # * use the early-stopping strategy to avoid adding unnecessary trees, to get # the best generalization performances. # -# We will use the California housing dataset to conduct our experiments. +# We use the California housing dataset to conduct our experiments. # %% from sklearn.datasets import fetch_california_housing @@ -58,25 +58,25 @@ # For both the gradient-boosting and random forest models, create a validation # curve using the training set to assess the impact of the number of trees on # the performance of each model. Evaluate the list of parameters `param_range = -# [1, 2, 5, 10, 20, 50, 100]` and use the mean absolute error. +# np.array([1, 2, 5, 10, 20, 50, 100])` and use the mean absolute error. # %% # Write your code here. # %% [markdown] -# Both gradient boosting and random forest models will always improve when -# increasing the number of trees in the ensemble. However, it will reach a -# plateau where adding new trees will just make fitting and scoring slower. +# Both gradient boosting and random forest models improve when increasing the +# number of trees in the ensemble. However, the scores reach a plateau where +# adding new trees just makes fitting and scoring slower. # # To avoid adding new unnecessary tree, unlike random-forest gradient-boosting -# offers an early-stopping option. Internally, the algorithm will use an +# offers an early-stopping option. Internally, the algorithm uses an # out-of-sample set to compute the generalization performance of the model at # each addition of a tree. Thus, if the generalization performance is not -# improving for several iterations, it will stop adding trees. +# improving for several iterations, it stops adding trees. # # Now, create a gradient-boosting model with `n_estimators=1_000`. This number -# of trees will be too large. Change the parameter `n_iter_no_change` such that -# the gradient boosting fitting will stop after adding 5 trees that do not +# of trees is certainly too large. Change the parameter `n_iter_no_change` such +# that the gradient boosting fitting stops after adding 5 trees that do not # improve the overall generalization performance. # %% diff --git a/python_scripts/ensemble_sol_03.py b/python_scripts/ensemble_sol_03.py index 302eb6864..a72542464 100644 --- a/python_scripts/ensemble_sol_03.py +++ b/python_scripts/ensemble_sol_03.py @@ -15,7 +15,7 @@ # * use the early-stopping strategy to avoid adding unnecessary trees, to get # the best generalization performances. # -# We will use the California housing dataset to conduct our experiments. +# We use the California housing dataset to conduct our experiments. # %% from sklearn.datasets import fetch_california_housing @@ -58,92 +58,47 @@ # For both the gradient-boosting and random forest models, create a validation # curve using the training set to assess the impact of the number of trees on # the performance of each model. Evaluate the list of parameters `param_range = -# [1, 2, 5, 10, 20, 50, 100]` and use the mean absolute error. +# np.array([1, 2, 5, 10, 20, 50, 100])` and use the mean absolute error. # %% # solution -from sklearn.model_selection import validation_curve +import numpy as np -param_range = [1, 2, 5, 10, 20, 50, 100] -gbdt_train_scores, gbdt_validation_scores = validation_curve( - gbdt, - data_train, - target_train, - param_name="n_estimators", - param_range=param_range, - scoring="neg_mean_absolute_error", - n_jobs=2, -) -gbdt_train_errors, gbdt_validation_errors = ( - -gbdt_train_scores, - -gbdt_validation_scores, -) +from sklearn.model_selection import ValidationCurveDisplay -forest_train_scores, forest_validation_scores = validation_curve( +param_range = np.array([1, 2, 5, 10, 20, 50, 100]) +disp = ValidationCurveDisplay.from_estimator( forest, - data_train, - target_train, + data, + target, param_name="n_estimators", param_range=param_range, scoring="neg_mean_absolute_error", + negate_score=True, + std_display_style="errorbar", n_jobs=2, ) -forest_train_errors = -forest_train_scores -forest_validation_errors = -forest_validation_scores -# %% tags=["solution"] -import matplotlib.pyplot as plt - -fig, axs = plt.subplots(ncols=2, sharex=True, sharey=True, figsize=(10, 4)) - -axs[0].errorbar( - param_range, - gbdt_train_errors.mean(axis=1), - yerr=gbdt_train_errors.std(axis=1), - label="Training", -) -axs[0].errorbar( - param_range, - gbdt_validation_errors.mean(axis=1), - yerr=gbdt_validation_errors.std(axis=1), - label="Cross-validation", +_ = disp.ax_.set( + xlabel="Number of trees in the forest", + ylabel="Mean absolute error (k$)", + title="Validation curve for random forest", ) -axs[0].set_title("Gradient boosting decision tree") -axs[0].set_xlabel("# estimators") -axs[0].set_ylabel("Mean absolute error in k$\n(smaller is better)") - -axs[1].errorbar( - param_range, - forest_train_errors.mean(axis=1), - yerr=forest_train_errors.std(axis=1), - label="Training", -) -axs[1].errorbar( - param_range, - forest_validation_errors.mean(axis=1), - yerr=forest_validation_errors.std(axis=1), - label="Cross-validation", -) -axs[1].set_title("Random forest") -axs[1].set_xlabel("# estimators") - -plt.legend() -_ = fig.suptitle("Validation curves", y=1.1) # %% [markdown] -# Both gradient boosting and random forest models will always improve when -# increasing the number of trees in the ensemble. However, it will reach a -# plateau where adding new trees will just make fitting and scoring slower. +# Both gradient boosting and random forest models improve when increasing the +# number of trees in the ensemble. However, the scores reach a plateau where +# adding new trees just makes fitting and scoring slower. # # To avoid adding new unnecessary tree, unlike random-forest gradient-boosting -# offers an early-stopping option. Internally, the algorithm will use an +# offers an early-stopping option. Internally, the algorithm uses an # out-of-sample set to compute the generalization performance of the model at # each addition of a tree. Thus, if the generalization performance is not -# improving for several iterations, it will stop adding trees. +# improving for several iterations, it stops adding trees. # # Now, create a gradient-boosting model with `n_estimators=1_000`. This number -# of trees will be too large. Change the parameter `n_iter_no_change` such that -# the gradient boosting fitting will stop after adding 5 trees that do not +# of trees is certainly too large. Change the parameter `n_iter_no_change` such +# that the gradient boosting fitting stops after adding 5 trees that do not # improve the overall generalization performance. # %% From fa58455e2501628a955f35b00f97b56003c25792 Mon Sep 17 00:00:00 2001 From: Arturo Amor <86408019+ArturoAmorQ@users.noreply.github.com> Date: Thu, 31 Aug 2023 11:28:34 +0200 Subject: [PATCH 038/108] Introduce that evaluation metrics and loss are not always the same (#704) Co-authored-by: ArturoAmorQ Co-authored-by: Guillaume Lemaitre Co-authored-by: Olivier Grisel --- python_scripts/metrics_sol_02.py | 63 +++++++++++++++++++++++++++++++- 1 file changed, 61 insertions(+), 2 deletions(-) diff --git a/python_scripts/metrics_sol_02.py b/python_scripts/metrics_sol_02.py index 6a4520811..bbd5118ee 100644 --- a/python_scripts/metrics_sol_02.py +++ b/python_scripts/metrics_sol_02.py @@ -53,8 +53,8 @@ print(f"R2 score: {scores.mean():.3f} ยฑ {scores.std():.3f}") # %% [markdown] -# Then, instead of using the $R^2$ score, use the mean absolute error. You need -# to refer to the documentation for the `scoring` parameter. +# Then, instead of using the $R^2$ score, use the mean absolute error (MAE). You +# may need to refer to the documentation for the `scoring` parameter. # %% # solution @@ -91,3 +91,62 @@ } scores = pd.DataFrame(scores) scores + +# %% [markdown] tags=["solution"] +# In the Regression Metrics notebook, we introduced the concept of loss function, +# which is the metric optimized when training a model. In the case of +# `LinearRegression`, the fitting process consists in minimizing the mean squared +# error (MSE). Some estimators, such as `HistGradientBoostingRegressor`, can +# use different loss functions, to be set using the `loss` hyperparameter. +# +# Notice that the evaluation metrics and the loss functions are not necessarily +# the same. Let's see an example: + +# %% +# solution +from collections import defaultdict +from sklearn.ensemble import HistGradientBoostingRegressor + +scoring = ["neg_mean_squared_error", "neg_mean_absolute_error"] +loss_functions = ["squared_error", "absolute_error"] +scores = defaultdict(list) + +for loss_func in loss_functions: + model = HistGradientBoostingRegressor(loss=loss_func) + cv_results = cross_validate(model, data, target, scoring=scoring) + mse = -cv_results["test_neg_mean_squared_error"] + mae = -cv_results["test_neg_mean_absolute_error"] + scores["loss"].append(loss_func) + scores["MSE"].append(f"{mse.mean():.1f} ยฑ {mse.std():.1f}") + scores["MAE"].append(f"{mae.mean():.1f} ยฑ {mae.std():.1f}") +scores = pd.DataFrame(scores) +scores.set_index("loss") + +# %% [markdown] tags=["solution"] +# Even if the score distributions overlap due to the presence of outliers in the +# dataset, it is true that the average MSE is lower when `loss="squared_error`, +# whereas the average MAE is lower when `loss="absolute_error` as expected. +# Indeed, the choice of a loss function is made depending on the evaluation +# metric that we want to optimize for a given use case. +# +# If you feel like going beyond the contents of this MOOC, you can try different +# combinations of loss functions and evaluation metrics. +# +# Notice that there are some metrics that cannot be directly optimized by +# optimizing a loss function. This is the case for metrics that evolve in a +# discontinuous manner with respect to the internal parameters of the model, as +# learning solvers based on gradient descent or similar optimizers require +# continuity (the details are beyond the scope of this MOOC). +# +# For instance, classification models are often evaluated using metrics computed +# on hard class predictions (i.e. whether a sample belongs to a given class) +# rather than from continuous values such as +# [`predict_proba`](https://scikit-learn.org/stable/glossary.html#term-predict_proba) +# (i.e. the estimated probability of belonging to said given class). Because of +# this, classifiers are typically trained by optimizing a loss function computed +# from some continuous output of the model. We call it a "surrogate loss" as it +# substitutes the metric of interest. For instance `LogisticRegression` +# minimizes the `log_loss` applied to the `predict_proba` output of the model. +# By minimizing the surrogate loss, we maximize the accuracy. However +# scikit-learn does not provide surrogate losses for all possible classification +# metrics. From 6ac871949cd8f3c0e8081fb58b5043c1c6f1a923 Mon Sep 17 00:00:00 2001 From: Olivier Grisel Date: Thu, 31 Aug 2023 11:39:33 +0200 Subject: [PATCH 039/108] Make the navbar scroll to the current page header (#705) --- requirements-dev.txt | 3 +++ 1 file changed, 3 insertions(+) diff --git a/requirements-dev.txt b/requirements-dev.txt index e6b46805b..180c42915 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -4,6 +4,9 @@ matplotlib seaborn plotly jupyter-book>=0.11 +# Partial fix for the navbar scrollToActive behavior: +# https://github.com/executablebooks/sphinx-book-theme/issues/541 +sphinx-book-theme @ git+https://github.com/ogrisel/sphinx-book-theme@fix-bd-docs-nav jupytext beautifulsoup4 IPython From cd4b75da7d02c549461fe2fbf15de16f9d79af64 Mon Sep 17 00:00:00 2001 From: Arturo Amor <86408019+ArturoAmorQ@users.noreply.github.com> Date: Thu, 31 Aug 2023 12:06:49 +0200 Subject: [PATCH 040/108] Various fixes and improvements in Regularization of linear regression model (#700) Co-authored-by: ArturoAmorQ Co-authored-by: Guillaume Lemaitre --- .../linear_models_regularization.py | 127 +++++++++--------- 1 file changed, 67 insertions(+), 60 deletions(-) diff --git a/python_scripts/linear_models_regularization.py b/python_scripts/linear_models_regularization.py index 7718875a8..1b221e856 100644 --- a/python_scripts/linear_models_regularization.py +++ b/python_scripts/linear_models_regularization.py @@ -45,14 +45,15 @@ ) # %% [markdown] -# In one of the previous notebook, we showed that linear models could be used +# In one of the previous notebooks, we showed that linear models could be used # even when there is no linear relationship between the `data` and `target`. # For instance, one can use the `PolynomialFeatures` transformer to create # additional features that capture some non-linear interactions between them. # # Here, we use this transformer to augment the feature space. Subsequently, we -# train a linear regression model. We use the out-of-sample test set to evaluate -# the generalization capabilities of our model. +# train a linear regression model. We use cross-validation with +# `return_train_score=True` to evaluate both the train scores and the +# generalization capabilities of our model. # %% from sklearn.model_selection import cross_validate @@ -92,20 +93,22 @@ ) # %% [markdown] -# The training score is much better than the testing score. Such a gap between -# the training and testing scores is an indication that our model overfitted the -# training set. Indeed, this is one of the dangers when augmenting the number of -# features with a `PolynomialFeatures` transformer. For instance, one does not -# expect features such as `PoolArea * YrSold` to be very predictive. -# -# We can create a dataframe to check the weights of the model: the columns -# contain the name of the features whereas the rows store the coefficients values -# of each model during the cross-validation. -# -# Since we used a `PolynomialFeatures` to augment the data, we extract the -# feature names representative of each feature combination. Scikit-learn -# provides a `feature_names_in_` method for this purpose. First, let's get the -# first fitted model from the cross-validation. +# The training error is in average one order of magnitude lower than the testing +# error (lower error is better). Such a gap between the training and testing +# scores is an indication that our model overfitted the training set. Indeed, +# this is one of the dangers when augmenting the number of features with a +# `PolynomialFeatures` transformer. For instance, one does not expect features +# such as `PoolArea * YrSold` to be very predictive. +# +# To analyze the weights of the model, we can create a dataframe. The columns of +# the dataframe contain the feature names, while the rows store the coefficients +# of each model of a given cross-validation fold. +# +# In order to obtain the feature names associated with each feature combination, +# we need to extract them from the augmented data created by +# `PolynomialFeatures`. Fortunately, scikit-learn provides a convenient method +# called `feature_names_in_` for this purpose. Let's begin by retrieving the +# coefficients from the model fitted in the first cross-validation fold. # %% model_first_fold = cv_results["estimator"][0] @@ -113,7 +116,7 @@ # %% [markdown] # Now, we can access the fitted `LinearRegression` (step `-1` i.e. the last step -# of the model) to recover the feature names. +# of the `linear_regression` pipeline) to recover the feature names. # %% feature_names = model_first_fold[-1].feature_names_in_ @@ -137,17 +140,29 @@ import matplotlib.pyplot as plt color = {"whiskers": "black", "medians": "black", "caps": "black"} -fig, ax = plt.subplots(figsize=(8, 12)) +fig, ax = plt.subplots(figsize=(10, 10)) +weights_linear_regression.plot.box(color=color, vert=False, ax=ax) +_ = ax.set(title="Linear regression weights (linear scale)") + +# %% [markdown] +# By looking at the bar plot above it would seem that most of the features are +# very close to zero, but this is just an effect of visualizing them on the same +# scale as the extremely large span of `"YrSold"`. Instead we can use a +# symmetric log scale for the plot. + +# %% +color = {"whiskers": "black", "medians": "black", "caps": "black"} +fig, ax = plt.subplots(figsize=(10, 10)) weights_linear_regression.plot.box(color=color, vert=False, ax=ax) _ = ax.set( - title="Linear regression weights", + title="Linear regression weights (symmetric log scale)", xscale="symlog", ) # %% [markdown] -# Notice that we use a (symmetric) log scale for the bar plot. Observe that some -# coefficents are extremely large while others are extremely small. Furthermore, -# the coefficient values can be very unstable accross cross-validation folds. +# Observe that some coefficients are extremely large while others are extremely +# small, yet non-zero. Furthermore, the coefficient values can be very unstable +# across cross-validation folds. # # We can force the linear regression model to consider all features in a more # homogeneous manner. In fact, we could force large positive or negative @@ -194,37 +209,34 @@ ) # %% [markdown] -# We see that the training and testing scores are much closer, indicating that -# our model is less overfitting. We can compare the values of the weights of -# ridge with the un-regularized linear regression. +# We see that the training and testing scores get closer, indicating that our +# model is less overfitting (yet still overfitting!). We can compare the values +# of the weights of ridge with the un-regularized linear regression. # %% coefs = [est[-1].coef_ for est in cv_results["estimator"]] weights_ridge = pd.DataFrame(coefs, columns=feature_names) # %% -fig, ax = plt.subplots(figsize=(8, 12)) +fig, ax = plt.subplots(figsize=(8, 10)) weights_ridge.plot.box(color=color, vert=False, ax=ax) -_ = ax.set( - title="Ridge weights", - xscale="symlog", -) +_ = ax.set(title="Ridge regression weights") # %% [markdown] -# By comparing the order of magnitude of the weights on this plot with respect -# to the previous plot, we see that a ridge model enforces all weights to lay in -# a more similar scale, while the overall magnitude of the weights is shrunk -# towards zero with respect to the linear regression model. +# Notice that the overall magnitudes of the weights are shrunk +# (yet non-zero!) with respect to the linear regression model. If you want to, +# feel free to use a symmetric log scale in the previous plot. # -# You can observe that the coefficients are still unstable from one fold to -# another, and finally, the results can vary a lot depending on the choice of -# the solver (for instance try to set `solver="saga"` or `solver="lsqr"` instead -# of `solver="cholesky"` and re-run the above cells). +# You can also observe that even if the weights' values are less extreme, they +# are still unstable from one fold to another. Even worst, the results can vary +# a lot depending on the choice of the solver (for instance try to set +# `solver="saga"` or `solver="lsqr"` instead of `solver="cholesky"` and re-run +# the above cells). # -# In the following we will attempt to resolve those remaining problems, by +# In the following we attempt to resolve those remaining problems, by # focusing on two important aspects we omitted so far: -# - the need to scale the data, and -# - the need to search for the best regularization parameter. +# - the need to **scale the data**, and +# - the need to **search for the best regularization parameter**. # # ## Feature scaling and regularization # @@ -315,19 +327,17 @@ weights_ridge_scaled_data = pd.DataFrame(coefs, columns=feature_names) # %% -fig, ax = plt.subplots(figsize=(8, 12)) +fig, ax = plt.subplots(figsize=(8, 10)) weights_ridge_scaled_data.plot.box(color=color, vert=False, ax=ax) -_ = ax.set( - title="Ridge weights with data scaling", - xscale="symlog", -) +_ = ax.set(title="Ridge regression weights with data scaling") # %% [markdown] -# Compare to the previous plots, we see that now all weight magnitudes are -# closer and that all features are more equally contributing. +# Compared to the previous plots, we see that now most weight magnitudes have a +# similar order of magnitude, i.e. they are more equally contributing. The +# number of unstable weights also decreased. # -# In the previous example, we fixed `alpha=10`. We can now check the impact of -# the value of `alpha` by increasing its value. +# In the previous model, we set `alpha=10`. We can now check the impact of +# `alpha` by increasing it to a very large value. # %% ridge_large_alpha = make_pipeline( @@ -350,17 +360,14 @@ weights_ridge_scaled_data = pd.DataFrame(coefs, columns=feature_names) # %% -fig, ax = plt.subplots(figsize=(8, 12)) +fig, ax = plt.subplots(figsize=(8, 10)) weights_ridge_scaled_data.plot.box(color=color, vert=False, ax=ax) -_ = ax.set( - title="Ridge weights with data scaling and large alpha", - xscale="symlog", -) +_ = ax.set(title="Ridge regression weights with data scaling and large alpha") # %% [markdown] -# Looking specifically to weights values, we observe that increasing the value -# of `alpha` decreases the weight values. A negative value of `alpha` would -# actually enhance large weights and promote overfitting. +# When examining the weight values, we notice that as the `alpha` value +# increases, the weights decrease. A negative value of `alpha` can lead to +# unpredictable and unstable behavior in the model. # # ```{note} # Here, we only focus on numerical features. For categorical features, it is @@ -465,7 +472,7 @@ cv_alphas # %% -fig, ax = plt.subplots() +fig, ax = plt.subplots(figsize=(8, 6)) ax.errorbar(cv_alphas.index, cv_alphas["mean"], yerr=cv_alphas["std"]) _ = ax.set( xscale="log", From 25e909320fd6054e9c341132c5ee61157ebd48c2 Mon Sep 17 00:00:00 2001 From: Arturo Amor <86408019+ArturoAmorQ@users.noreply.github.com> Date: Thu, 31 Aug 2023 14:00:14 +0200 Subject: [PATCH 041/108] Introduce non-linear feature engineering in more than 1D (#696) --- python_scripts/linear_models_ex_03.py | 58 ++++++---- python_scripts/linear_models_sol_03.py | 102 ++++++++++++------ .../linear_regression_non_linear_link.py | 20 +++- 3 files changed, 127 insertions(+), 53 deletions(-) diff --git a/python_scripts/linear_models_ex_03.py b/python_scripts/linear_models_ex_03.py index 07ca53ac7..3ab6949a3 100644 --- a/python_scripts/linear_models_ex_03.py +++ b/python_scripts/linear_models_ex_03.py @@ -14,17 +14,24 @@ # %% [markdown] # # ๐Ÿ“ Exercise M4.03 # -# In all previous notebooks, we only used a single feature in `data`. But we -# have already shown that we could add new features to make the model more -# expressive by deriving new features, based on the original feature. +# In the previous notebook, we showed that we can add new features based on the +# original feature to make the model more expressive, for instance `x ** 2` or `x ** 3`. +# In that case we only used a single feature in `data`. # # The aim of this notebook is to train a linear regression algorithm on a -# dataset with more than a single feature. +# dataset with more than a single feature. In such a "multi-dimensional" feature +# space we can derive new features of the form `x1 * x2`, `x2 * x3`, +# etc. Products of features are usually called "non-linear or +# multiplicative interactions" between features. # -# We will load a dataset about house prices in California. The dataset consists -# of 8 features regarding the demography and geography of districts in -# California and the aim is to predict the median house price of each district. -# We will use all 8 features to predict the target, the median house price. +# Feature engineering can be an important step of a model pipeline as long as +# the new features are expected to be predictive. For instance, think of a +# classification model to decide if a patient has risk of developing a heart +# disease. This would depend on the patient's Body Mass Index which is defined +# as `weight / height ** 2`. +# +# We load the dataset penguins dataset. We first use a set of 3 numerical +# features to predict the target, i.e. the body mass of the penguin. # %% [markdown] # ```{note} @@ -33,10 +40,18 @@ # ``` # %% -from sklearn.datasets import fetch_california_housing +import pandas as pd + +penguins = pd.read_csv("../datasets/penguins.csv") + +columns = ["Flipper Length (mm)", "Culmen Length (mm)", "Culmen Depth (mm)"] +target_name = "Body Mass (g)" -data, target = fetch_california_housing(as_frame=True, return_X_y=True) -target *= 100 # rescale the target in k$ +# Remove lines with missing values for the columns of interest +penguins_non_missing = penguins[columns + [target_name]].dropna() + +data = penguins_non_missing[columns] +target = penguins_non_missing[target_name] data.head() # %% [markdown] @@ -48,24 +63,31 @@ # %% [markdown] # Execute a cross-validation with 10 folds and use the mean absolute error (MAE) -# as metric. Be sure to *return* the fitted *estimators*. +# as metric. # %% # Write your code here. # %% [markdown] -# Compute the mean and std of the MAE in thousands of dollars (k$). +# Compute the mean and std of the MAE in grams (g). # %% # Write your code here. # %% [markdown] -# Inspect the fitted model using a box plot to show the distribution of values -# for the coefficients returned from the cross-validation. Hint: use the -# function -# [`df.plot.box()`](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.plot.box.html) -# to create a box plot. +# Now create a pipeline using `make_pipeline` consisting of a +# `PolynomialFeatures` and a linear regression. Set `degree=2` and +# `interaction_only=True` to the feature engineering step. Remember not to +# include the bias to avoid redundancies with the linear's regression intercept. +# +# Use the same strategy as before to cross-validate such a pipeline. +# %% +# Write your code here. + +# %% [markdown] +# Compute the mean and std of the MAE in grams (g) and compare with the results +# without feature engineering. # %% # Write your code here. diff --git a/python_scripts/linear_models_sol_03.py b/python_scripts/linear_models_sol_03.py index 7fadc8468..0cacfcf0d 100644 --- a/python_scripts/linear_models_sol_03.py +++ b/python_scripts/linear_models_sol_03.py @@ -8,17 +8,24 @@ # %% [markdown] # # ๐Ÿ“ƒ Solution for Exercise M4.03 # -# In all previous notebooks, we only used a single feature in `data`. But we -# have already shown that we could add new features to make the model more -# expressive by deriving new features, based on the original feature. +# In the previous notebook, we showed that we can add new features based on the +# original feature to make the model more expressive, for instance `x ** 2` or `x ** 3`. +# In that case we only used a single feature in `data`. # # The aim of this notebook is to train a linear regression algorithm on a -# dataset with more than a single feature. +# dataset with more than a single feature. In such a "multi-dimensional" feature +# space we can derive new features of the form `x1 * x2`, `x2 * x3`, +# etc. Products of features are usually called "non-linear or +# multiplicative interactions" between features. # -# We will load a dataset about house prices in California. The dataset consists -# of 8 features regarding the demography and geography of districts in -# California and the aim is to predict the median house price of each district. -# We will use all 8 features to predict the target, the median house price. +# Feature engineering can be an important step of a model pipeline as long as +# the new features are expected to be predictive. For instance, think of a +# classification model to decide if a patient has risk of developing a heart +# disease. This would depend on the patient's Body Mass Index which is defined +# as `weight / height ** 2`. +# +# We load the dataset penguins dataset. We first use a set of 3 numerical +# features to predict the target, i.e. the body mass of the penguin. # %% [markdown] # ```{note} @@ -27,10 +34,18 @@ # ``` # %% -from sklearn.datasets import fetch_california_housing +import pandas as pd + +penguins = pd.read_csv("../datasets/penguins.csv") + +columns = ["Flipper Length (mm)", "Culmen Length (mm)", "Culmen Depth (mm)"] +target_name = "Body Mass (g)" -data, target = fetch_california_housing(as_frame=True, return_X_y=True) -target *= 100 # rescale the target in k$ +# Remove lines with missing values for the columns of interest +penguins_non_missing = penguins[columns + [target_name]].dropna() + +data = penguins_non_missing[columns] +target = penguins_non_missing[target_name] data.head() # %% [markdown] @@ -45,7 +60,7 @@ # %% [markdown] # Execute a cross-validation with 10 folds and use the mean absolute error (MAE) -# as metric. Be sure to *return* the fitted *estimators*. +# as metric. # %% # solution @@ -55,42 +70,65 @@ linear_regression, data, target, - scoring="neg_mean_absolute_error", - return_estimator=True, cv=10, + scoring="neg_mean_absolute_error", n_jobs=2, ) # %% [markdown] -# Compute the mean and std of the MAE in thousands of dollars (k$). +# Compute the mean and std of the MAE in grams (g). # %% # solution print( - "Mean absolute error on testing set: " - f"{-cv_results['test_score'].mean():.3f} k$ ยฑ " - f"{cv_results['test_score'].std():.3f}" + "Mean absolute error on testing set with original features: " + f"{-cv_results['test_score'].mean():.3f} ยฑ " + f"{cv_results['test_score'].std():.3f} g" ) # %% [markdown] -# Inspect the fitted model using a box plot to show the distribution of values -# for the coefficients returned from the cross-validation. Hint: use the -# function -# [`df.plot.box()`](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.plot.box.html) -# to create a box plot. - +# Now create a pipeline using `make_pipeline` consisting of a +# `PolynomialFeatures` and a linear regression. Set `degree=2` and +# `interaction_only=True` to the feature engineering step. Remember not to +# include the bias to avoid redundancies with the linear's regression intercept. +# +# Use the same strategy as before to cross-validate such a pipeline. # %% # solution -import pandas as pd +from sklearn.preprocessing import PolynomialFeatures +from sklearn.pipeline import make_pipeline -weights = pd.DataFrame( - [est.coef_ for est in cv_results["estimator"]], columns=data.columns +poly_features = PolynomialFeatures( + degree=2, include_bias=False, interaction_only=True +) +linear_regression_interactions = make_pipeline( + poly_features, linear_regression ) -# %% tags=["solution"] -import matplotlib.pyplot as plt +cv_results = cross_validate( + linear_regression_interactions, + data, + target, + cv=10, + scoring="neg_mean_absolute_error", + n_jobs=2, +) + +# %% [markdown] +# Compute the mean and std of the MAE in grams (g) and compare with the results +# without feature engineering. + +# %% +# solution +print( + "Mean absolute error on testing set with interactions: " + f"{-cv_results['test_score'].mean():.3f} ยฑ " + f"{cv_results['test_score'].std():.3f} g" +) -color = {"whiskers": "black", "medians": "black", "caps": "black"} -weights.plot.box(color=color, vert=False) -_ = plt.title("Value of linear regression coefficients") +# %% [markdown] tags=["solution"] +# We observe that the mean absolute error is lower and less spread with the +# enriched features. In this case the "interactions" are indeed predictive. In +# the following notebook we will see what happens when the enriched features are +# non-predictive and how to deal with this case. diff --git a/python_scripts/linear_regression_non_linear_link.py b/python_scripts/linear_regression_non_linear_link.py index 2fc6699ac..9e72fb49e 100644 --- a/python_scripts/linear_regression_non_linear_link.py +++ b/python_scripts/linear_regression_non_linear_link.py @@ -247,9 +247,9 @@ # line. `SVR(kernel="linear")` is indeed yet another example of a linear model. # # The estimator can also be configured to use a non-linear kernel. Then, it can -# learn a prediction function that computes non-linear interaction between -# samples for which we want to make a prediction and selected samples from the -# training set. +# learn a prediction function that computes non-linear relations between samples +# for which we want to make a prediction and selected samples from the training +# set. # # The result is another kind of non-linear regression model with a similar # expressivity as our previous polynomial regression pipeline: @@ -315,3 +315,17 @@ ) ax.plot(data, target_predicted) _ = ax.set_title(f"Mean squared error = {mse:.2f}") + +# %% [markdown] +# ## Notebook Recap +# +# In this notebook we explored several ways to expand a single numerical feature +# into several non-linearly derived new features. This makes our machine +# learning pipeline more expressive and less likely to underfit, even if the +# last stage of the pipeline is a simple linear regression model. + +# For the sake of simplicity, we introduced those transformers on a toy +# regression problem with a single input feature. However, non-linear feature +# transformers such as Nystroem can further improve the expressiveness of +# machine learning pipelines to model non-linear interactions between features. +# We will explore this possibility in the next exercise. From 4cfc3423a500f82a251118f62c0f8abd440cb887 Mon Sep 17 00:00:00 2001 From: Olivier Grisel Date: Fri, 1 Sep 2023 10:13:49 +0200 Subject: [PATCH 042/108] Synchronize quizzes (#708) --- .../evaluation/evaluation_quiz_m7_02.md | 17 +++++---- .../evaluation/evaluation_quiz_m7_05.md | 15 +++++--- .../linear_models/linear_models_quiz_m4_02.md | 12 +++++++ .../linear_models/linear_models_quiz_m4_05.md | 12 ------- .../01_tabular_data_exploration_quiz_m1_01.md | 2 +- .../02_numerical_pipeline_quiz_m1_02.md | 9 ++--- .../wrap_up_quiz.md | 4 +-- .../parameter_tuning_automated_quiz_m3_02.md | 35 ++++++++++--------- .../parameter_tuning_manual_quiz_m3_01.md | 2 ++ 9 files changed, 58 insertions(+), 50 deletions(-) diff --git a/jupyter-book/evaluation/evaluation_quiz_m7_02.md b/jupyter-book/evaluation/evaluation_quiz_m7_02.md index 154c4884b..d28ce5eb6 100644 --- a/jupyter-book/evaluation/evaluation_quiz_m7_02.md +++ b/jupyter-book/evaluation/evaluation_quiz_m7_02.md @@ -1,17 +1,20 @@ # โœ… Quiz M7.02 ```{admonition} Question -Given a dataset containing records from patients in 10 different hospitals, -we would like to predict if a patient has a disease or not. We suspect -that different hospitals can have systematic biases in their respective -populations (age, socioeconomic status, genetics...). Which cross-validation -strategies are the most adequate to evaluate the ability of the model to make -good predictions on patients from unseen hospitals. +We have a dataset with patient records from 10 different hospitals, and our goal +is to predict whether a patient has a disease or not. Let's also suppose that +the classes ("disease" and "no-disease") are imbalanced. Additionally, we suspect +that each hospital's data may have systematic biases due to factors like +medical devices, policies, socioeconomic status of the patients, etc. + +Which cross-validation strategy is the most suitable for assessing the model's +ability to make good predictions on patients from hospitals not seen during +training? - a) Group stratified k-fold cross-validation - b) Group k-fold - c) Stratified k-fold cross-validation - d) Leave-one-out cross-validation -_Select all answers that apply_ +_Select a single answer_ ``` diff --git a/jupyter-book/evaluation/evaluation_quiz_m7_05.md b/jupyter-book/evaluation/evaluation_quiz_m7_05.md index f66209b6e..708bd3b94 100644 --- a/jupyter-book/evaluation/evaluation_quiz_m7_05.md +++ b/jupyter-book/evaluation/evaluation_quiz_m7_05.md @@ -26,12 +26,17 @@ _Select a single answer_ +++ ```{admonition} Question -If we observe that the values returned by -`cross_val_scores(model, X, y, scoring="neg_mean_squared_error")` increase after -changing the model parameters, it means that the latest model: +If all the values returned by +`cross_val_score(model_A, X, y, scoring="neg_mean_squared_error")` +are strictly lower than those returned by +`cross_val_score(model_B, X, y, scoring="neg_mean_squared_error")` +it means that `model_B` generalizes: -- a) generalizes better -- b) generalizes worse +- a) better than `model_A` +- b) worse than `model_A` + +Hint: Remember that `"neg_mean_squared_error"` is an alias for the negative of +the Mean Squared Error. _Select a single answer_ ``` diff --git a/jupyter-book/linear_models/linear_models_quiz_m4_02.md b/jupyter-book/linear_models/linear_models_quiz_m4_02.md index 2aa9382b6..883e9f167 100644 --- a/jupyter-book/linear_models/linear_models_quiz_m4_02.md +++ b/jupyter-book/linear_models/linear_models_quiz_m4_02.md @@ -24,3 +24,15 @@ vector and `y` a vector, `coef_` and `intercept_` will be respectively: _Select a single answer_ ``` + ++++ + +```{admonition} Question +The decision boundaries of a logistic regression model: + +- a) split classes using only one of the input features +- b) split classes using a combination of the input features +- c) often have curved shapes + +_Select a single answer_ +``` diff --git a/jupyter-book/linear_models/linear_models_quiz_m4_05.md b/jupyter-book/linear_models/linear_models_quiz_m4_05.md index 4fcf39380..fbddaddf8 100644 --- a/jupyter-book/linear_models/linear_models_quiz_m4_05.md +++ b/jupyter-book/linear_models/linear_models_quiz_m4_05.md @@ -12,18 +12,6 @@ _Select a single answer_ +++ -```{admonition} Question -The decision boundaries of a logistic regression: - -- a) uses only one of the input features -- b) uses a combination of the input features -- c) separates a single class - -_Select a single answer_ -``` - -+++ - ```{admonition} Question The parameter `C` in a logistic regression is: diff --git a/jupyter-book/predictive_modeling_pipeline/01_tabular_data_exploration_quiz_m1_01.md b/jupyter-book/predictive_modeling_pipeline/01_tabular_data_exploration_quiz_m1_01.md index 37c264dc4..60d32a294 100644 --- a/jupyter-book/predictive_modeling_pipeline/01_tabular_data_exploration_quiz_m1_01.md +++ b/jupyter-book/predictive_modeling_pipeline/01_tabular_data_exploration_quiz_m1_01.md @@ -21,7 +21,7 @@ _Select all answers that apply_ In the previous notebook, we used: -- a) pandas to manipulate data +- a) pandas to gain insights about the dataset - b) pandas and seaborn to visually inspect the dataset - c) numpy and scipy to perform numerical inspection (for instance using `scipy.optimize.minimize`) diff --git a/jupyter-book/predictive_modeling_pipeline/02_numerical_pipeline_quiz_m1_02.md b/jupyter-book/predictive_modeling_pipeline/02_numerical_pipeline_quiz_m1_02.md index f58b22871..81676cb76 100644 --- a/jupyter-book/predictive_modeling_pipeline/02_numerical_pipeline_quiz_m1_02.md +++ b/jupyter-book/predictive_modeling_pipeline/02_numerical_pipeline_quiz_m1_02.md @@ -66,7 +66,8 @@ _Select a single answer_ +++ ```{admonition} Question -A `StandardScaler` transformer with the default parameter will: +Look at the plots and the answers of the previous question. A `StandardScaler` +transformer with the default parameter: - a) transforms the features so that they have similar ranges - b) transforms the features to lie in the [0.0, 1.0] range @@ -75,9 +76,6 @@ A `StandardScaler` transformer with the default parameter will: - d) can help logistic regression converge faster (fewer iterations) _Select all answers that apply_ - -Hint: look at the plots and the answers of the previous question to eliminate -some of the wrong answers. ``` +++ @@ -87,8 +85,7 @@ Cross-validation allows us to: - a) train the model faster - b) measure the generalization performance of the model -- c) reach better generalization performance -- d) estimate the variability of the generalization score +- c) estimate the variability of the generalization score _Select all answers that apply_ ``` diff --git a/jupyter-book/predictive_modeling_pipeline/wrap_up_quiz.md b/jupyter-book/predictive_modeling_pipeline/wrap_up_quiz.md index ee160f0b7..179d9ed09 100644 --- a/jupyter-book/predictive_modeling_pipeline/wrap_up_quiz.md +++ b/jupyter-book/predictive_modeling_pipeline/wrap_up_quiz.md @@ -118,8 +118,8 @@ _Select a single answer_ Instead of solely using the numerical columns, let us build a pipeline that can process both the numerical and categorical features together as follows: -- numerical features should be processed as previously done with a - `StandardScaler`; +- the `numerical_features` (as defined above) should be processed as previously + done with a `StandardScaler`; - the left-out columns should be treated as categorical variables using a [`sklearn.preprocessing.OneHotEncoder`](https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.OneHotEncoder.html). To avoid any issue with rare categories that could only be present during diff --git a/jupyter-book/tuning/parameter_tuning_automated_quiz_m3_02.md b/jupyter-book/tuning/parameter_tuning_automated_quiz_m3_02.md index fa8d0e3d4..1471af06b 100644 --- a/jupyter-book/tuning/parameter_tuning_automated_quiz_m3_02.md +++ b/jupyter-book/tuning/parameter_tuning_automated_quiz_m3_02.md @@ -110,20 +110,21 @@ spread the active ranges and improve the readability of the plot. ```{admonition} Question In the parallel coordinate plot obtained by the running the above code snippet, -select the bad performing models. - -We define bad performing models as the models with a `mean_test_score` below -0.8. You can select the range [0.0, 0.8] by clicking and holding on the -`mean_test_score` axis of the parallel coordinate plot. +select the models with a score higher than 0.85. You can select the range [0.85, +max] by clicking and holding on the `mean_test_score` axis of the parallel +coordinate plot. -Looking at this plot, which parameter values always cause the model to perform badly? +Identify ranges of values for hyperparameters that always prevent the model to +reach a test score higher than 0.85, irrespective of the other values. In other +words, which hyperparameters values are never used to get a good model +(i.e. with `mean_test_score` higher than 0.85). - a) too large `l2_regularization` - b) too small `l2_regularization` - c) too large `learning_rate` -- d) too small `learning_rate` +- d) too low `learning_rate` - e) too large `max_bins` -- f) too small `max_bins` +- f) too low `max_bins` _Select all answers that apply_ ``` @@ -131,21 +132,21 @@ _Select all answers that apply_ +++ ```{admonition} Question -In the parallel coordinate plot shown above, select the models with a score -higher than 0.85. You can select the range [0.85, max] by clicking and holding -on the `mean_test_score` axis of the parallel coordinate plot. +In the parallel coordinate plot obtained by the running the above code snippet, +select the bad performing models. -Identify ranges of values for hyperparameters that always prevent the model to -reach a test score higher than 0.85, irrespective of the other values. In other -words, which hyperparameters values are never used to get a good model -(i.e. with `mean_test_score` higher than 0.85). +We define bad performing models as the models with a `mean_test_score` below +0.8. You can select the range [0.0, 0.8] by clicking and holding on the +`mean_test_score` axis of the parallel coordinate plot. + +Looking at this plot, which parameter values always cause the model to perform badly? - a) too large `l2_regularization` - b) too small `l2_regularization` - c) too large `learning_rate` -- d) too low `learning_rate` +- d) too small `learning_rate` - e) too large `max_bins` -- f) too low `max_bins` +- f) too small `max_bins` _Select all answers that apply_ ``` diff --git a/jupyter-book/tuning/parameter_tuning_manual_quiz_m3_01.md b/jupyter-book/tuning/parameter_tuning_manual_quiz_m3_01.md index c602fcc71..eb1380853 100644 --- a/jupyter-book/tuning/parameter_tuning_manual_quiz_m3_01.md +++ b/jupyter-book/tuning/parameter_tuning_manual_quiz_m3_01.md @@ -2,6 +2,8 @@ ```{admonition} Question Which parameters below are hyperparameters of `HistGradientBosstingClassifier`? +Remember we only consider hyperparameters to be those that potentially impact +the result of the learning procedure and subsequent predictions. - a) `C` - b) `max_leaf_nodes` From 1506ae08f869116e1f96487c3fcde2f1e5916e08 Mon Sep 17 00:00:00 2001 From: Arturo Amor <86408019+ArturoAmorQ@users.noreply.github.com> Date: Fri, 1 Sep 2023 10:14:30 +0200 Subject: [PATCH 043/108] Replace GBDT by HGBT and add wrap-up table (#706) --- python_scripts/ensemble_hyperparameters.py | 92 ++++++++++++---------- 1 file changed, 52 insertions(+), 40 deletions(-) diff --git a/python_scripts/ensemble_hyperparameters.py b/python_scripts/ensemble_hyperparameters.py index 7fd079558..ffd3c0f7d 100644 --- a/python_scripts/ensemble_hyperparameters.py +++ b/python_scripts/ensemble_hyperparameters.py @@ -8,22 +8,17 @@ # %% [markdown] # # Hyperparameter tuning # -# In the previous section, we did not discuss the parameters of random forest -# and gradient-boosting. However, there are a couple of things to keep in mind -# when setting these. -# -# This notebook gives crucial information regarding how to set the -# hyperparameters of both random forest and gradient boosting decision tree -# models. +# In the previous section, we did not discuss the hyperparameters of random +# forest and histogram gradient-boosting. This notebook gives crucial +# information regarding how to set them. # # ```{caution} -# For the sake of clarity, no cross-validation will be used to estimate the +# For the sake of clarity, no nested cross-validation is used to estimate the # variability of the testing error. We are only showing the effect of the -# parameters on the validation set of what should be the inner loop of a nested -# cross-validation. +# parameters on the validation set. # ``` # -# We will start by loading the california housing dataset. +# We start by loading the california housing dataset. # %% from sklearn.datasets import fetch_california_housing @@ -40,7 +35,7 @@ # # The main parameter to select in random forest is the `n_estimators` parameter. # In general, the more trees in the forest, the better the generalization -# performance will be. However, it will slow down the fitting and prediction +# performance would be. However, adding trees slows down the fitting and prediction # time. The goal is to balance computing time and generalization performance # when setting the number of estimators. Here, we fix `n_estimators=100`, which # is already the default value. @@ -53,7 +48,7 @@ # # Instead, we can tune the hyperparameter `max_features`, which controls the # size of the random subset of features to consider when looking for the best -# split when growing the trees: smaller values for `max_features` will lead to +# split when growing the trees: smaller values for `max_features` lead to # more random trees with hopefully more uncorrelated prediction errors. However # if `max_features` is too small, predictions can be too random, even after # averaging with the trees in the ensemble. @@ -69,9 +64,9 @@ # We can also tune the different parameters that control the depth of each tree # in the forest. Two parameters are important for this: `max_depth` and # `max_leaf_nodes`. They differ in the way they control the tree structure. -# Indeed, `max_depth` will enforce to have a more symmetric tree, while -# `max_leaf_nodes` does not impose such constraint. If `max_leaf_nodes=None` -# then the number of leaf nodes is unlimited. +# Indeed, `max_depth` enforces growing symmetric trees, while `max_leaf_nodes` +# does not impose such constraint. If `max_leaf_nodes=None` then the number of +# leaf nodes is unlimited. # # The hyperparameter `min_samples_leaf` controls the minimum number of samples # required to be at a leaf node. This means that a split point (at any depth) is @@ -136,26 +131,33 @@ ) # %% [markdown] -# ## Gradient-boosting decision trees +# ## Histogram gradient-boosting decision trees # -# For gradient-boosting, parameters are coupled, so we cannot set the parameters -# one after the other anymore. The important parameters are `n_estimators`, +# For gradient-boosting, hyperparameters are coupled, so we cannot set them +# one after the other anymore. The important hyperparameters are `max_iter`, # `learning_rate`, and `max_depth` or `max_leaf_nodes` (as previously discussed # random forest). # -# Let's first discuss the `max_depth` (or `max_leaf_nodes`) parameter. We saw in -# the section on gradient-boosting that the algorithm fits the error of the -# previous tree in the ensemble. Thus, fitting fully grown trees would be +# Let's first discuss `max_iter` which, similarly to the `n_estimators` +# hyperparameter in random forests, controls the number of trees in the +# estimator. The difference is that the actual number of trees trained by the +# model is not entirely set by the user, but depends also on the stopping +# criteria: the number of trees can be lower than `max_iter` if adding a new +# tree does not improve the model enough. We will give more details on this in +# the next exercise. +# +# The depth of the trees is controlled by `max_depth` (or `max_leaf_nodes`). We +# saw in the section on gradient-boosting that boosting algorithms fit the error +# of the previous tree in the ensemble. Thus, fitting fully grown trees would be # detrimental. Indeed, the first tree of the ensemble would perfectly fit # (overfit) the data and thus no subsequent tree would be required, since there # would be no residuals. Therefore, the tree used in gradient-boosting should # have a low depth, typically between 3 to 8 levels, or few leaves ($2^3=8$ to -# $2^8=256$). Having very weak learners at each step will help reducing -# overfitting. +# $2^8=256$). Having very weak learners at each step helps reducing overfitting. # # With this consideration in mind, the deeper the trees, the faster the -# residuals will be corrected and less learners are required. Therefore, -# `n_estimators` should be increased if `max_depth` is lower. +# residuals are corrected and then less learners are required. Therefore, +# it can be beneficial to increase `max_iter` if `max_depth` is low. # # Finally, we have overlooked the impact of the `learning_rate` parameter until # now. When fitting the residuals, we would like the tree to try to correct all @@ -163,21 +165,21 @@ # control this behaviour. A small learning-rate value would only correct the # residuals of very few samples. If a large learning-rate is set (e.g., 1), we # would fit the residuals of all samples. So, with a very low learning-rate, we -# will need more estimators to correct the overall error. However, a too large -# learning-rate tends to obtain an overfitted ensemble, similar to having a too -# large tree depth. +# would need more estimators to correct the overall error. However, a too large +# learning-rate tends to obtain an overfitted ensemble, similar to having very +# deep trees. # %% from scipy.stats import loguniform -from sklearn.ensemble import GradientBoostingRegressor +from sklearn.ensemble import HistGradientBoostingRegressor param_distributions = { - "n_estimators": [1, 2, 5, 10, 20, 50, 100, 200, 500], + "max_iter": [3, 10, 30, 100, 300, 1000], "max_leaf_nodes": [2, 5, 10, 20, 50, 100], "learning_rate": loguniform(0.01, 1), } search_cv = RandomizedSearchCV( - GradientBoostingRegressor(), + HistGradientBoostingRegressor(), param_distributions=param_distributions, scoring="neg_mean_absolute_error", n_iter=20, @@ -196,27 +198,37 @@ # %% [markdown] # # ```{caution} -# Here, we tune the `n_estimators` but be aware that is better to use -# `early_stopping` as done in the Exercise M6.04. +# Here, we tune `max_iter` but be aware that it is better to set `max_iter` to a +# fixed, large enough value and use parameters linked to `early_stopping` as we +# will do in Exercise M6.04. # ``` # -# In this search, we see that the `learning_rate` is required to be large -# enough, i.e. > 0.1. We also observe that for the best ranked models, having a -# smaller `learning_rate`, will require more trees or a larger number of leaves +# In this search, we observe that for the best ranked models, having a +# smaller `learning_rate`, requires more trees or a larger number of leaves # for each tree. However, it is particularly difficult to draw more detailed -# conclusions since the best value of an hyperparameter depends on the other +# conclusions since the best value of each hyperparameter depends on the other # hyperparameter values. # %% [markdown] -# Now we estimate the generalization performance of the best model using the +# We can now estimate the generalization performance of the best model using the # test set. # %% error = -search_cv.score(data_test, target_test) -print(f"On average, our GBDT regressor makes an error of {error:.2f} k$") +print(f"On average, our HGBT regressor makes an error of {error:.2f} k$") # %% [markdown] # The mean test score in the held-out test set is slightly better than the score # of the best model. The reason is that the final model is refitted on the whole # training set and therefore, on more data than the cross-validated models of # the grid search procedure. +# +# We summarize these details in the following table: +# +# | **Bagging & Random Forests** | **Boosting** | +# |--------------------------------------------------|-----------------------------------------------------| +# | fit trees **independently** | fit trees **sequentially** | +# | each **deep tree overfits** | each **shallow tree underfits** | +# | averaging the tree predictions **reduces overfitting** | sequentially adding trees **reduces underfitting** | +# | generalization improves with the number of trees | too many trees may cause overfitting | +# | does not have a `learning_rate` parameter | fitting the residuals is controlled by the `learning_rate` | From e802f4f02e7f043f44d0cbea767f632cb8b3f870 Mon Sep 17 00:00:00 2001 From: Arturo Amor <86408019+ArturoAmorQ@users.noreply.github.com> Date: Fri, 1 Sep 2023 13:49:34 +0200 Subject: [PATCH 044/108] MAINT Update scikit-learn to v 1.3 (#709) Co-authored-by: ArturoAmorQ --- check_env.py | 16 ++++++++++++---- environment-dev.yml | 2 +- environment.yml | 2 +- local-install-instructions.md | 5 ++--- requirements-dev.txt | 2 +- requirements.txt | 2 +- 6 files changed, 18 insertions(+), 11 deletions(-) diff --git a/check_env.py b/check_env.py index 357cc200c..5e72c76c4 100644 --- a/check_env.py +++ b/check_env.py @@ -7,7 +7,9 @@ try: from packaging.version import Version except ImportError: - print(FAIL, "'packaging' package not installed, install it with conda or pip") + print( + FAIL, "'packaging' package not installed, install it with conda or pip" + ) sys.exit(1) # first check the python version @@ -19,7 +21,10 @@ if pyversion < Version("3.8"): print( FAIL, - "Python version 3.8 or above is required," f" but {pyversion_str} is installed.", + ( + "Python version 3.8 or above is required," + f" but {pyversion_str} is installed." + ), ) sys.exit(1) print() @@ -45,7 +50,10 @@ def import_version(pkg, min_ver, fail_msg=""): if Version(ver) < Version(min_ver): print( FAIL, - f"{lib} version {min_ver} or higher required, but {ver} installed.", + ( + f"{lib} version {min_ver} or higher required, but" + f" {ver} installed." + ), ) else: print(OK, f"{pkg} version {ver}") @@ -58,7 +66,7 @@ def import_version(pkg, min_ver, fail_msg=""): "numpy": "1.16", "scipy": "1.2", "matplotlib": "3.0", - "sklearn": "1.2", + "sklearn": "1.3", "pandas": "1", "seaborn": "0.11", "notebook": "5.7", diff --git a/environment-dev.yml b/environment-dev.yml index fab94f5fc..c465dd72b 100644 --- a/environment-dev.yml +++ b/environment-dev.yml @@ -2,7 +2,7 @@ name: scikit-learn-course channels: - conda-forge dependencies: - - scikit-learn >= 1.2.1 + - scikit-learn >= 1.3 - pandas >= 1 - matplotlib-base - seaborn diff --git a/environment.yml b/environment.yml index 3ca255705..5dd5dfe4e 100644 --- a/environment.yml +++ b/environment.yml @@ -4,7 +4,7 @@ channels: - conda-forge dependencies: - - scikit-learn >= 1.2.1 + - scikit-learn >= 1.3 - pandas >= 1 - matplotlib-base - seaborn diff --git a/local-install-instructions.md b/local-install-instructions.md index cc69b9ab6..2085c5db1 100644 --- a/local-install-instructions.md +++ b/local-install-instructions.md @@ -23,7 +23,7 @@ cd scikit-learn-mooc conda env create -f environment.yml ``` -## Check your install +## Check your install To make sure you have all the necessary packages installed, we **strongly recommend** you to execute the `check_env.py` script located at the root of @@ -46,7 +46,7 @@ Using python in /home/lesteve/miniconda3/envs/scikit-learn-course [ OK ] numpy version 1.19.5 [ OK ] scipy version 1.6.0 [ OK ] matplotlib version 3.3.3 -[ OK ] sklearn version 1.2.1 +[ OK ] sklearn version 1.3 [ OK ] pandas version 1.2.0 [ OK ] seaborn version 0.11.1 [ OK ] notebook version 6.2.0 @@ -63,4 +63,3 @@ jupyter notebook full-index.ipynb `full-index.ipynb` is an index file helping to navigate the notebooks. All the Jupyter notebooks are located in the `notebooks` folder. - diff --git a/requirements-dev.txt b/requirements-dev.txt index 180c42915..33232cdc2 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -1,4 +1,4 @@ -scikit-learn>=1.2.1 +scikit-learn>=1.3 pandas>=1 matplotlib seaborn diff --git a/requirements.txt b/requirements.txt index 7a6c74f9b..e06a8123d 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,4 @@ -scikit-learn>=1.2.1 +scikit-learn>=1.3 pandas>=1 matplotlib seaborn From cbfedd19811d0943a4ad74070f00f626d4262953 Mon Sep 17 00:00:00 2001 From: Arturo Amor <86408019+ArturoAmorQ@users.noreply.github.com> Date: Fri, 1 Sep 2023 13:53:33 +0200 Subject: [PATCH 045/108] Rework ordering of linear models module (#701) Co-authored-by: ArturoAmorQ Co-authored-by: Olivier Grisel --- jupyter-book/_toc.yml | 18 +- .../linear_models_classification_index.md | 5 - .../linear_models_non_linear_index.md | 2 +- .../linear_models_regression_index.md | 5 - notebooks/linear_models_ex_02.ipynb | 129 ++--- notebooks/linear_models_ex_03.ipynb | 130 ----- notebooks/linear_models_ex_04.ipynb | 165 ------ notebooks/linear_models_ex_05.ipynb | 137 ----- notebooks/linear_models_sol_02.ipynb | 187 ++----- notebooks/linear_models_sol_03.ipynb | 171 ------ notebooks/linear_models_sol_04.ipynb | 492 ------------------ notebooks/linear_models_sol_05.ipynb | 201 ------- python_scripts/linear_models_ex_02.py | 100 ++-- python_scripts/linear_models_ex_03.py | 81 ++- python_scripts/linear_models_ex_04.py | 92 ---- python_scripts/linear_models_ex_05.py | 83 --- python_scripts/linear_models_sol_02.py | 164 +++--- python_scripts/linear_models_sol_03.py | 168 +++--- python_scripts/linear_models_sol_04.py | 269 ---------- python_scripts/linear_models_sol_05.py | 123 ----- python_scripts/logistic_regression.py | 7 +- 21 files changed, 331 insertions(+), 2398 deletions(-) delete mode 100644 jupyter-book/linear_models/linear_models_classification_index.md delete mode 100644 jupyter-book/linear_models/linear_models_regression_index.md delete mode 100644 notebooks/linear_models_ex_03.ipynb delete mode 100644 notebooks/linear_models_ex_04.ipynb delete mode 100644 notebooks/linear_models_ex_05.ipynb delete mode 100644 notebooks/linear_models_sol_03.ipynb delete mode 100644 notebooks/linear_models_sol_04.ipynb delete mode 100644 notebooks/linear_models_sol_05.ipynb delete mode 100644 python_scripts/linear_models_ex_04.py delete mode 100644 python_scripts/linear_models_ex_05.py delete mode 100644 python_scripts/linear_models_sol_04.py delete mode 100644 python_scripts/linear_models_sol_05.py diff --git a/jupyter-book/_toc.yml b/jupyter-book/_toc.yml index dfc89c04f..80bb88aa3 100644 --- a/jupyter-book/_toc.yml +++ b/jupyter-book/_toc.yml @@ -91,34 +91,26 @@ parts: sections: - file: linear_models/linear_models_slides - file: linear_models/linear_models_quiz_m4_01 - - file: linear_models/linear_models_regression_index - sections: - file: python_scripts/linear_regression_without_sklearn - file: python_scripts/linear_models_ex_01 - file: python_scripts/linear_models_sol_01 - file: python_scripts/linear_regression_in_sklearn + - file: python_scripts/logistic_regression - file: linear_models/linear_models_quiz_m4_02 - file: linear_models/linear_models_non_linear_index sections: + - file: python_scripts/linear_regression_non_linear_link - file: python_scripts/linear_models_ex_02 - file: python_scripts/linear_models_sol_02 - - file: python_scripts/linear_regression_non_linear_link - - file: python_scripts/linear_models_ex_03 - - file: python_scripts/linear_models_sol_03 + - file: python_scripts/logistic_regression_non_linear - file: linear_models/linear_models_quiz_m4_03 - file: linear_models/linear_models_regularization_index sections: - file: linear_models/regularized_linear_models_slides - file: python_scripts/linear_models_regularization - - file: python_scripts/linear_models_ex_04 - - file: python_scripts/linear_models_sol_04 - file: linear_models/linear_models_quiz_m4_04 - - file: linear_models/linear_models_classification_index - sections: - - file: python_scripts/logistic_regression - - file: python_scripts/linear_models_ex_05 - - file: python_scripts/linear_models_sol_05 - - file: python_scripts/logistic_regression_non_linear + - file: python_scripts/linear_models_ex_03 + - file: python_scripts/linear_models_sol_03 - file: linear_models/linear_models_quiz_m4_05 - file: linear_models/linear_models_wrap_up_quiz - file: linear_models/linear_models_module_take_away diff --git a/jupyter-book/linear_models/linear_models_classification_index.md b/jupyter-book/linear_models/linear_models_classification_index.md deleted file mode 100644 index 81399c436..000000000 --- a/jupyter-book/linear_models/linear_models_classification_index.md +++ /dev/null @@ -1,5 +0,0 @@ -# Linear model for classification - -```{tableofcontents} - -``` diff --git a/jupyter-book/linear_models/linear_models_non_linear_index.md b/jupyter-book/linear_models/linear_models_non_linear_index.md index d56614515..22fe06b20 100644 --- a/jupyter-book/linear_models/linear_models_non_linear_index.md +++ b/jupyter-book/linear_models/linear_models_non_linear_index.md @@ -1,4 +1,4 @@ -# Modelling non-linear features-target relationships +# Non-linear feature engineering for linear models ```{tableofcontents} diff --git a/jupyter-book/linear_models/linear_models_regression_index.md b/jupyter-book/linear_models/linear_models_regression_index.md deleted file mode 100644 index 8b8144a84..000000000 --- a/jupyter-book/linear_models/linear_models_regression_index.md +++ /dev/null @@ -1,5 +0,0 @@ -# Linear regression - -```{tableofcontents} - -``` diff --git a/notebooks/linear_models_ex_02.ipynb b/notebooks/linear_models_ex_02.ipynb index c9c0aad96..4cf750e81 100644 --- a/notebooks/linear_models_ex_02.ipynb +++ b/notebooks/linear_models_ex_02.ipynb @@ -4,39 +4,19 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "# \ud83d\udcdd Exercise M4.02\n", + "# \ud83d\udcdd Exercise M4.03\n", "\n", - "The goal of this exercise is to build an intuition on what will be the\n", - "parameters' values of a linear model when the link between the data and the\n", - "target is non-linear.\n", + "In all previous notebooks, we only used a single feature in `data`. But we\n", + "have already shown that we could add new features to make the model more\n", + "expressive by deriving new features, based on the original feature.\n", "\n", - "First, we will generate such non-linear data.\n", + "The aim of this notebook is to train a linear regression algorithm on a\n", + "dataset with more than a single feature.\n", "\n", - "

\n", - "

Tip

\n", - "

np.random.RandomState allows to create a random number generator which can\n", - "be later used to get deterministic results.

\n", - "
" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import numpy as np\n", - "\n", - "# Set the seed for reproduction\n", - "rng = np.random.RandomState(0)\n", - "\n", - "# Generate data\n", - "n_sample = 100\n", - "data_max, data_min = 1.4, -1.4\n", - "len_data = data_max - data_min\n", - "data = rng.rand(n_sample) * len_data - len_data / 2\n", - "noise = rng.randn(n_sample) * 0.3\n", - "target = data**3 - 0.5 * data**2 + noise" + "We will load a dataset about house prices in California. The dataset consists\n", + "of 8 features regarding the demography and geography of districts in\n", + "California and the aim is to predict the median house price of each district.\n", + "We will use all 8 features to predict the target, the median house price." ] }, { @@ -45,8 +25,8 @@ "source": [ "
\n", "

Note

\n", - "

To ease the plotting, we will create a Pandas dataframe containing the data\n", - "and target

\n", + "

If you want a deeper overview regarding this dataset, you can refer to the\n", + "Appendix - Datasets description section at the end of this MOOC.

\n", "
" ] }, @@ -56,65 +36,19 @@ "metadata": {}, "outputs": [], "source": [ - "import pandas as pd\n", + "from sklearn.datasets import fetch_california_housing\n", "\n", - "full_data = pd.DataFrame({\"data\": data, \"target\": target})" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import seaborn as sns\n", - "\n", - "_ = sns.scatterplot(\n", - " data=full_data, x=\"data\", y=\"target\", color=\"black\", alpha=0.5\n", - ")" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "lines_to_next_cell": 2 - }, - "source": [ - "We observe that the link between the data `data` and vector `target` is\n", - "non-linear. For instance, `data` could represent the years of experience\n", - "(normalized) and `target` the salary (normalized). Therefore, the problem here\n", - "would be to infer the salary given the years of experience.\n", - "\n", - "Using the function `f` defined below, find both the `weight` and the\n", - "`intercept` that you think will lead to a good linear model. Plot both the\n", - "data and the predictions of this model." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "def f(data, weight=0, intercept=0):\n", - " target_predict = weight * data + intercept\n", - " return target_predict" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Write your code here." + "data, target = fetch_california_housing(as_frame=True, return_X_y=True)\n", + "target *= 100 # rescale the target in k$\n", + "data.head()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "Compute the mean squared error for this model" + "Now it is your turn to train a linear regression model on this dataset. First,\n", + "create a linear regression model." ] }, { @@ -130,16 +64,8 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Train a linear regression model on this dataset.\n", - "\n", - "
\n", - "

Warning

\n", - "

In scikit-learn, by convention data (also called X in the scikit-learn\n", - "documentation) should be a 2D matrix of shape (n_samples, n_features).\n", - "If data is a 1D vector, you need to reshape it into a matrix with a\n", - "single column if the vector represents a feature or a single row if the\n", - "vector represents a sample.

\n", - "
" + "Execute a cross-validation with 10 folds and use the mean absolute error (MAE)\n", + "as metric. Be sure to *return* the fitted *estimators*." ] }, { @@ -148,8 +74,6 @@ "metadata": {}, "outputs": [], "source": [ - "from sklearn.linear_model import LinearRegression\n", - "\n", "# Write your code here." ] }, @@ -157,8 +81,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Compute predictions from the linear regression model and plot both the data\n", - "and the predictions." + "Compute the mean and std of the MAE in thousands of dollars (k$)." ] }, { @@ -172,9 +95,15 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "lines_to_next_cell": 2 + }, "source": [ - "Compute the mean squared error" + "Inspect the fitted model using a box plot to show the distribution of values\n", + "for the coefficients returned from the cross-validation. Hint: use the\n", + "function\n", + "[`df.plot.box()`](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.plot.box.html)\n", + "to create a box plot." ] }, { diff --git a/notebooks/linear_models_ex_03.ipynb b/notebooks/linear_models_ex_03.ipynb deleted file mode 100644 index 4cf750e81..000000000 --- a/notebooks/linear_models_ex_03.ipynb +++ /dev/null @@ -1,130 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# \ud83d\udcdd Exercise M4.03\n", - "\n", - "In all previous notebooks, we only used a single feature in `data`. But we\n", - "have already shown that we could add new features to make the model more\n", - "expressive by deriving new features, based on the original feature.\n", - "\n", - "The aim of this notebook is to train a linear regression algorithm on a\n", - "dataset with more than a single feature.\n", - "\n", - "We will load a dataset about house prices in California. The dataset consists\n", - "of 8 features regarding the demography and geography of districts in\n", - "California and the aim is to predict the median house price of each district.\n", - "We will use all 8 features to predict the target, the median house price." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "
\n", - "

Note

\n", - "

If you want a deeper overview regarding this dataset, you can refer to the\n", - "Appendix - Datasets description section at the end of this MOOC.

\n", - "
" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from sklearn.datasets import fetch_california_housing\n", - "\n", - "data, target = fetch_california_housing(as_frame=True, return_X_y=True)\n", - "target *= 100 # rescale the target in k$\n", - "data.head()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Now it is your turn to train a linear regression model on this dataset. First,\n", - "create a linear regression model." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Write your code here." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Execute a cross-validation with 10 folds and use the mean absolute error (MAE)\n", - "as metric. Be sure to *return* the fitted *estimators*." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Write your code here." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Compute the mean and std of the MAE in thousands of dollars (k$)." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Write your code here." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "lines_to_next_cell": 2 - }, - "source": [ - "Inspect the fitted model using a box plot to show the distribution of values\n", - "for the coefficients returned from the cross-validation. Hint: use the\n", - "function\n", - "[`df.plot.box()`](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.plot.box.html)\n", - "to create a box plot." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Write your code here." - ] - } - ], - "metadata": { - "jupytext": { - "main_language": "python" - }, - "kernelspec": { - "display_name": "Python 3", - "name": "python3" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} \ No newline at end of file diff --git a/notebooks/linear_models_ex_04.ipynb b/notebooks/linear_models_ex_04.ipynb deleted file mode 100644 index 77086778b..000000000 --- a/notebooks/linear_models_ex_04.ipynb +++ /dev/null @@ -1,165 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# \ud83d\udcdd Exercise M4.04\n", - "\n", - "In the previous notebook, we saw the effect of applying some regularization on\n", - "the coefficient of a linear model.\n", - "\n", - "In this exercise, we will study the advantage of using some regularization\n", - "when dealing with correlated features.\n", - "\n", - "We will first create a regression dataset. This dataset will contain 2,000\n", - "samples and 5 features from which only 2 features will be informative." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from sklearn.datasets import make_regression\n", - "\n", - "data, target, coef = make_regression(\n", - " n_samples=2_000,\n", - " n_features=5,\n", - " n_informative=2,\n", - " shuffle=False,\n", - " coef=True,\n", - " random_state=0,\n", - " noise=30,\n", - ")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "When creating the dataset, `make_regression` returns the true coefficient used\n", - "to generate the dataset. Let's plot this information." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import pandas as pd\n", - "\n", - "feature_names = [\n", - " \"Relevant feature #0\",\n", - " \"Relevant feature #1\",\n", - " \"Noisy feature #0\",\n", - " \"Noisy feature #1\",\n", - " \"Noisy feature #2\",\n", - "]\n", - "coef = pd.Series(coef, index=feature_names)\n", - "coef.plot.barh()\n", - "coef" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Create a `LinearRegression` regressor and fit on the entire dataset and check\n", - "the value of the coefficients. Are the coefficients of the linear regressor\n", - "close to the coefficients used to generate the dataset?" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Write your code here." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Now, create a new dataset that will be the same as `data` with 4 additional\n", - "columns that will repeat twice features 0 and 1. This procedure will create\n", - "perfectly correlated features." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Write your code here." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Fit again the linear regressor on this new dataset and check the coefficients.\n", - "What do you observe?" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Write your code here." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Create a ridge regressor and fit on the same dataset. Check the coefficients.\n", - "What do you observe?" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Write your code here." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Can you find the relationship between the ridge coefficients and the original\n", - "coefficients?" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Write your code here." - ] - } - ], - "metadata": { - "jupytext": { - "main_language": "python" - }, - "kernelspec": { - "display_name": "Python 3", - "name": "python3" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} \ No newline at end of file diff --git a/notebooks/linear_models_ex_05.ipynb b/notebooks/linear_models_ex_05.ipynb deleted file mode 100644 index 866d52086..000000000 --- a/notebooks/linear_models_ex_05.ipynb +++ /dev/null @@ -1,137 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# \ud83d\udcdd Exercise M4.05\n", - "\n", - "In the previous notebook we set `penalty=\"none\"` to disable regularization\n", - "entirely. This parameter can also control the **type** of regularization to\n", - "use, whereas the regularization **strength** is set using the parameter `C`.\n", - "Setting`penalty=\"none\"` is equivalent to an infinitely large value of `C`. In\n", - "this exercise, we ask you to train a logistic regression classifier using the\n", - "`penalty=\"l2\"` regularization (which happens to be the default in\n", - "scikit-learn) to find by yourself the effect of the parameter `C`.\n", - "\n", - "We will start by loading the dataset." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "
\n", - "

Note

\n", - "

If you want a deeper overview regarding this dataset, you can refer to the\n", - "Appendix - Datasets description section at the end of this MOOC.

\n", - "
" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import pandas as pd\n", - "\n", - "penguins = pd.read_csv(\"../datasets/penguins_classification.csv\")\n", - "# only keep the Adelie and Chinstrap classes\n", - "penguins = (\n", - " penguins.set_index(\"Species\").loc[[\"Adelie\", \"Chinstrap\"]].reset_index()\n", - ")\n", - "\n", - "culmen_columns = [\"Culmen Length (mm)\", \"Culmen Depth (mm)\"]\n", - "target_column = \"Species\"" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from sklearn.model_selection import train_test_split\n", - "\n", - "penguins_train, penguins_test = train_test_split(penguins, random_state=0)\n", - "\n", - "data_train = penguins_train[culmen_columns]\n", - "data_test = penguins_test[culmen_columns]\n", - "\n", - "target_train = penguins_train[target_column]\n", - "target_test = penguins_test[target_column]" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "First, let's create our predictive model." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from sklearn.pipeline import make_pipeline\n", - "from sklearn.preprocessing import StandardScaler\n", - "from sklearn.linear_model import LogisticRegression\n", - "\n", - "logistic_regression = make_pipeline(\n", - " StandardScaler(), LogisticRegression(penalty=\"l2\")\n", - ")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Given the following candidates for the `C` parameter, find out the impact of\n", - "`C` on the classifier decision boundary. You can use\n", - "`sklearn.inspection.DecisionBoundaryDisplay.from_estimator` to plot the\n", - "decision function boundary." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "Cs = [0.01, 0.1, 1, 10]\n", - "\n", - "# Write your code here." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Look at the impact of the `C` hyperparameter on the magnitude of the weights." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Write your code here." - ] - } - ], - "metadata": { - "jupytext": { - "main_language": "python" - }, - "kernelspec": { - "display_name": "Python 3", - "name": "python3" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} \ No newline at end of file diff --git a/notebooks/linear_models_sol_02.ipynb b/notebooks/linear_models_sol_02.ipynb index d56864c4e..634c43171 100644 --- a/notebooks/linear_models_sol_02.ipynb +++ b/notebooks/linear_models_sol_02.ipynb @@ -4,39 +4,19 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "# \ud83d\udcc3 Solution for Exercise M4.02\n", + "# \ud83d\udcc3 Solution for Exercise M4.03\n", "\n", - "The goal of this exercise is to build an intuition on what will be the\n", - "parameters' values of a linear model when the link between the data and the\n", - "target is non-linear.\n", + "In all previous notebooks, we only used a single feature in `data`. But we\n", + "have already shown that we could add new features to make the model more\n", + "expressive by deriving new features, based on the original feature.\n", "\n", - "First, we will generate such non-linear data.\n", + "The aim of this notebook is to train a linear regression algorithm on a\n", + "dataset with more than a single feature.\n", "\n", - "
\n", - "

Tip

\n", - "

np.random.RandomState allows to create a random number generator which can\n", - "be later used to get deterministic results.

\n", - "
" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import numpy as np\n", - "\n", - "# Set the seed for reproduction\n", - "rng = np.random.RandomState(0)\n", - "\n", - "# Generate data\n", - "n_sample = 100\n", - "data_max, data_min = 1.4, -1.4\n", - "len_data = data_max - data_min\n", - "data = rng.rand(n_sample) * len_data - len_data / 2\n", - "noise = rng.randn(n_sample) * 0.3\n", - "target = data**3 - 0.5 * data**2 + noise" + "We will load a dataset about house prices in California. The dataset consists\n", + "of 8 features regarding the demography and geography of districts in\n", + "California and the aim is to predict the median house price of each district.\n", + "We will use all 8 features to predict the target, the median house price." ] }, { @@ -45,8 +25,8 @@ "source": [ "
\n", "

Note

\n", - "

To ease the plotting, we will create a Pandas dataframe containing the data\n", - "and target

\n", + "

If you want a deeper overview regarding this dataset, you can refer to the\n", + "Appendix - Datasets description section at the end of this MOOC.

\n", "
" ] }, @@ -56,49 +36,19 @@ "metadata": {}, "outputs": [], "source": [ - "import pandas as pd\n", - "\n", - "full_data = pd.DataFrame({\"data\": data, \"target\": target})" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import seaborn as sns\n", + "from sklearn.datasets import fetch_california_housing\n", "\n", - "_ = sns.scatterplot(\n", - " data=full_data, x=\"data\", y=\"target\", color=\"black\", alpha=0.5\n", - ")" + "data, target = fetch_california_housing(as_frame=True, return_X_y=True)\n", + "target *= 100 # rescale the target in k$\n", + "data.head()" ] }, { "cell_type": "markdown", - "metadata": { - "lines_to_next_cell": 2 - }, - "source": [ - "We observe that the link between the data `data` and vector `target` is\n", - "non-linear. For instance, `data` could represent the years of experience\n", - "(normalized) and `target` the salary (normalized). Therefore, the problem here\n", - "would be to infer the salary given the years of experience.\n", - "\n", - "Using the function `f` defined below, find both the `weight` and the\n", - "`intercept` that you think will lead to a good linear model. Plot both the\n", - "data and the predictions of this model." - ] - }, - { - "cell_type": "code", - "execution_count": null, "metadata": {}, - "outputs": [], "source": [ - "def f(data, weight=0, intercept=0):\n", - " target_predict = weight * data + intercept\n", - " return target_predict" + "Now it is your turn to train a linear regression model on this dataset. First,\n", + "create a linear regression model." ] }, { @@ -108,30 +58,17 @@ "outputs": [], "source": [ "# solution\n", - "predictions = f(data, weight=1.2, intercept=-0.2)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "tags": [ - "solution" - ] - }, - "outputs": [], - "source": [ - "ax = sns.scatterplot(\n", - " data=full_data, x=\"data\", y=\"target\", color=\"black\", alpha=0.5\n", - ")\n", - "_ = ax.plot(data, predictions)" + "from sklearn.linear_model import LinearRegression\n", + "\n", + "linear_regression = LinearRegression()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "Compute the mean squared error for this model" + "Execute a cross-validation with 10 folds and use the mean absolute error (MAE)\n", + "as metric. Be sure to *return* the fitted *estimators*." ] }, { @@ -141,26 +78,24 @@ "outputs": [], "source": [ "# solution\n", - "from sklearn.metrics import mean_squared_error\n", + "from sklearn.model_selection import cross_validate\n", "\n", - "error = mean_squared_error(target, f(data, weight=1.2, intercept=-0.2))\n", - "print(f\"The MSE is {error}\")" + "cv_results = cross_validate(\n", + " linear_regression,\n", + " data,\n", + " target,\n", + " scoring=\"neg_mean_absolute_error\",\n", + " return_estimator=True,\n", + " cv=10,\n", + " n_jobs=2,\n", + ")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "Train a linear regression model on this dataset.\n", - "\n", - "
\n", - "

Warning

\n", - "

In scikit-learn, by convention data (also called X in the scikit-learn\n", - "documentation) should be a 2D matrix of shape (n_samples, n_features).\n", - "If data is a 1D vector, you need to reshape it into a matrix with a\n", - "single column if the vector represents a feature or a single row if the\n", - "vector represents a sample.

\n", - "
" + "Compute the mean and std of the MAE in thousands of dollars (k$)." ] }, { @@ -169,20 +104,25 @@ "metadata": {}, "outputs": [], "source": [ - "from sklearn.linear_model import LinearRegression\n", - "\n", "# solution\n", - "linear_regression = LinearRegression()\n", - "data_2d = data.reshape(-1, 1)\n", - "linear_regression.fit(data_2d, target)" + "print(\n", + " \"Mean absolute error on testing set: \"\n", + " f\"{-cv_results['test_score'].mean():.3f} k$ \u00b1 \"\n", + " f\"{cv_results['test_score'].std():.3f}\"\n", + ")" ] }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "lines_to_next_cell": 2 + }, "source": [ - "Compute predictions from the linear regression model and plot both the data\n", - "and the predictions." + "Inspect the fitted model using a box plot to show the distribution of values\n", + "for the coefficients returned from the cross-validation. Hint: use the\n", + "function\n", + "[`df.plot.box()`](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.plot.box.html)\n", + "to create a box plot." ] }, { @@ -192,7 +132,11 @@ "outputs": [], "source": [ "# solution\n", - "predictions = linear_regression.predict(data_2d)" + "import pandas as pd\n", + "\n", + "weights = pd.DataFrame(\n", + " [est.coef_ for est in cv_results[\"estimator\"]], columns=data.columns\n", + ")" ] }, { @@ -205,28 +149,11 @@ }, "outputs": [], "source": [ - "ax = sns.scatterplot(\n", - " data=full_data, x=\"data\", y=\"target\", color=\"black\", alpha=0.5\n", - ")\n", - "_ = ax.plot(data, predictions)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Compute the mean squared error" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# solution\n", - "error = mean_squared_error(target, predictions)\n", - "print(f\"The MSE is {error}\")" + "import matplotlib.pyplot as plt\n", + "\n", + "color = {\"whiskers\": \"black\", \"medians\": \"black\", \"caps\": \"black\"}\n", + "weights.plot.box(color=color, vert=False)\n", + "_ = plt.title(\"Value of linear regression coefficients\")" ] } ], diff --git a/notebooks/linear_models_sol_03.ipynb b/notebooks/linear_models_sol_03.ipynb deleted file mode 100644 index 634c43171..000000000 --- a/notebooks/linear_models_sol_03.ipynb +++ /dev/null @@ -1,171 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# \ud83d\udcc3 Solution for Exercise M4.03\n", - "\n", - "In all previous notebooks, we only used a single feature in `data`. But we\n", - "have already shown that we could add new features to make the model more\n", - "expressive by deriving new features, based on the original feature.\n", - "\n", - "The aim of this notebook is to train a linear regression algorithm on a\n", - "dataset with more than a single feature.\n", - "\n", - "We will load a dataset about house prices in California. The dataset consists\n", - "of 8 features regarding the demography and geography of districts in\n", - "California and the aim is to predict the median house price of each district.\n", - "We will use all 8 features to predict the target, the median house price." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "
\n", - "

Note

\n", - "

If you want a deeper overview regarding this dataset, you can refer to the\n", - "Appendix - Datasets description section at the end of this MOOC.

\n", - "
" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from sklearn.datasets import fetch_california_housing\n", - "\n", - "data, target = fetch_california_housing(as_frame=True, return_X_y=True)\n", - "target *= 100 # rescale the target in k$\n", - "data.head()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Now it is your turn to train a linear regression model on this dataset. First,\n", - "create a linear regression model." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# solution\n", - "from sklearn.linear_model import LinearRegression\n", - "\n", - "linear_regression = LinearRegression()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Execute a cross-validation with 10 folds and use the mean absolute error (MAE)\n", - "as metric. Be sure to *return* the fitted *estimators*." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# solution\n", - "from sklearn.model_selection import cross_validate\n", - "\n", - "cv_results = cross_validate(\n", - " linear_regression,\n", - " data,\n", - " target,\n", - " scoring=\"neg_mean_absolute_error\",\n", - " return_estimator=True,\n", - " cv=10,\n", - " n_jobs=2,\n", - ")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Compute the mean and std of the MAE in thousands of dollars (k$)." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# solution\n", - "print(\n", - " \"Mean absolute error on testing set: \"\n", - " f\"{-cv_results['test_score'].mean():.3f} k$ \u00b1 \"\n", - " f\"{cv_results['test_score'].std():.3f}\"\n", - ")" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "lines_to_next_cell": 2 - }, - "source": [ - "Inspect the fitted model using a box plot to show the distribution of values\n", - "for the coefficients returned from the cross-validation. Hint: use the\n", - "function\n", - "[`df.plot.box()`](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.plot.box.html)\n", - "to create a box plot." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# solution\n", - "import pandas as pd\n", - "\n", - "weights = pd.DataFrame(\n", - " [est.coef_ for est in cv_results[\"estimator\"]], columns=data.columns\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "tags": [ - "solution" - ] - }, - "outputs": [], - "source": [ - "import matplotlib.pyplot as plt\n", - "\n", - "color = {\"whiskers\": \"black\", \"medians\": \"black\", \"caps\": \"black\"}\n", - "weights.plot.box(color=color, vert=False)\n", - "_ = plt.title(\"Value of linear regression coefficients\")" - ] - } - ], - "metadata": { - "jupytext": { - "main_language": "python" - }, - "kernelspec": { - "display_name": "Python 3", - "name": "python3" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} \ No newline at end of file diff --git a/notebooks/linear_models_sol_04.ipynb b/notebooks/linear_models_sol_04.ipynb deleted file mode 100644 index f49b0c465..000000000 --- a/notebooks/linear_models_sol_04.ipynb +++ /dev/null @@ -1,492 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# \ud83d\udcc3 Solution for Exercise M4.04\n", - "\n", - "In the previous notebook, we saw the effect of applying some regularization on\n", - "the coefficient of a linear model.\n", - "\n", - "In this exercise, we will study the advantage of using some regularization\n", - "when dealing with correlated features.\n", - "\n", - "We will first create a regression dataset. This dataset will contain 2,000\n", - "samples and 5 features from which only 2 features will be informative." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from sklearn.datasets import make_regression\n", - "\n", - "data, target, coef = make_regression(\n", - " n_samples=2_000,\n", - " n_features=5,\n", - " n_informative=2,\n", - " shuffle=False,\n", - " coef=True,\n", - " random_state=0,\n", - " noise=30,\n", - ")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "When creating the dataset, `make_regression` returns the true coefficient used\n", - "to generate the dataset. Let's plot this information." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import pandas as pd\n", - "\n", - "feature_names = [\n", - " \"Relevant feature #0\",\n", - " \"Relevant feature #1\",\n", - " \"Noisy feature #0\",\n", - " \"Noisy feature #1\",\n", - " \"Noisy feature #2\",\n", - "]\n", - "coef = pd.Series(coef, index=feature_names)\n", - "coef.plot.barh()\n", - "coef" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Create a `LinearRegression` regressor and fit on the entire dataset and check\n", - "the value of the coefficients. Are the coefficients of the linear regressor\n", - "close to the coefficients used to generate the dataset?" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# solution\n", - "from sklearn.linear_model import LinearRegression\n", - "\n", - "linear_regression = LinearRegression()\n", - "linear_regression.fit(data, target)\n", - "linear_regression.coef_" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "tags": [ - "solution" - ] - }, - "outputs": [], - "source": [ - "feature_names = [\n", - " \"Relevant feature #0\",\n", - " \"Relevant feature #1\",\n", - " \"Noisy feature #0\",\n", - " \"Noisy feature #1\",\n", - " \"Noisy feature #2\",\n", - "]\n", - "coef = pd.Series(linear_regression.coef_, index=feature_names)\n", - "_ = coef.plot.barh()" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "tags": [ - "solution" - ] - }, - "source": [ - "We see that the coefficients are close to the coefficients used to generate\n", - "the dataset. The dispersion is indeed cause by the noise injected during the\n", - "dataset generation." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Now, create a new dataset that will be the same as `data` with 4 additional\n", - "columns that will repeat twice features 0 and 1. This procedure will create\n", - "perfectly correlated features." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# solution\n", - "import numpy as np\n", - "\n", - "data = np.concatenate([data, data[:, [0, 1]], data[:, [0, 1]]], axis=1)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Fit again the linear regressor on this new dataset and check the coefficients.\n", - "What do you observe?" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# solution\n", - "linear_regression = LinearRegression()\n", - "linear_regression.fit(data, target)\n", - "linear_regression.coef_" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "tags": [ - "solution" - ] - }, - "outputs": [], - "source": [ - "feature_names = [\n", - " \"Relevant feature #0\",\n", - " \"Relevant feature #1\",\n", - " \"Noisy feature #0\",\n", - " \"Noisy feature #1\",\n", - " \"Noisy feature #2\",\n", - " \"First repetition of feature #0\",\n", - " \"First repetition of feature #1\",\n", - " \"Second repetition of feature #0\",\n", - " \"Second repetition of feature #1\",\n", - "]\n", - "coef = pd.Series(linear_regression.coef_, index=feature_names)\n", - "_ = coef.plot.barh()" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "tags": [ - "solution" - ] - }, - "source": [ - "We see that the coefficient values are far from what one could expect. By\n", - "repeating the informative features, one would have expected these coefficients\n", - "to be similarly informative.\n", - "\n", - "Instead, we see that some coefficients have a huge norm ~1e14. It indeed means\n", - "that we try to solve an mathematical ill-posed problem. Indeed, finding\n", - "coefficients in a linear regression involves inverting the matrix\n", - "`np.dot(data.T, data)` which is not possible (or lead to high numerical\n", - "errors)." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Create a ridge regressor and fit on the same dataset. Check the coefficients.\n", - "What do you observe?" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# solution\n", - "from sklearn.linear_model import Ridge\n", - "\n", - "ridge = Ridge()\n", - "ridge.fit(data, target)\n", - "ridge.coef_" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "tags": [ - "solution" - ] - }, - "outputs": [], - "source": [ - "coef = pd.Series(ridge.coef_, index=feature_names)\n", - "_ = coef.plot.barh()" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "tags": [ - "solution" - ] - }, - "source": [ - "We see that the penalty applied on the weights give a better results: the\n", - "values of the coefficients do not suffer from numerical issues. Indeed, the\n", - "matrix to be inverted internally is `np.dot(data.T, data) + alpha * I`. Adding\n", - "this penalty `alpha` allow the inversion without numerical issue." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Can you find the relationship between the ridge coefficients and the original\n", - "coefficients?" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# solution\n", - "ridge.coef_[:5] * 3" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "tags": [ - "solution" - ] - }, - "source": [ - "Repeating three times each informative features induced to divide the ridge\n", - "coefficients by three." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "tags": [ - "solution" - ] - }, - "source": [ - "
\n", - "

Tip

\n", - "

We advise to always use a penalty to shrink the magnitude of the weights\n", - "toward zero (also called \"l2 penalty\"). In scikit-learn, LogisticRegression\n", - "applies such penalty by default. However, one needs to use Ridge (and even\n", - "RidgeCV to tune the parameter alpha) instead of LinearRegression.

\n", - "

Other kinds of regularizations exist but will not be covered in this course.

\n", - "
\n", - "\n", - "## Dealing with correlation between one-hot encoded features\n", - "\n", - "In this section, we will focus on how to deal with correlated features that\n", - "arise naturally when one-hot encoding categorical features.\n", - "\n", - "Let's first load the Ames housing dataset and take a subset of features that\n", - "are only categorical features." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "tags": [ - "solution" - ] - }, - "outputs": [], - "source": [ - "import pandas as pd\n", - "from sklearn.model_selection import train_test_split\n", - "\n", - "ames_housing = pd.read_csv(\"../datasets/house_prices.csv\", na_values=\"?\")\n", - "ames_housing = ames_housing.drop(columns=\"Id\")\n", - "\n", - "categorical_columns = [\"Street\", \"Foundation\", \"CentralAir\", \"PavedDrive\"]\n", - "target_name = \"SalePrice\"\n", - "X, y = ames_housing[categorical_columns], ames_housing[target_name]\n", - "\n", - "X_train, X_test, y_train, y_test = train_test_split(\n", - " X, y, test_size=0.2, random_state=0\n", - ")" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "tags": [ - "solution" - ] - }, - "source": [ - "\n", - "We previously presented that a `OneHotEncoder` creates as many columns as\n", - "categories. Therefore, there is always one column (i.e. one encoded category)\n", - "that can be inferred from the others. Thus, `OneHotEncoder` creates collinear\n", - "features.\n", - "\n", - "We illustrate this behaviour by considering the \"CentralAir\" feature that\n", - "contains only two categories:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "tags": [ - "solution" - ] - }, - "outputs": [], - "source": [ - "X_train[\"CentralAir\"]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "tags": [ - "solution" - ] - }, - "outputs": [], - "source": [ - "from sklearn.preprocessing import OneHotEncoder\n", - "\n", - "single_feature = [\"CentralAir\"]\n", - "encoder = OneHotEncoder(sparse_output=False, dtype=np.int32)\n", - "X_trans = encoder.fit_transform(X_train[single_feature])\n", - "X_trans = pd.DataFrame(\n", - " X_trans,\n", - " columns=encoder.get_feature_names_out(input_features=single_feature),\n", - ")\n", - "X_trans" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "tags": [ - "solution" - ] - }, - "source": [ - "\n", - "Here, we see that the encoded category \"CentralAir_N\" is the opposite of the\n", - "encoded category \"CentralAir_Y\". Therefore, we observe that using a\n", - "`OneHotEncoder` creates two features having the problematic pattern observed\n", - "earlier in this exercise. Training a linear regression model on such a of\n", - "one-hot encoded binary feature can therefore lead to numerical problems,\n", - "especially without regularization. Furthermore, the two one-hot features are\n", - "redundant as they encode exactly the same information in opposite ways.\n", - "\n", - "Using regularization helps to overcome the numerical issues that we\n", - "highlighted earlier in this exercise.\n", - "\n", - "Another strategy is to arbitrarily drop one of the encoded categories.\n", - "Scikit-learn provides such an option by setting the parameter `drop` in the\n", - "`OneHotEncoder`. This parameter can be set to `first` to always drop the first\n", - "encoded category or `binary_only` to only drop a column in the case of binary\n", - "categories." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "tags": [ - "solution" - ] - }, - "outputs": [], - "source": [ - "encoder = OneHotEncoder(drop=\"first\", sparse_output=False, dtype=np.int32)\n", - "X_trans = encoder.fit_transform(X_train[single_feature])\n", - "X_trans = pd.DataFrame(\n", - " X_trans,\n", - " columns=encoder.get_feature_names_out(input_features=single_feature),\n", - ")\n", - "X_trans" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "tags": [ - "solution" - ] - }, - "source": [ - "\n", - "We see that only the second column of the previous encoded data is kept.\n", - "Dropping one of the one-hot encoded column is a common practice, especially\n", - "for binary categorical features. Note however that this breaks symmetry\n", - "between categories and impacts the number of coefficients of the model, their\n", - "values, and thus their meaning, especially when applying strong\n", - "regularization.\n", - "\n", - "Let's finally illustrate how to use this option is a machine-learning\n", - "pipeline:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "tags": [ - "solution" - ] - }, - "outputs": [], - "source": [ - "from sklearn.pipeline import make_pipeline\n", - "\n", - "model = make_pipeline(OneHotEncoder(drop=\"first\", dtype=np.int32), Ridge())\n", - "model.fit(X_train, y_train)\n", - "n_categories = [X_train[col].nunique() for col in X_train.columns]\n", - "print(f\"R2 score on the testing set: {model.score(X_test, y_test):.2f}\")\n", - "print(\n", - " f\"Our model contains {model[-1].coef_.size} features while \"\n", - " f\"{sum(n_categories)} categories are originally available.\"\n", - ")" - ] - } - ], - "metadata": { - "jupytext": { - "main_language": "python" - }, - "kernelspec": { - "display_name": "Python 3", - "name": "python3" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} \ No newline at end of file diff --git a/notebooks/linear_models_sol_05.ipynb b/notebooks/linear_models_sol_05.ipynb deleted file mode 100644 index 08bae2e77..000000000 --- a/notebooks/linear_models_sol_05.ipynb +++ /dev/null @@ -1,201 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# \ud83d\udcc3 Solution for Exercise M4.05\n", - "\n", - "In the previous notebook we set `penalty=\"none\"` to disable regularization\n", - "entirely. This parameter can also control the **type** of regularization to\n", - "use, whereas the regularization **strength** is set using the parameter `C`.\n", - "Setting`penalty=\"none\"` is equivalent to an infinitely large value of `C`. In\n", - "this exercise, we ask you to train a logistic regression classifier using the\n", - "`penalty=\"l2\"` regularization (which happens to be the default in\n", - "scikit-learn) to find by yourself the effect of the parameter `C`.\n", - "\n", - "We will start by loading the dataset." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "
\n", - "

Note

\n", - "

If you want a deeper overview regarding this dataset, you can refer to the\n", - "Appendix - Datasets description section at the end of this MOOC.

\n", - "
" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import pandas as pd\n", - "\n", - "penguins = pd.read_csv(\"../datasets/penguins_classification.csv\")\n", - "# only keep the Adelie and Chinstrap classes\n", - "penguins = (\n", - " penguins.set_index(\"Species\").loc[[\"Adelie\", \"Chinstrap\"]].reset_index()\n", - ")\n", - "\n", - "culmen_columns = [\"Culmen Length (mm)\", \"Culmen Depth (mm)\"]\n", - "target_column = \"Species\"" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from sklearn.model_selection import train_test_split\n", - "\n", - "penguins_train, penguins_test = train_test_split(penguins, random_state=0)\n", - "\n", - "data_train = penguins_train[culmen_columns]\n", - "data_test = penguins_test[culmen_columns]\n", - "\n", - "target_train = penguins_train[target_column]\n", - "target_test = penguins_test[target_column]" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "First, let's create our predictive model." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from sklearn.pipeline import make_pipeline\n", - "from sklearn.preprocessing import StandardScaler\n", - "from sklearn.linear_model import LogisticRegression\n", - "\n", - "logistic_regression = make_pipeline(\n", - " StandardScaler(), LogisticRegression(penalty=\"l2\")\n", - ")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Given the following candidates for the `C` parameter, find out the impact of\n", - "`C` on the classifier decision boundary. You can use\n", - "`sklearn.inspection.DecisionBoundaryDisplay.from_estimator` to plot the\n", - "decision function boundary." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "Cs = [0.01, 0.1, 1, 10]\n", - "\n", - "# solution\n", - "import matplotlib.pyplot as plt\n", - "import seaborn as sns\n", - "from sklearn.inspection import DecisionBoundaryDisplay\n", - "\n", - "for C in Cs:\n", - " logistic_regression.set_params(logisticregression__C=C)\n", - " logistic_regression.fit(data_train, target_train)\n", - " accuracy = logistic_regression.score(data_test, target_test)\n", - "\n", - " DecisionBoundaryDisplay.from_estimator(\n", - " logistic_regression,\n", - " data_test,\n", - " response_method=\"predict\",\n", - " cmap=\"RdBu_r\",\n", - " alpha=0.5,\n", - " )\n", - " sns.scatterplot(\n", - " data=penguins_test,\n", - " x=culmen_columns[0],\n", - " y=culmen_columns[1],\n", - " hue=target_column,\n", - " palette=[\"tab:red\", \"tab:blue\"],\n", - " )\n", - " plt.legend(bbox_to_anchor=(1.05, 0.8), loc=\"upper left\")\n", - " plt.title(f\"C: {C} \\n Accuracy on the test set: {accuracy:.2f}\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Look at the impact of the `C` hyperparameter on the magnitude of the weights." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# solution\n", - "weights_ridge = []\n", - "for C in Cs:\n", - " logistic_regression.set_params(logisticregression__C=C)\n", - " logistic_regression.fit(data_train, target_train)\n", - " coefs = logistic_regression[-1].coef_[0]\n", - " weights_ridge.append(pd.Series(coefs, index=culmen_columns))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "tags": [ - "solution" - ] - }, - "outputs": [], - "source": [ - "weights_ridge = pd.concat(weights_ridge, axis=1, keys=[f\"C: {C}\" for C in Cs])\n", - "weights_ridge.plot.barh()\n", - "_ = plt.title(\"LogisticRegression weights depending of C\")" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "tags": [ - "solution" - ] - }, - "source": [ - "We see that a small `C` will shrink the weights values toward zero. It means\n", - "that a small `C` provides a more regularized model. Thus, `C` is the inverse\n", - "of the `alpha` coefficient in the `Ridge` model.\n", - "\n", - "Besides, with a strong penalty (i.e. small `C` value), the weight of the\n", - "feature \"Culmen Depth (mm)\" is almost zero. It explains why the decision\n", - "separation in the plot is almost perpendicular to the \"Culmen Length (mm)\"\n", - "feature." - ] - } - ], - "metadata": { - "jupytext": { - "main_language": "python" - }, - "kernelspec": { - "display_name": "Python 3", - "name": "python3" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} \ No newline at end of file diff --git a/python_scripts/linear_models_ex_02.py b/python_scripts/linear_models_ex_02.py index 640c44046..f58a1f0fe 100644 --- a/python_scripts/linear_models_ex_02.py +++ b/python_scripts/linear_models_ex_02.py @@ -14,100 +14,80 @@ # %% [markdown] # # ๐Ÿ“ Exercise M4.02 # -# The goal of this exercise is to build an intuition on what will be the -# parameters' values of a linear model when the link between the data and the -# target is non-linear. +# In the previous notebook, we showed that we can add new features based on the +# original feature to make the model more expressive, for instance `x ** 2` or `x ** 3`. +# In that case we only used a single feature in `data`. # -# First, we will generate such non-linear data. +# The aim of this notebook is to train a linear regression algorithm on a +# dataset with more than a single feature. In such a "multi-dimensional" feature +# space we can derive new features of the form `x1 * x2`, `x2 * x3`, +# etc. Products of features are usually called "non-linear or +# multiplicative interactions" between features. # -# ```{tip} -# `np.random.RandomState` allows to create a random number generator which can -# be later used to get deterministic results. -# ``` - -# %% -import numpy as np - -# Set the seed for reproduction -rng = np.random.RandomState(0) - -# Generate data -n_sample = 100 -data_max, data_min = 1.4, -1.4 -len_data = data_max - data_min -data = rng.rand(n_sample) * len_data - len_data / 2 -noise = rng.randn(n_sample) * 0.3 -target = data**3 - 0.5 * data**2 + noise +# Feature engineering can be an important step of a model pipeline as long as +# the new features are expected to be predictive. For instance, think of a +# classification model to decide if a patient has risk of developing a heart +# disease. This would depend on the patient's Body Mass Index which is defined +# as `weight / height ** 2`. +# +# We load the dataset penguins dataset. We first use a set of 3 numerical +# features to predict the target, i.e. the body mass of the penguin. # %% [markdown] # ```{note} -# To ease the plotting, we will create a Pandas dataframe containing the data -# and target +# If you want a deeper overview regarding this dataset, you can refer to the +# Appendix - Datasets description section at the end of this MOOC. # ``` # %% import pandas as pd -full_data = pd.DataFrame({"data": data, "target": target}) +penguins = pd.read_csv("../datasets/penguins.csv") -# %% -import seaborn as sns +columns = ["Flipper Length (mm)", "Culmen Length (mm)", "Culmen Depth (mm)"] +target_name = "Body Mass (g)" -_ = sns.scatterplot( - data=full_data, x="data", y="target", color="black", alpha=0.5 -) +# Remove lines with missing values for the columns of interest +penguins_non_missing = penguins[columns + [target_name]].dropna() -# %% [markdown] -# We observe that the link between the data `data` and vector `target` is -# non-linear. For instance, `data` could represent the years of experience -# (normalized) and `target` the salary (normalized). Therefore, the problem here -# would be to infer the salary given the years of experience. -# -# Using the function `f` defined below, find both the `weight` and the -# `intercept` that you think will lead to a good linear model. Plot both the -# data and the predictions of this model. - - -# %% -def f(data, weight=0, intercept=0): - target_predict = weight * data + intercept - return target_predict +data = penguins_non_missing[columns] +target = penguins_non_missing[target_name] +data.head() +# %% [markdown] +# Now it is your turn to train a linear regression model on this dataset. First, +# create a linear regression model. # %% # Write your code here. # %% [markdown] -# Compute the mean squared error for this model +# Execute a cross-validation with 10 folds and use the mean absolute error (MAE) +# as metric. # %% # Write your code here. # %% [markdown] -# Train a linear regression model on this dataset. -# -# ```{warning} -# In scikit-learn, by convention `data` (also called `X` in the scikit-learn -# documentation) should be a 2D matrix of shape `(n_samples, n_features)`. -# If `data` is a 1D vector, you need to reshape it into a matrix with a -# single column if the vector represents a feature or a single row if the -# vector represents a sample. -# ``` +# Compute the mean and std of the MAE in grams (g). # %% -from sklearn.linear_model import LinearRegression - # Write your code here. # %% [markdown] -# Compute predictions from the linear regression model and plot both the data -# and the predictions. +# Now create a pipeline using `make_pipeline` consisting of a +# `PolynomialFeatures` and a linear regression. Set `degree=2` and +# `interaction_only=True` to the feature engineering step. Remember not to +# include the bias to avoid redundancies with the linear's regression intercept. +# +# Use the same strategy as before to cross-validate such a pipeline. # %% # Write your code here. # %% [markdown] -# Compute the mean squared error +# Compute the mean and std of the MAE in grams (g) and compare with the results +# without feature engineering. # %% # Write your code here. diff --git a/python_scripts/linear_models_ex_03.py b/python_scripts/linear_models_ex_03.py index 3ab6949a3..9c311e817 100644 --- a/python_scripts/linear_models_ex_03.py +++ b/python_scripts/linear_models_ex_03.py @@ -14,24 +14,14 @@ # %% [markdown] # # ๐Ÿ“ Exercise M4.03 # -# In the previous notebook, we showed that we can add new features based on the -# original feature to make the model more expressive, for instance `x ** 2` or `x ** 3`. -# In that case we only used a single feature in `data`. +# The parameter `penalty` can control the **type** of regularization to use, +# whereas the regularization **strength** is set using the parameter `C`. +# Setting`penalty="none"` is equivalent to an infinitely large value of `C`. In +# this exercise, we ask you to train a logistic regression classifier using the +# `penalty="l2"` regularization (which happens to be the default in +# scikit-learn) to find by yourself the effect of the parameter `C`. # -# The aim of this notebook is to train a linear regression algorithm on a -# dataset with more than a single feature. In such a "multi-dimensional" feature -# space we can derive new features of the form `x1 * x2`, `x2 * x3`, -# etc. Products of features are usually called "non-linear or -# multiplicative interactions" between features. -# -# Feature engineering can be an important step of a model pipeline as long as -# the new features are expected to be predictive. For instance, think of a -# classification model to decide if a patient has risk of developing a heart -# disease. This would depend on the patient's Body Mass Index which is defined -# as `weight / height ** 2`. -# -# We load the dataset penguins dataset. We first use a set of 3 numerical -# features to predict the target, i.e. the body mass of the penguin. +# We start by loading the dataset. # %% [markdown] # ```{note} @@ -42,52 +32,51 @@ # %% import pandas as pd -penguins = pd.read_csv("../datasets/penguins.csv") +penguins = pd.read_csv("../datasets/penguins_classification.csv") +# only keep the Adelie and Chinstrap classes +penguins = ( + penguins.set_index("Species").loc[["Adelie", "Chinstrap"]].reset_index() +) -columns = ["Flipper Length (mm)", "Culmen Length (mm)", "Culmen Depth (mm)"] -target_name = "Body Mass (g)" +culmen_columns = ["Culmen Length (mm)", "Culmen Depth (mm)"] +target_column = "Species" -# Remove lines with missing values for the columns of interest -penguins_non_missing = penguins[columns + [target_name]].dropna() +# %% +from sklearn.model_selection import train_test_split -data = penguins_non_missing[columns] -target = penguins_non_missing[target_name] -data.head() +penguins_train, penguins_test = train_test_split(penguins, random_state=0) -# %% [markdown] -# Now it is your turn to train a linear regression model on this dataset. First, -# create a linear regression model. +data_train = penguins_train[culmen_columns] +data_test = penguins_test[culmen_columns] -# %% -# Write your code here. +target_train = penguins_train[target_column] +target_test = penguins_test[target_column] # %% [markdown] -# Execute a cross-validation with 10 folds and use the mean absolute error (MAE) -# as metric. +# First, let's create our predictive model. # %% -# Write your code here. +from sklearn.pipeline import make_pipeline +from sklearn.preprocessing import StandardScaler +from sklearn.linear_model import LogisticRegression -# %% [markdown] -# Compute the mean and std of the MAE in grams (g). - -# %% -# Write your code here. +logistic_regression = make_pipeline( + StandardScaler(), LogisticRegression(penalty="l2") +) # %% [markdown] -# Now create a pipeline using `make_pipeline` consisting of a -# `PolynomialFeatures` and a linear regression. Set `degree=2` and -# `interaction_only=True` to the feature engineering step. Remember not to -# include the bias to avoid redundancies with the linear's regression intercept. -# -# Use the same strategy as before to cross-validate such a pipeline. +# Given the following candidates for the `C` parameter, find out the impact of +# `C` on the classifier decision boundary. You can use +# `sklearn.inspection.DecisionBoundaryDisplay.from_estimator` to plot the +# decision function boundary. # %% +Cs = [0.01, 0.1, 1, 10] + # Write your code here. # %% [markdown] -# Compute the mean and std of the MAE in grams (g) and compare with the results -# without feature engineering. +# Look at the impact of the `C` hyperparameter on the magnitude of the weights. # %% # Write your code here. diff --git a/python_scripts/linear_models_ex_04.py b/python_scripts/linear_models_ex_04.py deleted file mode 100644 index 18191bccf..000000000 --- a/python_scripts/linear_models_ex_04.py +++ /dev/null @@ -1,92 +0,0 @@ -# --- -# jupyter: -# jupytext: -# text_representation: -# extension: .py -# format_name: percent -# format_version: '1.3' -# jupytext_version: 1.14.5 -# kernelspec: -# display_name: Python 3 -# name: python3 -# --- - -# %% [markdown] -# # ๐Ÿ“ Exercise M4.04 -# -# In the previous notebook, we saw the effect of applying some regularization on -# the coefficient of a linear model. -# -# In this exercise, we will study the advantage of using some regularization -# when dealing with correlated features. -# -# We will first create a regression dataset. This dataset will contain 2,000 -# samples and 5 features from which only 2 features will be informative. - -# %% -from sklearn.datasets import make_regression - -data, target, coef = make_regression( - n_samples=2_000, - n_features=5, - n_informative=2, - shuffle=False, - coef=True, - random_state=0, - noise=30, -) - -# %% [markdown] -# When creating the dataset, `make_regression` returns the true coefficient used -# to generate the dataset. Let's plot this information. - -# %% -import pandas as pd - -feature_names = [ - "Relevant feature #0", - "Relevant feature #1", - "Noisy feature #0", - "Noisy feature #1", - "Noisy feature #2", -] -coef = pd.Series(coef, index=feature_names) -coef.plot.barh() -coef - -# %% [markdown] -# Create a `LinearRegression` regressor and fit on the entire dataset and check -# the value of the coefficients. Are the coefficients of the linear regressor -# close to the coefficients used to generate the dataset? - -# %% -# Write your code here. - -# %% [markdown] -# Now, create a new dataset that will be the same as `data` with 4 additional -# columns that will repeat twice features 0 and 1. This procedure will create -# perfectly correlated features. - -# %% -# Write your code here. - -# %% [markdown] -# Fit again the linear regressor on this new dataset and check the coefficients. -# What do you observe? - -# %% -# Write your code here. - -# %% [markdown] -# Create a ridge regressor and fit on the same dataset. Check the coefficients. -# What do you observe? - -# %% -# Write your code here. - -# %% [markdown] -# Can you find the relationship between the ridge coefficients and the original -# coefficients? - -# %% -# Write your code here. diff --git a/python_scripts/linear_models_ex_05.py b/python_scripts/linear_models_ex_05.py deleted file mode 100644 index 1c36b83c2..000000000 --- a/python_scripts/linear_models_ex_05.py +++ /dev/null @@ -1,83 +0,0 @@ -# --- -# jupyter: -# jupytext: -# text_representation: -# extension: .py -# format_name: percent -# format_version: '1.3' -# jupytext_version: 1.14.5 -# kernelspec: -# display_name: Python 3 -# name: python3 -# --- - -# %% [markdown] -# # ๐Ÿ“ Exercise M4.05 -# -# In the previous notebook we set `penalty="none"` to disable regularization -# entirely. This parameter can also control the **type** of regularization to -# use, whereas the regularization **strength** is set using the parameter `C`. -# Setting`penalty="none"` is equivalent to an infinitely large value of `C`. In -# this exercise, we ask you to train a logistic regression classifier using the -# `penalty="l2"` regularization (which happens to be the default in -# scikit-learn) to find by yourself the effect of the parameter `C`. -# -# We will start by loading the dataset. - -# %% [markdown] -# ```{note} -# If you want a deeper overview regarding this dataset, you can refer to the -# Appendix - Datasets description section at the end of this MOOC. -# ``` - -# %% -import pandas as pd - -penguins = pd.read_csv("../datasets/penguins_classification.csv") -# only keep the Adelie and Chinstrap classes -penguins = ( - penguins.set_index("Species").loc[["Adelie", "Chinstrap"]].reset_index() -) - -culmen_columns = ["Culmen Length (mm)", "Culmen Depth (mm)"] -target_column = "Species" - -# %% -from sklearn.model_selection import train_test_split - -penguins_train, penguins_test = train_test_split(penguins, random_state=0) - -data_train = penguins_train[culmen_columns] -data_test = penguins_test[culmen_columns] - -target_train = penguins_train[target_column] -target_test = penguins_test[target_column] - -# %% [markdown] -# First, let's create our predictive model. - -# %% -from sklearn.pipeline import make_pipeline -from sklearn.preprocessing import StandardScaler -from sklearn.linear_model import LogisticRegression - -logistic_regression = make_pipeline( - StandardScaler(), LogisticRegression(penalty="l2") -) - -# %% [markdown] -# Given the following candidates for the `C` parameter, find out the impact of -# `C` on the classifier decision boundary. You can use -# `sklearn.inspection.DecisionBoundaryDisplay.from_estimator` to plot the -# decision function boundary. - -# %% -Cs = [0.01, 0.1, 1, 10] - -# Write your code here. - -# %% [markdown] -# Look at the impact of the `C` hyperparameter on the magnitude of the weights. - -# %% -# Write your code here. diff --git a/python_scripts/linear_models_sol_02.py b/python_scripts/linear_models_sol_02.py index d62a4b983..3abc476da 100644 --- a/python_scripts/linear_models_sol_02.py +++ b/python_scripts/linear_models_sol_02.py @@ -8,123 +8,127 @@ # %% [markdown] # # ๐Ÿ“ƒ Solution for Exercise M4.02 # -# The goal of this exercise is to build an intuition on what will be the -# parameters' values of a linear model when the link between the data and the -# target is non-linear. +# In the previous notebook, we showed that we can add new features based on the +# original feature to make the model more expressive, for instance `x ** 2` or `x ** 3`. +# In that case we only used a single feature in `data`. # -# First, we will generate such non-linear data. +# The aim of this notebook is to train a linear regression algorithm on a +# dataset with more than a single feature. In such a "multi-dimensional" feature +# space we can derive new features of the form `x1 * x2`, `x2 * x3`, +# etc. Products of features are usually called "non-linear or +# multiplicative interactions" between features. # -# ```{tip} -# `np.random.RandomState` allows to create a random number generator which can -# be later used to get deterministic results. -# ``` - -# %% -import numpy as np - -# Set the seed for reproduction -rng = np.random.RandomState(0) - -# Generate data -n_sample = 100 -data_max, data_min = 1.4, -1.4 -len_data = data_max - data_min -data = rng.rand(n_sample) * len_data - len_data / 2 -noise = rng.randn(n_sample) * 0.3 -target = data**3 - 0.5 * data**2 + noise +# Feature engineering can be an important step of a model pipeline as long as +# the new features are expected to be predictive. For instance, think of a +# classification model to decide if a patient has risk of developing a heart +# disease. This would depend on the patient's Body Mass Index which is defined +# as `weight / height ** 2`. +# +# We load the dataset penguins dataset. We first use a set of 3 numerical +# features to predict the target, i.e. the body mass of the penguin. # %% [markdown] # ```{note} -# To ease the plotting, we will create a Pandas dataframe containing the data -# and target +# If you want a deeper overview regarding this dataset, you can refer to the +# Appendix - Datasets description section at the end of this MOOC. # ``` # %% import pandas as pd -full_data = pd.DataFrame({"data": data, "target": target}) - -# %% -import seaborn as sns - -_ = sns.scatterplot( - data=full_data, x="data", y="target", color="black", alpha=0.5 -) +penguins = pd.read_csv("../datasets/penguins.csv") -# %% [markdown] -# We observe that the link between the data `data` and vector `target` is -# non-linear. For instance, `data` could represent the years of experience -# (normalized) and `target` the salary (normalized). Therefore, the problem here -# would be to infer the salary given the years of experience. -# -# Using the function `f` defined below, find both the `weight` and the -# `intercept` that you think will lead to a good linear model. Plot both the -# data and the predictions of this model. +columns = ["Flipper Length (mm)", "Culmen Length (mm)", "Culmen Depth (mm)"] +target_name = "Body Mass (g)" +# Remove lines with missing values for the columns of interest +penguins_non_missing = penguins[columns + [target_name]].dropna() -# %% -def f(data, weight=0, intercept=0): - target_predict = weight * data + intercept - return target_predict +data = penguins_non_missing[columns] +target = penguins_non_missing[target_name] +data.head() +# %% [markdown] +# Now it is your turn to train a linear regression model on this dataset. First, +# create a linear regression model. # %% # solution -predictions = f(data, weight=1.2, intercept=-0.2) +from sklearn.linear_model import LinearRegression -# %% tags=["solution"] -ax = sns.scatterplot( - data=full_data, x="data", y="target", color="black", alpha=0.5 -) -_ = ax.plot(data, predictions) +linear_regression = LinearRegression() # %% [markdown] -# Compute the mean squared error for this model +# Execute a cross-validation with 10 folds and use the mean absolute error (MAE) +# as metric. # %% # solution -from sklearn.metrics import mean_squared_error - -error = mean_squared_error(target, f(data, weight=1.2, intercept=-0.2)) -print(f"The MSE is {error}") +from sklearn.model_selection import cross_validate + +cv_results = cross_validate( + linear_regression, + data, + target, + cv=10, + scoring="neg_mean_absolute_error", + n_jobs=2, +) # %% [markdown] -# Train a linear regression model on this dataset. -# -# ```{warning} -# In scikit-learn, by convention `data` (also called `X` in the scikit-learn -# documentation) should be a 2D matrix of shape `(n_samples, n_features)`. -# If `data` is a 1D vector, you need to reshape it into a matrix with a -# single column if the vector represents a feature or a single row if the -# vector represents a sample. -# ``` +# Compute the mean and std of the MAE in grams (g). # %% -from sklearn.linear_model import LinearRegression - # solution -linear_regression = LinearRegression() -data_2d = data.reshape(-1, 1) -linear_regression.fit(data_2d, target) +print( + "Mean absolute error on testing set with original features: " + f"{-cv_results['test_score'].mean():.3f} ยฑ " + f"{cv_results['test_score'].std():.3f} g" +) # %% [markdown] -# Compute predictions from the linear regression model and plot both the data -# and the predictions. +# Now create a pipeline using `make_pipeline` consisting of a +# `PolynomialFeatures` and a linear regression. Set `degree=2` and +# `interaction_only=True` to the feature engineering step. Remember not to +# include the bias to avoid redundancies with the linear's regression intercept. +# +# Use the same strategy as before to cross-validate such a pipeline. # %% # solution -predictions = linear_regression.predict(data_2d) +from sklearn.preprocessing import PolynomialFeatures +from sklearn.pipeline import make_pipeline -# %% tags=["solution"] -ax = sns.scatterplot( - data=full_data, x="data", y="target", color="black", alpha=0.5 +poly_features = PolynomialFeatures( + degree=2, include_bias=False, interaction_only=True +) +linear_regression_interactions = make_pipeline( + poly_features, linear_regression +) + +cv_results = cross_validate( + linear_regression_interactions, + data, + target, + cv=10, + scoring="neg_mean_absolute_error", + n_jobs=2, ) -_ = ax.plot(data, predictions) # %% [markdown] -# Compute the mean squared error +# Compute the mean and std of the MAE in grams (g) and compare with the results +# without feature engineering. # %% # solution -error = mean_squared_error(target, predictions) -print(f"The MSE is {error}") +print( + "Mean absolute error on testing set with interactions: " + f"{-cv_results['test_score'].mean():.3f} ยฑ " + f"{cv_results['test_score'].std():.3f} g" +) + +# %% [markdown] tags=["solution"] +# We observe that the mean absolute error is lower and less spread with the +# enriched features. In this case the "interactions" are indeed predictive. In +# the following notebook we will see what happens when the enriched features are +# non-predictive and how to deal with this case. diff --git a/python_scripts/linear_models_sol_03.py b/python_scripts/linear_models_sol_03.py index 0cacfcf0d..d789c8522 100644 --- a/python_scripts/linear_models_sol_03.py +++ b/python_scripts/linear_models_sol_03.py @@ -8,24 +8,14 @@ # %% [markdown] # # ๐Ÿ“ƒ Solution for Exercise M4.03 # -# In the previous notebook, we showed that we can add new features based on the -# original feature to make the model more expressive, for instance `x ** 2` or `x ** 3`. -# In that case we only used a single feature in `data`. +# The parameter `penalty` can control the **type** of regularization to use, +# whereas the regularization **strength** is set using the parameter `C`. +# Setting`penalty="none"` is equivalent to an infinitely large value of `C`. In +# this exercise, we ask you to train a logistic regression classifier using the +# `penalty="l2"` regularization (which happens to be the default in +# scikit-learn) to find by yourself the effect of the parameter `C`. # -# The aim of this notebook is to train a linear regression algorithm on a -# dataset with more than a single feature. In such a "multi-dimensional" feature -# space we can derive new features of the form `x1 * x2`, `x2 * x3`, -# etc. Products of features are usually called "non-linear or -# multiplicative interactions" between features. -# -# Feature engineering can be an important step of a model pipeline as long as -# the new features are expected to be predictive. For instance, think of a -# classification model to decide if a patient has risk of developing a heart -# disease. This would depend on the patient's Body Mass Index which is defined -# as `weight / height ** 2`. -# -# We load the dataset penguins dataset. We first use a set of 3 numerical -# features to predict the target, i.e. the body mass of the penguin. +# We start by loading the dataset. # %% [markdown] # ```{note} @@ -36,99 +26,97 @@ # %% import pandas as pd -penguins = pd.read_csv("../datasets/penguins.csv") - -columns = ["Flipper Length (mm)", "Culmen Length (mm)", "Culmen Depth (mm)"] -target_name = "Body Mass (g)" - -# Remove lines with missing values for the columns of interest -penguins_non_missing = penguins[columns + [target_name]].dropna() - -data = penguins_non_missing[columns] -target = penguins_non_missing[target_name] -data.head() +penguins = pd.read_csv("../datasets/penguins_classification.csv") +# only keep the Adelie and Chinstrap classes +penguins = ( + penguins.set_index("Species").loc[["Adelie", "Chinstrap"]].reset_index() +) -# %% [markdown] -# Now it is your turn to train a linear regression model on this dataset. First, -# create a linear regression model. +culmen_columns = ["Culmen Length (mm)", "Culmen Depth (mm)"] +target_column = "Species" # %% -# solution -from sklearn.linear_model import LinearRegression +from sklearn.model_selection import train_test_split -linear_regression = LinearRegression() +penguins_train, penguins_test = train_test_split(penguins, random_state=0) -# %% [markdown] -# Execute a cross-validation with 10 folds and use the mean absolute error (MAE) -# as metric. +data_train = penguins_train[culmen_columns] +data_test = penguins_test[culmen_columns] -# %% -# solution -from sklearn.model_selection import cross_validate - -cv_results = cross_validate( - linear_regression, - data, - target, - cv=10, - scoring="neg_mean_absolute_error", - n_jobs=2, -) +target_train = penguins_train[target_column] +target_test = penguins_test[target_column] # %% [markdown] -# Compute the mean and std of the MAE in grams (g). +# First, let's create our predictive model. # %% -# solution -print( - "Mean absolute error on testing set with original features: " - f"{-cv_results['test_score'].mean():.3f} ยฑ " - f"{cv_results['test_score'].std():.3f} g" +from sklearn.pipeline import make_pipeline +from sklearn.preprocessing import StandardScaler +from sklearn.linear_model import LogisticRegression + +logistic_regression = make_pipeline( + StandardScaler(), LogisticRegression(penalty="l2") ) # %% [markdown] -# Now create a pipeline using `make_pipeline` consisting of a -# `PolynomialFeatures` and a linear regression. Set `degree=2` and -# `interaction_only=True` to the feature engineering step. Remember not to -# include the bias to avoid redundancies with the linear's regression intercept. -# -# Use the same strategy as before to cross-validate such a pipeline. +# Given the following candidates for the `C` parameter, find out the impact of +# `C` on the classifier decision boundary. You can use +# `sklearn.inspection.DecisionBoundaryDisplay.from_estimator` to plot the +# decision function boundary. # %% -# solution -from sklearn.preprocessing import PolynomialFeatures -from sklearn.pipeline import make_pipeline - -poly_features = PolynomialFeatures( - degree=2, include_bias=False, interaction_only=True -) -linear_regression_interactions = make_pipeline( - poly_features, linear_regression -) +Cs = [0.01, 0.1, 1, 10] -cv_results = cross_validate( - linear_regression_interactions, - data, - target, - cv=10, - scoring="neg_mean_absolute_error", - n_jobs=2, -) +# solution +import matplotlib.pyplot as plt +import seaborn as sns +from sklearn.inspection import DecisionBoundaryDisplay + +for C in Cs: + logistic_regression.set_params(logisticregression__C=C) + logistic_regression.fit(data_train, target_train) + accuracy = logistic_regression.score(data_test, target_test) + + DecisionBoundaryDisplay.from_estimator( + logistic_regression, + data_test, + response_method="predict", + cmap="RdBu_r", + alpha=0.5, + ) + sns.scatterplot( + data=penguins_test, + x=culmen_columns[0], + y=culmen_columns[1], + hue=target_column, + palette=["tab:red", "tab:blue"], + ) + plt.legend(bbox_to_anchor=(1.05, 0.8), loc="upper left") + plt.title(f"C: {C} \n Accuracy on the test set: {accuracy:.2f}") # %% [markdown] -# Compute the mean and std of the MAE in grams (g) and compare with the results -# without feature engineering. +# Look at the impact of the `C` hyperparameter on the magnitude of the weights. # %% # solution -print( - "Mean absolute error on testing set with interactions: " - f"{-cv_results['test_score'].mean():.3f} ยฑ " - f"{cv_results['test_score'].std():.3f} g" -) +weights_ridge = [] +for C in Cs: + logistic_regression.set_params(logisticregression__C=C) + logistic_regression.fit(data_train, target_train) + coefs = logistic_regression[-1].coef_[0] + weights_ridge.append(pd.Series(coefs, index=culmen_columns)) + +# %% tags=["solution"] +weights_ridge = pd.concat(weights_ridge, axis=1, keys=[f"C: {C}" for C in Cs]) +weights_ridge.plot.barh() +_ = plt.title("LogisticRegression weights depending of C") # %% [markdown] tags=["solution"] -# We observe that the mean absolute error is lower and less spread with the -# enriched features. In this case the "interactions" are indeed predictive. In -# the following notebook we will see what happens when the enriched features are -# non-predictive and how to deal with this case. +# We see that a small `C` will shrink the weights values toward zero. It means +# that a small `C` provides a more regularized model. Thus, `C` is the inverse +# of the `alpha` coefficient in the `Ridge` model. +# +# Besides, with a strong penalty (i.e. small `C` value), the weight of the +# feature "Culmen Depth (mm)" is almost zero. It explains why the decision +# separation in the plot is almost perpendicular to the "Culmen Length (mm)" +# feature. diff --git a/python_scripts/linear_models_sol_04.py b/python_scripts/linear_models_sol_04.py deleted file mode 100644 index a759c3d24..000000000 --- a/python_scripts/linear_models_sol_04.py +++ /dev/null @@ -1,269 +0,0 @@ -# --- -# jupyter: -# kernelspec: -# display_name: Python 3 -# name: python3 -# --- - -# %% [markdown] -# # ๐Ÿ“ƒ Solution for Exercise M4.04 -# -# In the previous notebook, we saw the effect of applying some regularization on -# the coefficient of a linear model. -# -# In this exercise, we will study the advantage of using some regularization -# when dealing with correlated features. -# -# We will first create a regression dataset. This dataset will contain 2,000 -# samples and 5 features from which only 2 features will be informative. - -# %% -from sklearn.datasets import make_regression - -data, target, coef = make_regression( - n_samples=2_000, - n_features=5, - n_informative=2, - shuffle=False, - coef=True, - random_state=0, - noise=30, -) - -# %% [markdown] -# When creating the dataset, `make_regression` returns the true coefficient used -# to generate the dataset. Let's plot this information. - -# %% -import pandas as pd - -feature_names = [ - "Relevant feature #0", - "Relevant feature #1", - "Noisy feature #0", - "Noisy feature #1", - "Noisy feature #2", -] -coef = pd.Series(coef, index=feature_names) -coef.plot.barh() -coef - -# %% [markdown] -# Create a `LinearRegression` regressor and fit on the entire dataset and check -# the value of the coefficients. Are the coefficients of the linear regressor -# close to the coefficients used to generate the dataset? - -# %% -# solution -from sklearn.linear_model import LinearRegression - -linear_regression = LinearRegression() -linear_regression.fit(data, target) -linear_regression.coef_ - -# %% tags=["solution"] -feature_names = [ - "Relevant feature #0", - "Relevant feature #1", - "Noisy feature #0", - "Noisy feature #1", - "Noisy feature #2", -] -coef = pd.Series(linear_regression.coef_, index=feature_names) -_ = coef.plot.barh() - -# %% [markdown] tags=["solution"] -# We see that the coefficients are close to the coefficients used to generate -# the dataset. The dispersion is indeed cause by the noise injected during the -# dataset generation. - -# %% [markdown] -# Now, create a new dataset that will be the same as `data` with 4 additional -# columns that will repeat twice features 0 and 1. This procedure will create -# perfectly correlated features. - -# %% -# solution -import numpy as np - -data = np.concatenate([data, data[:, [0, 1]], data[:, [0, 1]]], axis=1) - -# %% [markdown] -# Fit again the linear regressor on this new dataset and check the coefficients. -# What do you observe? - -# %% -# solution -linear_regression = LinearRegression() -linear_regression.fit(data, target) -linear_regression.coef_ - -# %% tags=["solution"] -feature_names = [ - "Relevant feature #0", - "Relevant feature #1", - "Noisy feature #0", - "Noisy feature #1", - "Noisy feature #2", - "First repetition of feature #0", - "First repetition of feature #1", - "Second repetition of feature #0", - "Second repetition of feature #1", -] -coef = pd.Series(linear_regression.coef_, index=feature_names) -_ = coef.plot.barh() - -# %% [markdown] tags=["solution"] -# We see that the coefficient values are far from what one could expect. By -# repeating the informative features, one would have expected these coefficients -# to be similarly informative. -# -# Instead, we see that some coefficients have a huge norm ~1e14. It indeed means -# that we try to solve an mathematical ill-posed problem. Indeed, finding -# coefficients in a linear regression involves inverting the matrix -# `np.dot(data.T, data)` which is not possible (or lead to high numerical -# errors). - -# %% [markdown] -# Create a ridge regressor and fit on the same dataset. Check the coefficients. -# What do you observe? - -# %% -# solution -from sklearn.linear_model import Ridge - -ridge = Ridge() -ridge.fit(data, target) -ridge.coef_ - -# %% tags=["solution"] -coef = pd.Series(ridge.coef_, index=feature_names) -_ = coef.plot.barh() - -# %% [markdown] tags=["solution"] -# We see that the penalty applied on the weights give a better results: the -# values of the coefficients do not suffer from numerical issues. Indeed, the -# matrix to be inverted internally is `np.dot(data.T, data) + alpha * I`. Adding -# this penalty `alpha` allow the inversion without numerical issue. - -# %% [markdown] -# Can you find the relationship between the ridge coefficients and the original -# coefficients? - -# %% -# solution -ridge.coef_[:5] * 3 - -# %% [markdown] tags=["solution"] -# Repeating three times each informative features induced to divide the ridge -# coefficients by three. - -# %% [markdown] tags=["solution"] -# ```{tip} -# We advise to always use a penalty to shrink the magnitude of the weights -# toward zero (also called "l2 penalty"). In scikit-learn, `LogisticRegression` -# applies such penalty by default. However, one needs to use `Ridge` (and even -# `RidgeCV` to tune the parameter `alpha`) instead of `LinearRegression`. -# -# Other kinds of regularizations exist but will not be covered in this course. -# ``` -# -# ## Dealing with correlation between one-hot encoded features -# -# In this section, we will focus on how to deal with correlated features that -# arise naturally when one-hot encoding categorical features. -# -# Let's first load the Ames housing dataset and take a subset of features that -# are only categorical features. - -# %% tags=["solution"] -import pandas as pd -from sklearn.model_selection import train_test_split - -ames_housing = pd.read_csv("../datasets/house_prices.csv", na_values="?") -ames_housing = ames_housing.drop(columns="Id") - -categorical_columns = ["Street", "Foundation", "CentralAir", "PavedDrive"] -target_name = "SalePrice" -X, y = ames_housing[categorical_columns], ames_housing[target_name] - -X_train, X_test, y_train, y_test = train_test_split( - X, y, test_size=0.2, random_state=0 -) - -# %% [markdown] tags=["solution"] -# -# We previously presented that a `OneHotEncoder` creates as many columns as -# categories. Therefore, there is always one column (i.e. one encoded category) -# that can be inferred from the others. Thus, `OneHotEncoder` creates collinear -# features. -# -# We illustrate this behaviour by considering the "CentralAir" feature that -# contains only two categories: - -# %% tags=["solution"] -X_train["CentralAir"] - -# %% tags=["solution"] -from sklearn.preprocessing import OneHotEncoder - -single_feature = ["CentralAir"] -encoder = OneHotEncoder(sparse_output=False, dtype=np.int32) -X_trans = encoder.fit_transform(X_train[single_feature]) -X_trans = pd.DataFrame( - X_trans, - columns=encoder.get_feature_names_out(input_features=single_feature), -) -X_trans - -# %% [markdown] tags=["solution"] -# -# Here, we see that the encoded category "CentralAir_N" is the opposite of the -# encoded category "CentralAir_Y". Therefore, we observe that using a -# `OneHotEncoder` creates two features having the problematic pattern observed -# earlier in this exercise. Training a linear regression model on such a of -# one-hot encoded binary feature can therefore lead to numerical problems, -# especially without regularization. Furthermore, the two one-hot features are -# redundant as they encode exactly the same information in opposite ways. -# -# Using regularization helps to overcome the numerical issues that we -# highlighted earlier in this exercise. -# -# Another strategy is to arbitrarily drop one of the encoded categories. -# Scikit-learn provides such an option by setting the parameter `drop` in the -# `OneHotEncoder`. This parameter can be set to `first` to always drop the first -# encoded category or `binary_only` to only drop a column in the case of binary -# categories. - -# %% tags=["solution"] -encoder = OneHotEncoder(drop="first", sparse_output=False, dtype=np.int32) -X_trans = encoder.fit_transform(X_train[single_feature]) -X_trans = pd.DataFrame( - X_trans, - columns=encoder.get_feature_names_out(input_features=single_feature), -) -X_trans - -# %% [markdown] tags=["solution"] -# -# We see that only the second column of the previous encoded data is kept. -# Dropping one of the one-hot encoded column is a common practice, especially -# for binary categorical features. Note however that this breaks symmetry -# between categories and impacts the number of coefficients of the model, their -# values, and thus their meaning, especially when applying strong -# regularization. -# -# Let's finally illustrate how to use this option is a machine-learning -# pipeline: - -# %% tags=["solution"] -from sklearn.pipeline import make_pipeline - -model = make_pipeline(OneHotEncoder(drop="first", dtype=np.int32), Ridge()) -model.fit(X_train, y_train) -n_categories = [X_train[col].nunique() for col in X_train.columns] -print(f"R2 score on the testing set: {model.score(X_test, y_test):.2f}") -print( - f"Our model contains {model[-1].coef_.size} features while " - f"{sum(n_categories)} categories are originally available." -) diff --git a/python_scripts/linear_models_sol_05.py b/python_scripts/linear_models_sol_05.py deleted file mode 100644 index bc4a15df1..000000000 --- a/python_scripts/linear_models_sol_05.py +++ /dev/null @@ -1,123 +0,0 @@ -# --- -# jupyter: -# kernelspec: -# display_name: Python 3 -# name: python3 -# --- - -# %% [markdown] -# # ๐Ÿ“ƒ Solution for Exercise M4.05 -# -# In the previous notebook we set `penalty="none"` to disable regularization -# entirely. This parameter can also control the **type** of regularization to -# use, whereas the regularization **strength** is set using the parameter `C`. -# Setting`penalty="none"` is equivalent to an infinitely large value of `C`. In -# this exercise, we ask you to train a logistic regression classifier using the -# `penalty="l2"` regularization (which happens to be the default in -# scikit-learn) to find by yourself the effect of the parameter `C`. -# -# We will start by loading the dataset. - -# %% [markdown] -# ```{note} -# If you want a deeper overview regarding this dataset, you can refer to the -# Appendix - Datasets description section at the end of this MOOC. -# ``` - -# %% -import pandas as pd - -penguins = pd.read_csv("../datasets/penguins_classification.csv") -# only keep the Adelie and Chinstrap classes -penguins = ( - penguins.set_index("Species").loc[["Adelie", "Chinstrap"]].reset_index() -) - -culmen_columns = ["Culmen Length (mm)", "Culmen Depth (mm)"] -target_column = "Species" - -# %% -from sklearn.model_selection import train_test_split - -penguins_train, penguins_test = train_test_split(penguins, random_state=0) - -data_train = penguins_train[culmen_columns] -data_test = penguins_test[culmen_columns] - -target_train = penguins_train[target_column] -target_test = penguins_test[target_column] - -# %% [markdown] -# First, let's create our predictive model. - -# %% -from sklearn.pipeline import make_pipeline -from sklearn.preprocessing import StandardScaler -from sklearn.linear_model import LogisticRegression - -logistic_regression = make_pipeline( - StandardScaler(), LogisticRegression(penalty="l2") -) - -# %% [markdown] -# Given the following candidates for the `C` parameter, find out the impact of -# `C` on the classifier decision boundary. You can use -# `sklearn.inspection.DecisionBoundaryDisplay.from_estimator` to plot the -# decision function boundary. - -# %% -Cs = [0.01, 0.1, 1, 10] - -# solution -import matplotlib.pyplot as plt -import seaborn as sns -from sklearn.inspection import DecisionBoundaryDisplay - -for C in Cs: - logistic_regression.set_params(logisticregression__C=C) - logistic_regression.fit(data_train, target_train) - accuracy = logistic_regression.score(data_test, target_test) - - DecisionBoundaryDisplay.from_estimator( - logistic_regression, - data_test, - response_method="predict", - cmap="RdBu_r", - alpha=0.5, - ) - sns.scatterplot( - data=penguins_test, - x=culmen_columns[0], - y=culmen_columns[1], - hue=target_column, - palette=["tab:red", "tab:blue"], - ) - plt.legend(bbox_to_anchor=(1.05, 0.8), loc="upper left") - plt.title(f"C: {C} \n Accuracy on the test set: {accuracy:.2f}") - -# %% [markdown] -# Look at the impact of the `C` hyperparameter on the magnitude of the weights. - -# %% -# solution -weights_ridge = [] -for C in Cs: - logistic_regression.set_params(logisticregression__C=C) - logistic_regression.fit(data_train, target_train) - coefs = logistic_regression[-1].coef_[0] - weights_ridge.append(pd.Series(coefs, index=culmen_columns)) - -# %% tags=["solution"] -weights_ridge = pd.concat(weights_ridge, axis=1, keys=[f"C: {C}" for C in Cs]) -weights_ridge.plot.barh() -_ = plt.title("LogisticRegression weights depending of C") - -# %% [markdown] tags=["solution"] -# We see that a small `C` will shrink the weights values toward zero. It means -# that a small `C` provides a more regularized model. Thus, `C` is the inverse -# of the `alpha` coefficient in the `Ridge` model. -# -# Besides, with a strong penalty (i.e. small `C` value), the weight of the -# feature "Culmen Depth (mm)" is almost zero. It explains why the decision -# separation in the plot is almost perpendicular to the "Culmen Length (mm)" -# feature. diff --git a/python_scripts/logistic_regression.py b/python_scripts/logistic_regression.py index 3156ebda0..45487341b 100644 --- a/python_scripts/logistic_regression.py +++ b/python_scripts/logistic_regression.py @@ -78,9 +78,7 @@ from sklearn.preprocessing import StandardScaler from sklearn.linear_model import LogisticRegression -logistic_regression = make_pipeline( - StandardScaler(), LogisticRegression(penalty=None) -) +logistic_regression = make_pipeline(StandardScaler(), LogisticRegression()) logistic_regression.fit(data_train, target_train) accuracy = logistic_regression.score(data_test, target_test) print(f"Accuracy on test set: {accuracy:.3f}") @@ -124,8 +122,7 @@ # %% [markdown] # Thus, we see that our decision function is represented by a line separating -# the 2 classes. We should also note that we did not impose any regularization -# by setting the parameter `penalty` to `'none'`. +# the 2 classes. # # Since the line is oblique, it means that we used a combination of both # features: From 7fb450d65c9d007af3a4e35a112458e7f1212451 Mon Sep 17 00:00:00 2001 From: Arturo Amor <86408019+ArturoAmorQ@users.noreply.github.com> Date: Tue, 12 Sep 2023 10:35:12 +0200 Subject: [PATCH 046/108] FIX LearningCurveDisplay FutureWarning (#714) Co-authored-by: ArturoAmorQ --- python_scripts/cross_validation_learning_curve.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/python_scripts/cross_validation_learning_curve.py b/python_scripts/cross_validation_learning_curve.py index 0c064c1b4..b2bb23335 100644 --- a/python_scripts/cross_validation_learning_curve.py +++ b/python_scripts/cross_validation_learning_curve.py @@ -85,12 +85,11 @@ score_type="both", # both train and test errors scoring="neg_mean_absolute_error", negate_score=True, # to use when metric starts with "neg_" - log_scale=True, # sets log scale for the x-axis score_name="Mean absolute error (k$)", std_display_style="errorbar", n_jobs=2, ) -_ = display.ax_.set_title("Learning curve for decision tree") +_ = display.ax_.set(xscale="log", title="Learning curve for decision tree") # %% [markdown] # Looking at the training error alone, we see that we get an error of 0 k$. It From 6a582a7f3c910692ecde98b61174c7b3cebd1c01 Mon Sep 17 00:00:00 2001 From: Arturo Amor <86408019+ArturoAmorQ@users.noreply.github.com> Date: Tue, 12 Sep 2023 10:56:57 +0200 Subject: [PATCH 047/108] Always passa numpy array to ValidationCurveDisplay's param_range (#712) Co-authored-by: ArturoAmorQ --- python_scripts/cross_validation_validation_curve.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/python_scripts/cross_validation_validation_curve.py b/python_scripts/cross_validation_validation_curve.py index 997fb0433..b91dda141 100644 --- a/python_scripts/cross_validation_validation_curve.py +++ b/python_scripts/cross_validation_validation_curve.py @@ -104,9 +104,10 @@ # %% # %%time +import numpy as np from sklearn.model_selection import ValidationCurveDisplay -max_depth = [1, 5, 10, 15, 20, 25] +max_depth = np.array([1, 5, 10, 15, 20, 25]) disp = ValidationCurveDisplay.from_estimator( regressor, data, From b28cd3e4b1410dfa00081b63e8cb9c0a750d0032 Mon Sep 17 00:00:00 2001 From: Arturo Amor <86408019+ArturoAmorQ@users.noreply.github.com> Date: Wed, 13 Sep 2023 11:35:49 +0200 Subject: [PATCH 048/108] MAINT fix narrative according to new order of M4 (#713) Co-authored-by: ArturoAmorQ Co-authored-by: Olivier Grisel --- python_scripts/linear_models_sol_02.py | 4 +- .../linear_regression_non_linear_link.py | 38 +++++++++---------- 2 files changed, 22 insertions(+), 20 deletions(-) diff --git a/python_scripts/linear_models_sol_02.py b/python_scripts/linear_models_sol_02.py index 3abc476da..4ae5d6f87 100644 --- a/python_scripts/linear_models_sol_02.py +++ b/python_scripts/linear_models_sol_02.py @@ -90,7 +90,9 @@ # Now create a pipeline using `make_pipeline` consisting of a # `PolynomialFeatures` and a linear regression. Set `degree=2` and # `interaction_only=True` to the feature engineering step. Remember not to -# include the bias to avoid redundancies with the linear's regression intercept. +# include a "bias" feature (that is a constant-valued feature) to avoid +# introducing a redundancy with the intercept of the subsequent linear +# regression model. # # Use the same strategy as before to cross-validate such a pipeline. diff --git a/python_scripts/linear_regression_non_linear_link.py b/python_scripts/linear_regression_non_linear_link.py index 9e72fb49e..2da7d435a 100644 --- a/python_scripts/linear_regression_non_linear_link.py +++ b/python_scripts/linear_regression_non_linear_link.py @@ -8,20 +8,19 @@ # %% [markdown] # # Linear regression for a non-linear features-target relationship # -# In the previous exercise, you were asked to train a linear regression model on -# a dataset where the matrix `data` and the vector `target` do not have a linear -# link. -# -# In this notebook, we show that even if the parametrization of linear models is -# not natively adapted to the problem at hand, it is still possible to make -# linear models more expressive by engineering additional features. +# In this notebook, we show that even if linear models are not natively adapted +# to express a `target` that is not a linear function of the `data`, it is still +# possible to make linear models more expressive by engineering additional +# features. # # A machine learning pipeline that combines a non-linear feature engineering -# step followed by a linear regression step can therefore be considered +# step followed by a linear regression step can therefore be considered a # non-linear regression model as a whole. # -# To illustrate these concepts, we will reuse the same dataset generated in the -# previous exercise. +# ```{tip} +# `np.random.RandomState` allows to create a random number generator which can +# be later used to get deterministic results. +# ``` # %% import numpy as np @@ -38,8 +37,8 @@ # %% [markdown] # ```{note} -# To ease the plotting, we will create a pandas dataframe containing the data -# and target: +# To ease the plotting, we create a pandas dataframe containing the data and +# target: # ``` # %% @@ -55,8 +54,7 @@ ) # %% [markdown] -# We will highlight the limitations of fitting a linear regression model as done -# in the previous exercise. +# We now observe the limitations of fitting a linear regression model. # # ```{warning} # In scikit-learn, by convention `data` (also called `X` in the scikit-learn @@ -102,7 +100,7 @@ ) # %% [markdown] -# It is important to note that the learnt model will not be able to handle the +# It is important to note that the learnt model is not able to handle the # non-linear relationship between `data` and `target` since linear models assume # the relationship between `data` and `target` to be linear. # @@ -212,9 +210,11 @@ # %% [markdown] # The last possibility is to make a linear model more expressive is to use a -# "kernel". Instead of learning a weight per feature as we previously -# emphasized, a weight will be assigned to each sample. However, not all samples -# will be used. This is the base of the support vector machine algorithm. +# "kernel". Instead of learning one weight per feature as we previously did, a +# weight is assigned to each sample. However, not all samples are used: some +# redundant data points of the training set are assigned a weight of 0 so +# that they do no influence the model's prediction function. This is the +# main intuition of the support vector machine algorithm. # # The mathematical definition of "kernels" and "support vector machines" is # beyond the scope of this course. We encourage interested readers with a @@ -323,7 +323,7 @@ # into several non-linearly derived new features. This makes our machine # learning pipeline more expressive and less likely to underfit, even if the # last stage of the pipeline is a simple linear regression model. - +# # For the sake of simplicity, we introduced those transformers on a toy # regression problem with a single input feature. However, non-linear feature # transformers such as Nystroem can further improve the expressiveness of From 41f329618d7e9c55ebb82e457a2913fa4d246e4e Mon Sep 17 00:00:00 2001 From: Arturo Amor <86408019+ArturoAmorQ@users.noreply.github.com> Date: Wed, 13 Sep 2023 12:06:25 +0200 Subject: [PATCH 049/108] ENH Introduce predict_proba earlier in the module (#715) --- python_scripts/logistic_regression.py | 27 +++++++++++++++++---------- 1 file changed, 17 insertions(+), 10 deletions(-) diff --git a/python_scripts/logistic_regression.py b/python_scripts/logistic_regression.py index 45487341b..0ea25a2a1 100644 --- a/python_scripts/logistic_regression.py +++ b/python_scripts/logistic_regression.py @@ -8,11 +8,11 @@ # %% [markdown] # # Linear model for classification # -# In regression, we saw that the target to be predicted was a continuous -# variable. In classification, this target will be discrete (e.g. categorical). +# In regression, we saw that the target to be predicted is a continuous +# variable. In classification, the target is discrete (e.g. categorical). # -# We will go back to our penguin dataset. However, this time we will try to -# predict the penguin species using the culmen information. We will also +# In this notebook we go back to the penguin dataset. However, this time the +# task is to predict the penguin species using the culmen information. We also # simplify our classification problem by selecting only 2 of the penguin species # to solve a binary classification problem. @@ -51,8 +51,8 @@ # increases, the probability that the penguin is a Chinstrap is closer to 1. # However, the culmen depth is not helpful for predicting the penguin species. # -# For model fitting, we will separate the target from the data and we will -# create a training and a testing set. +# For model fitting, we separate the target from the data and we create a +# training and a testing set. # %% from sklearn.model_selection import train_test_split @@ -66,7 +66,7 @@ target_test = penguins_test[target_column] # %% [markdown] -# The linear regression that we previously saw will predict a continuous output. +# The linear regression that we previously saw predicts a continuous output. # When the target is a binary outcome, one can use the logistic function to # model the probability. This model is known as logistic regression. # @@ -90,9 +90,9 @@ # feature values of the sample. # # ```{note} -# Here, we will use the class `DecisionBoundaryDisplay`. This educational tool -# allows us to gain some insights by plotting the decision function boundary -# learned by the classifier in a 2 dimensional feature space. +# Here, we use the class `DecisionBoundaryDisplay`. This educational tool allows +# us to gain some insights by plotting the decision function boundary learned by +# the classifier in a 2 dimensional feature space. # # Notice however that in more realistic machine learning contexts, one would # typically fit on more than two features at once and therefore it would not be @@ -154,3 +154,10 @@ # x1 = coef0 / coef1 * x0 - intercept / coef1 # # which is the equation of a straight line. +# +# ```{note} +# If you want to go further, try changing the `response_method` to +# `"predict_proba"` in the `DecisionBoundaryDisplay` above. Now the boundaries +# encode by color the estimated probability of belonging to either class, as +# mentioned in the introductory slides ๐ŸŽฅ Intuitions on linear models. +# ``` From fccc6367769afd45a6ac39e6126a9be070b9648f Mon Sep 17 00:00:00 2001 From: Arturo Amor <86408019+ArturoAmorQ@users.noreply.github.com> Date: Wed, 13 Sep 2023 13:58:19 +0200 Subject: [PATCH 050/108] MAINT improve wording and fix typo (#716) --- .../parameter_tuning_randomized_search.py | 34 +++++++++---------- 1 file changed, 17 insertions(+), 17 deletions(-) diff --git a/python_scripts/parameter_tuning_randomized_search.py b/python_scripts/parameter_tuning_randomized_search.py index 81d45786d..c69573aad 100644 --- a/python_scripts/parameter_tuning_randomized_search.py +++ b/python_scripts/parameter_tuning_randomized_search.py @@ -12,12 +12,13 @@ # search for the best hyperparameters maximizing the generalization performance # of a predictive model. # -# However, a grid-search approach has limitations. It does not scale when the -# number of parameters to tune is increasing. Also, the grid will impose a -# regularity during the search which might be problematic. +# However, a grid-search approach has limitations. It does not scale well when +# the number of parameters to tune increases. Also, the grid imposes a +# regularity during the search which might miss better parameter +# values between two consecutive values on the grid. # -# In this notebook, we will present another method to tune hyperparameters -# called randomized search. +# In this notebook, we present a different method to tune hyperparameters called +# randomized search. # %% [markdown] # ## Our predictive model @@ -56,8 +57,7 @@ ) # %% [markdown] -# We will create the same predictive pipeline as seen in the grid-search -# section. +# We create the same predictive pipeline as done for the grid-search section. # %% from sklearn.compose import ColumnTransformer @@ -97,26 +97,26 @@ # # With the `GridSearchCV` estimator, the parameters need to be specified # explicitly. We already mentioned that exploring a large number of values for -# different parameters will be quickly untractable. +# different parameters quickly becomes untractable. # # Instead, we can randomly generate the parameter candidates. Indeed, such # approach avoids the regularity of the grid. Hence, adding more evaluations can # increase the resolution in each direction. This is the case in the frequent # situation where the choice of some hyperparameters is not very important, as -# for hyperparameter 2 in the figure below. +# for the hyperparameter 2 in the figure below. # # ![Randomized vs grid search](../figures/grid_vs_random_search.svg) # # Indeed, the number of evaluation points needs to be divided across the two # different hyperparameters. With a grid, the danger is that the region of good -# hyperparameters fall between the line of the grid: this region is aligned with -# the grid given that hyperparameter 2 has a weak influence. Rather, stochastic -# search will sample hyperparameter 1 independently from hyperparameter 2 and -# find the optimal region. +# hyperparameters may fall between lines of the grid. In the figure such region +# is aligned with the grid given that hyperparameter 2 has a weak influence. +# Rather, stochastic search samples the hyperparameter 1 independently from the +# hyperparameter 2 and find the optimal region. # # The `RandomizedSearchCV` class allows for such stochastic search. It is used # similarly to the `GridSearchCV` but the sampling distributions need to be -# specified instead of the parameter values. For instance, we will draw +# specified instead of the parameter values. For instance, we can draw # candidates using a log-uniform distribution because the parameters we are # interested in take positive values with a natural log scaling (.1 is as close # to 1 as 10 is). @@ -126,7 +126,7 @@ # grid search (with `GridSearchCV`) to optimize 3 or more hyperparameters. # ``` # -# We will optimize 3 other parameters in addition to the ones we optimized in +# We now optimize 3 other parameters in addition to the ones we optimized in # the notebook presenting the `GridSearchCV`: # # * `l2_regularization`: it corresponds to the strength of the regularization; @@ -138,14 +138,14 @@ # We recall the meaning of the 2 remaining parameters: # # * `learning_rate`: it corresponds to the speed at which the gradient-boosting -# will correct the residuals at each boosting iteration; +# corrects the residuals at each boosting iteration; # * `max_leaf_nodes`: it corresponds to the maximum number of leaves for each # tree in the ensemble. # # ```{note} # `scipy.stats.loguniform` can be used to generate floating numbers. To generate # random values for integer-valued parameters (e.g. `min_samples_leaf`) we can -# adapt is as follows: +# adapt it as follows: # ``` # %% From ffa8e3f28bf444e3b68304ff8333fc039b219bd2 Mon Sep 17 00:00:00 2001 From: Arturo Amor <86408019+ArturoAmorQ@users.noreply.github.com> Date: Tue, 19 Sep 2023 12:27:33 +0200 Subject: [PATCH 051/108] ENH Add explicit definitions for technical terms (#718) --- jupyter-book/appendix/glossary.md | 21 ++++++++ .../02_numerical_pipeline_introduction.py | 54 +++++++++++-------- .../02_numerical_pipeline_scaling.py | 3 ++ .../cross_validation_validation_curve.py | 17 ++++-- python_scripts/ensemble_bagging.py | 2 +- python_scripts/parameter_tuning_manual.py | 47 +++++++--------- 6 files changed, 88 insertions(+), 56 deletions(-) diff --git a/jupyter-book/appendix/glossary.md b/jupyter-book/appendix/glossary.md index 58334871a..30721687e 100644 --- a/jupyter-book/appendix/glossary.md +++ b/jupyter-book/appendix/glossary.md @@ -6,6 +6,27 @@ of this page. ## Main terms used in this course +### API + +Acronym that stands for "Application Programming Interface". It can have a +slightly different meaning in different contexts: in some cases it can be used +to designate an online service that can be accessed by remote programs. In the +context of an online service, the term "API" can be used both to designate the +service itself, and the technical specification of the programming interface +used by people who write client applications that connect to this service. + +In the context of an offline library such as scikit-learn, it means the list of +all (public) functions, classes and methods in the library, along with their +documentation via docstrings. It and can be browsed online at: + +- https://scikit-learn.org/stable/modules/classes.html + +In scikit-learn we try to adopt simple conventions and limit to a minimum the +number of methods an object must implement. Furthermore, scikit-learn tries to +use consistent method names for different estimators of the same category: e.g. +all transformers expose `fit`, `fit_transform` and `transform` methods and +generally accept similar arguments of type and shapes for those methods. + ### classification Type of problems where the goal is to [predict](#predict-prediction) a diff --git a/python_scripts/02_numerical_pipeline_introduction.py b/python_scripts/02_numerical_pipeline_introduction.py index 51171c320..ca56a13fb 100644 --- a/python_scripts/02_numerical_pipeline_introduction.py +++ b/python_scripts/02_numerical_pipeline_introduction.py @@ -11,20 +11,23 @@ # In this notebook, we present how to build predictive models on tabular # datasets, with only numerical features. # -# In particular we will highlight: +# In particular we highlight: # # * the scikit-learn API: `.fit(X, y)`/`.predict(X)`/`.score(X, y)`; # * how to evaluate the generalization performance of a model with a train-test # split. # +# Here API stands for "Application Programming Interface" and refers to a set of +# conventions to build self-consistent software. Notice that you can visit the +# Glossary for more info on technical jargon. +# # ## Loading the dataset with Pandas # -# We will use the same dataset "adult_census" described in the previous -# notebook. For more details about the dataset see -# . +# We use the "adult_census" dataset described in the previous notebook. For more +# details about the dataset see . # # Numerical data is the most natural type of data used in machine learning and -# can (almost) directly be fed into predictive models. We will load a subset of +# can (almost) directly be fed into predictive models. Here we load a subset of # the original data with only the numerical columns. # %% @@ -56,7 +59,7 @@ data.head() # %% [markdown] -# We can now linger on the variables, also denominated features, that we will +# We can now linger on the variables, also denominated features, that we later # use to build our predictive model. In addition, we can also check how many # samples are available in our dataset. @@ -72,7 +75,7 @@ # %% [markdown] # ## Fit a model and make predictions # -# We will build a classification model using the "K-nearest neighbors" strategy. +# We now build a classification model using the "K-nearest neighbors" strategy. # To predict the target of a new sample, a k-nearest neighbors takes into # account its `k` closest samples in the training set and predicts the majority # target of these samples. @@ -97,10 +100,11 @@ # # ![Predictor fit diagram](../figures/api_diagram-predictor.fit.svg) # +# In scikit-learn an object that has a `fit` method is called an **estimator**. # The method `fit` is composed of two elements: (i) a **learning algorithm** and # (ii) some **model states**. The learning algorithm takes the training data and -# training target as input and sets the model states. These model states will be -# used later to either predict (for classifiers and regressors) or transform +# training target as input and sets the model states. These model states are +# later used to either predict (for classifiers and regressors) or transform # data (for transformers). # # Both the learning algorithm and the type of model states are specific to each @@ -120,17 +124,18 @@ target_predicted = model.predict(data) # %% [markdown] -# We can illustrate the prediction mechanism as follows: +# An estimator (an object with a `fit` method) with a `predict` method is called +# a **predictor**. We can illustrate the prediction mechanism as follows: # # ![Predictor predict diagram](../figures/api_diagram-predictor.predict.svg) # -# To predict, a model uses a **prediction function** that will use the input -# data together with the model states. As for the learning algorithm and the -# model states, the prediction function is specific for each type of model. +# To predict, a model uses a **prediction function** that uses the input data +# together with the model states. As for the learning algorithm and the model +# states, the prediction function is specific for each type of model. # %% [markdown] # Let's now have a look at the computed predictions. For the sake of simplicity, -# we will look at the five first predicted targets. +# we look at the five first predicted targets. # %% target_predicted[:5] @@ -214,7 +219,9 @@ print(f"The test accuracy using a {model_name} is {accuracy:.3f}") # %% [markdown] -# Let's check the underlying mechanism when the `score` method is called: +# We use the generic term **model** for objects whose goodness of fit can be +# measured using the `score` method. Let's check the underlying mechanism when +# calling `score`: # # ![Predictor score diagram](../figures/api_diagram-predictor.score.svg) # @@ -234,13 +241,13 @@ # %% [markdown] # ```{note} -# In this MOOC, we will refer to **generalization performance** of a model when -# referring to the test score or test error obtained by comparing the -# prediction of a model and the true targets. Equivalent terms for -# **generalization performance** are predictive performance and statistical -# performance. We will refer to **computational performance** of a predictive -# model when assessing the computational costs of training a predictive model -# or using it to make predictions. +# In this MOOC, we refer to **generalization performance** of a model when +# referring to the test score or test error obtained by comparing the prediction +# of a model and the true targets. Equivalent terms for **generalization +# performance** are predictive performance and statistical performance. We refer +# to **computational performance** of a predictive model when assessing the +# computational costs of training a predictive model or using it to make +# predictions. # ``` # %% [markdown] @@ -252,4 +259,5 @@ # * evaluated its generalization performance on the testing data; # * introduced the scikit-learn API `.fit(X, y)` (to train a model), # `.predict(X)` (to make predictions) and `.score(X, y)` (to evaluate a -# model). +# model); +# * introduced the jargon for estimator, predictor and model. diff --git a/python_scripts/02_numerical_pipeline_scaling.py b/python_scripts/02_numerical_pipeline_scaling.py index 19ac871b8..66370921d 100644 --- a/python_scripts/02_numerical_pipeline_scaling.py +++ b/python_scripts/02_numerical_pipeline_scaling.py @@ -166,6 +166,9 @@ # `fit` and then `transform`. # # ![Transformer fit_transform diagram](../figures/api_diagram-transformer.fit_transform.svg) +# +# In scikit-learn jargon, a **transformer** is defined as an estimator (an +# object with a `fit` method) supporting `transform` or `fit_transform`. # %% data_train_scaled = scaler.fit_transform(data_train) diff --git a/python_scripts/cross_validation_validation_curve.py b/python_scripts/cross_validation_validation_curve.py index b91dda141..7f2373720 100644 --- a/python_scripts/cross_validation_validation_curve.py +++ b/python_scripts/cross_validation_validation_curve.py @@ -12,8 +12,8 @@ # and how it helps us quantify the training and testing errors as well as their # fluctuations. # -# In this notebook, we will put these two errors into perspective and show how -# they can help us know if our model generalizes, overfits, or underfits. +# In this notebook, we put these two errors into perspective and show how they +# can help us know if our model generalizes, overfits, or underfits. # # Let's first load the data and create the same model as in the previous # notebook. @@ -40,7 +40,7 @@ # ## Overfitting vs. underfitting # # To better understand the generalization performance of our model and maybe -# find insights on how to improve it, we will compare the testing error with the +# find insights on how to improve it, we compare the testing error with the # training error. Thus, we need to compute the error on the training set, which # is possible using the `cross_validate` function. @@ -93,13 +93,20 @@ # # ## Validation curve # +# We call **hyperparameters** those parameters that potentially impact the +# result of the learning and subsequent predictions of a predictor. For example: +# +# - the number of neighbors in a k-nearest neighbor model; +# +# - the degree of the polynomial. +# # Some model hyperparameters are usually the key to go from a model that # underfits to a model that overfits, hopefully going through a region were we # can get a good balance between the two. We can acquire knowledge by plotting a # curve called the validation curve. This curve can also be applied to the above # experiment and varies the value of a hyperparameter. # -# For the decision tree, the `max_depth` parameter is used to control the +# For the decision tree, the `max_depth` hyperparameter is used to control the # tradeoff between under-fitting and over-fitting. # %% @@ -165,4 +172,4 @@ # # * how to identify whether a model is generalizing, overfitting, or # underfitting; -# * how to check influence of a hyperparameter on the tradeoff underfit/overfit. +# * how to check influence of a hyperparameter on the underfit/overfit tradeoff. diff --git a/python_scripts/ensemble_bagging.py b/python_scripts/ensemble_bagging.py index 84696187c..111ef9684 100644 --- a/python_scripts/ensemble_bagging.py +++ b/python_scripts/ensemble_bagging.py @@ -270,7 +270,7 @@ def bootstrap_sample(data, target): # # ## Bagging in scikit-learn # -# Scikit-learn implements the bagging procedure as a "meta-estimator", that is +# Scikit-learn implements the bagging procedure as a **meta-estimator**, that is, # an estimator that wraps another estimator: it takes a base model that is # cloned several times and trained independently on each bootstrap sample. # diff --git a/python_scripts/parameter_tuning_manual.py b/python_scripts/parameter_tuning_manual.py index 59072bca8..15d047a80 100644 --- a/python_scripts/parameter_tuning_manual.py +++ b/python_scripts/parameter_tuning_manual.py @@ -8,26 +8,19 @@ # %% [markdown] # # Set and get hyperparameters in scikit-learn # -# The process of learning a predictive model is driven by a set of internal -# parameters and a set of training data. These internal parameters are called -# hyperparameters and are specific for each family of models. In addition, a -# specific set of hyperparameters are optimal for a specific dataset and thus -# they need to be optimized. -# -# ```{note} -# In this notebook we will use the words "hyperparameters" and "parameters" -# interchangeably. -# ``` +# Recall that hyperparameters refer to the parameters that control the learning +# process of a predictive model and are specific for each family of models. In +# addition, the optimal set of hyperparameters is specific to each dataset and +# thus they always need to be optimized. # # This notebook shows how one can get and set the value of a hyperparameter in a -# scikit-learn estimator. We recall that hyperparameters refer to the parameter -# that will control the learning process. +# scikit-learn estimator. # # They should not be confused with the fitted parameters, resulting from the # training. These fitted parameters are recognizable in scikit-learn because # they are spelled with a final underscore `_`, for instance `model.coef_`. # -# We will start by loading the adult census dataset and only use the numerical +# We start by loading the adult census dataset and only use the numerical # features. # %% @@ -83,7 +76,7 @@ # %% [markdown] # We created a model with the default `C` value that is equal to 1. If we wanted -# to use a different `C` parameter we could have done so when we created the +# to use a different `C` hyperparameter we could have done so when we created the # `LogisticRegression` object with something like `LogisticRegression(C=1e-3)`. # # ```{note} @@ -92,9 +85,9 @@ # Be aware that we will focus on linear models in an upcoming module. # ``` # -# We can also change the parameter of a model after it has been created with the -# `set_params` method, which is available for all scikit-learn estimators. For -# example, we can set `C=1e-3`, fit and evaluate the model: +# We can also change the hyperparameter of a model after it has been created +# with the `set_params` method, which is available for all scikit-learn +# estimators. For example, we can set `C=1e-3`, fit and evaluate the model: # %% model.set_params(classifier__C=1e-3) @@ -106,23 +99,23 @@ ) # %% [markdown] -# When the model of interest is a `Pipeline`, the parameter names are of the -# form `__` (note the double underscore in the -# middle). In our case, `classifier` comes from the `Pipeline` definition and -# `C` is the parameter name of `LogisticRegression`. +# When the model of interest is a `Pipeline`, the hyperparameter names are of +# the form `__` (note the double underscore in +# the middle). In our case, `classifier` comes from the `Pipeline` definition +# and `C` is the hyperparameter name of `LogisticRegression`. # # In general, you can use the `get_params` method on scikit-learn models to list -# all the parameters with their values. For example, if you want to get all the -# parameter names, you can use: +# all the hyperparameters with their values. For example, if you want to get all +# the hyperparameter names, you can use: # %% for parameter in model.get_params(): print(parameter) # %% [markdown] -# `.get_params()` returns a `dict` whose keys are the parameter names and whose -# values are the parameter values. If you want to get the value of a single -# parameter, for example `classifier__C`, you can use: +# `.get_params()` returns a `dict` whose keys are the hyperparameter names and +# whose values are the hyperparameter values. If you want to get the value of a +# single hyperparameter, for example `classifier__C`, you can use: # %% model.get_params()["classifier__C"] @@ -158,5 +151,5 @@ # %% [markdown] # In this notebook we have seen: # -# - how to use `get_params` and `set_params` to get the parameters of a model +# - how to use `get_params` and `set_params` to get the hyperparameters of a model # and set them. From 0739e1802726253864b6288ffa676e1c54d8cbbf Mon Sep 17 00:00:00 2001 From: Arturo Amor <86408019+ArturoAmorQ@users.noreply.github.com> Date: Tue, 19 Sep 2023 12:27:55 +0200 Subject: [PATCH 052/108] ENH Expand contents related to Nystroem (#717) --- python_scripts/linear_models_ex_02.py | 61 ++++++- python_scripts/linear_models_sol_02.py | 157 ++++++++++++++++-- .../linear_regression_non_linear_link.py | 8 +- 3 files changed, 201 insertions(+), 25 deletions(-) diff --git a/python_scripts/linear_models_ex_02.py b/python_scripts/linear_models_ex_02.py index f58a1f0fe..fdfdaf610 100644 --- a/python_scripts/linear_models_ex_02.py +++ b/python_scripts/linear_models_ex_02.py @@ -15,14 +15,14 @@ # # ๐Ÿ“ Exercise M4.02 # # In the previous notebook, we showed that we can add new features based on the -# original feature to make the model more expressive, for instance `x ** 2` or `x ** 3`. -# In that case we only used a single feature in `data`. +# original feature `x` to make the model more expressive, for instance `x ** 2` or +# `x ** 3`. In that case we only used a single feature in `data`. # # The aim of this notebook is to train a linear regression algorithm on a # dataset with more than a single feature. In such a "multi-dimensional" feature -# space we can derive new features of the form `x1 * x2`, `x2 * x3`, -# etc. Products of features are usually called "non-linear or -# multiplicative interactions" between features. +# space we can derive new features of the form `x1 * x2`, `x2 * x3`, etc. +# Products of features are usually called "non-linear" or "multiplicative" +# interactions between features. # # Feature engineering can be an important step of a model pipeline as long as # the new features are expected to be predictive. For instance, think of a @@ -69,7 +69,9 @@ # Write your code here. # %% [markdown] -# Compute the mean and std of the MAE in grams (g). +# Compute the mean and std of the MAE in grams (g). Remember you have to revert +# the sign introduced when metrics start with `neg_`, such as in +# `"neg_mean_absolute_error"`. # %% # Write your code here. @@ -78,16 +80,57 @@ # Now create a pipeline using `make_pipeline` consisting of a # `PolynomialFeatures` and a linear regression. Set `degree=2` and # `interaction_only=True` to the feature engineering step. Remember not to -# include the bias to avoid redundancies with the linear's regression intercept. +# include a "bias" feature (that is a constant-valued feature) to avoid +# introducing a redundancy with the intercept of the subsequent linear +# regression model. # -# Use the same strategy as before to cross-validate such a pipeline. +# You may want to use the `.set_output(transform="pandas")` method of the +# pipeline to answer the next question. # %% # Write your code here. # %% [markdown] -# Compute the mean and std of the MAE in grams (g) and compare with the results +# Transform the first 5 rows of the dataset and look at the column names. How +# many features are generated at the output of the `PolynomialFeatures` step in +# the previous pipeline? + +# %% +# Write your code here. + +# %% [markdown] +# Check that the values for the new interaction features are correct for a few +# of them. + +# %% +# Write your code here. + +# %% [markdown] +# Use the same cross-validation strategy as done previously to estimate the mean +# and std of the MAE in grams (g) for such a pipeline. Compare with the results # without feature engineering. # %% # Write your code here. + +# %% [markdown] +# +# Now let's try to build an alternative pipeline with an adjustable number of +# intermediate features while keeping a similar predictive power. To do so, try +# using the `Nystroem` transformer instead of `PolynomialFeatures`. Set the +# kernel parameter to `"poly"` and `degree` to 2. Adjust the number of +# components to be as small as possible while keeping a good cross-validation +# performance. +# +# Hint: Use a `ValidationCurveDisplay` with `param_range = np.array([5, 10, 50, +# 100])` to find the optimal `n_components`. + +# %% +# Write your code here. + +# %% [markdown] +# How do the mean and std of the MAE for the Nystroem pipeline with optimal +# `n_components` compare to the other previous models? + +# %% +# Write your code here. diff --git a/python_scripts/linear_models_sol_02.py b/python_scripts/linear_models_sol_02.py index 4ae5d6f87..170533dcf 100644 --- a/python_scripts/linear_models_sol_02.py +++ b/python_scripts/linear_models_sol_02.py @@ -9,14 +9,14 @@ # # ๐Ÿ“ƒ Solution for Exercise M4.02 # # In the previous notebook, we showed that we can add new features based on the -# original feature to make the model more expressive, for instance `x ** 2` or `x ** 3`. -# In that case we only used a single feature in `data`. +# original feature `x` to make the model more expressive, for instance `x ** 2` or +# `x ** 3`. In that case we only used a single feature in `data`. # # The aim of this notebook is to train a linear regression algorithm on a # dataset with more than a single feature. In such a "multi-dimensional" feature -# space we can derive new features of the form `x1 * x2`, `x2 * x3`, -# etc. Products of features are usually called "non-linear or -# multiplicative interactions" between features. +# space we can derive new features of the form `x1 * x2`, `x2 * x3`, etc. +# Products of features are usually called "non-linear" or "multiplicative" +# interactions between features. # # Feature engineering can be an important step of a model pipeline as long as # the new features are expected to be predictive. For instance, think of a @@ -76,7 +76,9 @@ ) # %% [markdown] -# Compute the mean and std of the MAE in grams (g). +# Compute the mean and std of the MAE in grams (g). Remember you have to revert +# the sign introduced when metrics start with `neg_`, such as in +# `"neg_mean_absolute_error"`. # %% # solution @@ -94,7 +96,8 @@ # introducing a redundancy with the intercept of the subsequent linear # regression model. # -# Use the same strategy as before to cross-validate such a pipeline. +# You may want to use the `.set_output(transform="pandas")` method of the +# pipeline to answer the next question. # %% # solution @@ -106,8 +109,46 @@ ) linear_regression_interactions = make_pipeline( poly_features, linear_regression -) +).set_output(transform="pandas") + +# %% [markdown] +# Transform the first 5 rows of the dataset and look at the column names. How +# many features are generated at the output of the `PolynomialFeatures` step in +# the previous pipeline? + +# %% +# solution +linear_regression_interactions.fit(data, target) +linear_regression_interactions[0].transform(data[:5]) + +# %% [markdown] tags=["solution"] +# We observe that 3 features are generated, corresponding to the different +# combinations of products of the 3 original features, i.e. we have 6 +# intermediate features in total. In general, given `p` original features, one +# has `p * (p - 1) / 2` interactions. + +# %% [markdown] +# Check that the values for the new interaction features are correct for a few +# of them. + +# %% [markdown] tags=["solution"] +# Let's now check that the value in the 1st row and the 5th column (3384.7) is +# the product of the values at the first and third columns (respectively 181.0 +# and 18.7) of the same row: + +# %% +# solution +culmen_length_first_sample = 181.0 +culmen_depth_first_sample = 18.7 +culmen_length_first_sample * culmen_depth_first_sample + +# %% [markdown] +# Use the same cross-validation strategy as done previously to estimate the mean +# and std of the MAE in grams (g) for such a pipeline. Compare with the results +# without feature engineering. +# %% +# solution cv_results = cross_validate( linear_regression_interactions, data, @@ -116,21 +157,107 @@ scoring="neg_mean_absolute_error", n_jobs=2, ) +print( + "Mean absolute error on testing set with interactions: " + f"{-cv_results['test_score'].mean():.3f} ยฑ " + f"{cv_results['test_score'].std():.3f} g" +) + +# %% [markdown] tags=["solution"] +# We observe that the MAE is lower and less spread with the enriched features. +# In this case the additional "interaction" features are indeed predictive. +# Later in this module we will see what happens when the enriched features are +# non-predictive and how to deal with this case. # %% [markdown] -# Compute the mean and std of the MAE in grams (g) and compare with the results -# without feature engineering. +# +# Now let's try to build an alternative pipeline with an adjustable number of +# intermediate features while keeping a similar predictive power. To do so, try +# using the `Nystroem` transformer instead of `PolynomialFeatures`. Set the +# kernel parameter to `"poly"` and `degree` to 2. Adjust the number of +# components to be as small as possible while keeping a good cross-validation +# performance. +# +# Hint: Use a `ValidationCurveDisplay` with `param_range = np.array([5, 10, 50, +# 100])` to find the optimal `n_components`. # %% # solution +import numpy as np + +from sklearn.kernel_approximation import Nystroem +from sklearn.model_selection import ValidationCurveDisplay + +nystroem_regression = make_pipeline( + Nystroem(kernel="poly", degree=2, random_state=0), + linear_regression, +) + +param_range = np.array([5, 10, 50, 100]) +disp = ValidationCurveDisplay.from_estimator( + nystroem_regression, + data, + target, + param_name="nystroem__n_components", + param_range=param_range, + cv=10, + scoring="neg_mean_absolute_error", + negate_score=True, + std_display_style="errorbar", + n_jobs=2, +) + +_ = disp.ax_.set( + xlabel="Number of components", + ylabel="Mean absolute error (g)", + title="Validation curve for Nystroem regression", +) + +# %% [markdown] tags=["solution"] +# In the validation curve above we can observe that a small number of components +# leads to an underfitting model, whereas a large number of components leads to +# an overfitting model. The optimal number of Nystrรถm components is around 10 +# for this dataset. + +# %% [markdown] +# How do the mean and std of the MAE for the Nystroem pipeline with optimal +# `n_components` compare to the other previous models? + +# %% +# solution +nystroem_regression.set_params(nystroem__n_components=10) +cv_results = cross_validate( + nystroem_regression, + data, + target, + cv=10, + scoring="neg_mean_absolute_error", + n_jobs=2, +) print( - "Mean absolute error on testing set with interactions: " + "Mean absolute error on testing set with nystroem: " f"{-cv_results['test_score'].mean():.3f} ยฑ " f"{cv_results['test_score'].std():.3f} g" ) # %% [markdown] tags=["solution"] -# We observe that the mean absolute error is lower and less spread with the -# enriched features. In this case the "interactions" are indeed predictive. In -# the following notebook we will see what happens when the enriched features are -# non-predictive and how to deal with this case. +# In this case we have a model with 10 features instead of 6, and which has +# approximately the same prediction error as the model with interactions. +# +# Notice that if we had `p = 100` original features (instead of 3), the +# `PolynomialFeatures` transformer would have generated `100 * (100 - 1) / 2 = +# 4950` additional interaction features (so we would have 5050 features in +# total). The resulting pipeline would have been much slower to train and +# predict and would have had a much larger memory footprint. Furthermore, the +# large number of interaction features would probably have resulted in an +# overfitting model. +# +# On the other hand, the `Nystroem` transformer generates a user-adjustable +# number of features (`n_components`). Furthermore, the optimal number of +# components is usually much smaller than that. So the `Nystroem` transformer +# can be more scalable when the number of original features is too large for +# `PolynomialFeatures` to be used. +# +# The main downside of the `Nystroem` transformer is that it is not possible to +# easily interpret the meaning of the generated features and therefore the +# meaning of the learned coefficients for the downstream linear model. diff --git a/python_scripts/linear_regression_non_linear_link.py b/python_scripts/linear_regression_non_linear_link.py index 2da7d435a..996e5fcce 100644 --- a/python_scripts/linear_regression_non_linear_link.py +++ b/python_scripts/linear_regression_non_linear_link.py @@ -299,11 +299,17 @@ ax.plot(data, target_predicted) _ = ax.set_title(f"Mean squared error = {mse:.2f}") +# %% [markdown] +# `Nystroem` is a nice alternative to `PolynomialFeatures` that makes it +# possible to keep the memory usage of the transformed dataset under control. +# However, interpreting the meaning of the intermediate features can be +# challenging. + # %% from sklearn.kernel_approximation import Nystroem nystroem_regression = make_pipeline( - Nystroem(n_components=5), + Nystroem(kernel="poly", degree=3, n_components=5, random_state=0), LinearRegression(), ) nystroem_regression.fit(data, target) From 3417163cb282918ecf717c73e47dd014e5f9634e Mon Sep 17 00:00:00 2001 From: Arturo Amor <86408019+ArturoAmorQ@users.noreply.github.com> Date: Wed, 20 Sep 2023 11:34:27 +0200 Subject: [PATCH 053/108] Synchronize quizzes to use ValidationCurveDisplay (#719) Co-authored-by: ArturoAmorQ --- jupyter-book/ensemble/ensemble_wrap_up_quiz.md | 7 +++++-- jupyter-book/overfit/overfit_wrap_up_quiz.md | 13 ++++++++----- 2 files changed, 13 insertions(+), 7 deletions(-) diff --git a/jupyter-book/ensemble/ensemble_wrap_up_quiz.md b/jupyter-book/ensemble/ensemble_wrap_up_quiz.md index 5d1f596bd..14532ce08 100644 --- a/jupyter-book/ensemble/ensemble_wrap_up_quiz.md +++ b/jupyter-book/ensemble/ensemble_wrap_up_quiz.md @@ -85,7 +85,10 @@ _Select a single answer_ Plot the validation curve of the `n_estimators` parameters defined by: ```python -n_estimators = [1, 2, 5, 10, 20, 50, 100, 200, 500, 1_000] +import numpy as np + + +n_estimators = np.array([1, 2, 5, 10, 20, 50, 100, 200, 500, 1_000]) ``` ```{admonition} Question @@ -182,7 +185,7 @@ Build a validation curve for a `sklearn.ensemble.HistGradientBoostingRegressor` varying `max_iter` as follows: ```python -max_iter = [1, 2, 5, 10, 20, 50, 100, 200, 500, 1_000] +max_iters = np.array([1, 2, 5, 10, 20, 50, 100, 200, 500]) ``` We recall that `max_iter` corresponds to the number of trees in the boosted diff --git a/jupyter-book/overfit/overfit_wrap_up_quiz.md b/jupyter-book/overfit/overfit_wrap_up_quiz.md index 8b27695fa..ef8da43e8 100644 --- a/jupyter-book/overfit/overfit_wrap_up_quiz.md +++ b/jupyter-book/overfit/overfit_wrap_up_quiz.md @@ -131,11 +131,14 @@ function to also compute the train score. +++ -We will now study the effect of the parameter `n_neighbors` on the train and -test score using a validation curve. You can use the following parameter range: +We now study the effect of the parameter `n_neighbors` on the train and test +score using a validation curve. You can use the following parameter range: ```python -param_range = [1, 2, 5, 10, 20, 50, 100, 200, 500] +import numpy as np + + +param_range = np.array([1, 2, 5, 10, 20, 50, 100, 200, 500]) ``` Also, use a 5-fold cross-validation and compute the balanced accuracy score @@ -157,7 +160,7 @@ _Select a single answer_ +++ ```{admonition} Question -Select the true affirmations stated below: +Select the most correct of the affirmations stated below: - a) The model overfits for a range of `n_neighbors` values between 1 to 10 - b) The model overfits for a range of `n_neighbors` values between 10 to 100 @@ -169,7 +172,7 @@ _Select a single answer_ +++ ```{admonition} Question -Select which of the following statements are true: +Select the most correct of the affirmations stated below: - a) The model best generalizes for a range of `n_neighbors` values between 1 to 10 - b) The model best generalizes for a range of `n_neighbors` values between 10 to 100 From 5b4505e6d5cb04b3871c05d2463f96c4fa3084fe Mon Sep 17 00:00:00 2001 From: Arturo Amor <86408019+ArturoAmorQ@users.noreply.github.com> Date: Wed, 20 Sep 2023 14:10:31 +0200 Subject: [PATCH 054/108] Fix invalidparametererror in plot_tree (#720) Co-authored-by: ArturoAmorQ --- python_scripts/trees_classification.py | 2 +- python_scripts/trees_regression.py | 2 +- python_scripts/trees_sol_01.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/python_scripts/trees_classification.py b/python_scripts/trees_classification.py index 21b772666..d83b5203e 100644 --- a/python_scripts/trees_classification.py +++ b/python_scripts/trees_classification.py @@ -133,7 +133,7 @@ _ = plot_tree( tree, feature_names=culmen_columns, - class_names=tree.classes_, + class_names=tree.classes_.tolist(), impurity=False, ax=ax, ) diff --git a/python_scripts/trees_regression.py b/python_scripts/trees_regression.py index 56fbfd3f0..8431c025c 100644 --- a/python_scripts/trees_regression.py +++ b/python_scripts/trees_regression.py @@ -144,7 +144,7 @@ from sklearn.tree import plot_tree _, ax = plt.subplots(figsize=(8, 6)) -_ = plot_tree(tree, feature_names=feature_name, ax=ax) +_ = plot_tree(tree, feature_names=[feature_name], ax=ax) # %% [markdown] # The threshold for our feature (flipper length) is 206.5 mm. The predicted diff --git a/python_scripts/trees_sol_01.py b/python_scripts/trees_sol_01.py index bdf10b00e..34dcbf81c 100644 --- a/python_scripts/trees_sol_01.py +++ b/python_scripts/trees_sol_01.py @@ -86,7 +86,7 @@ _ = plot_tree( tree, feature_names=culmen_columns, - class_names=tree.classes_, + class_names=tree.classes_.tolist(), impurity=False, ax=ax, ) From 86ad85864497eed500a53adde59166e19b6ac4de Mon Sep 17 00:00:00 2001 From: Olivier Grisel Date: Wed, 20 Sep 2023 16:01:29 +0200 Subject: [PATCH 055/108] Synchronize notebooks by running 'make notebooks' (#721) --- .../02_numerical_pipeline_introduction.ipynb | 54 ++- notebooks/02_numerical_pipeline_scaling.ipynb | 23 +- notebooks/03_categorical_pipeline.ipynb | 208 ++++----- notebooks/cross_validation_ex_01.ipynb | 37 +- .../cross_validation_learning_curve.ipynb | 69 +-- notebooks/cross_validation_sol_01.ipynb | 119 ++---- .../cross_validation_validation_curve.ipynb | 99 ++--- notebooks/ensemble_bagging.ipynb | 2 +- notebooks/ensemble_ex_03.ipynb | 18 +- notebooks/ensemble_hyperparameters.ipynb | 94 ++-- notebooks/ensemble_sol_03.ipynb | 99 +---- notebooks/linear_models_ex_02.ipynb | 160 ++++++- notebooks/linear_models_ex_03.ipynb | 136 ++++++ notebooks/linear_models_regularization.ipynb | 400 +++++++++++------- notebooks/linear_models_sol_02.ipynb | 312 ++++++++++++-- notebooks/linear_models_sol_03.ipynb | 200 +++++++++ notebooks/linear_regression_in_sklearn.ipynb | 30 +- .../linear_regression_non_linear_link.ipynb | 73 +++- notebooks/logistic_regression.ipynb | 37 +- notebooks/metrics_classification.ipynb | 37 +- notebooks/metrics_regression.ipynb | 196 ++++++--- notebooks/metrics_sol_02.ipynb | 85 +++- notebooks/parameter_tuning_manual.ipynb | 48 +-- .../parameter_tuning_randomized_search.ipynb | 34 +- notebooks/trees_classification.ipynb | 2 +- notebooks/trees_regression.ipynb | 2 +- notebooks/trees_sol_01.ipynb | 2 +- 27 files changed, 1698 insertions(+), 878 deletions(-) create mode 100644 notebooks/linear_models_ex_03.ipynb create mode 100644 notebooks/linear_models_sol_03.ipynb diff --git a/notebooks/02_numerical_pipeline_introduction.ipynb b/notebooks/02_numerical_pipeline_introduction.ipynb index 0e39aa5b1..102280de9 100644 --- a/notebooks/02_numerical_pipeline_introduction.ipynb +++ b/notebooks/02_numerical_pipeline_introduction.ipynb @@ -9,20 +9,23 @@ "In this notebook, we present how to build predictive models on tabular\n", "datasets, with only numerical features.\n", "\n", - "In particular we will highlight:\n", + "In particular we highlight:\n", "\n", "* the scikit-learn API: `.fit(X, y)`/`.predict(X)`/`.score(X, y)`;\n", "* how to evaluate the generalization performance of a model with a train-test\n", " split.\n", "\n", + "Here API stands for \"Application Programming Interface\" and refers to a set of\n", + "conventions to build self-consistent software. Notice that you can visit the\n", + "Glossary for more info on technical jargon.\n", + "\n", "## Loading the dataset with Pandas\n", "\n", - "We will use the same dataset \"adult_census\" described in the previous\n", - "notebook. For more details about the dataset see\n", - ".\n", + "We use the \"adult_census\" dataset described in the previous notebook. For more\n", + "details about the dataset see .\n", "\n", "Numerical data is the most natural type of data used in machine learning and\n", - "can (almost) directly be fed into predictive models. We will load a subset of\n", + "can (almost) directly be fed into predictive models. Here we load a subset of\n", "the original data with only the numerical columns." ] }, @@ -90,7 +93,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "We can now linger on the variables, also denominated features, that we will\n", + "We can now linger on the variables, also denominated features, that we later\n", "use to build our predictive model. In addition, we can also check how many\n", "samples are available in our dataset." ] @@ -122,7 +125,7 @@ "source": [ "## Fit a model and make predictions\n", "\n", - "We will build a classification model using the \"K-nearest neighbors\" strategy.\n", + "We now build a classification model using the \"K-nearest neighbors\" strategy.\n", "To predict the target of a new sample, a k-nearest neighbors takes into\n", "account its `k` closest samples in the training set and predicts the majority\n", "target of these samples.\n", @@ -158,10 +161,11 @@ "\n", "![Predictor fit diagram](../figures/api_diagram-predictor.fit.svg)\n", "\n", + "In scikit-learn an object that has a `fit` method is called an **estimator**.\n", "The method `fit` is composed of two elements: (i) a **learning algorithm** and\n", "(ii) some **model states**. The learning algorithm takes the training data and\n", - "training target as input and sets the model states. These model states will be\n", - "used later to either predict (for classifiers and regressors) or transform\n", + "training target as input and sets the model states. These model states are\n", + "later used to either predict (for classifiers and regressors) or transform\n", "data (for transformers).\n", "\n", "Both the learning algorithm and the type of model states are specific to each\n", @@ -200,13 +204,14 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "We can illustrate the prediction mechanism as follows:\n", + "An estimator (an object with a `fit` method) with a `predict` method is called\n", + "a **predictor**. We can illustrate the prediction mechanism as follows:\n", "\n", "![Predictor predict diagram](../figures/api_diagram-predictor.predict.svg)\n", "\n", - "To predict, a model uses a **prediction function** that will use the input\n", - "data together with the model states. As for the learning algorithm and the\n", - "model states, the prediction function is specific for each type of model." + "To predict, a model uses a **prediction function** that uses the input data\n", + "together with the model states. As for the learning algorithm and the model\n", + "states, the prediction function is specific for each type of model." ] }, { @@ -214,7 +219,7 @@ "metadata": {}, "source": [ "Let's now have a look at the computed predictions. For the sake of simplicity,\n", - "we will look at the five first predicted targets." + "we look at the five first predicted targets." ] }, { @@ -384,7 +389,9 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Let's check the underlying mechanism when the `score` method is called:\n", + "We use the generic term **model** for objects whose goodness of fit can be\n", + "measured using the `score` method. Let's check the underlying mechanism when\n", + "calling `score`:\n", "\n", "![Predictor score diagram](../figures/api_diagram-predictor.score.svg)\n", "\n", @@ -413,13 +420,13 @@ "source": [ "
\n", "

Note

\n", - "

In this MOOC, we will refer to generalization performance of a model when\n", - "referring to the test score or test error obtained by comparing the\n", - "prediction of a model and the true targets. Equivalent terms for\n", - "generalization performance are predictive performance and statistical\n", - "performance. We will refer to computational performance of a predictive\n", - "model when assessing the computational costs of training a predictive model\n", - "or using it to make predictions.

\n", + "

In this MOOC, we refer to generalization performance of a model when\n", + "referring to the test score or test error obtained by comparing the prediction\n", + "of a model and the true targets. Equivalent terms for generalization\n", + "performance are predictive performance and statistical performance. We refer\n", + "to computational performance of a predictive model when assessing the\n", + "computational costs of training a predictive model or using it to make\n", + "predictions.

\n", "
" ] }, @@ -435,7 +442,8 @@ "* evaluated its generalization performance on the testing data;\n", "* introduced the scikit-learn API `.fit(X, y)` (to train a model),\n", " `.predict(X)` (to make predictions) and `.score(X, y)` (to evaluate a\n", - " model)." + " model);\n", + "* introduced the jargon for estimator, predictor and model." ] } ], diff --git a/notebooks/02_numerical_pipeline_scaling.ipynb b/notebooks/02_numerical_pipeline_scaling.ipynb index d695c632b..c7bd8d751 100644 --- a/notebooks/02_numerical_pipeline_scaling.ipynb +++ b/notebooks/02_numerical_pipeline_scaling.ipynb @@ -260,7 +260,10 @@ "Finally, the method `fit_transform` is a shorthand method to call successively\n", "`fit` and then `transform`.\n", "\n", - "![Transformer fit_transform diagram](../figures/api_diagram-transformer.fit_transform.svg)" + "![Transformer fit_transform diagram](../figures/api_diagram-transformer.fit_transform.svg)\n", + "\n", + "In scikit-learn jargon, a **transformer** is defined as an estimator (an\n", + "object with a `fit` method) supporting `transform` or `fit_transform`." ] }, { @@ -273,13 +276,29 @@ "data_train_scaled" ] }, + { + "cell_type": "markdown", + "metadata": { + "lines_to_next_cell": 0 + }, + "source": [ + "By default, all scikit-learn transformers output NumPy arrays. Since\n", + "scikit-learn 1.2, it is possible to set the output to be a pandas dataframe,\n", + "which makes data exploration easier as it preserves the column names. The\n", + "method `set_output` controls this behaviour. Please refer to this [example\n", + "from the scikit-learn\n", + "documentation](https://scikit-learn.org/stable/auto_examples/miscellaneous/plot_set_output.html)\n", + "for more options to configure the output of transformers." + ] + }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ - "data_train_scaled = pd.DataFrame(data_train_scaled, columns=data_train.columns)\n", + "scaler = StandardScaler().set_output(transform=\"pandas\")\n", + "data_train_scaled = scaler.fit_transform(data_train)\n", "data_train_scaled.describe()" ] }, diff --git a/notebooks/03_categorical_pipeline.ipynb b/notebooks/03_categorical_pipeline.ipynb index 5f91c713b..3972842a5 100644 --- a/notebooks/03_categorical_pipeline.ipynb +++ b/notebooks/03_categorical_pipeline.ipynb @@ -6,9 +6,9 @@ "source": [ "# Encoding of categorical variables\n", "\n", - "In this notebook, we will present typical ways of dealing with **categorical\n", - "variables** by encoding them, namely **ordinal encoding** and **one-hot\n", - "encoding**." + "In this notebook, we will present typical ways of dealing with\n", + "**categorical variables** by encoding them, namely **ordinal encoding** and\n", + "**one-hot encoding**." ] }, { @@ -44,16 +44,18 @@ "\n", "## Identify categorical variables\n", "\n", - "As we saw in the previous section, a numerical variable is a quantity\n", - "represented by a real or integer number. These variables can be naturally\n", - "handled by machine learning algorithms that are typically composed of a\n", - "sequence of arithmetic instructions such as additions and multiplications.\n", + "As we saw in the previous section, a numerical variable is a\n", + "quantity represented by a real or integer number. These variables can be\n", + "naturally handled by machine learning algorithms that are typically composed\n", + "of a sequence of arithmetic instructions such as additions and\n", + "multiplications.\n", "\n", - "In contrast, categorical variables have discrete values, typically represented\n", - "by string labels (but not only) taken from a finite list of possible choices.\n", - "For instance, the variable `native-country` in our dataset is a categorical\n", - "variable because it encodes the data using a finite list of possible countries\n", - "(along with the `?` symbol when this information is missing):" + "In contrast, categorical variables have discrete values, typically\n", + "represented by string labels (but not only) taken from a finite list of\n", + "possible choices. For instance, the variable `native-country` in our dataset\n", + "is a categorical variable because it encodes the data using a finite list of\n", + "possible countries (along with the `?` symbol when this information is\n", + "missing):" ] }, { @@ -69,8 +71,8 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "How can we easily recognize categorical columns among the dataset? Part of the\n", - "answer lies in the columns' data type:" + "How can we easily recognize categorical columns among the dataset? Part of\n", + "the answer lies in the columns' data type:" ] }, { @@ -93,8 +95,8 @@ "\n", "In the previous notebook, we manually defined the numerical columns. We could\n", "do a similar approach. Instead, we will use the scikit-learn helper function\n", - "`make_column_selector`, which allows us to select columns based on their data\n", - "type. We will illustrate how to use this helper." + "`make_column_selector`, which allows us to select columns based on\n", + "their data type. We will illustrate how to use this helper." ] }, { @@ -157,8 +159,9 @@ "### Encoding ordinal categories\n", "\n", "The most intuitive strategy is to encode each category with a different\n", - "number. The `OrdinalEncoder` will transform the data in such manner. We will\n", - "start by encoding a single column to understand how the encoding works." + "number. The `OrdinalEncoder` will transform the data in such manner.\n", + "We will start by encoding a single column to understand how the encoding\n", + "works." ] }, { @@ -171,7 +174,7 @@ "\n", "education_column = data_categorical[[\"education\"]]\n", "\n", - "encoder = OrdinalEncoder()\n", + "encoder = OrdinalEncoder().set_output(transform=\"pandas\")\n", "education_encoded = encoder.fit_transform(education_column)\n", "education_encoded" ] @@ -228,37 +231,37 @@ "independently. We also note that the number of features before and after the\n", "encoding is the same.\n", "\n", - "However, be careful when applying this encoding strategy: using this integer\n", - "representation leads downstream predictive models to assume that the values\n", - "are ordered (0 < 1 < 2 < 3... for instance).\n", + "However, be careful when applying this encoding strategy:\n", + "using this integer representation leads downstream predictive models\n", + "to assume that the values are ordered (0 < 1 < 2 < 3... for instance).\n", "\n", "By default, `OrdinalEncoder` uses a lexicographical strategy to map string\n", - "category labels to integers. This strategy is arbitrary and often meaningless.\n", - "For instance, suppose the dataset has a categorical variable named `\"size\"`\n", - "with categories such as \"S\", \"M\", \"L\", \"XL\". We would like the integer\n", - "representation to respect the meaning of the sizes by mapping them to\n", - "increasing integers such as `0, 1, 2, 3`. However, the lexicographical\n", - "strategy used by default would map the labels \"S\", \"M\", \"L\", \"XL\" to 2, 1, 0,\n", - "3, by following the alphabetical order.\n", - "\n", - "The `OrdinalEncoder` class accepts a `categories` constructor argument to pass\n", - "categories in the expected ordering explicitly. You can find more information\n", - "in the [scikit-learn\n", - "documentation](https://scikit-learn.org/stable/modules/preprocessing.html#encoding-categorical-features)\n", + "category labels to integers. This strategy is arbitrary and often\n", + "meaningless. For instance, suppose the dataset has a categorical variable\n", + "named `\"size\"` with categories such as \"S\", \"M\", \"L\", \"XL\". We would like the\n", + "integer representation to respect the meaning of the sizes by mapping them to\n", + "increasing integers such as `0, 1, 2, 3`.\n", + "However, the lexicographical strategy used by default would map the labels\n", + "\"S\", \"M\", \"L\", \"XL\" to 2, 1, 0, 3, by following the alphabetical order.\n", + "\n", + "The `OrdinalEncoder` class accepts a `categories` constructor argument to\n", + "pass categories in the expected ordering explicitly. You can find more\n", + "information in the\n", + "[scikit-learn documentation](https://scikit-learn.org/stable/modules/preprocessing.html#encoding-categorical-features)\n", "if needed.\n", "\n", - "If a categorical variable does not carry any meaningful order information then\n", - "this encoding might be misleading to downstream statistical models and you\n", - "might consider using one-hot encoding instead (see below).\n", + "If a categorical variable does not carry any meaningful order information\n", + "then this encoding might be misleading to downstream statistical models and\n", + "you might consider using one-hot encoding instead (see below).\n", "\n", "### Encoding nominal categories (without assuming any order)\n", "\n", - "`OneHotEncoder` is an alternative encoder that prevents the downstream models\n", - "to make a false assumption about the ordering of categories. For a given\n", - "feature, it will create as many new columns as there are possible categories.\n", - "For a given sample, the value of the column corresponding to the category will\n", - "be set to `1` while all the columns of the other categories will be set to\n", - "`0`.\n", + "`OneHotEncoder` is an alternative encoder that prevents the downstream\n", + "models to make a false assumption about the ordering of categories. For a\n", + "given feature, it will create as many new columns as there are possible\n", + "categories. For a given sample, the value of the column corresponding to the\n", + "category will be set to `1` while all the columns of the other categories\n", + "will be set to `0`.\n", "\n", "We will start by encoding a single feature (e.g. `\"education\"`) to illustrate\n", "how the encoding works." @@ -272,7 +275,7 @@ "source": [ "from sklearn.preprocessing import OneHotEncoder\n", "\n", - "encoder = OneHotEncoder(sparse_output=False)\n", + "encoder = OneHotEncoder(sparse_output=False).set_output(transform=\"pandas\")\n", "education_encoded = encoder.fit_transform(education_column)\n", "education_encoded" ] @@ -286,8 +289,8 @@ "

sparse_output=False is used in the OneHotEncoder for didactic purposes,\n", "namely easier visualization of the data.

\n", "

Sparse matrices are efficient data structures when most of your matrix\n", - "elements are zero. They won't be covered in detail in this course. If you want\n", - "more details about them, you can look at\n", + "elements are zero. They won't be covered in detail in this course. If you\n", + "want more details about them, you can look at\n", "this.

\n", "" ] @@ -296,27 +299,8 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "We see that encoding a single feature will give a NumPy array full of zeros\n", - "and ones. We can get a better understanding using the associated feature names\n", - "resulting from the transformation." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "feature_names = encoder.get_feature_names_out(input_features=[\"education\"])\n", - "education_encoded = pd.DataFrame(education_encoded, columns=feature_names)\n", - "education_encoded" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "As we can see, each category (unique value) became a column; the encoding\n", + "We see that encoding a single feature will give a dataframe full of zeros\n", + "and ones. Each category (unique value) became a column; the encoding\n", "returned, for each sample, a 1 to specify which category it belongs to.\n", "\n", "Let's apply this encoding on the full dataset." @@ -351,24 +335,6 @@ "print(f\"The encoded dataset contains {data_encoded.shape[1]} features\")" ] }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Let's wrap this NumPy array in a dataframe with informative column names as\n", - "provided by the encoder object:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "columns_encoded = encoder.get_feature_names_out(data_categorical.columns)\n", - "pd.DataFrame(data_encoded, columns=columns_encoded).head()" - ] - }, { "cell_type": "markdown", "metadata": {}, @@ -376,8 +342,8 @@ "Look at how the `\"workclass\"` variable of the 3 first records has been encoded\n", "and compare this to the original string representation.\n", "\n", - "The number of features after the encoding is more than 10 times larger than in\n", - "the original data because some variables such as `occupation` and\n", + "The number of features after the encoding is more than 10 times larger than\n", + "in the original data because some variables such as `occupation` and\n", "`native-country` have many possible categories." ] }, @@ -397,9 +363,9 @@ "source": [ "
\n", "

Note

\n", - "

In general OneHotEncoder is the encoding strategy used when the downstream\n", - "models are linear models while OrdinalEncoder is often a good strategy\n", - "with tree-based models.

\n", + "

In general OneHotEncoder is the encoding strategy used when the\n", + "downstream models are linear models while OrdinalEncoder is often a\n", + "good strategy with tree-based models.

\n", "
" ] }, @@ -407,20 +373,20 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Using an `OrdinalEncoder` will output ordinal categories. This means that\n", - "there is an order in the resulting categories (e.g. `0 < 1 < 2`). The impact\n", - "of violating this ordering assumption is really dependent on the downstream\n", - "models. Linear models will be impacted by misordered categories while\n", - "tree-based models will not.\n", + "\n", + "Using an `OrdinalEncoder` will output ordinal categories. This means\n", + "that there is an order in the resulting categories (e.g. `0 < 1 < 2`). The\n", + "impact of violating this ordering assumption is really dependent on the\n", + "downstream models. Linear models will be impacted by misordered categories\n", + "while tree-based models will not.\n", "\n", "You can still use an `OrdinalEncoder` with linear models but you need to be\n", "sure that:\n", "- the original categories (before encoding) have an ordering;\n", "- the encoded categories follow the same ordering than the original\n", " categories.\n", - "\n", - "The **next exercise** shows what can happen when using an `OrdinalEncoder`\n", - "with a liner model and the conditions above are not met.\n", + "The **next exercise** highlights the issue of misusing `OrdinalEncoder` with\n", + "a linear model.\n", "\n", "One-hot encoding categorical variables with high cardinality can cause\n", "computational inefficiency in tree-based models. Because of this, it is not\n", @@ -437,8 +403,8 @@ "\n", "We can now integrate this encoder inside a machine learning pipeline like we\n", "did with numerical data: let's train a linear classifier on the encoded data\n", - "and check the generalization performance of this machine learning pipeline\n", - "using cross-validation.\n", + "and check the generalization performance of this machine learning pipeline using\n", + "cross-validation.\n", "\n", "Before we create the pipeline, we have to linger on the `native-country`.\n", "Let's recall some statistics regarding this column." @@ -457,20 +423,28 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "We see that the `Holand-Netherlands` category is occurring rarely. This will\n", + "We see that the `\"Holand-Netherlands\"` category is occurring rarely. This will\n", "be a problem during cross-validation: if the sample ends up in the test set\n", "during splitting then the classifier would not have seen the category during\n", "training and will not be able to encode it.\n", "\n", - "In scikit-learn, there are two solutions to bypass this issue:\n", + "In scikit-learn, there are some possible solutions to bypass this issue:\n", "\n", - "* list all the possible categories and provide it to the encoder via the\n", - " keyword argument `categories`;\n", - "* use the parameter `handle_unknown`, i.e. if an unknown category is\n", + "* list all the possible categories and provide them to the encoder via the\n", + " keyword argument `categories` instead of letting the estimator automatically\n", + " determine them from the training data when calling fit;\n", + "* set the parameter `handle_unknown=\"ignore\"`, i.e. if an unknown category is\n", " encountered during transform, the resulting one-hot encoded columns for this\n", - " feature will be all zeros.\n", + " feature will be all zeros;\n", + "* adjust the `min_frequency` parameter to collapse the rarest categories\n", + " observed in the training data into a single one-hot encoded feature. If you\n", + " enable this option, you can also set `handle_unknown=\"infrequent_if_exist\"`\n", + " to encode the unknown categories (categories only observed at predict time)\n", + " as ones in that last column.\n", "\n", - "Here, we will use the latter solution for simplicity." + "In this notebook we only explore the second option, namely\n", + "`OneHotEncoder(handle_unknown=\"ignore\")`. Feel free to evaluate the\n", + "alternatives on your own, for instance using a sandbox notebook." ] }, { @@ -479,9 +453,9 @@ "source": [ "
\n", "

Tip

\n", - "

Be aware the OrdinalEncoder exposes as well a parameter handle_unknown. It\n", - "can be set to use_encoded_value. If that option is chosen, you can define a\n", - "fixed value to which all unknowns will be set to during transform. For\n", + "

Be aware the OrdinalEncoder exposes a parameter also named handle_unknown.\n", + "It can be set to use_encoded_value. If that option is chosen, you can define\n", + "a fixed value to which all unknowns will be set to during transform. For\n", "example, OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=42) will set all values encountered during transform to 42\n", "which are not part of the data encountered during the fit call. You are\n", "going to use these parameters in the next exercise.

\n", @@ -516,10 +490,10 @@ "
\n", "

Note

\n", "

Here, we need to increase the maximum number of iterations to obtain a fully\n", - "converged LogisticRegression and silence a ConvergenceWarning. Contrary to\n", - "the numerical features, the one-hot encoded categorical features are all on\n", - "the same scale (values are 0 or 1), so they would not benefit from scaling. In\n", - "this case, increasing max_iter is the right thing to do.

\n", + "converged LogisticRegression and silence a ConvergenceWarning. Contrary\n", + "to the numerical features, the one-hot encoded categorical features are all\n", + "on the same scale (values are 0 or 1), so they would not benefit from\n", + "scaling. In this case, increasing max_iter is the right thing to do.

\n", "
" ] }, @@ -557,9 +531,9 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "As you can see, this representation of the categorical variables is slightly\n", - "more predictive of the revenue than the numerical variables that we used\n", - "previously." + "As you can see, this representation of the categorical variables is\n", + "slightly more predictive of the revenue than the numerical variables\n", + "that we used previously." ] }, { diff --git a/notebooks/cross_validation_ex_01.ipynb b/notebooks/cross_validation_ex_01.ipynb index 695981a3b..381202093 100644 --- a/notebooks/cross_validation_ex_01.ipynb +++ b/notebooks/cross_validation_ex_01.ipynb @@ -14,7 +14,7 @@ "* use a learning curve to determine the usefulness of adding new samples in\n", " the dataset when building a classifier.\n", "\n", - "To make these experiments we will first load the blood transfusion dataset." + "To make these experiments we first load the blood transfusion dataset." ] }, { @@ -45,7 +45,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "We will use a support vector machine classifier (SVM). In its most simple\n", + "Here we use a support vector machine classifier (SVM). In its most simple\n", "form, a SVM classifier is a linear classifier behaving similarly to a logistic\n", "regression. Indeed, the optimization used to find the optimal weights of the\n", "linear model are different but we don't need to know these details for the\n", @@ -105,33 +105,17 @@ "As previously mentioned, the parameter `gamma` is one of the parameters\n", "controlling under/over-fitting in support vector machine with an RBF kernel.\n", "\n", - "Evaluate the effect of the parameter `gamma` by using the\n", - "[`sklearn.model_selection.validation_curve`](https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.validation_curve.html)\n", - "function. You can leave the default `scoring=None` which is equivalent to\n", + "Evaluate the effect of the parameter `gamma` by using\n", + "[`sklearn.model_selection.ValidationCurveDisplay`](https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.ValidationCurveDisplay.html).\n", + "You can leave the default `scoring=None` which is equivalent to\n", "`scoring=\"accuracy\"` for classification problems. You can vary `gamma` between\n", "`10e-3` and `10e2` by generating samples on a logarithmic scale with the help\n", "of `np.logspace(-3, 2, num=30)`.\n", "\n", - "Since we are manipulating a `Pipeline` the parameter name will be set to\n", - "`svc__gamma` instead of only `gamma`. You can retrieve the parameter name\n", - "using `model.get_params().keys()`. We will go more into detail regarding\n", - "accessing and setting hyperparameter in the next section." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Write your code here." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Plot the validation curve for the train and test scores." + "Since we are manipulating a `Pipeline` the parameter name is `svc__gamma`\n", + "instead of only `gamma`. You can retrieve the parameter name using\n", + "`model.get_params().keys()`. We will go more into detail regarding accessing\n", + "and setting hyperparameter in the next section." ] }, { @@ -149,7 +133,8 @@ "source": [ "Now, you can perform an analysis to check whether adding new samples to the\n", "dataset could help our model to better generalize. Compute the learning curve\n", - "(using [`sklearn.model_selection.learning_curve`](https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.learning_curve.html))\n", + "(using\n", + "[`sklearn.model_selection.LearningCurveDisplay`](https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.LearningCurveDisplay.html))\n", "by computing the train and test scores for different training dataset size.\n", "Plot the train and test scores with respect to the number of samples." ] diff --git a/notebooks/cross_validation_learning_curve.ipynb b/notebooks/cross_validation_learning_curve.ipynb index 44f04e7d2..aaac1c12d 100644 --- a/notebooks/cross_validation_learning_curve.ipynb +++ b/notebooks/cross_validation_learning_curve.ipynb @@ -11,8 +11,8 @@ "generalizing. Besides these aspects, it is also important to understand how\n", "the different errors are influenced by the number of samples available.\n", "\n", - "In this notebook, we will show this aspect by looking a the variability of the\n", - "different errors.\n", + "In this notebook, we will show this aspect by looking a the variability of\n", + "the different errors.\n", "\n", "Let's first load the data and create the same model as in the previous\n", "notebook." @@ -69,11 +69,11 @@ "the validation curve, but instead of varying a hyperparameter, we vary the\n", "number of training samples. This curve is called the **learning curve**.\n", "\n", - "It gives information regarding the benefit of adding new training samples to\n", - "improve a model's generalization performance.\n", + "It gives information regarding the benefit of adding new training samples\n", + "to improve a model's generalization performance.\n", "\n", - "Let's compute the learning curve for a decision tree and vary the proportion\n", - "of the training set from 10% to 100%." + "Let's compute the learning curve for a decision tree and vary the\n", + "proportion of the training set from 10% to 100%." ] }, { @@ -119,55 +119,22 @@ "metadata": {}, "outputs": [], "source": [ - "from sklearn.model_selection import learning_curve\n", + "from sklearn.model_selection import LearningCurveDisplay\n", "\n", - "results = learning_curve(\n", + "display = LearningCurveDisplay.from_estimator(\n", " regressor,\n", " data,\n", " target,\n", " train_sizes=train_sizes,\n", " cv=cv,\n", + " score_type=\"both\", # both train and test errors\n", " scoring=\"neg_mean_absolute_error\",\n", + " negate_score=True, # to use when metric starts with \"neg_\"\n", + " score_name=\"Mean absolute error (k$)\",\n", + " std_display_style=\"errorbar\",\n", " n_jobs=2,\n", ")\n", - "train_size, train_scores, test_scores = results[:3]\n", - "# Convert the scores into errors\n", - "train_errors, test_errors = -train_scores, -test_scores" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Now, we can plot the curve." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import matplotlib.pyplot as plt\n", - "\n", - "plt.errorbar(\n", - " train_size,\n", - " train_errors.mean(axis=1),\n", - " yerr=train_errors.std(axis=1),\n", - " label=\"Training error\",\n", - ")\n", - "plt.errorbar(\n", - " train_size,\n", - " test_errors.mean(axis=1),\n", - " yerr=test_errors.std(axis=1),\n", - " label=\"Testing error\",\n", - ")\n", - "plt.legend()\n", - "\n", - "plt.xscale(\"log\")\n", - "plt.xlabel(\"Number of samples in the training set\")\n", - "plt.ylabel(\"Mean absolute error (k$)\")\n", - "_ = plt.title(\"Learning curve for decision tree\")" + "_ = display.ax_.set(xscale=\"log\", title=\"Learning curve for decision tree\")" ] }, { @@ -178,11 +145,11 @@ "means that the trained model (i.e. decision tree) is clearly overfitting the\n", "training data.\n", "\n", - "Looking at the testing error alone, we observe that the more samples are added\n", - "into the training set, the lower the testing error becomes. Also, we are\n", - "searching for the plateau of the testing error for which there is no benefit\n", - "to adding samples anymore or assessing the potential gain of adding more\n", - "samples into the training set.\n", + "Looking at the testing error alone, we observe that the more samples are\n", + "added into the training set, the lower the testing error becomes. Also, we\n", + "are searching for the plateau of the testing error for which there is no\n", + "benefit to adding samples anymore or assessing the potential gain of adding\n", + "more samples into the training set.\n", "\n", "If we achieve a plateau and adding new samples in the training set does not\n", "reduce the testing error, we might have reached the Bayes error rate using the\n", diff --git a/notebooks/cross_validation_sol_01.ipynb b/notebooks/cross_validation_sol_01.ipynb index 04780c59d..925c7e379 100644 --- a/notebooks/cross_validation_sol_01.ipynb +++ b/notebooks/cross_validation_sol_01.ipynb @@ -14,7 +14,7 @@ "* use a learning curve to determine the usefulness of adding new samples in\n", " the dataset when building a classifier.\n", "\n", - "To make these experiments we will first load the blood transfusion dataset." + "To make these experiments we first load the blood transfusion dataset." ] }, { @@ -45,7 +45,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "We will use a support vector machine classifier (SVM). In its most simple\n", + "Here we use a support vector machine classifier (SVM). In its most simple\n", "form, a SVM classifier is a linear classifier behaving similarly to a logistic\n", "regression. Indeed, the optimization used to find the optimal weights of the\n", "linear model are different but we don't need to know these details for the\n", @@ -133,17 +133,17 @@ "As previously mentioned, the parameter `gamma` is one of the parameters\n", "controlling under/over-fitting in support vector machine with an RBF kernel.\n", "\n", - "Evaluate the effect of the parameter `gamma` by using the\n", - "[`sklearn.model_selection.validation_curve`](https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.validation_curve.html)\n", - "function. You can leave the default `scoring=None` which is equivalent to\n", + "Evaluate the effect of the parameter `gamma` by using\n", + "[`sklearn.model_selection.ValidationCurveDisplay`](https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.ValidationCurveDisplay.html).\n", + "You can leave the default `scoring=None` which is equivalent to\n", "`scoring=\"accuracy\"` for classification problems. You can vary `gamma` between\n", "`10e-3` and `10e2` by generating samples on a logarithmic scale with the help\n", "of `np.logspace(-3, 2, num=30)`.\n", "\n", - "Since we are manipulating a `Pipeline` the parameter name will be set to\n", - "`svc__gamma` instead of only `gamma`. You can retrieve the parameter name\n", - "using `model.get_params().keys()`. We will go more into detail regarding\n", - "accessing and setting hyperparameter in the next section." + "Since we are manipulating a `Pipeline` the parameter name is `svc__gamma`\n", + "instead of only `gamma`. You can retrieve the parameter name using\n", + "`model.get_params().keys()`. We will go more into detail regarding accessing\n", + "and setting hyperparameter in the next section." ] }, { @@ -154,57 +154,29 @@ "source": [ "# solution\n", "import numpy as np\n", - "from sklearn.model_selection import validation_curve\n", + "\n", + "from sklearn.model_selection import ValidationCurveDisplay\n", "\n", "gammas = np.logspace(-3, 2, num=30)\n", "param_name = \"svc__gamma\"\n", - "train_scores, test_scores = validation_curve(\n", + "disp = ValidationCurveDisplay.from_estimator(\n", " model,\n", " data,\n", " target,\n", " param_name=param_name,\n", " param_range=gammas,\n", " cv=cv,\n", + " scoring=\"accuracy\", # this is already the default for classifiers\n", + " score_name=\"Accuracy\",\n", + " std_display_style=\"errorbar\",\n", + " errorbar_kw={\"alpha\": 0.7}, # transparency for better visualization\n", " n_jobs=2,\n", - ")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Plot the validation curve for the train and test scores." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# solution\n", - "import matplotlib.pyplot as plt\n", - "\n", - "plt.errorbar(\n", - " gammas,\n", - " train_scores.mean(axis=1),\n", - " yerr=train_scores.std(axis=1),\n", - " alpha=0.95,\n", - " label=\"Training score\",\n", ")\n", - "plt.errorbar(\n", - " gammas,\n", - " test_scores.mean(axis=1),\n", - " yerr=test_scores.std(axis=1),\n", - " alpha=0.5,\n", - " label=\"Testing score\",\n", - ")\n", - "plt.legend()\n", "\n", - "plt.xscale(\"log\")\n", - "plt.xlabel(r\"Value of hyperparameter $\\gamma$\")\n", - "plt.ylabel(\"Accuracy score\")\n", - "_ = plt.title(\"Validation score of support vector machine\")" + "_ = disp.ax_.set(\n", + " xlabel=r\"Value of hyperparameter $\\gamma$\",\n", + " title=\"Validation curve of support vector machine\",\n", + ")" ] }, { @@ -227,7 +199,8 @@ "source": [ "Now, you can perform an analysis to check whether adding new samples to the\n", "dataset could help our model to better generalize. Compute the learning curve\n", - "(using [`sklearn.model_selection.learning_curve`](https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.learning_curve.html))\n", + "(using\n", + "[`sklearn.model_selection.LearningCurveDisplay`](https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.LearningCurveDisplay.html))\n", "by computing the train and test scores for different training dataset size.\n", "Plot the train and test scores with respect to the number of samples." ] @@ -239,44 +212,24 @@ "outputs": [], "source": [ "# solution\n", - "from sklearn.model_selection import learning_curve\n", + "from sklearn.model_selection import LearningCurveDisplay\n", "\n", "train_sizes = np.linspace(0.1, 1, num=10)\n", - "results = learning_curve(\n", - " model, data, target, train_sizes=train_sizes, cv=cv, n_jobs=2\n", - ")\n", - "train_size, train_scores, test_scores = results[:3]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "tags": [ - "solution" - ] - }, - "outputs": [], - "source": [ - "plt.errorbar(\n", - " train_size,\n", - " train_scores.mean(axis=1),\n", - " yerr=train_scores.std(axis=1),\n", - " alpha=0.95,\n", - " label=\"Training score\",\n", - ")\n", - "plt.errorbar(\n", - " train_size,\n", - " test_scores.mean(axis=1),\n", - " yerr=test_scores.std(axis=1),\n", - " alpha=0.5,\n", - " label=\"Testing score\",\n", + "LearningCurveDisplay.from_estimator(\n", + " model,\n", + " data,\n", + " target,\n", + " train_sizes=train_sizes,\n", + " cv=cv,\n", + " score_type=\"both\",\n", + " scoring=\"accuracy\", # this is already the default for classifiers\n", + " score_name=\"Accuracy\",\n", + " std_display_style=\"errorbar\",\n", + " errorbar_kw={\"alpha\": 0.7}, # transparency for better visualization\n", + " n_jobs=2,\n", ")\n", - "plt.legend(bbox_to_anchor=(1.05, 0.8), loc=\"upper left\")\n", "\n", - "plt.xlabel(\"Number of samples in the training set\")\n", - "plt.ylabel(\"Accuracy\")\n", - "_ = plt.title(\"Learning curve for support vector machine\")" + "_ = disp.ax_.set(title=\"Learning curve for support vector machine\")" ] }, { diff --git a/notebooks/cross_validation_validation_curve.ipynb b/notebooks/cross_validation_validation_curve.ipynb index c2d64c039..79208cf24 100644 --- a/notebooks/cross_validation_validation_curve.ipynb +++ b/notebooks/cross_validation_validation_curve.ipynb @@ -10,8 +10,8 @@ "and how it helps us quantify the training and testing errors as well as their\n", "fluctuations.\n", "\n", - "In this notebook, we will put these two errors into perspective and show how\n", - "they can help us know if our model generalizes, overfits, or underfits.\n", + "In this notebook, we put these two errors into perspective and show how they\n", + "can help us know if our model generalizes, overfits, or underfits.\n", "\n", "Let's first load the data and create the same model as in the previous\n", "notebook." @@ -59,7 +59,7 @@ "## Overfitting vs. underfitting\n", "\n", "To better understand the generalization performance of our model and maybe\n", - "find insights on how to improve it, we will compare the testing error with the\n", + "find insights on how to improve it, we compare the testing error with the\n", "training error. Thus, we need to compute the error on the training set, which\n", "is possible using the `cross_validate` function." ] @@ -73,7 +73,7 @@ "import pandas as pd\n", "from sklearn.model_selection import cross_validate, ShuffleSplit\n", "\n", - "cv = ShuffleSplit(n_splits=30, test_size=0.2)\n", + "cv = ShuffleSplit(n_splits=30, test_size=0.2, random_state=0)\n", "cv_results = cross_validate(\n", " regressor,\n", " data,\n", @@ -138,13 +138,20 @@ "\n", "## Validation curve\n", "\n", + "We call **hyperparameters** those parameters that potentially impact the\n", + "result of the learning and subsequent predictions of a predictor. For example:\n", + "\n", + "- the number of neighbors in a k-nearest neighbor model;\n", + "\n", + "- the degree of the polynomial.\n", + "\n", "Some model hyperparameters are usually the key to go from a model that\n", "underfits to a model that overfits, hopefully going through a region were we\n", "can get a good balance between the two. We can acquire knowledge by plotting a\n", "curve called the validation curve. This curve can also be applied to the above\n", "experiment and varies the value of a hyperparameter.\n", "\n", - "For the decision tree, the `max_depth` parameter is used to control the\n", + "For the decision tree, the `max_depth` hyperparameter is used to control the\n", "tradeoff between under-fitting and over-fitting." ] }, @@ -155,10 +162,11 @@ "outputs": [], "source": [ "%%time\n", - "from sklearn.model_selection import validation_curve\n", + "import numpy as np\n", + "from sklearn.model_selection import ValidationCurveDisplay\n", "\n", - "max_depth = [1, 5, 10, 15, 20, 25]\n", - "train_scores, test_scores = validation_curve(\n", + "max_depth = np.array([1, 5, 10, 15, 20, 25])\n", + "disp = ValidationCurveDisplay.from_estimator(\n", " regressor,\n", " data,\n", " target,\n", @@ -166,32 +174,15 @@ " param_range=max_depth,\n", " cv=cv,\n", " scoring=\"neg_mean_absolute_error\",\n", + " negate_score=True,\n", + " std_display_style=\"errorbar\",\n", " n_jobs=2,\n", ")\n", - "train_errors, test_errors = -train_scores, -test_scores" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Now that we collected the results, we will show the validation curve by\n", - "plotting the training and testing errors (as well as their deviations)." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "plt.plot(max_depth, train_errors.mean(axis=1), label=\"Training error\")\n", - "plt.plot(max_depth, test_errors.mean(axis=1), label=\"Testing error\")\n", - "plt.legend()\n", - "\n", - "plt.xlabel(\"Maximum depth of decision tree\")\n", - "plt.ylabel(\"Mean absolute error (k$)\")\n", - "_ = plt.title(\"Validation curve for decision tree\")" + "_ = disp.ax_.set(\n", + " xlabel=\"Maximum depth of decision tree\",\n", + " ylabel=\"Mean absolute error (k$)\",\n", + " title=\"Validate curve for decision tree\",\n", + ")" ] }, { @@ -223,43 +214,11 @@ "could reach by just tuning this parameter.\n", "\n", "Be aware that looking at the mean errors is quite limiting. We should also\n", - "look at the standard deviation to assess the dispersion of the score. We can\n", - "repeat the same plot as before but this time, we will add some information to\n", - "show the standard deviation of the errors as well." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "plt.errorbar(\n", - " max_depth,\n", - " train_errors.mean(axis=1),\n", - " yerr=train_errors.std(axis=1),\n", - " label=\"Training error\",\n", - ")\n", - "plt.errorbar(\n", - " max_depth,\n", - " test_errors.mean(axis=1),\n", - " yerr=test_errors.std(axis=1),\n", - " label=\"Testing error\",\n", - ")\n", - "plt.legend()\n", - "\n", - "plt.xlabel(\"Maximum depth of decision tree\")\n", - "plt.ylabel(\"Mean absolute error (k$)\")\n", - "_ = plt.title(\"Validation curve for decision tree\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "We were lucky that the variance of the errors was small compared to their\n", - "respective values, and therefore the conclusions above are quite clear. This\n", - "is not necessarily always the case." + "look at the standard deviation to assess the dispersion of the score. For such\n", + "purpose, we can use the parameter `std_display_style` to show the standard\n", + "deviation of the errors as well. In this case, the variance of the errors is\n", + "small compared to their respective values, and therefore the conclusions above\n", + "are quite clear. This is not necessarily always the case." ] }, { @@ -272,7 +231,7 @@ "\n", "* how to identify whether a model is generalizing, overfitting, or\n", " underfitting;\n", - "* how to check influence of a hyperparameter on the tradeoff underfit/overfit." + "* how to check influence of a hyperparameter on the underfit/overfit tradeoff." ] } ], diff --git a/notebooks/ensemble_bagging.ipynb b/notebooks/ensemble_bagging.ipynb index 3eebca268..5fc5a0328 100644 --- a/notebooks/ensemble_bagging.ipynb +++ b/notebooks/ensemble_bagging.ipynb @@ -363,7 +363,7 @@ "\n", "## Bagging in scikit-learn\n", "\n", - "Scikit-learn implements the bagging procedure as a \"meta-estimator\", that is\n", + "Scikit-learn implements the bagging procedure as a **meta-estimator**, that is,\n", "an estimator that wraps another estimator: it takes a base model that is\n", "cloned several times and trained independently on each bootstrap sample.\n", "\n", diff --git a/notebooks/ensemble_ex_03.ipynb b/notebooks/ensemble_ex_03.ipynb index 1e2b9f9c3..895d786c5 100644 --- a/notebooks/ensemble_ex_03.ipynb +++ b/notebooks/ensemble_ex_03.ipynb @@ -13,7 +13,7 @@ "* use the early-stopping strategy to avoid adding unnecessary trees, to get\n", " the best generalization performances.\n", "\n", - "We will use the California housing dataset to conduct our experiments." + "We use the California housing dataset to conduct our experiments." ] }, { @@ -85,7 +85,7 @@ "For both the gradient-boosting and random forest models, create a validation\n", "curve using the training set to assess the impact of the number of trees on\n", "the performance of each model. Evaluate the list of parameters `param_range =\n", - "[1, 2, 5, 10, 20, 50, 100]` and use the mean absolute error." + "np.array([1, 2, 5, 10, 20, 50, 100])` and use the mean absolute error." ] }, { @@ -101,19 +101,19 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Both gradient boosting and random forest models will always improve when\n", - "increasing the number of trees in the ensemble. However, it will reach a\n", - "plateau where adding new trees will just make fitting and scoring slower.\n", + "Both gradient boosting and random forest models improve when increasing the\n", + "number of trees in the ensemble. However, the scores reach a plateau where\n", + "adding new trees just makes fitting and scoring slower.\n", "\n", "To avoid adding new unnecessary tree, unlike random-forest gradient-boosting\n", - "offers an early-stopping option. Internally, the algorithm will use an\n", + "offers an early-stopping option. Internally, the algorithm uses an\n", "out-of-sample set to compute the generalization performance of the model at\n", "each addition of a tree. Thus, if the generalization performance is not\n", - "improving for several iterations, it will stop adding trees.\n", + "improving for several iterations, it stops adding trees.\n", "\n", "Now, create a gradient-boosting model with `n_estimators=1_000`. This number\n", - "of trees will be too large. Change the parameter `n_iter_no_change` such that\n", - "the gradient boosting fitting will stop after adding 5 trees that do not\n", + "of trees is certainly too large. Change the parameter `n_iter_no_change` such\n", + "that the gradient boosting fitting stops after adding 5 trees that do not\n", "improve the overall generalization performance." ] }, diff --git a/notebooks/ensemble_hyperparameters.ipynb b/notebooks/ensemble_hyperparameters.ipynb index fb872005b..cd2e02b51 100644 --- a/notebooks/ensemble_hyperparameters.ipynb +++ b/notebooks/ensemble_hyperparameters.ipynb @@ -6,23 +6,18 @@ "source": [ "# Hyperparameter tuning\n", "\n", - "In the previous section, we did not discuss the parameters of random forest\n", - "and gradient-boosting. However, there are a couple of things to keep in mind\n", - "when setting these.\n", - "\n", - "This notebook gives crucial information regarding how to set the\n", - "hyperparameters of both random forest and gradient boosting decision tree\n", - "models.\n", + "In the previous section, we did not discuss the hyperparameters of random\n", + "forest and histogram gradient-boosting. This notebook gives crucial\n", + "information regarding how to set them.\n", "\n", "
\n", "

Caution!

\n", - "

For the sake of clarity, no cross-validation will be used to estimate the\n", + "

For the sake of clarity, no nested cross-validation is used to estimate the\n", "variability of the testing error. We are only showing the effect of the\n", - "parameters on the validation set of what should be the inner loop of a nested\n", - "cross-validation.

\n", + "parameters on the validation set.

\n", "
\n", "\n", - "We will start by loading the california housing dataset." + "We start by loading the california housing dataset." ] }, { @@ -49,7 +44,7 @@ "\n", "The main parameter to select in random forest is the `n_estimators` parameter.\n", "In general, the more trees in the forest, the better the generalization\n", - "performance will be. However, it will slow down the fitting and prediction\n", + "performance would be. However, adding trees slows down the fitting and prediction\n", "time. The goal is to balance computing time and generalization performance\n", "when setting the number of estimators. Here, we fix `n_estimators=100`, which\n", "is already the default value.\n", @@ -63,7 +58,7 @@ "\n", "Instead, we can tune the hyperparameter `max_features`, which controls the\n", "size of the random subset of features to consider when looking for the best\n", - "split when growing the trees: smaller values for `max_features` will lead to\n", + "split when growing the trees: smaller values for `max_features` lead to\n", "more random trees with hopefully more uncorrelated prediction errors. However\n", "if `max_features` is too small, predictions can be too random, even after\n", "averaging with the trees in the ensemble.\n", @@ -89,9 +84,9 @@ "We can also tune the different parameters that control the depth of each tree\n", "in the forest. Two parameters are important for this: `max_depth` and\n", "`max_leaf_nodes`. They differ in the way they control the tree structure.\n", - "Indeed, `max_depth` will enforce to have a more symmetric tree, while\n", - "`max_leaf_nodes` does not impose such constraint. If `max_leaf_nodes=None`\n", - "then the number of leaf nodes is unlimited.\n", + "Indeed, `max_depth` enforces growing symmetric trees, while `max_leaf_nodes`\n", + "does not impose such constraint. If `max_leaf_nodes=None` then the number of\n", + "leaf nodes is unlimited.\n", "\n", "The hyperparameter `min_samples_leaf` controls the minimum number of samples\n", "required to be at a leaf node. This means that a split point (at any depth) is\n", @@ -176,26 +171,33 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "## Gradient-boosting decision trees\n", + "## Histogram gradient-boosting decision trees\n", "\n", - "For gradient-boosting, parameters are coupled, so we cannot set the parameters\n", - "one after the other anymore. The important parameters are `n_estimators`,\n", + "For gradient-boosting, hyperparameters are coupled, so we cannot set them\n", + "one after the other anymore. The important hyperparameters are `max_iter`,\n", "`learning_rate`, and `max_depth` or `max_leaf_nodes` (as previously discussed\n", "random forest).\n", "\n", - "Let's first discuss the `max_depth` (or `max_leaf_nodes`) parameter. We saw in\n", - "the section on gradient-boosting that the algorithm fits the error of the\n", - "previous tree in the ensemble. Thus, fitting fully grown trees would be\n", + "Let's first discuss `max_iter` which, similarly to the `n_estimators`\n", + "hyperparameter in random forests, controls the number of trees in the\n", + "estimator. The difference is that the actual number of trees trained by the\n", + "model is not entirely set by the user, but depends also on the stopping\n", + "criteria: the number of trees can be lower than `max_iter` if adding a new\n", + "tree does not improve the model enough. We will give more details on this in\n", + "the next exercise.\n", + "\n", + "The depth of the trees is controlled by `max_depth` (or `max_leaf_nodes`). We\n", + "saw in the section on gradient-boosting that boosting algorithms fit the error\n", + "of the previous tree in the ensemble. Thus, fitting fully grown trees would be\n", "detrimental. Indeed, the first tree of the ensemble would perfectly fit\n", "(overfit) the data and thus no subsequent tree would be required, since there\n", "would be no residuals. Therefore, the tree used in gradient-boosting should\n", "have a low depth, typically between 3 to 8 levels, or few leaves ($2^3=8$ to\n", - "$2^8=256$). Having very weak learners at each step will help reducing\n", - "overfitting.\n", + "$2^8=256$). Having very weak learners at each step helps reducing overfitting.\n", "\n", "With this consideration in mind, the deeper the trees, the faster the\n", - "residuals will be corrected and less learners are required. Therefore,\n", - "`n_estimators` should be increased if `max_depth` is lower.\n", + "residuals are corrected and then less learners are required. Therefore,\n", + "it can be beneficial to increase `max_iter` if `max_depth` is low.\n", "\n", "Finally, we have overlooked the impact of the `learning_rate` parameter until\n", "now. When fitting the residuals, we would like the tree to try to correct all\n", @@ -203,9 +205,9 @@ "control this behaviour. A small learning-rate value would only correct the\n", "residuals of very few samples. If a large learning-rate is set (e.g., 1), we\n", "would fit the residuals of all samples. So, with a very low learning-rate, we\n", - "will need more estimators to correct the overall error. However, a too large\n", - "learning-rate tends to obtain an overfitted ensemble, similar to having a too\n", - "large tree depth." + "would need more estimators to correct the overall error. However, a too large\n", + "learning-rate tends to obtain an overfitted ensemble, similar to having very\n", + "deep trees." ] }, { @@ -215,15 +217,15 @@ "outputs": [], "source": [ "from scipy.stats import loguniform\n", - "from sklearn.ensemble import GradientBoostingRegressor\n", + "from sklearn.ensemble import HistGradientBoostingRegressor\n", "\n", "param_distributions = {\n", - " \"n_estimators\": [1, 2, 5, 10, 20, 50, 100, 200, 500],\n", + " \"max_iter\": [3, 10, 30, 100, 300, 1000],\n", " \"max_leaf_nodes\": [2, 5, 10, 20, 50, 100],\n", " \"learning_rate\": loguniform(0.01, 1),\n", "}\n", "search_cv = RandomizedSearchCV(\n", - " GradientBoostingRegressor(),\n", + " HistGradientBoostingRegressor(),\n", " param_distributions=param_distributions,\n", " scoring=\"neg_mean_absolute_error\",\n", " n_iter=20,\n", @@ -247,15 +249,15 @@ "\n", "
\n", "

Caution!

\n", - "

Here, we tune the n_estimators but be aware that is better to use\n", - "early_stopping as done in the Exercise M6.04.

\n", + "

Here, we tune max_iter but be aware that it is better to set max_iter to a\n", + "fixed, large enough value and use parameters linked to early_stopping as we\n", + "will do in Exercise M6.04.

\n", "
\n", "\n", - "In this search, we see that the `learning_rate` is required to be large\n", - "enough, i.e. > 0.1. We also observe that for the best ranked models, having a\n", - "smaller `learning_rate`, will require more trees or a larger number of leaves\n", + "In this search, we observe that for the best ranked models, having a\n", + "smaller `learning_rate`, requires more trees or a larger number of leaves\n", "for each tree. However, it is particularly difficult to draw more detailed\n", - "conclusions since the best value of an hyperparameter depends on the other\n", + "conclusions since the best value of each hyperparameter depends on the other\n", "hyperparameter values." ] }, @@ -263,7 +265,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Now we estimate the generalization performance of the best model using the\n", + "We can now estimate the generalization performance of the best model using the\n", "test set." ] }, @@ -274,7 +276,7 @@ "outputs": [], "source": [ "error = -search_cv.score(data_test, target_test)\n", - "print(f\"On average, our GBDT regressor makes an error of {error:.2f} k$\")" + "print(f\"On average, our HGBT regressor makes an error of {error:.2f} k$\")" ] }, { @@ -284,7 +286,17 @@ "The mean test score in the held-out test set is slightly better than the score\n", "of the best model. The reason is that the final model is refitted on the whole\n", "training set and therefore, on more data than the cross-validated models of\n", - "the grid search procedure." + "the grid search procedure.\n", + "\n", + "We summarize these details in the following table:\n", + "\n", + "| **Bagging & Random Forests** | **Boosting** |\n", + "|--------------------------------------------------|-----------------------------------------------------|\n", + "| fit trees **independently** | fit trees **sequentially** |\n", + "| each **deep tree overfits** | each **shallow tree underfits** |\n", + "| averaging the tree predictions **reduces overfitting** | sequentially adding trees **reduces underfitting** |\n", + "| generalization improves with the number of trees | too many trees may cause overfitting |\n", + "| does not have a `learning_rate` parameter | fitting the residuals is controlled by the `learning_rate` |" ] } ], diff --git a/notebooks/ensemble_sol_03.ipynb b/notebooks/ensemble_sol_03.ipynb index c4213984b..7fc5dae16 100644 --- a/notebooks/ensemble_sol_03.ipynb +++ b/notebooks/ensemble_sol_03.ipynb @@ -13,7 +13,7 @@ "* use the early-stopping strategy to avoid adding unnecessary trees, to get\n", " the best generalization performances.\n", "\n", - "We will use the California housing dataset to conduct our experiments." + "We use the California housing dataset to conduct our experiments." ] }, { @@ -91,7 +91,7 @@ "For both the gradient-boosting and random forest models, create a validation\n", "curve using the training set to assess the impact of the number of trees on\n", "the performance of each model. Evaluate the list of parameters `param_range =\n", - "[1, 2, 5, 10, 20, 50, 100]` and use the mean absolute error." + "np.array([1, 2, 5, 10, 20, 50, 100])` and use the mean absolute error." ] }, { @@ -101,102 +101,47 @@ "outputs": [], "source": [ "# solution\n", - "from sklearn.model_selection import validation_curve\n", + "import numpy as np\n", "\n", - "param_range = [1, 2, 5, 10, 20, 50, 100]\n", - "gbdt_train_scores, gbdt_validation_scores = validation_curve(\n", - " gbdt,\n", - " data_train,\n", - " target_train,\n", - " param_name=\"n_estimators\",\n", - " param_range=param_range,\n", - " scoring=\"neg_mean_absolute_error\",\n", - " n_jobs=2,\n", - ")\n", - "gbdt_train_errors, gbdt_validation_errors = (\n", - " -gbdt_train_scores,\n", - " -gbdt_validation_scores,\n", - ")\n", + "from sklearn.model_selection import ValidationCurveDisplay\n", "\n", - "forest_train_scores, forest_validation_scores = validation_curve(\n", + "param_range = np.array([1, 2, 5, 10, 20, 50, 100])\n", + "disp = ValidationCurveDisplay.from_estimator(\n", " forest,\n", - " data_train,\n", - " target_train,\n", + " data,\n", + " target,\n", " param_name=\"n_estimators\",\n", " param_range=param_range,\n", " scoring=\"neg_mean_absolute_error\",\n", + " negate_score=True,\n", + " std_display_style=\"errorbar\",\n", " n_jobs=2,\n", ")\n", - "forest_train_errors = -forest_train_scores\n", - "forest_validation_errors = -forest_validation_scores" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "tags": [ - "solution" - ] - }, - "outputs": [], - "source": [ - "import matplotlib.pyplot as plt\n", - "\n", - "fig, axs = plt.subplots(ncols=2, sharex=True, sharey=True, figsize=(10, 4))\n", - "\n", - "axs[0].errorbar(\n", - " param_range,\n", - " gbdt_train_errors.mean(axis=1),\n", - " yerr=gbdt_train_errors.std(axis=1),\n", - " label=\"Training\",\n", - ")\n", - "axs[0].errorbar(\n", - " param_range,\n", - " gbdt_validation_errors.mean(axis=1),\n", - " yerr=gbdt_validation_errors.std(axis=1),\n", - " label=\"Cross-validation\",\n", - ")\n", - "axs[0].set_title(\"Gradient boosting decision tree\")\n", - "axs[0].set_xlabel(\"# estimators\")\n", - "axs[0].set_ylabel(\"Mean absolute error in k$\\n(smaller is better)\")\n", - "\n", - "axs[1].errorbar(\n", - " param_range,\n", - " forest_train_errors.mean(axis=1),\n", - " yerr=forest_train_errors.std(axis=1),\n", - " label=\"Training\",\n", - ")\n", - "axs[1].errorbar(\n", - " param_range,\n", - " forest_validation_errors.mean(axis=1),\n", - " yerr=forest_validation_errors.std(axis=1),\n", - " label=\"Cross-validation\",\n", - ")\n", - "axs[1].set_title(\"Random forest\")\n", - "axs[1].set_xlabel(\"# estimators\")\n", "\n", - "plt.legend()\n", - "_ = fig.suptitle(\"Validation curves\", y=1.1)" + "_ = disp.ax_.set(\n", + " xlabel=\"Number of trees in the forest\",\n", + " ylabel=\"Mean absolute error (k$)\",\n", + " title=\"Validation curve for random forest\",\n", + ")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "Both gradient boosting and random forest models will always improve when\n", - "increasing the number of trees in the ensemble. However, it will reach a\n", - "plateau where adding new trees will just make fitting and scoring slower.\n", + "Both gradient boosting and random forest models improve when increasing the\n", + "number of trees in the ensemble. However, the scores reach a plateau where\n", + "adding new trees just makes fitting and scoring slower.\n", "\n", "To avoid adding new unnecessary tree, unlike random-forest gradient-boosting\n", - "offers an early-stopping option. Internally, the algorithm will use an\n", + "offers an early-stopping option. Internally, the algorithm uses an\n", "out-of-sample set to compute the generalization performance of the model at\n", "each addition of a tree. Thus, if the generalization performance is not\n", - "improving for several iterations, it will stop adding trees.\n", + "improving for several iterations, it stops adding trees.\n", "\n", "Now, create a gradient-boosting model with `n_estimators=1_000`. This number\n", - "of trees will be too large. Change the parameter `n_iter_no_change` such that\n", - "the gradient boosting fitting will stop after adding 5 trees that do not\n", + "of trees is certainly too large. Change the parameter `n_iter_no_change` such\n", + "that the gradient boosting fitting stops after adding 5 trees that do not\n", "improve the overall generalization performance." ] }, diff --git a/notebooks/linear_models_ex_02.ipynb b/notebooks/linear_models_ex_02.ipynb index 4cf750e81..89bae664e 100644 --- a/notebooks/linear_models_ex_02.ipynb +++ b/notebooks/linear_models_ex_02.ipynb @@ -4,19 +4,26 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "# \ud83d\udcdd Exercise M4.03\n", + "# \ud83d\udcdd Exercise M4.02\n", "\n", - "In all previous notebooks, we only used a single feature in `data`. But we\n", - "have already shown that we could add new features to make the model more\n", - "expressive by deriving new features, based on the original feature.\n", + "In the previous notebook, we showed that we can add new features based on the\n", + "original feature `x` to make the model more expressive, for instance `x ** 2` or\n", + "`x ** 3`. In that case we only used a single feature in `data`.\n", "\n", "The aim of this notebook is to train a linear regression algorithm on a\n", - "dataset with more than a single feature.\n", + "dataset with more than a single feature. In such a \"multi-dimensional\" feature\n", + "space we can derive new features of the form `x1 * x2`, `x2 * x3`, etc.\n", + "Products of features are usually called \"non-linear\" or \"multiplicative\"\n", + "interactions between features.\n", "\n", - "We will load a dataset about house prices in California. The dataset consists\n", - "of 8 features regarding the demography and geography of districts in\n", - "California and the aim is to predict the median house price of each district.\n", - "We will use all 8 features to predict the target, the median house price." + "Feature engineering can be an important step of a model pipeline as long as\n", + "the new features are expected to be predictive. For instance, think of a\n", + "classification model to decide if a patient has risk of developing a heart\n", + "disease. This would depend on the patient's Body Mass Index which is defined\n", + "as `weight / height ** 2`.\n", + "\n", + "We load the dataset penguins dataset. We first use a set of 3 numerical\n", + "features to predict the target, i.e. the body mass of the penguin." ] }, { @@ -36,10 +43,18 @@ "metadata": {}, "outputs": [], "source": [ - "from sklearn.datasets import fetch_california_housing\n", + "import pandas as pd\n", + "\n", + "penguins = pd.read_csv(\"../datasets/penguins.csv\")\n", + "\n", + "columns = [\"Flipper Length (mm)\", \"Culmen Length (mm)\", \"Culmen Depth (mm)\"]\n", + "target_name = \"Body Mass (g)\"\n", "\n", - "data, target = fetch_california_housing(as_frame=True, return_X_y=True)\n", - "target *= 100 # rescale the target in k$\n", + "# Remove lines with missing values for the columns of interest\n", + "penguins_non_missing = penguins[columns + [target_name]].dropna()\n", + "\n", + "data = penguins_non_missing[columns]\n", + "target = penguins_non_missing[target_name]\n", "data.head()" ] }, @@ -65,7 +80,84 @@ "metadata": {}, "source": [ "Execute a cross-validation with 10 folds and use the mean absolute error (MAE)\n", - "as metric. Be sure to *return* the fitted *estimators*." + "as metric." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Write your code here." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Compute the mean and std of the MAE in grams (g). Remember you have to revert\n", + "the sign introduced when metrics start with `neg_`, such as in\n", + "`\"neg_mean_absolute_error\"`." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Write your code here." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now create a pipeline using `make_pipeline` consisting of a\n", + "`PolynomialFeatures` and a linear regression. Set `degree=2` and\n", + "`interaction_only=True` to the feature engineering step. Remember not to\n", + "include a \"bias\" feature (that is a constant-valued feature) to avoid\n", + "introducing a redundancy with the intercept of the subsequent linear\n", + "regression model.\n", + "\n", + "You may want to use the `.set_output(transform=\"pandas\")` method of the\n", + "pipeline to answer the next question." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Write your code here." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Transform the first 5 rows of the dataset and look at the column names. How\n", + "many features are generated at the output of the `PolynomialFeatures` step in\n", + "the previous pipeline?" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Write your code here." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Check that the values for the new interaction features are correct for a few\n", + "of them." ] }, { @@ -81,7 +173,9 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Compute the mean and std of the MAE in thousands of dollars (k$)." + "Use the same cross-validation strategy as done previously to estimate the mean\n", + "and std of the MAE in grams (g) for such a pipeline. Compare with the results\n", + "without feature engineering." ] }, { @@ -95,15 +189,35 @@ }, { "cell_type": "markdown", - "metadata": { - "lines_to_next_cell": 2 - }, - "source": [ - "Inspect the fitted model using a box plot to show the distribution of values\n", - "for the coefficients returned from the cross-validation. Hint: use the\n", - "function\n", - "[`df.plot.box()`](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.plot.box.html)\n", - "to create a box plot." + "metadata": {}, + "source": [ + "\n", + "Now let's try to build an alternative pipeline with an adjustable number of\n", + "intermediate features while keeping a similar predictive power. To do so, try\n", + "using the `Nystroem` transformer instead of `PolynomialFeatures`. Set the\n", + "kernel parameter to `\"poly\"` and `degree` to 2. Adjust the number of\n", + "components to be as small as possible while keeping a good cross-validation\n", + "performance.\n", + "\n", + "Hint: Use a `ValidationCurveDisplay` with `param_range = np.array([5, 10, 50,\n", + "100])` to find the optimal `n_components`." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Write your code here." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "How do the mean and std of the MAE for the Nystroem pipeline with optimal\n", + "`n_components` compare to the other previous models?" ] }, { diff --git a/notebooks/linear_models_ex_03.ipynb b/notebooks/linear_models_ex_03.ipynb new file mode 100644 index 000000000..36b516f3c --- /dev/null +++ b/notebooks/linear_models_ex_03.ipynb @@ -0,0 +1,136 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# \ud83d\udcdd Exercise M4.03\n", + "\n", + "The parameter `penalty` can control the **type** of regularization to use,\n", + "whereas the regularization **strength** is set using the parameter `C`.\n", + "Setting`penalty=\"none\"` is equivalent to an infinitely large value of `C`. In\n", + "this exercise, we ask you to train a logistic regression classifier using the\n", + "`penalty=\"l2\"` regularization (which happens to be the default in\n", + "scikit-learn) to find by yourself the effect of the parameter `C`.\n", + "\n", + "We start by loading the dataset." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "
\n", + "

Note

\n", + "

If you want a deeper overview regarding this dataset, you can refer to the\n", + "Appendix - Datasets description section at the end of this MOOC.

\n", + "
" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "\n", + "penguins = pd.read_csv(\"../datasets/penguins_classification.csv\")\n", + "# only keep the Adelie and Chinstrap classes\n", + "penguins = (\n", + " penguins.set_index(\"Species\").loc[[\"Adelie\", \"Chinstrap\"]].reset_index()\n", + ")\n", + "\n", + "culmen_columns = [\"Culmen Length (mm)\", \"Culmen Depth (mm)\"]\n", + "target_column = \"Species\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.model_selection import train_test_split\n", + "\n", + "penguins_train, penguins_test = train_test_split(penguins, random_state=0)\n", + "\n", + "data_train = penguins_train[culmen_columns]\n", + "data_test = penguins_test[culmen_columns]\n", + "\n", + "target_train = penguins_train[target_column]\n", + "target_test = penguins_test[target_column]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "First, let's create our predictive model." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.pipeline import make_pipeline\n", + "from sklearn.preprocessing import StandardScaler\n", + "from sklearn.linear_model import LogisticRegression\n", + "\n", + "logistic_regression = make_pipeline(\n", + " StandardScaler(), LogisticRegression(penalty=\"l2\")\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Given the following candidates for the `C` parameter, find out the impact of\n", + "`C` on the classifier decision boundary. You can use\n", + "`sklearn.inspection.DecisionBoundaryDisplay.from_estimator` to plot the\n", + "decision function boundary." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "Cs = [0.01, 0.1, 1, 10]\n", + "\n", + "# Write your code here." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Look at the impact of the `C` hyperparameter on the magnitude of the weights." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Write your code here." + ] + } + ], + "metadata": { + "jupytext": { + "main_language": "python" + }, + "kernelspec": { + "display_name": "Python 3", + "name": "python3" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} \ No newline at end of file diff --git a/notebooks/linear_models_regularization.ipynb b/notebooks/linear_models_regularization.ipynb index ff52b4849..fc1129695 100644 --- a/notebooks/linear_models_regularization.ipynb +++ b/notebooks/linear_models_regularization.ipynb @@ -6,19 +6,18 @@ "source": [ "# Regularization of linear regression model\n", "\n", - "In this notebook, we will see the limitations of linear regression models and\n", - "the advantage of using regularized models instead.\n", + "In this notebook, we explore some limitations of linear regression models and\n", + "demonstrate the benefits of using regularized models instead. Additionally, we\n", + "discuss the importance of scaling the data when working with regularized\n", + "models, especially when tuning the regularization parameter.\n", "\n", - "Besides, we will also present the preprocessing required when dealing with\n", - "regularized models, furthermore when the regularization parameter needs to be\n", - "tuned.\n", - "\n", - "We will start by highlighting the over-fitting issue that can arise with a\n", + "We start by highlighting the problem of overfitting that can occur with a\n", "simple linear regression model.\n", "\n", "## Effect of regularization\n", "\n", - "We will first load the California housing dataset." + "We load the Ames housing dataset. We retain some specific\n", + "`features_of_interest`." ] }, { @@ -38,26 +37,36 @@ "metadata": {}, "outputs": [], "source": [ - "from sklearn.datasets import fetch_california_housing\n", + "import pandas as pd\n", "\n", - "data, target = fetch_california_housing(as_frame=True, return_X_y=True)\n", - "target *= 100 # rescale the target in k$\n", - "data.head()" + "ames_housing = pd.read_csv(\"../datasets/ames_housing_no_missing.csv\")\n", + "features_of_interest = [\n", + " \"LotFrontage\",\n", + " \"LotArea\",\n", + " \"PoolArea\",\n", + " \"YearBuilt\",\n", + " \"YrSold\",\n", + "]\n", + "target_name = \"SalePrice\"\n", + "data, target = (\n", + " ames_housing[features_of_interest],\n", + " ames_housing[target_name],\n", + ")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "In one of the previous notebook, we showed that linear models could be used\n", - "even in settings where `data` and `target` are not linearly linked.\n", - "\n", - "We showed that one can use the `PolynomialFeatures` transformer to create\n", - "additional features encoding non-linear interactions between features.\n", + "In one of the previous notebooks, we showed that linear models could be used\n", + "even when there is no linear relationship between the `data` and `target`.\n", + "For instance, one can use the `PolynomialFeatures` transformer to create\n", + "additional features that capture some non-linear interactions between them.\n", "\n", - "Here, we will use this transformer to augment the feature space. Subsequently,\n", - "we will train a linear regression model. We will use the out-of-sample test\n", - "set to evaluate the generalization capabilities of our model." + "Here, we use this transformer to augment the feature space. Subsequently, we\n", + "train a linear regression model. We use cross-validation with\n", + "`return_train_score=True` to evaluate both the train scores and the\n", + "generalization capabilities of our model." ] }, { @@ -72,8 +81,8 @@ "from sklearn.linear_model import LinearRegression\n", "\n", "linear_regression = make_pipeline(\n", - " PolynomialFeatures(degree=2), LinearRegression()\n", - ")\n", + " PolynomialFeatures(degree=2, include_bias=False), LinearRegression()\n", + ").set_output(transform=\"pandas\")\n", "cv_results = cross_validate(\n", " linear_regression,\n", " data,\n", @@ -102,7 +111,7 @@ "train_error = -cv_results[\"train_score\"]\n", "print(\n", " \"Mean squared error of linear regression model on the train set:\\n\"\n", - " f\"{train_error.mean():.3f} \u00b1 {train_error.std():.3f}\"\n", + " f\"{train_error.mean():.2e} \u00b1 {train_error.std():.2e}\"\n", ")" ] }, @@ -115,7 +124,7 @@ "test_error = -cv_results[\"test_score\"]\n", "print(\n", " \"Mean squared error of linear regression model on the test set:\\n\"\n", - " f\"{test_error.mean():.3f} \u00b1 {test_error.std():.3f}\"\n", + " f\"{test_error.mean():.2e} \u00b1 {test_error.std():.2e}\"\n", ")" ] }, @@ -123,20 +132,22 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "The score on the training set is much better. This generalization performance\n", - "gap between the training and testing score is an indication that our model\n", - "overfitted our training set.\n", + "The training error is in average one order of magnitude lower than the testing\n", + "error (lower error is better). Such a gap between the training and testing\n", + "scores is an indication that our model overfitted the training set. Indeed,\n", + "this is one of the dangers when augmenting the number of features with a\n", + "`PolynomialFeatures` transformer. For instance, one does not expect features\n", + "such as `PoolArea * YrSold` to be very predictive.\n", "\n", - "Indeed, this is one of the danger when augmenting the number of features with\n", - "a `PolynomialFeatures` transformer. Our model will focus on some specific\n", - "features. We can check the weights of the model to have a confirmation. Let's\n", - "create a dataframe: the columns will contain the name of the feature while the\n", - "line the coefficients values stored by each model during the cross-validation.\n", + "To analyze the weights of the model, we can create a dataframe. The columns of\n", + "the dataframe contain the feature names, while the rows store the coefficients\n", + "of each model of a given cross-validation fold.\n", "\n", - "Since we used a `PolynomialFeatures` to augment the data, we will create\n", - "feature names representative of the feature combination. Scikit-learn provides\n", - "a `get_feature_names_out` method for this purpose. First, let's get the first\n", - "fitted model from the cross-validation." + "In order to obtain the feature names associated with each feature combination,\n", + "we need to extract them from the augmented data created by\n", + "`PolynomialFeatures`. Fortunately, scikit-learn provides a convenient method\n", + "called `feature_names_in_` for this purpose. Let's begin by retrieving the\n", + "coefficients from the model fitted in the first cross-validation fold." ] }, { @@ -145,15 +156,16 @@ "metadata": {}, "outputs": [], "source": [ - "model_first_fold = cv_results[\"estimator\"][0]" + "model_first_fold = cv_results[\"estimator\"][0]\n", + "model_first_fold" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "Now, we can access to the fitted `PolynomialFeatures` to generate the feature\n", - "names:" + "Now, we can access the fitted `LinearRegression` (step `-1` i.e. the last step\n", + "of the `linear_regression` pipeline) to recover the feature names." ] }, { @@ -162,9 +174,7 @@ "metadata": {}, "outputs": [], "source": [ - "feature_names = model_first_fold[0].get_feature_names_out(\n", - " input_features=data.columns\n", - ")\n", + "feature_names = model_first_fold[-1].feature_names_in_\n", "feature_names" ] }, @@ -172,7 +182,9 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Finally, we can create the dataframe containing all the information." + "The following code creates a list by iterating through the estimators and\n", + "querying their last step for the learned `coef_`. We can then create the\n", + "dataframe containing all the information." ] }, { @@ -203,18 +215,48 @@ "import matplotlib.pyplot as plt\n", "\n", "color = {\"whiskers\": \"black\", \"medians\": \"black\", \"caps\": \"black\"}\n", - "weights_linear_regression.plot.box(color=color, vert=False, figsize=(6, 16))\n", - "_ = plt.title(\"Linear regression coefficients\")" + "fig, ax = plt.subplots(figsize=(10, 10))\n", + "weights_linear_regression.plot.box(color=color, vert=False, ax=ax)\n", + "_ = ax.set(title=\"Linear regression weights (linear scale)\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "By looking at the bar plot above it would seem that most of the features are\n", + "very close to zero, but this is just an effect of visualizing them on the same\n", + "scale as the extremely large span of `\"YrSold\"`. Instead we can use a\n", + "symmetric log scale for the plot." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "color = {\"whiskers\": \"black\", \"medians\": \"black\", \"caps\": \"black\"}\n", + "fig, ax = plt.subplots(figsize=(10, 10))\n", + "weights_linear_regression.plot.box(color=color, vert=False, ax=ax)\n", + "_ = ax.set(\n", + " title=\"Linear regression weights (symmetric log scale)\",\n", + " xscale=\"symlog\",\n", + ")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ + "Observe that some coefficients are extremely large while others are extremely\n", + "small, yet non-zero. Furthermore, the coefficient values can be very unstable\n", + "across cross-validation folds.\n", + "\n", "We can force the linear regression model to consider all features in a more\n", - "homogeneous manner. In fact, we could force large positive or negative weight\n", - "to shrink toward zero. This is known as regularization. We will use a ridge\n", - "model which enforces such behavior." + "homogeneous manner. In fact, we could force large positive or negative\n", + "weights to shrink toward zero. This is known as regularization. We use a\n", + "ridge model which enforces such behavior." ] }, { @@ -225,12 +267,15 @@ "source": [ "from sklearn.linear_model import Ridge\n", "\n", - "ridge = make_pipeline(PolynomialFeatures(degree=2), Ridge(alpha=100))\n", + "ridge = make_pipeline(\n", + " PolynomialFeatures(degree=2, include_bias=False),\n", + " Ridge(alpha=100, solver=\"cholesky\"),\n", + ")\n", "cv_results = cross_validate(\n", " ridge,\n", " data,\n", " target,\n", - " cv=10,\n", + " cv=20,\n", " scoring=\"neg_mean_squared_error\",\n", " return_train_score=True,\n", " return_estimator=True,\n", @@ -241,11 +286,12 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "The code cell above will generate a couple of warnings because the features\n", - "included both extremely large and extremely small values, which are causing\n", - "numerical problems when training the predictive model.\n", + "The code cell above can generate a couple of warnings (depending on the\n", + "choice of solver) because the features included both extremely large and\n", + "extremely small values, which are causing numerical problems when training\n", + "the predictive model. We will get to that in a bit.\n", "\n", - "We can explore the train and test scores of this model." + "Let us evaluate the train and test scores of this model." ] }, { @@ -256,8 +302,8 @@ "source": [ "train_error = -cv_results[\"train_score\"]\n", "print(\n", - " \"Mean squared error of linear regression model on the train set:\\n\"\n", - " f\"{train_error.mean():.3f} \u00b1 {train_error.std():.3f}\"\n", + " \"Mean squared error of ridge model on the train set:\\n\"\n", + " f\"{train_error.mean():.2e} \u00b1 {train_error.std():.2e}\"\n", ")" ] }, @@ -269,8 +315,8 @@ "source": [ "test_error = -cv_results[\"test_score\"]\n", "print(\n", - " \"Mean squared error of linear regression model on the test set:\\n\"\n", - " f\"{test_error.mean():.3f} \u00b1 {test_error.std():.3f}\"\n", + " \"Mean squared error of ridge model on the test set:\\n\"\n", + " f\"{test_error.mean():.2e} \u00b1 {test_error.std():.2e}\"\n", ")" ] }, @@ -278,9 +324,9 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "We see that the training and testing scores are much closer, indicating that\n", - "our model is less overfitting. We can compare the values of the weights of\n", - "ridge with the un-regularized linear regression." + "We see that the training and testing scores get closer, indicating that our\n", + "model is less overfitting (yet still overfitting!). We can compare the values\n", + "of the weights of ridge with the un-regularized linear regression." ] }, { @@ -299,51 +345,68 @@ "metadata": {}, "outputs": [], "source": [ - "weights_ridge.plot.box(color=color, vert=False, figsize=(6, 16))\n", - "_ = plt.title(\"Ridge weights\")" + "fig, ax = plt.subplots(figsize=(8, 10))\n", + "weights_ridge.plot.box(color=color, vert=False, ax=ax)\n", + "_ = ax.set(title=\"Ridge regression weights\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "By comparing the magnitude of the weights on this plot compared to the\n", - "previous plot, we see that a ridge model will enforce all weights to have a\n", - "similar magnitude, while the overall magnitude of the weights is shrunk\n", - "towards zero with respect to the linear regression model.\n", + "Notice that the overall magnitudes of the weights are shrunk\n", + "(yet non-zero!) with respect to the linear regression model. If you want to,\n", + "feel free to use a symmetric log scale in the previous plot.\n", + "\n", + "You can also observe that even if the weights' values are less extreme, they\n", + "are still unstable from one fold to another. Even worst, the results can vary\n", + "a lot depending on the choice of the solver (for instance try to set\n", + "`solver=\"saga\"` or `solver=\"lsqr\"` instead of `solver=\"cholesky\"` and re-run\n", + "the above cells).\n", "\n", - "However, in this example, we omitted two important aspects: (i) the need to\n", - "scale the data and (ii) the need to search for the best regularization\n", - "parameter.\n", + "In the following we attempt to resolve those remaining problems, by\n", + "focusing on two important aspects we omitted so far:\n", + "- the need to **scale the data**, and\n", + "- the need to **search for the best regularization parameter**.\n", "\n", "## Feature scaling and regularization\n", "\n", - "On the one hand, weights define the link between feature values and the\n", - "predicted target. On the other hand, regularization adds constraints on the\n", - "weights of the model through the `alpha` parameter. Therefore, the effect that\n", - "feature rescaling has on the final weights also interacts with regularization.\n", + "On the one hand, weights define the association between feature values and the\n", + "predicted target, which depends on the scales of both the feature values and\n", + "the target. On the other hand, regularization adds constraints on the weights\n", + "of the model through the `alpha` parameter. Therefore, the effect that feature\n", + "rescaling has on the final weights also interacts with the use of\n", + "regularization.\n", "\n", "Let's consider the case where features live on the same scale/units: if two\n", - "features are found to be equally important by the model, they will be affected\n", - "similarly by regularization strength.\n", + "features are found to be equally important by the model, they are affected\n", + "similarly by the regularization strength.\n", "\n", - "Now, let's consider the scenario where features have completely different data\n", - "scale (for instance age in years and annual revenue in dollars). If two\n", - "features are as important, our model will boost the weights of features with\n", - "small scale and reduce the weights of features with high scale.\n", + "Now, let's consider the scenario where two features have completely different\n", + "data scales (for instance age in years and annual revenue in dollars). Let's\n", + "also assume that both features are approximately equally predictive and are\n", + "not too correlated. Fitting a linear regression without scaling and without\n", + "regularization would give a higher weight to the feature with the smallest\n", + "natural scale. If we add regularization, the feature with the smallest natural\n", + "scale would be penalized more than the other feature. This is not desirable\n", + "given the hypothesis that both features are equally important. In such case we\n", + "require the regularization to stay neutral.\n", "\n", - "We recall that regularization forces weights to be closer. Therefore, we get\n", - "an intuition that if we want to use regularization, dealing with rescaled data\n", - "would make it easier to find an optimal regularization parameter and thus an\n", - "adequate model.\n", + "In practice, we don't know ahead of time which features are predictive, and\n", + "therefore we want regularization to treat all features approximately equally\n", + "by default. This can be achieved by rescaling the features.\n", "\n", - "As a side note, some solvers based on gradient computation are expecting such\n", - "rescaled data. Unscaled data will be detrimental when computing the optimal\n", - "weights. Therefore, when working with a linear model and numerical data, it is\n", - "generally good practice to scale the data.\n", + "Furthermore, many numerical solvers used internally in scikit-learn behave\n", + "better when features are approximately on the same scale. Heterogeneously\n", + "scaled data can be detrimental when solving for the optimal weights (hence the\n", + "warnings we tend to get when fitting linear models on raw data). Therefore,\n", + "when working with a linear model and numerical data, it is generally a good\n", + "practice to scale the data.\n", "\n", - "Thus, we will add a `StandardScaler` in the machine learning pipeline. This\n", - "scaler will be placed just before the regressor." + "Thus, we add a `MinMaxScaler` in the machine learning pipeline, which scales\n", + "each feature individually such that its range maps into the range between zero\n", + "and one. We place it just before the `PolynomialFeatures` transformer as\n", + "powers of features in the range between zero and one remain in the same range." ] }, { @@ -352,13 +415,15 @@ "metadata": {}, "outputs": [], "source": [ - "from sklearn.preprocessing import StandardScaler\n", + "from sklearn.preprocessing import MinMaxScaler\n", "\n", - "ridge = make_pipeline(\n", - " PolynomialFeatures(degree=2), StandardScaler(), Ridge(alpha=0.5)\n", + "scaled_ridge = make_pipeline(\n", + " MinMaxScaler(),\n", + " PolynomialFeatures(degree=2, include_bias=False),\n", + " Ridge(alpha=10, solver=\"cholesky\"),\n", ")\n", "cv_results = cross_validate(\n", - " ridge,\n", + " scaled_ridge,\n", " data,\n", " target,\n", " cv=10,\n", @@ -376,8 +441,8 @@ "source": [ "train_error = -cv_results[\"train_score\"]\n", "print(\n", - " \"Mean squared error of linear regression model on the train set:\\n\"\n", - " f\"{train_error.mean():.3f} \u00b1 {train_error.std():.3f}\"\n", + " \"Mean squared error of scaled ridge model on the train set:\\n\"\n", + " f\"{train_error.mean():.2e} \u00b1 {train_error.std():.2e}\"\n", ")" ] }, @@ -389,8 +454,8 @@ "source": [ "test_error = -cv_results[\"test_score\"]\n", "print(\n", - " \"Mean squared error of linear regression model on the test set:\\n\"\n", - " f\"{test_error.mean():.3f} \u00b1 {test_error.std():.3f}\"\n", + " \"Mean squared error of scaled ridge model on the test set:\\n\"\n", + " f\"{test_error.mean():.2e} \u00b1 {test_error.std():.2e}\"\n", ")" ] }, @@ -398,11 +463,16 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "We observe that scaling data has a positive impact on the test score and that\n", - "the test score is closer to the train score. It means that our model is less\n", + "We observe that scaling data has a positive impact on the test error: it is\n", + "now both lower and closer to the train error. It means that our model is less\n", "overfitted and that we are getting closer to the best generalization sweet\n", "spot.\n", "\n", + "If you want to try different solvers, you can notice that fitting this\n", + "pipeline no longer generates any warning regardless of such choice.\n", + "Additionally, changing the solver should no longer result in significant\n", + "changes in the weights.\n", + "\n", "Let's have an additional look to the different weights." ] }, @@ -413,7 +483,7 @@ "outputs": [], "source": [ "coefs = [est[-1].coef_ for est in cv_results[\"estimator\"]]\n", - "weights_ridge = pd.DataFrame(coefs, columns=feature_names)" + "weights_ridge_scaled_data = pd.DataFrame(coefs, columns=feature_names)" ] }, { @@ -422,19 +492,21 @@ "metadata": {}, "outputs": [], "source": [ - "weights_ridge.plot.box(color=color, vert=False, figsize=(6, 16))\n", - "_ = plt.title(\"Ridge weights with data scaling\")" + "fig, ax = plt.subplots(figsize=(8, 10))\n", + "weights_ridge_scaled_data.plot.box(color=color, vert=False, ax=ax)\n", + "_ = ax.set(title=\"Ridge regression weights with data scaling\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "Compare to the previous plots, we see that now all weight magnitudes are\n", - "closer and that all features are more equally contributing.\n", + "Compared to the previous plots, we see that now most weight magnitudes have a\n", + "similar order of magnitude, i.e. they are more equally contributing. The\n", + "number of unstable weights also decreased.\n", "\n", - "In the previous example, we fixed `alpha=0.5`. We will now check the impact of\n", - "the value of `alpha` by increasing its value." + "In the previous model, we set `alpha=10`. We can now check the impact of\n", + "`alpha` by increasing it to a very large value." ] }, { @@ -443,11 +515,13 @@ "metadata": {}, "outputs": [], "source": [ - "ridge = make_pipeline(\n", - " PolynomialFeatures(degree=2), StandardScaler(), Ridge(alpha=1_000_000)\n", + "ridge_large_alpha = make_pipeline(\n", + " MinMaxScaler(),\n", + " PolynomialFeatures(degree=2, include_bias=False),\n", + " Ridge(alpha=1_000_000, solver=\"lsqr\"),\n", ")\n", "cv_results = cross_validate(\n", - " ridge,\n", + " ridge_large_alpha,\n", " data,\n", " target,\n", " cv=10,\n", @@ -464,7 +538,7 @@ "outputs": [], "source": [ "coefs = [est[-1].coef_ for est in cv_results[\"estimator\"]]\n", - "weights_ridge = pd.DataFrame(coefs, columns=feature_names)" + "weights_ridge_scaled_data = pd.DataFrame(coefs, columns=feature_names)" ] }, { @@ -473,42 +547,40 @@ "metadata": {}, "outputs": [], "source": [ - "weights_ridge.plot.box(color=color, vert=False, figsize=(6, 16))\n", - "_ = plt.title(\"Ridge weights with data scaling and large alpha\")" + "fig, ax = plt.subplots(figsize=(8, 10))\n", + "weights_ridge_scaled_data.plot.box(color=color, vert=False, ax=ax)\n", + "_ = ax.set(title=\"Ridge regression weights with data scaling and large alpha\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "Looking specifically to weights values, we observe that increasing the value\n", - "of `alpha` will decrease the weight values. A negative value of `alpha` would\n", - "actually enhance large weights and promote overfitting.\n", + "When examining the weight values, we notice that as the `alpha` value\n", + "increases, the weights decrease. A negative value of `alpha` can lead to\n", + "unpredictable and unstable behavior in the model.\n", "\n", "
\n", "

Note

\n", "

Here, we only focus on numerical features. For categorical features, it is\n", "generally common to omit scaling when features are encoded with a\n", "OneHotEncoder since the feature values are already on a similar scale.

\n", - "

However, this choice can be questioned since scaling interacts with\n", - "regularization as well. For instance, scaling categorical features that are\n", - "imbalanced (e.g. more occurrences of a specific category) would even out the\n", - "impact of regularization to each category. However, scaling such features in\n", - "the presence of rare categories could be problematic (i.e. division by a very\n", + "

However, this choice may depend on the scaling method and the user case. For\n", + "instance, standard scaling categorical features that are imbalanced (e.g. more\n", + "occurrences of a specific category) would even out the impact of\n", + "regularization to each category. However, scaling such features in the\n", + "presence of rare categories could be problematic (i.e. division by a very\n", "small standard deviation) and it can therefore introduce numerical issues.

\n", "
\n", "\n", - "In the previous analysis, we did not study if the parameter `alpha` will have\n", - "an effect on the performance. We chose the parameter beforehand and fixed it\n", - "for the analysis.\n", + "In the previous analysis, we chose the parameter beforehand and fixed it for\n", + "the analysis. In the next section, we check how the regularization parameter\n", + "`alpha` should be tuned.\n", "\n", - "In the next section, we will check the impact of the regularization parameter\n", - "`alpha` and how it should be tuned.\n", - "\n", - "## Fine tuning the regularization parameter\n", + "## Tuning the regularization parameter\n", "\n", "As mentioned, the regularization parameter needs to be tuned on each dataset.\n", - "The default parameter will not lead to the optimal model. Therefore, we need\n", + "The default parameter does not lead to the optimal model. Therefore, we need\n", "to tune the `alpha` parameter.\n", "\n", "Model hyperparameter tuning should be done with care. Indeed, we want to find\n", @@ -527,10 +599,10 @@ "these predictors finishes by `CV`. In the case of `Ridge`, scikit-learn\n", "provides a `RidgeCV` regressor.\n", "\n", - "Therefore, we can use this predictor as the last step of the pipeline.\n", - "Including the pipeline a cross-validation allows to make a nested\n", - "cross-validation: the inner cross-validation will search for the best alpha,\n", - "while the outer cross-validation will give an estimate of the testing score." + "Cross-validating a pipeline that contains such predictors allows to make a\n", + "nested cross-validation: the inner cross-validation searches for the best\n", + "alpha, while the outer cross-validation gives an estimate of the testing\n", + "score." ] }, { @@ -542,10 +614,10 @@ "import numpy as np\n", "from sklearn.linear_model import RidgeCV\n", "\n", - "alphas = np.logspace(-2, 0, num=21)\n", + "alphas = np.logspace(-7, 5, num=100)\n", "ridge = make_pipeline(\n", - " PolynomialFeatures(degree=2),\n", - " StandardScaler(),\n", + " MinMaxScaler(),\n", + " PolynomialFeatures(degree=2, include_bias=False),\n", " RidgeCV(alphas=alphas, store_cv_values=True),\n", ")" ] @@ -558,7 +630,7 @@ "source": [ "from sklearn.model_selection import ShuffleSplit\n", "\n", - "cv = ShuffleSplit(n_splits=5, random_state=1)\n", + "cv = ShuffleSplit(n_splits=50, random_state=0)\n", "cv_results = cross_validate(\n", " ridge,\n", " data,\n", @@ -579,8 +651,8 @@ "source": [ "train_error = -cv_results[\"train_score\"]\n", "print(\n", - " \"Mean squared error of linear regression model on the train set:\\n\"\n", - " f\"{train_error.mean():.3f} \u00b1 {train_error.std():.3f}\"\n", + " \"Mean squared error of tuned ridge model on the train set:\\n\"\n", + " f\"{train_error.mean():.2e} \u00b1 {train_error.std():.2e}\"\n", ")" ] }, @@ -592,8 +664,8 @@ "source": [ "test_error = -cv_results[\"test_score\"]\n", "print(\n", - " \"Mean squared error of linear regression model on the test set:\\n\"\n", - " f\"{test_error.mean():.3f} \u00b1 {test_error.std():.3f}\"\n", + " \"Mean squared error of tuned ridge model on the test set:\\n\"\n", + " f\"{test_error.mean():.2e} \u00b1 {test_error.std():.2e}\"\n", ")" ] }, @@ -606,8 +678,8 @@ "\n", "When fitting the ridge regressor, we also requested to store the error found\n", "during cross-validation (by setting the parameter `store_cv_values=True`). We\n", - "will plot the mean squared error for the different `alphas` regularization\n", - "strength that we tried. The error bars represent one standard deviation of the\n", + "can plot the mean squared error for the different `alphas` regularization\n", + "strengths that we tried. The error bars represent one standard deviation of the\n", "average mean square error across folds for a given value of `alpha`." ] }, @@ -631,12 +703,15 @@ "metadata": {}, "outputs": [], "source": [ - "plt.errorbar(cv_alphas.index, cv_alphas[\"mean\"], yerr=cv_alphas[\"std\"])\n", - "plt.xlim((0.0, 1.0))\n", - "plt.ylim((4_500, 11_000))\n", - "plt.ylabel(\"Mean squared error\\n (lower is better)\")\n", - "plt.xlabel(\"alpha\")\n", - "_ = plt.title(\"Testing error obtained by cross-validation\")" + "fig, ax = plt.subplots(figsize=(8, 6))\n", + "ax.errorbar(cv_alphas.index, cv_alphas[\"mean\"], yerr=cv_alphas[\"std\"])\n", + "_ = ax.set(\n", + " xscale=\"log\",\n", + " xlabel=\"alpha\",\n", + " yscale=\"log\",\n", + " ylabel=\"Mean squared error\\n (lower is better)\",\n", + " title=\"Testing error in RidgeCV's inner cross-validation\",\n", + ")" ] }, { @@ -685,11 +760,28 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "This range can be reduced by decreasing the spacing between the grid of\n", - "`alphas`.\n", - "\n", - "In this notebook, you learned about the concept of regularization and the\n", - "importance of preprocessing and parameter tuning." + "This range can be reduced depending on the feature engineering and\n", + "preprocessing.\n", + "\n", + "Here is a summary of important points highlighted in this notebook:\n", + "- scaling features makes the effect of regularization more even: all variables\n", + " are regularized by comparable magnitude, which would not necessarily be the\n", + " case with the natural feature scales;\n", + "- scaling features makes the numerical solvers more stable which is also\n", + " helpful to tune the regularization parameter more independently of the\n", + " choice of the solver used to fit the linear model;\n", + "- tuning the regularization parameter of the `Ridge` estimator can be done\n", + " very efficiently by using the `RidgeCV` class. Wrapping it into a\n", + " `cross_validate` call makes it possible to assess the true generalization\n", + " power of the whole pipeline by including the tuning of the regularization\n", + " parameter as part of the learning process: this is an example of \"nested\n", + " cross-validation\";\n", + "- doing so makes it possible to check that the optimal value of the\n", + " regularization strength `alpha` is robust to a resampling of the dataset. If\n", + " it wasn't the case it would hint at a problem with the dataset (e.g.\n", + " presence of outliers in the features or the target that influence the\n", + " learning process disproportionately) or a bad choice of other elements of\n", + " the feature engineering pipeline." ] } ], diff --git a/notebooks/linear_models_sol_02.ipynb b/notebooks/linear_models_sol_02.ipynb index 634c43171..38ac00ef6 100644 --- a/notebooks/linear_models_sol_02.ipynb +++ b/notebooks/linear_models_sol_02.ipynb @@ -4,19 +4,26 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "# \ud83d\udcc3 Solution for Exercise M4.03\n", + "# \ud83d\udcc3 Solution for Exercise M4.02\n", "\n", - "In all previous notebooks, we only used a single feature in `data`. But we\n", - "have already shown that we could add new features to make the model more\n", - "expressive by deriving new features, based on the original feature.\n", + "In the previous notebook, we showed that we can add new features based on the\n", + "original feature `x` to make the model more expressive, for instance `x ** 2` or\n", + "`x ** 3`. In that case we only used a single feature in `data`.\n", "\n", "The aim of this notebook is to train a linear regression algorithm on a\n", - "dataset with more than a single feature.\n", + "dataset with more than a single feature. In such a \"multi-dimensional\" feature\n", + "space we can derive new features of the form `x1 * x2`, `x2 * x3`, etc.\n", + "Products of features are usually called \"non-linear\" or \"multiplicative\"\n", + "interactions between features.\n", "\n", - "We will load a dataset about house prices in California. The dataset consists\n", - "of 8 features regarding the demography and geography of districts in\n", - "California and the aim is to predict the median house price of each district.\n", - "We will use all 8 features to predict the target, the median house price." + "Feature engineering can be an important step of a model pipeline as long as\n", + "the new features are expected to be predictive. For instance, think of a\n", + "classification model to decide if a patient has risk of developing a heart\n", + "disease. This would depend on the patient's Body Mass Index which is defined\n", + "as `weight / height ** 2`.\n", + "\n", + "We load the dataset penguins dataset. We first use a set of 3 numerical\n", + "features to predict the target, i.e. the body mass of the penguin." ] }, { @@ -36,10 +43,18 @@ "metadata": {}, "outputs": [], "source": [ - "from sklearn.datasets import fetch_california_housing\n", + "import pandas as pd\n", + "\n", + "penguins = pd.read_csv(\"../datasets/penguins.csv\")\n", + "\n", + "columns = [\"Flipper Length (mm)\", \"Culmen Length (mm)\", \"Culmen Depth (mm)\"]\n", + "target_name = \"Body Mass (g)\"\n", + "\n", + "# Remove lines with missing values for the columns of interest\n", + "penguins_non_missing = penguins[columns + [target_name]].dropna()\n", "\n", - "data, target = fetch_california_housing(as_frame=True, return_X_y=True)\n", - "target *= 100 # rescale the target in k$\n", + "data = penguins_non_missing[columns]\n", + "target = penguins_non_missing[target_name]\n", "data.head()" ] }, @@ -68,7 +83,7 @@ "metadata": {}, "source": [ "Execute a cross-validation with 10 folds and use the mean absolute error (MAE)\n", - "as metric. Be sure to *return* the fitted *estimators*." + "as metric." ] }, { @@ -84,9 +99,8 @@ " linear_regression,\n", " data,\n", " target,\n", - " scoring=\"neg_mean_absolute_error\",\n", - " return_estimator=True,\n", " cv=10,\n", + " scoring=\"neg_mean_absolute_error\",\n", " n_jobs=2,\n", ")" ] @@ -95,7 +109,9 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Compute the mean and std of the MAE in thousands of dollars (k$)." + "Compute the mean and std of the MAE in grams (g). Remember you have to revert\n", + "the sign introduced when metrics start with `neg_`, such as in\n", + "`\"neg_mean_absolute_error\"`." ] }, { @@ -106,23 +122,98 @@ "source": [ "# solution\n", "print(\n", - " \"Mean absolute error on testing set: \"\n", - " f\"{-cv_results['test_score'].mean():.3f} k$ \u00b1 \"\n", - " f\"{cv_results['test_score'].std():.3f}\"\n", + " \"Mean absolute error on testing set with original features: \"\n", + " f\"{-cv_results['test_score'].mean():.3f} \u00b1 \"\n", + " f\"{cv_results['test_score'].std():.3f} g\"\n", ")" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now create a pipeline using `make_pipeline` consisting of a\n", + "`PolynomialFeatures` and a linear regression. Set `degree=2` and\n", + "`interaction_only=True` to the feature engineering step. Remember not to\n", + "include a \"bias\" feature (that is a constant-valued feature) to avoid\n", + "introducing a redundancy with the intercept of the subsequent linear\n", + "regression model.\n", + "\n", + "You may want to use the `.set_output(transform=\"pandas\")` method of the\n", + "pipeline to answer the next question." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# solution\n", + "from sklearn.preprocessing import PolynomialFeatures\n", + "from sklearn.pipeline import make_pipeline\n", + "\n", + "poly_features = PolynomialFeatures(\n", + " degree=2, include_bias=False, interaction_only=True\n", + ")\n", + "linear_regression_interactions = make_pipeline(\n", + " poly_features, linear_regression\n", + ").set_output(transform=\"pandas\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Transform the first 5 rows of the dataset and look at the column names. How\n", + "many features are generated at the output of the `PolynomialFeatures` step in\n", + "the previous pipeline?" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# solution\n", + "linear_regression_interactions.fit(data, target)\n", + "linear_regression_interactions[0].transform(data[:5])" + ] + }, { "cell_type": "markdown", "metadata": { - "lines_to_next_cell": 2 + "tags": [ + "solution" + ] + }, + "source": [ + "We observe that 3 features are generated, corresponding to the different\n", + "combinations of products of the 3 original features, i.e. we have 6\n", + "intermediate features in total. In general, given `p` original features, one\n", + "has `p * (p - 1) / 2` interactions." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Check that the values for the new interaction features are correct for a few\n", + "of them." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "tags": [ + "solution" + ] }, "source": [ - "Inspect the fitted model using a box plot to show the distribution of values\n", - "for the coefficients returned from the cross-validation. Hint: use the\n", - "function\n", - "[`df.plot.box()`](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.plot.box.html)\n", - "to create a box plot." + "Let's now check that the value in the 1st row and the 5th column (3384.7) is\n", + "the product of the values at the first and third columns (respectively 181.0\n", + "and 18.7) of the same row:" ] }, { @@ -132,28 +223,183 @@ "outputs": [], "source": [ "# solution\n", - "import pandas as pd\n", - "\n", - "weights = pd.DataFrame(\n", - " [est.coef_ for est in cv_results[\"estimator\"]], columns=data.columns\n", + "culmen_length_first_sample = 181.0\n", + "culmen_depth_first_sample = 18.7\n", + "culmen_length_first_sample * culmen_depth_first_sample" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Use the same cross-validation strategy as done previously to estimate the mean\n", + "and std of the MAE in grams (g) for such a pipeline. Compare with the results\n", + "without feature engineering." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# solution\n", + "cv_results = cross_validate(\n", + " linear_regression_interactions,\n", + " data,\n", + " target,\n", + " cv=10,\n", + " scoring=\"neg_mean_absolute_error\",\n", + " n_jobs=2,\n", + ")\n", + "print(\n", + " \"Mean absolute error on testing set with interactions: \"\n", + " f\"{-cv_results['test_score'].mean():.3f} \u00b1 \"\n", + " f\"{cv_results['test_score'].std():.3f} g\"\n", ")" ] }, + { + "cell_type": "markdown", + "metadata": { + "tags": [ + "solution" + ] + }, + "source": [ + "We observe that the MAE is lower and less spread with the enriched features.\n", + "In this case the additional \"interaction\" features are indeed predictive.\n", + "Later in this module we will see what happens when the enriched features are\n", + "non-predictive and how to deal with this case." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "Now let's try to build an alternative pipeline with an adjustable number of\n", + "intermediate features while keeping a similar predictive power. To do so, try\n", + "using the `Nystroem` transformer instead of `PolynomialFeatures`. Set the\n", + "kernel parameter to `\"poly\"` and `degree` to 2. Adjust the number of\n", + "components to be as small as possible while keeping a good cross-validation\n", + "performance.\n", + "\n", + "Hint: Use a `ValidationCurveDisplay` with `param_range = np.array([5, 10, 50,\n", + "100])` to find the optimal `n_components`." + ] + }, { "cell_type": "code", "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# solution\n", + "import numpy as np\n", + "\n", + "from sklearn.kernel_approximation import Nystroem\n", + "from sklearn.model_selection import ValidationCurveDisplay\n", + "\n", + "nystroem_regression = make_pipeline(\n", + " Nystroem(kernel=\"poly\", degree=2, random_state=0),\n", + " linear_regression,\n", + ")\n", + "\n", + "param_range = np.array([5, 10, 50, 100])\n", + "disp = ValidationCurveDisplay.from_estimator(\n", + " nystroem_regression,\n", + " data,\n", + " target,\n", + " param_name=\"nystroem__n_components\",\n", + " param_range=param_range,\n", + " cv=10,\n", + " scoring=\"neg_mean_absolute_error\",\n", + " negate_score=True,\n", + " std_display_style=\"errorbar\",\n", + " n_jobs=2,\n", + ")\n", + "\n", + "_ = disp.ax_.set(\n", + " xlabel=\"Number of components\",\n", + " ylabel=\"Mean absolute error (g)\",\n", + " title=\"Validation curve for Nystroem regression\",\n", + ")" + ] + }, + { + "cell_type": "markdown", "metadata": { "tags": [ "solution" ] }, + "source": [ + "In the validation curve above we can observe that a small number of components\n", + "leads to an underfitting model, whereas a large number of components leads to\n", + "an overfitting model. The optimal number of Nystr\u00f6m components is around 10\n", + "for this dataset." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "How do the mean and std of the MAE for the Nystroem pipeline with optimal\n", + "`n_components` compare to the other previous models?" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ - "import matplotlib.pyplot as plt\n", + "# solution\n", + "nystroem_regression.set_params(nystroem__n_components=10)\n", + "cv_results = cross_validate(\n", + " nystroem_regression,\n", + " data,\n", + " target,\n", + " cv=10,\n", + " scoring=\"neg_mean_absolute_error\",\n", + " n_jobs=2,\n", + ")\n", + "print(\n", + " \"Mean absolute error on testing set with nystroem: \"\n", + " f\"{-cv_results['test_score'].mean():.3f} \u00b1 \"\n", + " f\"{cv_results['test_score'].std():.3f} g\"\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "tags": [ + "solution" + ] + }, + "source": [ + "In this case we have a model with 10 features instead of 6, and which has\n", + "approximately the same prediction error as the model with interactions.\n", + "\n", + "Notice that if we had `p = 100` original features (instead of 3), the\n", + "`PolynomialFeatures` transformer would have generated `100 * (100 - 1) / 2 =\n", + "4950` additional interaction features (so we would have 5050 features in\n", + "total). The resulting pipeline would have been much slower to train and\n", + "predict and would have had a much larger memory footprint. Furthermore, the\n", + "large number of interaction features would probably have resulted in an\n", + "overfitting model.\n", + "\n", + "On the other hand, the `Nystroem` transformer generates a user-adjustable\n", + "number of features (`n_components`). Furthermore, the optimal number of\n", + "components is usually much smaller than that. So the `Nystroem` transformer\n", + "can be more scalable when the number of original features is too large for\n", + "`PolynomialFeatures` to be used.\n", "\n", - "color = {\"whiskers\": \"black\", \"medians\": \"black\", \"caps\": \"black\"}\n", - "weights.plot.box(color=color, vert=False)\n", - "_ = plt.title(\"Value of linear regression coefficients\")" + "The main downside of the `Nystroem` transformer is that it is not possible to\n", + "easily interpret the meaning of the generated features and therefore the\n", + "meaning of the learned coefficients for the downstream linear model." ] } ], diff --git a/notebooks/linear_models_sol_03.ipynb b/notebooks/linear_models_sol_03.ipynb new file mode 100644 index 000000000..0eabeeb54 --- /dev/null +++ b/notebooks/linear_models_sol_03.ipynb @@ -0,0 +1,200 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# \ud83d\udcc3 Solution for Exercise M4.03\n", + "\n", + "The parameter `penalty` can control the **type** of regularization to use,\n", + "whereas the regularization **strength** is set using the parameter `C`.\n", + "Setting`penalty=\"none\"` is equivalent to an infinitely large value of `C`. In\n", + "this exercise, we ask you to train a logistic regression classifier using the\n", + "`penalty=\"l2\"` regularization (which happens to be the default in\n", + "scikit-learn) to find by yourself the effect of the parameter `C`.\n", + "\n", + "We start by loading the dataset." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "
\n", + "

Note

\n", + "

If you want a deeper overview regarding this dataset, you can refer to the\n", + "Appendix - Datasets description section at the end of this MOOC.

\n", + "
" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "\n", + "penguins = pd.read_csv(\"../datasets/penguins_classification.csv\")\n", + "# only keep the Adelie and Chinstrap classes\n", + "penguins = (\n", + " penguins.set_index(\"Species\").loc[[\"Adelie\", \"Chinstrap\"]].reset_index()\n", + ")\n", + "\n", + "culmen_columns = [\"Culmen Length (mm)\", \"Culmen Depth (mm)\"]\n", + "target_column = \"Species\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.model_selection import train_test_split\n", + "\n", + "penguins_train, penguins_test = train_test_split(penguins, random_state=0)\n", + "\n", + "data_train = penguins_train[culmen_columns]\n", + "data_test = penguins_test[culmen_columns]\n", + "\n", + "target_train = penguins_train[target_column]\n", + "target_test = penguins_test[target_column]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "First, let's create our predictive model." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.pipeline import make_pipeline\n", + "from sklearn.preprocessing import StandardScaler\n", + "from sklearn.linear_model import LogisticRegression\n", + "\n", + "logistic_regression = make_pipeline(\n", + " StandardScaler(), LogisticRegression(penalty=\"l2\")\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Given the following candidates for the `C` parameter, find out the impact of\n", + "`C` on the classifier decision boundary. You can use\n", + "`sklearn.inspection.DecisionBoundaryDisplay.from_estimator` to plot the\n", + "decision function boundary." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "Cs = [0.01, 0.1, 1, 10]\n", + "\n", + "# solution\n", + "import matplotlib.pyplot as plt\n", + "import seaborn as sns\n", + "from sklearn.inspection import DecisionBoundaryDisplay\n", + "\n", + "for C in Cs:\n", + " logistic_regression.set_params(logisticregression__C=C)\n", + " logistic_regression.fit(data_train, target_train)\n", + " accuracy = logistic_regression.score(data_test, target_test)\n", + "\n", + " DecisionBoundaryDisplay.from_estimator(\n", + " logistic_regression,\n", + " data_test,\n", + " response_method=\"predict\",\n", + " cmap=\"RdBu_r\",\n", + " alpha=0.5,\n", + " )\n", + " sns.scatterplot(\n", + " data=penguins_test,\n", + " x=culmen_columns[0],\n", + " y=culmen_columns[1],\n", + " hue=target_column,\n", + " palette=[\"tab:red\", \"tab:blue\"],\n", + " )\n", + " plt.legend(bbox_to_anchor=(1.05, 0.8), loc=\"upper left\")\n", + " plt.title(f\"C: {C} \\n Accuracy on the test set: {accuracy:.2f}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Look at the impact of the `C` hyperparameter on the magnitude of the weights." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# solution\n", + "weights_ridge = []\n", + "for C in Cs:\n", + " logistic_regression.set_params(logisticregression__C=C)\n", + " logistic_regression.fit(data_train, target_train)\n", + " coefs = logistic_regression[-1].coef_[0]\n", + " weights_ridge.append(pd.Series(coefs, index=culmen_columns))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "tags": [ + "solution" + ] + }, + "outputs": [], + "source": [ + "weights_ridge = pd.concat(weights_ridge, axis=1, keys=[f\"C: {C}\" for C in Cs])\n", + "weights_ridge.plot.barh()\n", + "_ = plt.title(\"LogisticRegression weights depending of C\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "tags": [ + "solution" + ] + }, + "source": [ + "We see that a small `C` will shrink the weights values toward zero. It means\n", + "that a small `C` provides a more regularized model. Thus, `C` is the inverse\n", + "of the `alpha` coefficient in the `Ridge` model.\n", + "\n", + "Besides, with a strong penalty (i.e. small `C` value), the weight of the\n", + "feature \"Culmen Depth (mm)\" is almost zero. It explains why the decision\n", + "separation in the plot is almost perpendicular to the \"Culmen Length (mm)\"\n", + "feature." + ] + } + ], + "metadata": { + "jupytext": { + "main_language": "python" + }, + "kernelspec": { + "display_name": "Python 3", + "name": "python3" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} \ No newline at end of file diff --git a/notebooks/linear_regression_in_sklearn.ipynb b/notebooks/linear_regression_in_sklearn.ipynb index 75ea3b566..d9060d0ed 100644 --- a/notebooks/linear_regression_in_sklearn.ipynb +++ b/notebooks/linear_regression_in_sklearn.ipynb @@ -7,12 +7,12 @@ "# Linear regression using scikit-learn\n", "\n", "In the previous notebook, we presented the parametrization of a linear model.\n", - "During the exercise, you saw that varying parameters will give different\n", - "models that will fit better or worse the data. To evaluate quantitatively this\n", + "During the exercise, you saw that varying parameters gives different models\n", + "that may fit better or worse the data. To evaluate quantitatively this\n", "goodness of fit, you implemented a so-called metric.\n", "\n", - "When doing machine learning, you are interested in selecting the model which\n", - "will minimize the error on the data available the most. From the previous\n", + "When doing machine learning, one is interested in selecting the model which\n", + "minimizes the error on the data available the most. From the previous\n", "exercise, we could implement a brute-force approach, varying the weights and\n", "intercept and select the model with the lowest error.\n", "\n", @@ -65,9 +65,8 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "The instance `linear_regression` will store the parameter values in the\n", - "attributes `coef_` and `intercept_`. We can check what the optimal model found\n", - "is:" + "The instance `linear_regression` stores the parameter values in the attributes\n", + "`coef_` and `intercept_`. We can check what the optimal model found is:" ] }, { @@ -94,7 +93,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "We will use the weight and intercept to plot the model found using the\n", + "We can use the weight and intercept to plot the model found using the\n", "scikit-learn." ] }, @@ -131,9 +130,12 @@ "metadata": {}, "source": [ "In the solution of the previous exercise, we implemented a function to compute\n", - "the goodness of fit of a model. Indeed, we mentioned two metrics: (i) the mean\n", - "squared error and (ii) the mean absolute error. These metrics are implemented\n", - "in scikit-learn and we do not need to use our own implementation.\n", + "the goodness of fit of a model. Indeed, we mentioned two metrics: (i) the\n", + "[mean squared\n", + "error](https://scikit-learn.org/stable/modules/model_evaluation.html#mean-squared-error)\n", + "and (ii) the [mean absolute\n", + "error](https://scikit-learn.org/stable/modules/model_evaluation.html#mean-absolute-error).\n", + "Let's see how to use the implementations from scikit-learn in the following.\n", "\n", "We can first compute the mean squared error." ] @@ -158,7 +160,7 @@ "A linear regression model minimizes the mean squared error on the training\n", "set. This means that the parameters obtained after the fit (i.e. `coef_` and\n", "`intercept_`) are the optimal parameters that minimizes the mean squared\n", - "error. In other words, any other choice of parameters will yield a model with\n", + "error. In other words, any other choice of parameters would yield a model with\n", "a higher mean squared error on the training set.\n", "\n", "However, the mean squared error is difficult to interpret. The mean absolute\n", @@ -180,9 +182,7 @@ }, { "cell_type": "markdown", - "metadata": { - "lines_to_next_cell": 2 - }, + "metadata": {}, "source": [ "A mean absolute error of 313 means that in average, our model make an error of\n", "\u00b1 313 grams when predicting the body mass of a penguin given its flipper\n", diff --git a/notebooks/linear_regression_non_linear_link.ipynb b/notebooks/linear_regression_non_linear_link.ipynb index e2783cfb5..d56505e65 100644 --- a/notebooks/linear_regression_non_linear_link.ipynb +++ b/notebooks/linear_regression_non_linear_link.ipynb @@ -6,20 +6,20 @@ "source": [ "# Linear regression for a non-linear features-target relationship\n", "\n", - "In the previous exercise, you were asked to train a linear regression model on\n", - "a dataset where the matrix `data` and the vector `target` do not have a linear\n", - "link.\n", - "\n", - "In this notebook, we show that even if the parametrization of linear models is\n", - "not natively adapted to the problem at hand, it is still possible to make\n", - "linear models more expressive by engineering additional features.\n", + "In this notebook, we show that even if linear models are not natively adapted\n", + "to express a `target` that is not a linear function of the `data`, it is still\n", + "possible to make linear models more expressive by engineering additional\n", + "features.\n", "\n", "A machine learning pipeline that combines a non-linear feature engineering\n", - "step followed by a linear regression step can therefore be considered\n", + "step followed by a linear regression step can therefore be considered a\n", "non-linear regression model as a whole.\n", "\n", - "To illustrate these concepts, we will reuse the same dataset generated in the\n", - "previous exercise." + "
\n", + "

Tip

\n", + "

np.random.RandomState allows to create a random number generator which can\n", + "be later used to get deterministic results.

\n", + "
" ] }, { @@ -47,8 +47,8 @@ "source": [ "
\n", "

Note

\n", - "

To ease the plotting, we will create a pandas dataframe containing the data\n", - "and target:

\n", + "

To ease the plotting, we create a pandas dataframe containing the data and\n", + "target:

\n", "
" ] }, @@ -80,8 +80,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "We will highlight the limitations of fitting a linear regression model as done\n", - "in the previous exercise.\n", + "We now observe the limitations of fitting a linear regression model.\n", "\n", "
\n", "

Warning

\n", @@ -166,7 +165,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "It is important to note that the learnt model will not be able to handle the\n", + "It is important to note that the learnt model is not able to handle the\n", "non-linear relationship between `data` and `target` since linear models assume\n", "the relationship between `data` and `target` to be linear.\n", "\n", @@ -351,9 +350,11 @@ "metadata": {}, "source": [ "The last possibility is to make a linear model more expressive is to use a\n", - "\"kernel\". Instead of learning a weight per feature as we previously\n", - "emphasized, a weight will be assigned to each sample. However, not all samples\n", - "will be used. This is the base of the support vector machine algorithm.\n", + "\"kernel\". Instead of learning one weight per feature as we previously did, a\n", + "weight is assigned to each sample. However, not all samples are used: some\n", + "redundant data points of the training set are assigned a weight of 0 so\n", + "that they do no influence the model's prediction function. This is the\n", + "main intuition of the support vector machine algorithm.\n", "\n", "The mathematical definition of \"kernels\" and \"support vector machines\" is\n", "beyond the scope of this course. We encourage interested readers with a\n", @@ -402,9 +403,9 @@ "line. `SVR(kernel=\"linear\")` is indeed yet another example of a linear model.\n", "\n", "The estimator can also be configured to use a non-linear kernel. Then, it can\n", - "learn a prediction function that computes non-linear interaction between\n", - "samples for which we want to make a prediction and selected samples from the\n", - "training set.\n", + "learn a prediction function that computes non-linear relations between samples\n", + "for which we want to make a prediction and selected samples from the training\n", + "set.\n", "\n", "The result is another kind of non-linear regression model with a similar\n", "expressivity as our previous polynomial regression pipeline:" @@ -477,6 +478,16 @@ "_ = ax.set_title(f\"Mean squared error = {mse:.2f}\")" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "`Nystroem` is a nice alternative to `PolynomialFeatures` that makes it\n", + "possible to keep the memory usage of the transformed dataset under control.\n", + "However, interpreting the meaning of the intermediate features can be\n", + "challenging." + ] + }, { "cell_type": "code", "execution_count": null, @@ -486,7 +497,7 @@ "from sklearn.kernel_approximation import Nystroem\n", "\n", "nystroem_regression = make_pipeline(\n", - " Nystroem(n_components=5),\n", + " Nystroem(kernel=\"poly\", degree=3, n_components=5, random_state=0),\n", " LinearRegression(),\n", ")\n", "nystroem_regression.fit(data, target)\n", @@ -499,6 +510,24 @@ "ax.plot(data, target_predicted)\n", "_ = ax.set_title(f\"Mean squared error = {mse:.2f}\")" ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Notebook Recap\n", + "\n", + "In this notebook we explored several ways to expand a single numerical feature\n", + "into several non-linearly derived new features. This makes our machine\n", + "learning pipeline more expressive and less likely to underfit, even if the\n", + "last stage of the pipeline is a simple linear regression model.\n", + "\n", + "For the sake of simplicity, we introduced those transformers on a toy\n", + "regression problem with a single input feature. However, non-linear feature\n", + "transformers such as Nystroem can further improve the expressiveness of\n", + "machine learning pipelines to model non-linear interactions between features.\n", + "We will explore this possibility in the next exercise." + ] } ], "metadata": { diff --git a/notebooks/logistic_regression.ipynb b/notebooks/logistic_regression.ipynb index fc41a3402..4c4cf0de7 100644 --- a/notebooks/logistic_regression.ipynb +++ b/notebooks/logistic_regression.ipynb @@ -6,11 +6,11 @@ "source": [ "# Linear model for classification\n", "\n", - "In regression, we saw that the target to be predicted was a continuous\n", - "variable. In classification, this target will be discrete (e.g. categorical).\n", + "In regression, we saw that the target to be predicted is a continuous\n", + "variable. In classification, the target is discrete (e.g. categorical).\n", "\n", - "We will go back to our penguin dataset. However, this time we will try to\n", - "predict the penguin species using the culmen information. We will also\n", + "In this notebook we go back to the penguin dataset. However, this time the\n", + "task is to predict the penguin species using the culmen information. We also\n", "simplify our classification problem by selecting only 2 of the penguin species\n", "to solve a binary classification problem." ] @@ -74,8 +74,8 @@ "increases, the probability that the penguin is a Chinstrap is closer to 1.\n", "However, the culmen depth is not helpful for predicting the penguin species.\n", "\n", - "For model fitting, we will separate the target from the data and we will\n", - "create a training and a testing set." + "For model fitting, we separate the target from the data and we create a\n", + "training and a testing set." ] }, { @@ -99,7 +99,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "The linear regression that we previously saw will predict a continuous output.\n", + "The linear regression that we previously saw predicts a continuous output.\n", "When the target is a binary outcome, one can use the logistic function to\n", "model the probability. This model is known as logistic regression.\n", "\n", @@ -117,9 +117,7 @@ "from sklearn.preprocessing import StandardScaler\n", "from sklearn.linear_model import LogisticRegression\n", "\n", - "logistic_regression = make_pipeline(\n", - " StandardScaler(), LogisticRegression(penalty=None)\n", - ")\n", + "logistic_regression = make_pipeline(StandardScaler(), LogisticRegression())\n", "logistic_regression.fit(data_train, target_train)\n", "accuracy = logistic_regression.score(data_test, target_test)\n", "print(f\"Accuracy on test set: {accuracy:.3f}\")" @@ -136,9 +134,9 @@ "\n", "
\n", "

Note

\n", - "

Here, we will use the class DecisionBoundaryDisplay. This educational tool\n", - "allows us to gain some insights by plotting the decision function boundary\n", - "learned by the classifier in a 2 dimensional feature space.

\n", + "

Here, we use the class DecisionBoundaryDisplay. This educational tool allows\n", + "us to gain some insights by plotting the decision function boundary learned by\n", + "the classifier in a 2 dimensional feature space.

\n", "

Notice however that in more realistic machine learning contexts, one would\n", "typically fit on more than two features at once and therefore it would not be\n", "possible to display such a visualization of the decision boundary in\n", @@ -177,8 +175,7 @@ "metadata": {}, "source": [ "Thus, we see that our decision function is represented by a line separating\n", - "the 2 classes. We should also note that we did not impose any regularization\n", - "by setting the parameter `penalty` to `'none'`.\n", + "the 2 classes.\n", "\n", "Since the line is oblique, it means that we used a combination of both\n", "features:" @@ -225,7 +222,15 @@ "\n", " x1 = coef0 / coef1 * x0 - intercept / coef1\n", "\n", - "which is the equation of a straight line." + "which is the equation of a straight line.\n", + "\n", + "

\n", + "

Note

\n", + "

If you want to go further, try changing the response_method to\n", + "\"predict_proba\" in the DecisionBoundaryDisplay above. Now the boundaries\n", + "encode by color the estimated probability of belonging to either class, as\n", + "mentioned in the introductory slides \ud83c\udfa5 Intuitions on linear models.

\n", + "
" ] } ], diff --git a/notebooks/metrics_classification.ipynb b/notebooks/metrics_classification.ipynb index 4cda309ba..9191bae79 100644 --- a/notebooks/metrics_classification.ipynb +++ b/notebooks/metrics_classification.ipynb @@ -614,7 +614,42 @@ "our classifier. However, it is important to observe that the lower bound of\n", "the ROC-AUC is 0.5. Indeed, we show the generalization performance of a dummy\n", "classifier (the orange dashed line) to show that even the worst generalization\n", - "performance obtained will be above this line." + "performance obtained will be above this line.\n", + "\n", + "Instead of using a dummy classifier, you can use the parameter `plot_chance_level`\n", + "available in the ROC and PR displays:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "fig, axs = plt.subplots(ncols=2, nrows=1, figsize=(15, 7))\n", + "\n", + "PrecisionRecallDisplay.from_estimator(\n", + " classifier,\n", + " data_test,\n", + " target_test,\n", + " pos_label=\"donated\",\n", + " marker=\"+\",\n", + " plot_chance_level=True,\n", + " chance_level_kw={\"color\": \"tab:orange\", \"linestyle\": \"--\"},\n", + " ax=axs[0],\n", + ")\n", + "RocCurveDisplay.from_estimator(\n", + " classifier,\n", + " data_test,\n", + " target_test,\n", + " pos_label=\"donated\",\n", + " marker=\"+\",\n", + " plot_chance_level=True,\n", + " chance_level_kw={\"color\": \"tab:orange\", \"linestyle\": \"--\"},\n", + " ax=axs[1],\n", + ")\n", + "\n", + "_ = fig.suptitle(\"PR and ROC curves\")" ] } ], diff --git a/notebooks/metrics_regression.ipynb b/notebooks/metrics_regression.ipynb index 36b27bb14..cf9199f37 100644 --- a/notebooks/metrics_regression.ipynb +++ b/notebooks/metrics_regression.ipynb @@ -6,7 +6,7 @@ "source": [ "# Regression\n", "\n", - "In this notebook, we will present the metrics that can be used in regression.\n", + "In this notebook, we present the metrics that can be used in regression.\n", "\n", "A set of metrics are dedicated to regression. Indeed, classification metrics\n", "cannot be used to evaluate the generalization performance of regression models\n", @@ -14,9 +14,9 @@ "it is a continuous variable in regression, while a discrete variable in\n", "classification.\n", "\n", - "We will use the Ames housing dataset. The goal is to predict the price of\n", - "houses in the city of Ames, Iowa. As with classification, we will only use a\n", - "single train-test split to focus solely on the regression metrics." + "We use the Ames housing dataset. The goal is to predict the price of houses in\n", + "the city of Ames, Iowa. As with classification, we only use a single\n", + "train-test split to focus solely on the regression metrics." ] }, { @@ -76,7 +76,7 @@ "error (MSE). Thus, this metric is sometimes used to evaluate the model since\n", "it is optimized by said model.\n", "\n", - "We will give an example using a linear regression model." + "We give an example using a linear regression model." ] }, { @@ -103,8 +103,8 @@ "metadata": {}, "source": [ "Our linear regression model is minimizing the mean squared error on the\n", - "training set. It means that there is no other set of coefficients which will\n", - "decrease the error.\n", + "training set. It means that there is no other set of coefficients which\n", + "decreases the error.\n", "\n", "Then, we can compute the mean squared error on the test set." ] @@ -129,8 +129,10 @@ "source": [ "The raw MSE can be difficult to interpret. One way is to rescale the MSE by\n", "the variance of the target. This score is known as the $R^2$ also called the\n", - "coefficient of determination. Indeed, this is the default score used in\n", - "scikit-learn by calling the method `score`." + "[coefficient of\n", + "determination](https://scikit-learn.org/stable/modules/model_evaluation.html#r2-score-the-coefficient-of-determination).\n", + "Indeed, this is the default score used in scikit-learn by calling the method\n", + "`score`." ] }, { @@ -249,21 +251,29 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "In addition of metrics, we can visually represent the results by plotting the\n", - "predicted values versus the true values." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "predicted_actual = {\n", - " \"True values (k$)\": target_test,\n", - " \"Predicted values (k$)\": target_predicted,\n", - "}\n", - "predicted_actual = pd.DataFrame(predicted_actual)" + "In addition to using metrics, we can visualize the results by plotting the\n", + "predicted values versus the true values.\n", + "\n", + "In an ideal scenario where all variations in the target could be perfectly\n", + "explained by the obseved features (i.e. without any unobserved factors of\n", + "variations), and we have chosen an optimal model, we would expect all\n", + "predictions to fall along the diagonal line of the first plot below.\n", + "\n", + "In the real life, this is almost never the case: some unknown fraction of the\n", + "variations in the target cannot be explained by variations in data: they stem\n", + "from external factors not represented by the observed features.\n", + "\n", + "Therefore, the best we can hope for is that our model's predictions form a\n", + "cloud of points symmetrically distributed around the diagonal line, ideally\n", + "close enough to it for the model to be useful.\n", + "\n", + "To gain more insight, it can be helpful to plot the residuals, which represent\n", + "the difference between the actual and predicted values, against the predicted\n", + "values. This is shown in the second plot.\n", + "\n", + "Residual plots make it easier to assess if the residuals exhibit a variance\n", + "independent of the target values or if there is any systematic bias of the\n", + "model associated with the lowest or highest predicted values." ] }, { @@ -273,32 +283,49 @@ "outputs": [], "source": [ "import matplotlib.pyplot as plt\n", - "import seaborn as sns\n", + "from sklearn.metrics import PredictionErrorDisplay\n", + "\n", + "fig, axs = plt.subplots(ncols=2, figsize=(13, 5))\n", + "\n", + "PredictionErrorDisplay.from_predictions(\n", + " y_true=target_test,\n", + " y_pred=target_predicted,\n", + " kind=\"actual_vs_predicted\",\n", + " scatter_kwargs={\"alpha\": 0.5},\n", + " ax=axs[0],\n", + ")\n", + "axs[0].axis(\"square\")\n", + "axs[0].set_xlabel(\"Predicted values (k$)\")\n", + "axs[0].set_ylabel(\"True values (k$)\")\n", "\n", - "sns.scatterplot(\n", - " data=predicted_actual,\n", - " x=\"True values (k$)\",\n", - " y=\"Predicted values (k$)\",\n", - " color=\"black\",\n", - " alpha=0.5,\n", + "PredictionErrorDisplay.from_predictions(\n", + " y_true=target_test,\n", + " y_pred=target_predicted,\n", + " kind=\"residual_vs_predicted\",\n", + " scatter_kwargs={\"alpha\": 0.5},\n", + " ax=axs[1],\n", ")\n", - "plt.axline((0, 0), slope=1, label=\"Perfect fit\")\n", - "plt.axis(\"square\")\n", - "_ = plt.title(\"Regression using a model without \\ntarget transformation\")" + "axs[1].axis(\"square\")\n", + "axs[1].set_xlabel(\"Predicted values (k$)\")\n", + "axs[1].set_ylabel(\"Residual values (k$)\")\n", + "\n", + "_ = fig.suptitle(\n", + " \"Regression using a model\\nwithout target transformation\", y=1.1\n", + ")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "On this plot, correct predictions would lie on the diagonal line. This plot\n", - "allows us to detect if the model makes errors in a consistent way, i.e. has\n", - "some bias.\n", - "\n", - "On this plot, we see that for the large True price values, our model tends to\n", - "under-estimate the price of the house. Typically, this issue arises when the\n", - "target to predict does not follow a normal distribution. In this case the\n", - "model would benefit from target transformation." + "On these plots, we see that our model tends to under-estimate the price of the\n", + "house both for the lowest and large True price values. This means that the\n", + "residuals still hold some **structure typically visible as the \"banana\" or\n", + "\"smile\" shape of the residual plot**. This is often a clue that our model\n", + "could be improved, either by transforming the features, the target or\n", + "sometimes changing the model type or its parameters. In this case let's try to\n", + "see if the model would benefit from a target transformation that monotonically\n", + "reshapes the target variable to follow a normal distribution." ] }, { @@ -317,20 +344,49 @@ " regressor=regressor, transformer=transformer\n", ")\n", "model_transformed_target.fit(data_train, target_train)\n", - "target_predicted = model_transformed_target.predict(data_test)" + "target_predicted = model_transformed_target.predict(data_test)\n", + "\n", + "fig, axs = plt.subplots(ncols=2, figsize=(13, 5))\n", + "\n", + "PredictionErrorDisplay.from_predictions(\n", + " y_true=target_test,\n", + " y_pred=target_predicted,\n", + " kind=\"actual_vs_predicted\",\n", + " scatter_kwargs={\"alpha\": 0.5},\n", + " ax=axs[0],\n", + ")\n", + "axs[0].axis(\"square\")\n", + "axs[0].set_xlabel(\"Predicted values (k$)\")\n", + "axs[0].set_ylabel(\"True values (k$)\")\n", + "\n", + "PredictionErrorDisplay.from_predictions(\n", + " y_true=target_test,\n", + " y_pred=target_predicted,\n", + " kind=\"residual_vs_predicted\",\n", + " scatter_kwargs={\"alpha\": 0.5},\n", + " ax=axs[1],\n", + ")\n", + "axs[1].axis(\"square\")\n", + "axs[1].set_xlabel(\"Predicted values (k$)\")\n", + "axs[1].set_ylabel(\"Residual values (k$)\")\n", + "\n", + "_ = fig.suptitle(\n", + " \"Regression using a model that\\ntransforms the target before fitting\",\n", + " y=1.1,\n", + ")" ] }, { - "cell_type": "code", - "execution_count": null, + "cell_type": "markdown", "metadata": {}, - "outputs": [], "source": [ - "predicted_actual = {\n", - " \"True values (k$)\": target_test,\n", - " \"Predicted values (k$)\": target_predicted,\n", - "}\n", - "predicted_actual = pd.DataFrame(predicted_actual)" + "The model with the transformed target seems to exhibit fewer structure in its\n", + "residuals: over-estimation and under-estimation errors seems to be more\n", + "balanced.\n", + "\n", + "We can confirm this by computing the previously mentioned metrics and observe\n", + "that they all improved w.r.t. the linear regression model without the target\n", + "transformation." ] }, { @@ -339,18 +395,17 @@ "metadata": {}, "outputs": [], "source": [ - "sns.scatterplot(\n", - " data=predicted_actual,\n", - " x=\"True values (k$)\",\n", - " y=\"Predicted values (k$)\",\n", - " color=\"black\",\n", - " alpha=0.5,\n", + "print(\n", + " \"Mean absolute error: \"\n", + " f\"{mean_absolute_error(target_test, target_predicted):.3f} k$\"\n", + ")\n", + "print(\n", + " \"Median absolute error: \"\n", + " f\"{median_absolute_error(target_test, target_predicted):.3f} k$\"\n", ")\n", - "plt.axline((0, 0), slope=1, label=\"Perfect fit\")\n", - "plt.axis(\"square\")\n", - "plt.legend()\n", - "_ = plt.title(\n", - " \"Regression using a model that\\ntransform the target before fitting\"\n", + "print(\n", + " \"Mean absolute percentage error: \"\n", + " f\"{mean_absolute_percentage_error(target_test, target_predicted):.2%}\"\n", ")" ] }, @@ -358,8 +413,21 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Thus, once we transformed the target, we see that we corrected some of the\n", - "high values." + "While a common practice, performing such a target transformation for linear\n", + "regression is often disapproved by statisticians. It is mathematically more\n", + "justified to instead adapt the loss function of the regression model itself,\n", + "for instance by fitting a\n", + "[`PoissonRegressor`](https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.PoissonRegressor.html)\n", + "or a\n", + "[`TweedieRegressor`](https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.TweedieRegressor.html)\n", + "model instead of `LinearRegression`. In particular those models indeed use an\n", + "internal \"log link\" function that makes them more suited for this kind of\n", + "positive-only target data distributions, but this analysis is beyond the scope\n", + "of this MOOC.\n", + "\n", + "The interested readers are encouraged to learn more about those models, in\n", + "particular by reading their respective docstrings and the linked sections\n", + "in the scikit-learn user guide reachable from the links above." ] } ], diff --git a/notebooks/metrics_sol_02.ipynb b/notebooks/metrics_sol_02.ipynb index 9dff91ae7..2efef3f68 100644 --- a/notebooks/metrics_sol_02.ipynb +++ b/notebooks/metrics_sol_02.ipynb @@ -87,8 +87,8 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Then, instead of using the $R^2$ score, use the mean absolute error. You need\n", - "to refer to the documentation for the `scoring` parameter." + "Then, instead of using the $R^2$ score, use the mean absolute error (MAE). You\n", + "may need to refer to the documentation for the `scoring` parameter." ] }, { @@ -160,6 +160,87 @@ "scores = pd.DataFrame(scores)\n", "scores" ] + }, + { + "cell_type": "markdown", + "metadata": { + "tags": [ + "solution" + ] + }, + "source": [ + "In the Regression Metrics notebook, we introduced the concept of loss function,\n", + "which is the metric optimized when training a model. In the case of\n", + "`LinearRegression`, the fitting process consists in minimizing the mean squared\n", + "error (MSE). Some estimators, such as `HistGradientBoostingRegressor`, can\n", + "use different loss functions, to be set using the `loss` hyperparameter.\n", + "\n", + "Notice that the evaluation metrics and the loss functions are not necessarily\n", + "the same. Let's see an example:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# solution\n", + "from collections import defaultdict\n", + "from sklearn.ensemble import HistGradientBoostingRegressor\n", + "\n", + "scoring = [\"neg_mean_squared_error\", \"neg_mean_absolute_error\"]\n", + "loss_functions = [\"squared_error\", \"absolute_error\"]\n", + "scores = defaultdict(list)\n", + "\n", + "for loss_func in loss_functions:\n", + " model = HistGradientBoostingRegressor(loss=loss_func)\n", + " cv_results = cross_validate(model, data, target, scoring=scoring)\n", + " mse = -cv_results[\"test_neg_mean_squared_error\"]\n", + " mae = -cv_results[\"test_neg_mean_absolute_error\"]\n", + " scores[\"loss\"].append(loss_func)\n", + " scores[\"MSE\"].append(f\"{mse.mean():.1f} \u00b1 {mse.std():.1f}\")\n", + " scores[\"MAE\"].append(f\"{mae.mean():.1f} \u00b1 {mae.std():.1f}\")\n", + "scores = pd.DataFrame(scores)\n", + "scores.set_index(\"loss\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "tags": [ + "solution" + ] + }, + "source": [ + "Even if the score distributions overlap due to the presence of outliers in the\n", + "dataset, it is true that the average MSE is lower when `loss=\"squared_error`,\n", + "whereas the average MAE is lower when `loss=\"absolute_error` as expected.\n", + "Indeed, the choice of a loss function is made depending on the evaluation\n", + "metric that we want to optimize for a given use case.\n", + "\n", + "If you feel like going beyond the contents of this MOOC, you can try different\n", + "combinations of loss functions and evaluation metrics.\n", + "\n", + "Notice that there are some metrics that cannot be directly optimized by\n", + "optimizing a loss function. This is the case for metrics that evolve in a\n", + "discontinuous manner with respect to the internal parameters of the model, as\n", + "learning solvers based on gradient descent or similar optimizers require\n", + "continuity (the details are beyond the scope of this MOOC).\n", + "\n", + "For instance, classification models are often evaluated using metrics computed\n", + "on hard class predictions (i.e. whether a sample belongs to a given class)\n", + "rather than from continuous values such as\n", + "[`predict_proba`](https://scikit-learn.org/stable/glossary.html#term-predict_proba)\n", + "(i.e. the estimated probability of belonging to said given class). Because of\n", + "this, classifiers are typically trained by optimizing a loss function computed\n", + "from some continuous output of the model. We call it a \"surrogate loss\" as it\n", + "substitutes the metric of interest. For instance `LogisticRegression`\n", + "minimizes the `log_loss` applied to the `predict_proba` output of the model.\n", + "By minimizing the surrogate loss, we maximize the accuracy. However\n", + "scikit-learn does not provide surrogate losses for all possible classification\n", + "metrics." + ] } ], "metadata": { diff --git a/notebooks/parameter_tuning_manual.ipynb b/notebooks/parameter_tuning_manual.ipynb index 585d8a5fb..fcd4d99ca 100644 --- a/notebooks/parameter_tuning_manual.ipynb +++ b/notebooks/parameter_tuning_manual.ipynb @@ -6,27 +6,19 @@ "source": [ "# Set and get hyperparameters in scikit-learn\n", "\n", - "The process of learning a predictive model is driven by a set of internal\n", - "parameters and a set of training data. These internal parameters are called\n", - "hyperparameters and are specific for each family of models. In addition, a\n", - "specific set of hyperparameters are optimal for a specific dataset and thus\n", - "they need to be optimized.\n", - "\n", - "
\n", - "

Note

\n", - "

In this notebook we will use the words \"hyperparameters\" and \"parameters\"\n", - "interchangeably.

\n", - "
\n", + "Recall that hyperparameters refer to the parameters that control the learning\n", + "process of a predictive model and are specific for each family of models. In\n", + "addition, the optimal set of hyperparameters is specific to each dataset and\n", + "thus they always need to be optimized.\n", "\n", "This notebook shows how one can get and set the value of a hyperparameter in a\n", - "scikit-learn estimator. We recall that hyperparameters refer to the parameter\n", - "that will control the learning process.\n", + "scikit-learn estimator.\n", "\n", "They should not be confused with the fitted parameters, resulting from the\n", "training. These fitted parameters are recognizable in scikit-learn because\n", "they are spelled with a final underscore `_`, for instance `model.coef_`.\n", "\n", - "We will start by loading the adult census dataset and only use the numerical\n", + "We start by loading the adult census dataset and only use the numerical\n", "features." ] }, @@ -122,7 +114,7 @@ "metadata": {}, "source": [ "We created a model with the default `C` value that is equal to 1. If we wanted\n", - "to use a different `C` parameter we could have done so when we created the\n", + "to use a different `C` hyperparameter we could have done so when we created the\n", "`LogisticRegression` object with something like `LogisticRegression(C=1e-3)`.\n", "\n", "
\n", @@ -132,9 +124,9 @@ "Be aware that we will focus on linear models in an upcoming module.

\n", "
\n", "\n", - "We can also change the parameter of a model after it has been created with the\n", - "`set_params` method, which is available for all scikit-learn estimators. For\n", - "example, we can set `C=1e-3`, fit and evaluate the model:" + "We can also change the hyperparameter of a model after it has been created\n", + "with the `set_params` method, which is available for all scikit-learn\n", + "estimators. For example, we can set `C=1e-3`, fit and evaluate the model:" ] }, { @@ -156,14 +148,14 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "When the model of interest is a `Pipeline`, the parameter names are of the\n", - "form `__` (note the double underscore in the\n", - "middle). In our case, `classifier` comes from the `Pipeline` definition and\n", - "`C` is the parameter name of `LogisticRegression`.\n", + "When the model of interest is a `Pipeline`, the hyperparameter names are of\n", + "the form `__` (note the double underscore in\n", + "the middle). In our case, `classifier` comes from the `Pipeline` definition\n", + "and `C` is the hyperparameter name of `LogisticRegression`.\n", "\n", "In general, you can use the `get_params` method on scikit-learn models to list\n", - "all the parameters with their values. For example, if you want to get all the\n", - "parameter names, you can use:" + "all the hyperparameters with their values. For example, if you want to get all\n", + "the hyperparameter names, you can use:" ] }, { @@ -180,9 +172,9 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "`.get_params()` returns a `dict` whose keys are the parameter names and whose\n", - "values are the parameter values. If you want to get the value of a single\n", - "parameter, for example `classifier__C`, you can use:" + "`.get_params()` returns a `dict` whose keys are the hyperparameter names and\n", + "whose values are the hyperparameter values. If you want to get the value of a\n", + "single hyperparameter, for example `classifier__C`, you can use:" ] }, { @@ -243,7 +235,7 @@ "source": [ "In this notebook we have seen:\n", "\n", - "- how to use `get_params` and `set_params` to get the parameters of a model\n", + "- how to use `get_params` and `set_params` to get the hyperparameters of a model\n", " and set them." ] } diff --git a/notebooks/parameter_tuning_randomized_search.ipynb b/notebooks/parameter_tuning_randomized_search.ipynb index 37b4272ea..11bfac389 100644 --- a/notebooks/parameter_tuning_randomized_search.ipynb +++ b/notebooks/parameter_tuning_randomized_search.ipynb @@ -10,12 +10,13 @@ "search for the best hyperparameters maximizing the generalization performance\n", "of a predictive model.\n", "\n", - "However, a grid-search approach has limitations. It does not scale when the\n", - "number of parameters to tune is increasing. Also, the grid will impose a\n", - "regularity during the search which might be problematic.\n", + "However, a grid-search approach has limitations. It does not scale well when\n", + "the number of parameters to tune increases. Also, the grid imposes a\n", + "regularity during the search which might miss better parameter\n", + "values between two consecutive values on the grid.\n", "\n", - "In this notebook, we will present another method to tune hyperparameters\n", - "called randomized search." + "In this notebook, we present a different method to tune hyperparameters called\n", + "randomized search." ] }, { @@ -98,8 +99,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "We will create the same predictive pipeline as seen in the grid-search\n", - "section." + "We create the same predictive pipeline as done for the grid-search section." ] }, { @@ -155,26 +155,26 @@ "\n", "With the `GridSearchCV` estimator, the parameters need to be specified\n", "explicitly. We already mentioned that exploring a large number of values for\n", - "different parameters will be quickly untractable.\n", + "different parameters quickly becomes untractable.\n", "\n", "Instead, we can randomly generate the parameter candidates. Indeed, such\n", "approach avoids the regularity of the grid. Hence, adding more evaluations can\n", "increase the resolution in each direction. This is the case in the frequent\n", "situation where the choice of some hyperparameters is not very important, as\n", - "for hyperparameter 2 in the figure below.\n", + "for the hyperparameter 2 in the figure below.\n", "\n", "![Randomized vs grid search](../figures/grid_vs_random_search.svg)\n", "\n", "Indeed, the number of evaluation points needs to be divided across the two\n", "different hyperparameters. With a grid, the danger is that the region of good\n", - "hyperparameters fall between the line of the grid: this region is aligned with\n", - "the grid given that hyperparameter 2 has a weak influence. Rather, stochastic\n", - "search will sample hyperparameter 1 independently from hyperparameter 2 and\n", - "find the optimal region.\n", + "hyperparameters may fall between lines of the grid. In the figure such region\n", + "is aligned with the grid given that hyperparameter 2 has a weak influence.\n", + "Rather, stochastic search samples the hyperparameter 1 independently from the\n", + "hyperparameter 2 and find the optimal region.\n", "\n", "The `RandomizedSearchCV` class allows for such stochastic search. It is used\n", "similarly to the `GridSearchCV` but the sampling distributions need to be\n", - "specified instead of the parameter values. For instance, we will draw\n", + "specified instead of the parameter values. For instance, we can draw\n", "candidates using a log-uniform distribution because the parameters we are\n", "interested in take positive values with a natural log scaling (.1 is as close\n", "to 1 as 10 is).\n", @@ -185,7 +185,7 @@ "grid search (with GridSearchCV) to optimize 3 or more hyperparameters.

\n", "
\n", "\n", - "We will optimize 3 other parameters in addition to the ones we optimized in\n", + "We now optimize 3 other parameters in addition to the ones we optimized in\n", "the notebook presenting the `GridSearchCV`:\n", "\n", "* `l2_regularization`: it corresponds to the strength of the regularization;\n", @@ -197,7 +197,7 @@ "We recall the meaning of the 2 remaining parameters:\n", "\n", "* `learning_rate`: it corresponds to the speed at which the gradient-boosting\n", - " will correct the residuals at each boosting iteration;\n", + " corrects the residuals at each boosting iteration;\n", "* `max_leaf_nodes`: it corresponds to the maximum number of leaves for each\n", " tree in the ensemble.\n", "\n", @@ -205,7 +205,7 @@ "

Note

\n", "

scipy.stats.loguniform can be used to generate floating numbers. To generate\n", "random values for integer-valued parameters (e.g. min_samples_leaf) we can\n", - "adapt is as follows:

\n", + "adapt it as follows:

\n", "
" ] }, diff --git a/notebooks/trees_classification.ipynb b/notebooks/trees_classification.ipynb index b92504bef..dfcae831c 100644 --- a/notebooks/trees_classification.ipynb +++ b/notebooks/trees_classification.ipynb @@ -204,7 +204,7 @@ "_ = plot_tree(\n", " tree,\n", " feature_names=culmen_columns,\n", - " class_names=tree.classes_,\n", + " class_names=tree.classes_.tolist(),\n", " impurity=False,\n", " ax=ax,\n", ")" diff --git a/notebooks/trees_regression.ipynb b/notebooks/trees_regression.ipynb index ccc4ff13e..5e137e01e 100644 --- a/notebooks/trees_regression.ipynb +++ b/notebooks/trees_regression.ipynb @@ -226,7 +226,7 @@ "from sklearn.tree import plot_tree\n", "\n", "_, ax = plt.subplots(figsize=(8, 6))\n", - "_ = plot_tree(tree, feature_names=feature_name, ax=ax)" + "_ = plot_tree(tree, feature_names=[feature_name], ax=ax)" ] }, { diff --git a/notebooks/trees_sol_01.ipynb b/notebooks/trees_sol_01.ipynb index 4f1672510..c126f23fa 100644 --- a/notebooks/trees_sol_01.ipynb +++ b/notebooks/trees_sol_01.ipynb @@ -131,7 +131,7 @@ "_ = plot_tree(\n", " tree,\n", " feature_names=culmen_columns,\n", - " class_names=tree.classes_,\n", + " class_names=tree.classes_.tolist(),\n", " impurity=False,\n", " ax=ax,\n", ")" From fa7e2dcd0b825a9a72c61100575558965e6732eb Mon Sep 17 00:00:00 2001 From: Arturo Amor <86408019+ArturoAmorQ@users.noreply.github.com> Date: Fri, 22 Sep 2023 11:59:22 +0200 Subject: [PATCH 056/108] ENH Expand contents related to pedict_proba (#722) --- python_scripts/logistic_regression.py | 87 +++++++++++++++++++++++++-- 1 file changed, 82 insertions(+), 5 deletions(-) diff --git a/python_scripts/logistic_regression.py b/python_scripts/logistic_regression.py index 0ea25a2a1..b595e97a2 100644 --- a/python_scripts/logistic_regression.py +++ b/python_scripts/logistic_regression.py @@ -155,9 +155,86 @@ # # which is the equation of a straight line. # -# ```{note} -# If you want to go further, try changing the `response_method` to -# `"predict_proba"` in the `DecisionBoundaryDisplay` above. Now the boundaries -# encode by color the estimated probability of belonging to either class, as -# mentioned in the introductory slides ๐ŸŽฅ Intuitions on linear models. +# ## (Estimated) predicted probabilities +# +# The `predict` method in classification models returns what we call a "hard +# class prediction", i.e. the most likely class a given data point would belong +# to. We can confirm the intuition given by the `DecisionBoundaryDisplay` by +# testing on a hypothetical `sample`: + +# %% +test_penguin = pd.DataFrame( + {"Culmen Length (mm)": [45], "Culmen Depth (mm)": [17]} +) +logistic_regression.predict(test_penguin) + +# %% [markdown] +# In this case, our logistic regression classifier predicts the Chinstrap +# species. Note that this agrees with the decision boundary plot above: the +# coordinates of this test data point match a location close to the decision +# boundary, in the red region. +# +# As mentioned in the introductory slides ๐ŸŽฅ **Intuitions on linear models**, +# one can alternatively use the `predict_proba` method to compute continuous +# values ("soft predictions") that correspond to an estimation of the confidence +# of the target belonging to each class. + +# %% +y_pred_proba = logistic_regression.predict_proba(test_penguin) +y_pred_proba + +# %% +y_proba_sample = pd.Series( + y_pred_proba.ravel(), index=logistic_regression.classes_ +) +y_proba_sample.plot.bar() +plt.ylabel("Estimated probability") +_ = plt.title("Probability of the sample belonging to a penguin class") + +# %% [markdown] +# Notice that the (estimated) predicted probabilities sum to one. +# +# ```{warning} +# We insist that the output of `predict_proba` are just estimations. Their +# reliability on being a good estimate of the true conditional class-assignment +# probabilities depends on the quality of the model. Even classifiers with a +# high accuracy on a test set may be overconfident for some individuals and +# underconfident for others. # ``` +# +# Similarly to the hard decision boundary shown above, one can set the +# `response_method` to `"predict_proba"` in the `DecisionBoundaryDisplay` to +# rather show the confidence on individual classifications. In such case the +# boundaries encode the estimated probablities by color. In particular, when +# using [matplotlib diverging +# colormaps](https://matplotlib.org/stable/users/explain/colors/colormaps.html#diverging) +# such as `"RdBu_r"`, the softer the color, the more unsure about which class to +# choose (the probability of 0.5 is mapped to white). + +# %% +DecisionBoundaryDisplay.from_estimator( + logistic_regression, + data_test, + response_method="predict_proba", + cmap="RdBu_r", + alpha=0.5, +) +sns.scatterplot( + data=penguins_test, + x=culmen_columns[0], + y=culmen_columns[1], + hue=target_column, + palette=["tab:red", "tab:blue"], +) +_ = plt.title("Predicted probability of the trained\n LogisticRegression") + +# %% [markdown] +# The [scikit-learn user guide]( +# https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression) +# gives a more precise description of the `predict_proba` method of the +# `LogisticRegression`. More detailed info can be found on Wikipedia about the +# normalization functions: [softmax +# function](https://en.wikipedia.org/wiki/Softmax_function) used by logistic +# regression on multi-class problems and the [logistic +# function](https://en.wikipedia.org/wiki/Logistic_function) used for binary +# classifications problems. From 0ac64f7c4ce28eb9e90a72c3b8e8f17f17ccaea9 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Mon, 25 Sep 2023 09:34:40 +0200 Subject: [PATCH 057/108] ENH new non-linear engineering for LogisticRegression (#711) --- jupyter-book/_toc.yml | 1 + ...dels_feature_engineering_classification.py | 457 ++++++++++++++++++ 2 files changed, 458 insertions(+) create mode 100644 python_scripts/linear_models_feature_engineering_classification.py diff --git a/jupyter-book/_toc.yml b/jupyter-book/_toc.yml index 80bb88aa3..ac643fa2f 100644 --- a/jupyter-book/_toc.yml +++ b/jupyter-book/_toc.yml @@ -102,6 +102,7 @@ parts: - file: python_scripts/linear_regression_non_linear_link - file: python_scripts/linear_models_ex_02 - file: python_scripts/linear_models_sol_02 + - file: python_scripts/linear_models_feature_engineering_classification.py - file: python_scripts/logistic_regression_non_linear - file: linear_models/linear_models_quiz_m4_03 - file: linear_models/linear_models_regularization_index diff --git a/python_scripts/linear_models_feature_engineering_classification.py b/python_scripts/linear_models_feature_engineering_classification.py new file mode 100644 index 000000000..9fd203f34 --- /dev/null +++ b/python_scripts/linear_models_feature_engineering_classification.py @@ -0,0 +1,457 @@ +# --- +# jupyter: +# kernelspec: +# display_name: Python 3 +# name: python3 +# --- + +# %% [markdown] +# +# # Non-linear feature engineering for Logistic Regression +# +# In the slides at the beginning of the module we mentioned that linear +# classification models are not suited to non-linearly separable data. +# Nevertheless, one can still use feature engineering as previously done for +# regression models to overcome this issue. To do so, we use non-linear +# transformations that typically map the original feature space into a higher +# dimension space, where the linear model can separate the data more easily. +# +# Let us illustrate this on three synthetic datasets. Each dataset has two +# original features and two classes to make it easy to visualize. The first +# dataset is called the "moons" dataset as the data points from each class are +# shaped as a crescent moon: + +# %% +import numpy as np +import pandas as pd +from sklearn.datasets import make_moons + +feature_names = ["Feature #0", "Feature #1"] +target_name = "class" + +X, y = make_moons(n_samples=100, noise=0.13, random_state=42) + +# We store both the data and target in a dataframe to ease plotting +moons = pd.DataFrame( + np.concatenate([X, y[:, np.newaxis]], axis=1), + columns=feature_names + [target_name], +) +data_moons, target_moons = moons[feature_names], moons[target_name] + +# %% [markdown] +# +# The second dataset is called the "Gaussian quantiles" dataset as all data +# points are sampled from a 2D Gaussian distribution regardless of the class. +# The points closest to the center are assigned to the class 1 while the points +# in the outer edges are assigned to the class 0, resulting in concentric +# circles. + +# %% +from sklearn.datasets import make_gaussian_quantiles + +X, y = make_gaussian_quantiles( + n_samples=100, n_features=2, n_classes=2, random_state=42 +) +gauss = pd.DataFrame( + np.concatenate([X, y[:, np.newaxis]], axis=1), + columns=feature_names + [target_name], +) +data_gauss, target_gauss = gauss[feature_names], gauss[target_name] + +# %% [markdown] +# +# The third dataset is called the "XOR" dataset as the data points are sampled +# from a uniform distribution in a 2D space and the class is defined by the +# Exclusive OR (XOR) operation on the two features: the target class is 1 if +# only one of the two features is greater than 0. The target class is 0 +# otherwise. + +# %% +xor = pd.DataFrame( + np.random.RandomState(0).uniform(low=-1, high=1, size=(200, 2)), + columns=feature_names, +) +target_xor = np.logical_xor(xor["Feature #0"] > 0, xor["Feature #1"] > 0) +target_xor = target_xor.astype(np.int32) +xor["class"] = target_xor +data_xor = xor[feature_names] + +# %% [markdown] +# +# We use matplotlib to visualize all the datasets at a glance: + +# %% +import matplotlib.pyplot as plt +from matplotlib.colors import ListedColormap + + +_, axs = plt.subplots(ncols=3, figsize=(14, 4), constrained_layout=True) + +common_scatter_plot_params = dict( + cmap=ListedColormap(["tab:red", "tab:blue"]), + edgecolor="white", + linewidth=1, +) + +axs[0].scatter( + data_moons[feature_names[0]], + data_moons[feature_names[1]], + c=target_moons, + **common_scatter_plot_params, +) +axs[1].scatter( + data_gauss[feature_names[0]], + data_gauss[feature_names[1]], + c=target_gauss, + **common_scatter_plot_params, +) +axs[2].scatter( + data_xor[feature_names[0]], + data_xor[feature_names[1]], + c=target_xor, + **common_scatter_plot_params, +) +axs[0].set( + title="The moons dataset", + xlabel=feature_names[0], + ylabel=feature_names[1], +) +axs[1].set( + title="The Gaussian quantiles dataset", + xlabel=feature_names[0], +) +axs[2].set( + title="The XOR dataset", + xlabel=feature_names[0], +) + + +# %% [markdown] +# +# We intuitively observe that there is no (single) straight line that can +# separate the two classes in any of the datasets. We can confirm this by +# fitting a linear model, such as a logistic regression, to each dataset and +# plot the decision boundary of the model. +# +# Let's first define a function to help us fit a given model and plot its +# decision boundary on the previous datasets at a glance: + +# %% +from sklearn.inspection import DecisionBoundaryDisplay + + +def plot_decision_boundary(model, title=None): + datasets = [ + (data_moons, target_moons), + (data_gauss, target_gauss), + (data_xor, target_xor), + ] + fig, axs = plt.subplots( + ncols=3, + figsize=(14, 4), + constrained_layout=True, + ) + + for i, ax, (data, target) in zip( + range(len(datasets)), + axs, + datasets, + ): + model.fit(data, target) + DecisionBoundaryDisplay.from_estimator( + model, + data, + response_method="predict_proba", + plot_method="pcolormesh", + cmap="RdBu", + alpha=0.8, + # Setting vmin and vmax to the extreme values of the probability to + # ensure that 0.5 is mapped to white (the middle) of the blue-red + # colormap. + vmin=0, + vmax=1, + ax=ax, + ) + DecisionBoundaryDisplay.from_estimator( + model, + data, + response_method="predict_proba", + plot_method="contour", + alpha=0.8, + levels=[0.5], # 0.5 probability contour line + linestyles="--", + linewidths=2, + ax=ax, + ) + ax.scatter( + data[feature_names[0]], + data[feature_names[1]], + c=target, + **common_scatter_plot_params, + ) + if i > 0: + ax.set_ylabel(None) + if title is not None: + fig.suptitle(title) + + +# %% [markdown] +# +# Now let's define our logistic regression model and plot its decision boundary +# on the three datasets: + +# %% +from sklearn.pipeline import make_pipeline +from sklearn.preprocessing import StandardScaler +from sklearn.linear_model import LogisticRegression + +logistic_regression = make_pipeline(StandardScaler(), LogisticRegression()) +logistic_regression + +# %% +plot_decision_boundary(logistic_regression, title="Linear classifier") + +# %% [markdown] +# +# This confirms that it is not possible to separate the two classes with a +# linear model. On each plot we see a **significant number of misclassified +# samples on the training set**! The three plots show typical cases of +# **underfitting** for linear models. +# +# Also, the last two plots show soft colors, meaning that the model is highly +# unsure about which class to choose. + +# %% [markdown] +# +# ## Engineering non-linear features +# +# As we did for the linear regression models, we now attempt to build a more +# expressive machine learning pipeline by leveraging non-linear feature +# engineering, with techniques such as binning, splines, polynomial features, +# and kernel approximation. +# +# Let's start with the binning transformation of the features: + +# %% +from sklearn.preprocessing import KBinsDiscretizer + +classifier = make_pipeline(KBinsDiscretizer(n_bins=5), LogisticRegression()) +classifier + +# %% +plot_decision_boundary(classifier, title="Binning classifier") + +# %% [markdown] +# +# We can see that the resulting decision boundary is constrained to follow +# **axis-aligned segments**, which is very similar to what a decision tree would +# do as we will see in the next Module. Furthermore, as for decision trees, the +# model makes piecewise constant predictions within each rectangular region. +# +# This axis-aligned decision boundary is not necessarily the natural decision +# boundary a human would have intuitively drawn for the moons dataset and the +# Gaussian quantiles datasets. It still makes it possible for the model to +# successfully separate the data. However, binning alone does not help the +# classifier separate the data for the XOR dataset. This is because **the +# binning transformation is a feature-wise transformation** and thus **cannot +# capture interactions** between features that are necessary to separate the +# XOR dataset. +# +# Let's now consider a **spline** transformation of the original features. This +# transformation can be considered a **smooth version of the binning +# transformation**. You can find more details in the [scikit-learn user guide]( +# https://scikit-learn.org/stable/modules/preprocessing.html#spline-transformer). + +# %% +from sklearn.preprocessing import SplineTransformer + +classifier = make_pipeline( + SplineTransformer(degree=3, n_knots=5), + LogisticRegression(), +) +classifier + +# %% +plot_decision_boundary(classifier, title="Spline classifier") + +# %% [markdown] +# +# We can see that the decision boundary is now smooth, and while it favors +# axis-aligned decision rules when extrapolating in low density regions, it can +# adopt a more curvy decision boundary in the high density regions. +# +# Note however, that the number of knots is a hyperparameter that needs to be +# tuned. If we use too few knots, the model would underfit the data, as shown on +# the moons dataset. If we use too many knots, the model would overfit the data. +# +# However, as for the binning transformation, the model still fails to separate +# the data for the XOR dataset, irrespective of the number of knots, for the +# same reasons: **the spline transformation is a feature-wise transformation** +# and thus **cannot capture interactions** between features. + +# %% [markdown] +# +# ## Modeling non-additive feature interactions +# +# We now consider feature engineering techniques that non-linearly combine the +# original features in the hope of capturing interactions between them. We will +# consider polynomial features and kernel approximation. +# +# Let's start with the polynomial features: + +# %% +from sklearn.preprocessing import PolynomialFeatures + +classifier = make_pipeline( + StandardScaler(), + PolynomialFeatures(degree=3, include_bias=False), + LogisticRegression(C=10), +) +classifier + +# %% +plot_decision_boundary(classifier, title="Polynomial classifier") + +# %% [markdown] +# +# We can see that the decision boundary of this polynomial classifier is +# **smooth** and can successfully separate the data on all three datasets +# (depending on how we set the values of the `degree` and `C` +# hyperparameters). +# +# It is interesting to observe that this models extrapolates very differently +# from the previous models: its decision boundary can take a diagonal +# direction. Furthermore, we can observe that predictions are very confident in +# the low density regions of the feature space, even very close to the decision +# boundary +# +# We can obtain very similar results by using a kernel approximation technique +# such as the Nystrรถm method with a polynomial kernel: + +# %% +from sklearn.kernel_approximation import Nystroem + +classifier = make_pipeline( + StandardScaler(), + Nystroem(kernel="poly", degree=3, coef0=1, n_components=100), + LogisticRegression(C=10), +) +classifier +# %% +plot_decision_boundary(classifier, title="Polynomial Nystroem classifier") + +# %% [markdown] +# +# The polynomial kernel approach would be interesting in cases were the +# original feature space is already of high dimension: in these cases, +# **computing the complete polynomial expansion** with `PolynomialFeatures` +# could be **intractable**, while Nystrรถm method can control the output +# dimensionality with the `n_components` parameter. +# +# Let's now explore the use of a radial basis function (RBF) kernel: + +# %% +from sklearn.kernel_approximation import Nystroem + +classifier = make_pipeline( + StandardScaler(), + Nystroem(kernel="rbf", gamma=1, n_components=100), + LogisticRegression(C=5), +) +classifier +# %% +plot_decision_boundary(classifier, title="RBF Nystroem classifier") + +# %% [markdown] +# +# The resulting decision boundary is **smooth** and can successfully separate +# the classes for all three datasets. Furthemore, the model extrapolates very +# differently: in particular, it tends to be **much less confident in its +# predictions in the low density regions** of the feature space. +# +# As for the previous polynomial pipelines, this pipeline **does not favor +# axis-aligned decision rules**. It can be shown mathematically that the +# [inductive bias](https://en.wikipedia.org/wiki/Inductive_bias) of our RBF +# pipeline is actually rotationally invariant. + +# %% [markdown] +# +# ## Multi-step feature engineering +# +# It is possible to combine several feature engineering transformers in a +# single pipeline to blend their respective inductive biases. For instance, we +# can combine the binning transformation with a kernel approximation: + +# %% +classifier = make_pipeline( + KBinsDiscretizer(n_bins=5), + Nystroem(kernel="rbf", gamma=1.0, n_components=100), + LogisticRegression(), +) +classifier +# %% +plot_decision_boundary(classifier, title="Binning + Nystroem classifier") + +# %% [markdown] +# +# It is interesting to observe that this model is still piecewise constant with +# axis-aligned decision boundaries everywhere, but it can now successfully deal +# with the XOR problem thanks to the second step of the pipeline that can +# model the interactions between the features transformed by the first step. +# +# We can also combine the spline transformation with a kernel approximation: + +# %% +from sklearn.kernel_approximation import Nystroem + +classifier = make_pipeline( + SplineTransformer(n_knots=5), + Nystroem(kernel="rbf", gamma=1.0, n_components=100), + LogisticRegression(), +) +classifier + +# %% +plot_decision_boundary(classifier, title="Spline + RBF Nystroem classifier") + +# %% [markdown] +# +# The decision boundary of this pipeline is smooth, but with axis-aligned +# extrapolation. +# +# Depending on the task, this can be considered an advantage or a drawback. + +# %% [markdown] +# +# ## Summary and take-away messages +# +# - Linear models such as logistic regression can be used for classification on +# non-linearly separable datasets by leveraging non-linear feature +# engineering. +# - Transformers such as `KBinsDiscretizer` and `SplineTransformer` can be used +# to engineer non-linear features independently for each original feature. +# - As a result, these transformers cannot capture interactions between the +# orignal features (and then would fail on the XOR classification task). +# - Despite this limitation they already augment the expressivity of the +# pipeline, which can be sufficient for some datasets. +# - They also favor axis-aligned decision boundaries, in particular in the low +# density regions of the feature space (axis-aligned extrapolation). +# - Transformers such as `PolynomialFeatures` and `Nystroem` can be used to +# engineer non-linear features that capture interactions between the original +# features. +# - It can be useful to combine several feature engineering transformers in a +# single pipeline to build a more expressive model, for instance to favor +# axis-aligned extrapolation while also capturing interactions. +# - In particular, if the original dataset has both numerical and categorical +# features, it can be useful to apply binning or a spline transformation to the +# numerical features and one-hot encoding to the categorical features. Then, +# the resulting features can be combined with a kernel approximation to model +# interactions between numerical and categorical features. This can be +# achieved with the help of `ColumnTransformer`. +# +# In subsequent notebooks and exercises, we will further explore the interplay +# between regularization, feature engineering, and the under-fitting / +# overfitting trade-off. +# +# But first we will do an exercise to illustrate the relationship between the +# Nystrรถm kernel approximation and support vector machines. From fc7700483afa45f1a82922e3a38fb8844613a29b Mon Sep 17 00:00:00 2001 From: Arturo Amor <86408019+ArturoAmorQ@users.noreply.github.com> Date: Thu, 5 Oct 2023 14:22:55 +0200 Subject: [PATCH 058/108] MAINT Fix pandas version to 2.0 to avoid seaborn warning (#727) * MAINT Fix pandas version to avoid seaborn warning * Iter --------- Co-authored-by: ArturoAmorQ --- environment-dev.yml | 2 +- environment.yml | 2 +- local-install-instructions.md | 2 +- requirements-dev.txt | 2 +- requirements.txt | 2 +- 5 files changed, 5 insertions(+), 5 deletions(-) diff --git a/environment-dev.yml b/environment-dev.yml index c465dd72b..260ae54a8 100644 --- a/environment-dev.yml +++ b/environment-dev.yml @@ -3,7 +3,7 @@ channels: - conda-forge dependencies: - scikit-learn >= 1.3 - - pandas >= 1 + - pandas == 2.0 # avoid seaborn warning - matplotlib-base - seaborn - plotly >= 5.10 diff --git a/environment.yml b/environment.yml index 5dd5dfe4e..6b9c4fcfa 100644 --- a/environment.yml +++ b/environment.yml @@ -5,7 +5,7 @@ channels: dependencies: - scikit-learn >= 1.3 - - pandas >= 1 + - pandas == 2.0 # avoid seaborn warning - matplotlib-base - seaborn - jupyterlab diff --git a/local-install-instructions.md b/local-install-instructions.md index 2085c5db1..4cd7cffd1 100644 --- a/local-install-instructions.md +++ b/local-install-instructions.md @@ -47,7 +47,7 @@ Using python in /home/lesteve/miniconda3/envs/scikit-learn-course [ OK ] scipy version 1.6.0 [ OK ] matplotlib version 3.3.3 [ OK ] sklearn version 1.3 -[ OK ] pandas version 1.2.0 +[ OK ] pandas version 2.0 [ OK ] seaborn version 0.11.1 [ OK ] notebook version 6.2.0 [ OK ] plotly version 5.10.0 diff --git a/requirements-dev.txt b/requirements-dev.txt index 33232cdc2..77caf5786 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -1,5 +1,5 @@ scikit-learn>=1.3 -pandas>=1 +pandas==2.0 # avoid seaborn warning matplotlib seaborn plotly diff --git a/requirements.txt b/requirements.txt index e06a8123d..6e006d442 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,5 @@ scikit-learn>=1.3 -pandas>=1 +pandas==2.0 # avoid seaborn warning matplotlib seaborn plotly From 2128862e94f9fc63f316f1264d0a41a900426788 Mon Sep 17 00:00:00 2001 From: Arturo Amor <86408019+ArturoAmorQ@users.noreply.github.com> Date: Thu, 5 Oct 2023 14:23:51 +0200 Subject: [PATCH 059/108] Update dates for FUN session (#726) Co-authored-by: ArturoAmorQ --- README.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index dc52e5b95..7e8318e17 100644 --- a/README.md +++ b/README.md @@ -2,9 +2,9 @@ ๐Ÿ“ข ๐Ÿ“ข ๐Ÿ“ข A new session of the [Machine learning in Python with scikit-learn MOOC](https://www.fun-mooc.fr/en/courses/machine-learning-python-scikit-learn), -is available starting on October 18, 2022 and will last for 3 months. Enroll for -the full MOOC experience (quizz solutions, executable notebooks, discussion -forum, etc ...) ! +is available starting on November 8th, 2023 and will remain open on self-paced +mode. Enroll for the full MOOC experience (quizz solutions, executable +notebooks, discussion forum, etc ...) ! The MOOC is free and hosted on the [FUN-MOOC](https://fun-mooc.fr/) platform which does not use the student data for any other purpose than improving the From 35934dc4e9fc4791e7fc99c4a416009affca26ef Mon Sep 17 00:00:00 2001 From: Arturo Amor <86408019+ArturoAmorQ@users.noreply.github.com> Date: Fri, 6 Oct 2023 15:42:40 +0200 Subject: [PATCH 060/108] Synchronize quizzes related to predict_proba (#728) Co-authored-by: ArturoAmorQ --- .../linear_models/linear_models_quiz_m4_01.md | 2 +- .../linear_models/linear_models_quiz_m4_02.md | 26 +++++++++++++++++++ .../linear_models/linear_models_quiz_m4_03.md | 13 ++++++++++ .../linear_models/linear_models_quiz_m4_05.md | 14 +++++++++- 4 files changed, 53 insertions(+), 2 deletions(-) diff --git a/jupyter-book/linear_models/linear_models_quiz_m4_01.md b/jupyter-book/linear_models/linear_models_quiz_m4_01.md index 6539f5b9d..3023edf04 100644 --- a/jupyter-book/linear_models/linear_models_quiz_m4_01.md +++ b/jupyter-book/linear_models/linear_models_quiz_m4_01.md @@ -17,7 +17,7 @@ _Select a single answer_ ```{admonition} Question Is it possible to get a perfect fit (zero prediction error on the training set) -with a linear classifier on a non-linearly separable dataset? +with a linear classifier by itself on a non-linearly separable dataset? - a) yes - b) no diff --git a/jupyter-book/linear_models/linear_models_quiz_m4_02.md b/jupyter-book/linear_models/linear_models_quiz_m4_02.md index 883e9f167..e0c5f6c9d 100644 --- a/jupyter-book/linear_models/linear_models_quiz_m4_02.md +++ b/jupyter-book/linear_models/linear_models_quiz_m4_02.md @@ -36,3 +36,29 @@ The decision boundaries of a logistic regression model: _Select a single answer_ ``` + ++++ + +```{admonition} Question +For a binary classification task, what is the shape of the array returned by the +`predict_proba` method for 10 input samples? + +- a) (10,) +- b) (10, 2) +- c) (2, 10) + +_Select a single answer_ +``` + ++++ + +```{admonition} Question +In logistic regression's `predict_proba` method in scikit-learn, which of the +following statements is true regarding the predicted probabilities? + +- a) The sum of probabilities across different classes for a given sample is always equal to 1.0. +- b) The sum of probabilities across all samples for a given class is always equal to 1.0. +- c) The sum of probabilities across all features for a given class is always equal to 1.0. + +_Select a single answer_ +``` diff --git a/jupyter-book/linear_models/linear_models_quiz_m4_03.md b/jupyter-book/linear_models/linear_models_quiz_m4_03.md index 1e852f362..eb458ad46 100644 --- a/jupyter-book/linear_models/linear_models_quiz_m4_03.md +++ b/jupyter-book/linear_models/linear_models_quiz_m4_03.md @@ -26,3 +26,16 @@ and `intercept_`? _Select a single answer_ ``` + ++++ + +```{admonition} Question +Combining (one or more) feature engineering transformers in a single pipeline: + +- a) increases the expressivity of the model +- b) ensures that models extrapolate accurately regardless of its distribution +- c) may require tuning additional hyperparameters +- d) inherently prevents any underfitting + +_Select all answers that apply_ +``` diff --git a/jupyter-book/linear_models/linear_models_quiz_m4_05.md b/jupyter-book/linear_models/linear_models_quiz_m4_05.md index fbddaddf8..1fe12883c 100644 --- a/jupyter-book/linear_models/linear_models_quiz_m4_05.md +++ b/jupyter-book/linear_models/linear_models_quiz_m4_05.md @@ -5,7 +5,7 @@ By default, a [`LogisticRegression`](https://scikit-learn.org/stable/modules/gen - a) no penalty - b) a penalty that shrinks the magnitude of the weights towards zero (also called "l2 penalty") -- c) a penalty that sets some weights exactly to zero (also called "l1 penalty") +- c) a penalty that ensures all weights are equal _Select a single answer_ ``` @@ -21,3 +21,15 @@ The parameter `C` in a logistic regression is: _Select a single answer_ ``` + ++++ + +```{admonition} Question +In logistic regression, increasing the regularization strength makes the model: + +- a) more likely to overfit to the training data +- b) more flexible, fitting closely to the training data +- c) less complex, potentially underfitting the training data + +_Select a single answer_ +``` From e08a74e963a616d0b0526e1a7b26d2c436bbf8be Mon Sep 17 00:00:00 2001 From: Arturo Amor <86408019+ArturoAmorQ@users.noreply.github.com> Date: Fri, 6 Oct 2023 15:44:32 +0200 Subject: [PATCH 061/108] MAINT Improve wording in linear regression intro (#725) Co-authored-by: ArturoAmorQ --- .../linear_regression_without_sklearn.py | 22 +++++++++---------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/python_scripts/linear_regression_without_sklearn.py b/python_scripts/linear_regression_without_sklearn.py index 3ce72bc2d..acc06a0ec 100644 --- a/python_scripts/linear_regression_without_sklearn.py +++ b/python_scripts/linear_regression_without_sklearn.py @@ -9,8 +9,8 @@ # # Linear regression without scikit-learn # # In this notebook, we introduce linear regression. Before presenting the -# available scikit-learn classes, we will provide some insights with a simple -# example. We will use a dataset that contains measurements taken on penguins. +# available scikit-learn classes, here we provide some insights with a simple +# example. We use a dataset that contains measurements taken on penguins. # %% [markdown] # ```{note} @@ -25,8 +25,8 @@ penguins.head() # %% [markdown] -# We will formulate the following problem: using the flipper length of a -# penguin, we would like to infer its mass. +# We aim to solve the following problem: using the flipper length of a penguin, +# we would like to infer its mass. # %% import seaborn as sns @@ -72,8 +72,8 @@ def linear_model_flipper_mass( # %% [markdown] # Using the model we defined above, we can check the body mass values predicted -# for a range of flipper lengths. We will set `weight_flipper_length` to be 45 -# and `intercept_body_mass` to be -5000. +# for a range of flipper lengths. We set `weight_flipper_length` and +# `intercept_body_mass` to arbitrary values of 45 and -5000, respectively. # %% import numpy as np @@ -101,7 +101,7 @@ def linear_model_flipper_mass( # %% [markdown] # The variable `weight_flipper_length` is a weight applied to the feature # `flipper_length` in order to make the inference. When this coefficient is -# positive, it means that penguins with longer flipper lengths will have larger +# positive, it means that penguins with longer flipper lengths have larger # body masses. If the coefficient is negative, it means that penguins with # shorter flipper lengths have larger body masses. Graphically, this coefficient # is represented by the slope of the curve in the plot. Below we show what the @@ -129,7 +129,7 @@ def linear_model_flipper_mass( # %% [markdown] # In our case, this coefficient has a meaningful unit: g/mm. For instance, a # coefficient of 40 g/mm, means that for each additional millimeter in flipper -# length, the body weight predicted will increase by 40 g. +# length, the body weight predicted increases by 40 g. # %% body_mass_180 = linear_model_flipper_mass( @@ -150,8 +150,8 @@ def linear_model_flipper_mass( # This parameter corresponds to the value on the y-axis if `flipper_length=0` # (which in our case is only a mathematical consideration, as in our data, the # value of `flipper_length` only goes from 170mm to 230mm). This y-value when -# x=0 is called the y-intercept. If `intercept_body_mass` is 0, the curve will -# pass through the origin: +# x=0 is called the y-intercept. If `intercept_body_mass` is 0, the curve passes +# through the origin: # %% weight_flipper_length = 25 @@ -171,7 +171,7 @@ def linear_model_flipper_mass( _ = ax.set_title(label.format(weight_flipper_length, intercept_body_mass)) # %% [markdown] -# Otherwise, it will pass through the `intercept_body_mass` value: +# Otherwise, it passes through the `intercept_body_mass` value: # %% weight_flipper_length = 45 From d62c5f0eb22a95171a86f9bca22d0cfa895a7c85 Mon Sep 17 00:00:00 2001 From: Arturo Amor <86408019+ArturoAmorQ@users.noreply.github.com> Date: Fri, 6 Oct 2023 15:48:54 +0200 Subject: [PATCH 062/108] ENH Remove boilerplate from non-linear regression notebook (#724) * ENH Remove boilerplate from non-linear regression notebook * Add preamble to building custom dataset * Fix typo in plot title. --------- Co-authored-by: ArturoAmorQ Co-authored-by: Olivier Grisel --- .../linear_regression_non_linear_link.py | 187 ++++++++---------- 1 file changed, 84 insertions(+), 103 deletions(-) diff --git a/python_scripts/linear_regression_non_linear_link.py b/python_scripts/linear_regression_non_linear_link.py index 996e5fcce..ca88b8799 100644 --- a/python_scripts/linear_regression_non_linear_link.py +++ b/python_scripts/linear_regression_non_linear_link.py @@ -6,7 +6,7 @@ # --- # %% [markdown] -# # Linear regression for a non-linear features-target relationship +# # Non-linear feature engineering for Linear Regression # # In this notebook, we show that even if linear models are not natively adapted # to express a `target` that is not a linear function of the `data`, it is still @@ -17,10 +17,10 @@ # step followed by a linear regression step can therefore be considered a # non-linear regression model as a whole. # -# ```{tip} -# `np.random.RandomState` allows to create a random number generator which can -# be later used to get deterministic results. -# ``` +# In this occasion we are not loading a dataset, but creating our own custom +# data consisting of a single feature. The target is built as a cubic polynomial +# on said feature. To make things a bit more challenging, we add some random +# fluctuations to the target. # %% import numpy as np @@ -36,10 +36,13 @@ target = data**3 - 0.5 * data**2 + noise # %% [markdown] -# ```{note} +# ```{tip} +# `np.random.RandomState` allows to create a random number generator which can +# be later used to get deterministic results. +# ``` +# # To ease the plotting, we create a pandas dataframe containing the data and # target: -# ``` # %% import pandas as pd @@ -54,8 +57,6 @@ ) # %% [markdown] -# We now observe the limitations of fitting a linear regression model. -# # ```{warning} # In scikit-learn, by convention `data` (also called `X` in the scikit-learn # documentation) should be a 2D matrix of shape `(n_samples, n_features)`. @@ -69,26 +70,40 @@ data = data.reshape((-1, 1)) data.shape +# %% [markdown] +# To avoid writing the same code in multiple places we define a helper function +# that fits, scores and plots the different regression models. + + +# %% +def fit_score_plot_regression(model, title=None): + model.fit(data, target) + target_predicted = model.predict(data) + mse = mean_squared_error(target, target_predicted) + ax = sns.scatterplot( + data=full_data, x="input_feature", y="target", color="black", alpha=0.5 + ) + ax.plot(data, target_predicted) + if title is not None: + _ = ax.set_title(title + f" (MSE = {mse:.2f})") + else: + _ = ax.set_title(f"Mean squared error = {mse:.2f}") + + +# %% [markdown] +# We now observe the limitations of fitting a linear regression model. + # %% from sklearn.linear_model import LinearRegression from sklearn.metrics import mean_squared_error linear_regression = LinearRegression() -linear_regression.fit(data, target) -target_predicted = linear_regression.predict(data) +linear_regression # %% -mse = mean_squared_error(target, target_predicted) - -# %% -ax = sns.scatterplot( - data=full_data, x="input_feature", y="target", color="black", alpha=0.5 -) -ax.plot(data, target_predicted) -_ = ax.set_title(f"Mean squared error = {mse:.2f}") +fit_score_plot_regression(linear_regression, title="Simple linear regression") # %% [markdown] -# # Here the coefficient and intercept learnt by `LinearRegression` define the # best "straight line" that fits the data. We can inspect the coefficients using # the attributes of the model learnt as follows: @@ -100,10 +115,8 @@ ) # %% [markdown] -# It is important to note that the learnt model is not able to handle the -# non-linear relationship between `data` and `target` since linear models assume -# the relationship between `data` and `target` to be linear. -# +# Notice that the learnt model cannot handle the non-linear relationship between +# `data` and `target` because linear models assume a linear relationship. # Indeed, there are 3 possibilities to solve this issue: # # 1. choose a model that can natively deal with non-linearity, @@ -119,15 +132,10 @@ from sklearn.tree import DecisionTreeRegressor tree = DecisionTreeRegressor(max_depth=3).fit(data, target) -target_predicted = tree.predict(data) -mse = mean_squared_error(target, target_predicted) +tree # %% -ax = sns.scatterplot( - data=full_data, x="input_feature", y="target", color="black", alpha=0.5 -) -ax.plot(data, target_predicted) -_ = ax.set_title(f"Mean squared error = {mse:.2f}") +fit_score_plot_regression(tree, title="Decision tree regression") # %% [markdown] # Instead of having a model which can natively deal with non-linearity, we could @@ -147,68 +155,49 @@ data_expanded = np.concatenate([data, data**2, data**3], axis=1) data_expanded.shape - -# %% -linear_regression.fit(data_expanded, target) -target_predicted = linear_regression.predict(data_expanded) -mse = mean_squared_error(target, target_predicted) - -# %% -ax = sns.scatterplot( - data=full_data, x="input_feature", y="target", color="black", alpha=0.5 -) -ax.plot(data, target_predicted) -_ = ax.set_title(f"Mean squared error = {mse:.2f}") - # %% [markdown] -# We can see that even with a linear model, we can overcome the linearity -# limitation of the model by adding the non-linear components in the design of -# additional features. Here, we created new features by knowing the way the -# target was generated. -# # Instead of manually creating such polynomial features one could directly use # [sklearn.preprocessing.PolynomialFeatures](https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.PolynomialFeatures.html). -# -# To demonstrate the use of the `PolynomialFeatures` class, we use a -# scikit-learn pipeline which first transforms the features and then fit the -# regression model. # %% -from sklearn.pipeline import make_pipeline from sklearn.preprocessing import PolynomialFeatures -polynomial_regression = make_pipeline( - PolynomialFeatures(degree=3, include_bias=False), - LinearRegression(), -) -polynomial_regression.fit(data, target) -target_predicted = polynomial_regression.predict(data) -mse = mean_squared_error(target, target_predicted) +polynomial_expansion = PolynomialFeatures(degree=3, include_bias=False) # %% [markdown] # In the previous cell we had to set `include_bias=False` as otherwise we would -# create a column perfectly correlated to the `intercept_` introduced by the -# `LinearRegression`. We can verify that this procedure is equivalent to +# create a constant feature perfectly correlated to the `intercept_` introduced +# by the `LinearRegression`. We can verify that this procedure is equivalent to # creating the features by hand up to numerical error by computing the maximum # of the absolute values of the differences between the features generated by # both methods and checking that it is close to zero: -# %% -np.abs(polynomial_regression[0].fit_transform(data) - data_expanded).max() +np.abs(polynomial_expansion.fit_transform(data) - data_expanded).max() # %% [markdown] -# Then it should not be surprising that the predictions of the -# `PolynomialFeatures` pipeline match the predictions of the linear model fit on -# manually engineered features. +# To demonstrate the use of the `PolynomialFeatures` class, we use a +# scikit-learn pipeline which first transforms the features and then fit the +# regression model. # %% -ax = sns.scatterplot( - data=full_data, x="input_feature", y="target", color="black", alpha=0.5 +from sklearn.pipeline import make_pipeline +from sklearn.preprocessing import PolynomialFeatures + +polynomial_regression = make_pipeline( + PolynomialFeatures(degree=3, include_bias=False), + LinearRegression(), ) -ax.plot(data, target_predicted) -_ = ax.set_title(f"Mean squared error = {mse:.2f}") +polynomial_regression + +# %% +fit_score_plot_regression(polynomial_regression, title="Polynomial regression") # %% [markdown] +# We can see that even with a linear model, we can overcome the linearity +# limitation of the model by adding the non-linear components in the design of +# additional features. Here, we created new features by knowing the way the +# target was generated. +# # The last possibility is to make a linear model more expressive is to use a # "kernel". Instead of learning one weight per feature as we previously did, a # weight is assigned to each sample. However, not all samples are used: some @@ -231,16 +220,10 @@ from sklearn.svm import SVR svr = SVR(kernel="linear") -svr.fit(data, target) -target_predicted = svr.predict(data) -mse = mean_squared_error(target, target_predicted) +svr # %% -ax = sns.scatterplot( - data=full_data, x="input_feature", y="target", color="black", alpha=0.5 -) -ax.plot(data, target_predicted) -_ = ax.set_title(f"Mean squared error = {mse:.2f}") +fit_score_plot_regression(svr, title="Linear support vector machine") # %% [markdown] # The predictions of our SVR with a linear kernel are all aligned on a straight @@ -256,16 +239,10 @@ # %% svr = SVR(kernel="poly", degree=3) -svr.fit(data, target) -target_predicted = svr.predict(data) -mse = mean_squared_error(target, target_predicted) +svr # %% -ax = sns.scatterplot( - data=full_data, x="input_feature", y="target", color="black", alpha=0.5 -) -ax.plot(data, target_predicted) -_ = ax.set_title(f"Mean squared error = {mse:.2f}") +fit_score_plot_regression(svr, title="Polynomial support vector machine") # %% [markdown] # Kernel methods such as SVR are very efficient for small to medium datasets. @@ -276,7 +253,7 @@ # as # [KBinsDiscretizer](https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.KBinsDiscretizer.html) # or -# [Nystroem](https://scikit-learn.org/stable/modules/generated/sklearn.kernel_approximation.Nystroem.html). +# [SplineTransformer](https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.SplineTransformer.html). # # Here again we refer the interested reader to the documentation to get a proper # definition of those methods. The following just gives an intuitive overview of @@ -289,15 +266,22 @@ KBinsDiscretizer(n_bins=8), LinearRegression(), ) -binned_regression.fit(data, target) -target_predicted = binned_regression.predict(data) -mse = mean_squared_error(target, target_predicted) +binned_regression -ax = sns.scatterplot( - data=full_data, x="input_feature", y="target", color="black", alpha=0.5 +# %% +fit_score_plot_regression(binned_regression, title="Binned regression") + +# %% +from sklearn.preprocessing import SplineTransformer + +spline_regression = make_pipeline( + SplineTransformer(degree=3, include_bias=False), + LinearRegression(), ) -ax.plot(data, target_predicted) -_ = ax.set_title(f"Mean squared error = {mse:.2f}") +spline_regression + +# %% +fit_score_plot_regression(spline_regression, title="Spline regression") # %% [markdown] # `Nystroem` is a nice alternative to `PolynomialFeatures` that makes it @@ -312,15 +296,12 @@ Nystroem(kernel="poly", degree=3, n_components=5, random_state=0), LinearRegression(), ) -nystroem_regression.fit(data, target) -target_predicted = nystroem_regression.predict(data) -mse = mean_squared_error(target, target_predicted) +nystroem_regression -ax = sns.scatterplot( - data=full_data, x="input_feature", y="target", color="black", alpha=0.5 +# %% +fit_score_plot_regression( + nystroem_regression, title="Polynomial Nystroem regression" ) -ax.plot(data, target_predicted) -_ = ax.set_title(f"Mean squared error = {mse:.2f}") # %% [markdown] # ## Notebook Recap From 5d00d2f559d71bc7ccd4beee0c76b2f06e1e33c1 Mon Sep 17 00:00:00 2001 From: Arturo Amor <86408019+ArturoAmorQ@users.noreply.github.com> Date: Fri, 6 Oct 2023 16:47:02 +0200 Subject: [PATCH 063/108] Iter on contents related to predict_proba (#723) --- python_scripts/logistic_regression.py | 96 ++++++++++++++++++--------- 1 file changed, 64 insertions(+), 32 deletions(-) diff --git a/python_scripts/logistic_regression.py b/python_scripts/logistic_regression.py index b595e97a2..59f9045a5 100644 --- a/python_scripts/logistic_regression.py +++ b/python_scripts/logistic_regression.py @@ -6,7 +6,7 @@ # --- # %% [markdown] -# # Linear model for classification +# # Linear models for classification # # In regression, we saw that the target to be predicted is a continuous # variable. In classification, the target is discrete (e.g. categorical). @@ -121,26 +121,8 @@ _ = plt.title("Decision boundary of the trained\n LogisticRegression") # %% [markdown] -# Thus, we see that our decision function is represented by a line separating -# the 2 classes. -# -# Since the line is oblique, it means that we used a combination of both -# features: - -# %% -coefs = logistic_regression[-1].coef_[0] # the coefficients is a 2d array -weights = pd.Series(coefs, index=culmen_columns) - -# %% -weights.plot.barh() -_ = plt.title("Weights of the logistic regression") - -# %% [markdown] -# Indeed, both coefficients are non-null. If one of them had been zero, the -# decision boundary would have been either horizontal or vertical. -# -# Furthermore the intercept is also non-zero, which means that the decision does -# not go through the point with (0, 0) coordinates. +# Thus, we see that our decision function is represented by a straight line +# separating the 2 classes. # # For the mathematically inclined reader, the equation of the decision boundary # is: @@ -155,6 +137,36 @@ # # which is the equation of a straight line. # +# Since the line is oblique, it means that both coefficients (also called +# weights) are non-null: + +# %% +coefs = logistic_regression[-1].coef_[0] +weights = pd.Series(coefs, index=[f"Weight for '{c}'" for c in culmen_columns]) +weights + +# %% [markdown] +# You can [access pipeline +# steps](https://scikit-learn.org/stable/modules/compose.html#access-pipeline-steps) +# by name or position. In the code above `logistic_regression[-1]` means the +# last step of the pipeline. Then you can access the attributes of that step such +# as `coef_`. Notice also that the `coef_` attribute is an array of shape (1, +# `n_features`) an then we access it via its first entry. Alternatively one +# could use `coef_.ravel()`. +# +# We are now ready to visualize the weight values as a barplot: + +# %% +weights.plot.barh() +_ = plt.title("Weights of the logistic regression") + +# %% [markdown] +# If one of the weights had been zero, the decision boundary would have been +# either horizontal or vertical. +# +# Furthermore the intercept is also non-zero, which means that the decision does +# not go through the point with (0, 0) coordinates. +# # ## (Estimated) predicted probabilities # # The `predict` method in classification models returns what we call a "hard @@ -178,11 +190,29 @@ # one can alternatively use the `predict_proba` method to compute continuous # values ("soft predictions") that correspond to an estimation of the confidence # of the target belonging to each class. +# +# For a binary classification scenario, the logistic regression makes both hard +# and soft predictions based on the [logistic +# function](https://en.wikipedia.org/wiki/Logistic_function) (also called +# sigmoid function), which is S-shaped and maps any input into a value between 0 +# and 1. # %% y_pred_proba = logistic_regression.predict_proba(test_penguin) y_pred_proba +# %% [markdown] +# More in general, the output of `predict_proba` is an array of shape +# (`n_samples`, `n_classes`) + +# %% +y_pred_proba.shape + +# %% [markdown] +# Also notice that the sum of (estimated) predicted probabilities across classes +# is 1.0 for each given sample. We can visualize them for our `test_penguin` as +# follows: + # %% y_proba_sample = pd.Series( y_pred_proba.ravel(), index=logistic_regression.classes_ @@ -192,8 +222,6 @@ _ = plt.title("Probability of the sample belonging to a penguin class") # %% [markdown] -# Notice that the (estimated) predicted probabilities sum to one. -# # ```{warning} # We insist that the output of `predict_proba` are just estimations. Their # reliability on being a good estimate of the true conditional class-assignment @@ -209,7 +237,12 @@ # using [matplotlib diverging # colormaps](https://matplotlib.org/stable/users/explain/colors/colormaps.html#diverging) # such as `"RdBu_r"`, the softer the color, the more unsure about which class to -# choose (the probability of 0.5 is mapped to white). +# choose (the probability of 0.5 is mapped to white color). +# +# Equivalently, towards the tails of the curve the sigmoid function approaches +# its asymptotic values of 0 or 1, which are mapped to darker colors. Indeed, +# the closer the predicted probability is to 0 or 1, the more confident the +# classifier is in its predictions. # %% DecisionBoundaryDisplay.from_estimator( @@ -229,12 +262,11 @@ _ = plt.title("Predicted probability of the trained\n LogisticRegression") # %% [markdown] -# The [scikit-learn user guide]( +# For multi-class classification the logistic regression uses the [softmax +# function](https://en.wikipedia.org/wiki/Softmax_function) to make predictions. +# Giving more details on that scenario is beyond the scope of this MOOC. +# +# In any case, interested users are refered to the [scikit-learn user guide]( # https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression) -# gives a more precise description of the `predict_proba` method of the -# `LogisticRegression`. More detailed info can be found on Wikipedia about the -# normalization functions: [softmax -# function](https://en.wikipedia.org/wiki/Softmax_function) used by logistic -# regression on multi-class problems and the [logistic -# function](https://en.wikipedia.org/wiki/Logistic_function) used for binary -# classifications problems. +# for a more mathematical description of the `predict_proba` method of the +# `LogisticRegression` and the respective normalization functions. From e4c76fb29d6ef2547d8a322ca245e58b8d6f9952 Mon Sep 17 00:00:00 2001 From: Arturo Amor <86408019+ArturoAmorQ@users.noreply.github.com> Date: Tue, 10 Oct 2023 15:38:36 +0200 Subject: [PATCH 064/108] FIX variable name (#729) Co-authored-by: ArturoAmorQ --- python_scripts/linear_models_sol_02.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/python_scripts/linear_models_sol_02.py b/python_scripts/linear_models_sol_02.py index 170533dcf..03aa72005 100644 --- a/python_scripts/linear_models_sol_02.py +++ b/python_scripts/linear_models_sol_02.py @@ -138,9 +138,9 @@ # %% # solution -culmen_length_first_sample = 181.0 +flipper_length_first_sample = 181.0 culmen_depth_first_sample = 18.7 -culmen_length_first_sample * culmen_depth_first_sample +flipper_length_first_sample * culmen_depth_first_sample # %% [markdown] # Use the same cross-validation strategy as done previously to estimate the mean From 4f4add90410b347ad581719431d194d33e94d6e4 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Tue, 17 Oct 2023 14:18:17 +0200 Subject: [PATCH 065/108] ENH use probability estimates in DecisionBoundaryDisplay (#710) Co-authored-by: Olivier Grisel Co-authored-by: ArturoAmorQ Co-authored-by: Arturo Amor <86408019+ArturoAmorQ@users.noreply.github.com> --- python_scripts/linear_models_sol_03.py | 254 ++++++++++++++++++++----- 1 file changed, 206 insertions(+), 48 deletions(-) diff --git a/python_scripts/linear_models_sol_03.py b/python_scripts/linear_models_sol_03.py index d789c8522..dc2a82f5c 100644 --- a/python_scripts/linear_models_sol_03.py +++ b/python_scripts/linear_models_sol_03.py @@ -8,14 +8,20 @@ # %% [markdown] # # ๐Ÿ“ƒ Solution for Exercise M4.03 # -# The parameter `penalty` can control the **type** of regularization to use, -# whereas the regularization **strength** is set using the parameter `C`. -# Setting`penalty="none"` is equivalent to an infinitely large value of `C`. In -# this exercise, we ask you to train a logistic regression classifier using the -# `penalty="l2"` regularization (which happens to be the default in -# scikit-learn) to find by yourself the effect of the parameter `C`. +# In the previous Module we tuned the hyperparameter `C` of the logistic +# regression without mentioning that it controls the regularization strength. +# Later, on the slides on ๐ŸŽฅ **Intuitions on regularized linear models** we +# metioned that a small `C` provides a more regularized model, whereas a +# non-regularized model is obtained with an infinitely large value of `C`. +# Indeed, `C` behaves as the inverse of the `alpha` coefficient in the `Ridge` +# model. # -# We start by loading the dataset. +# In this exercise, we ask you to train a logistic regression classifier using +# different values of the parameter `C` to find its effects by yourself. +# +# We start by loading the dataset. We only keep the Adelie and Chinstrap classes +# to keep the discussion simple. + # %% [markdown] # ```{note} @@ -27,7 +33,6 @@ import pandas as pd penguins = pd.read_csv("../datasets/penguins_classification.csv") -# only keep the Adelie and Chinstrap classes penguins = ( penguins.set_index("Species").loc[["Adelie", "Chinstrap"]].reset_index() ) @@ -38,7 +43,9 @@ # %% from sklearn.model_selection import train_test_split -penguins_train, penguins_test = train_test_split(penguins, random_state=0) +penguins_train, penguins_test = train_test_split( + penguins, random_state=0, test_size=0.4 +) data_train = penguins_train[culmen_columns] data_test = penguins_test[culmen_columns] @@ -47,76 +54,227 @@ target_test = penguins_test[target_column] # %% [markdown] -# First, let's create our predictive model. +# We define a function to help us fit a given `model` and plot its decision +# boundary. We recall that by using a `DecisionBoundaryDisplay` with diverging +# colormap, `vmin=0` and `vmax=1`, we ensure that the 0.5 probability is mapped +# to the white color. Equivalently, the darker the color, the closer the +# predicted probability is to 0 or 1 and the more confident the classifier is in +# its predictions. # %% -from sklearn.pipeline import make_pipeline -from sklearn.preprocessing import StandardScaler -from sklearn.linear_model import LogisticRegression - -logistic_regression = make_pipeline( - StandardScaler(), LogisticRegression(penalty="l2") -) - -# %% [markdown] -# Given the following candidates for the `C` parameter, find out the impact of -# `C` on the classifier decision boundary. You can use -# `sklearn.inspection.DecisionBoundaryDisplay.from_estimator` to plot the -# decision function boundary. - -# %% -Cs = [0.01, 0.1, 1, 10] - -# solution import matplotlib.pyplot as plt import seaborn as sns from sklearn.inspection import DecisionBoundaryDisplay -for C in Cs: - logistic_regression.set_params(logisticregression__C=C) - logistic_regression.fit(data_train, target_train) - accuracy = logistic_regression.score(data_test, target_test) - DecisionBoundaryDisplay.from_estimator( - logistic_regression, - data_test, - response_method="predict", +def plot_decision_boundary(model): + model.fit(data_train, target_train) + accuracy = model.score(data_test, target_test) + + disp = DecisionBoundaryDisplay.from_estimator( + model, + data_train, + response_method="predict_proba", + plot_method="pcolormesh", cmap="RdBu_r", - alpha=0.5, + alpha=0.8, + vmin=0.0, + vmax=1.0, + ) + DecisionBoundaryDisplay.from_estimator( + model, + data_train, + response_method="predict_proba", + plot_method="contour", + linestyles="--", + linewidths=1, + alpha=0.8, + levels=[0.5], + ax=disp.ax_, ) sns.scatterplot( - data=penguins_test, + data=penguins_train, x=culmen_columns[0], y=culmen_columns[1], hue=target_column, - palette=["tab:red", "tab:blue"], + palette=["tab:blue", "tab:red"], + ax=disp.ax_, ) plt.legend(bbox_to_anchor=(1.05, 0.8), loc="upper left") plt.title(f"C: {C} \n Accuracy on the test set: {accuracy:.2f}") + +# %% [markdown] +# Let's now create our predictive model. + +# %% +from sklearn.pipeline import make_pipeline +from sklearn.preprocessing import StandardScaler +from sklearn.linear_model import LogisticRegression + +logistic_regression = make_pipeline(StandardScaler(), LogisticRegression()) + +# %% [markdown] +# ## Influence of the parameter `C` on the decision boundary +# +# Given the following candidates for the `C` parameter and the +# `plot_decision_boundary` function, find out the impact of `C` on the +# classifier's decision boundary. +# +# - How does the value of `C` impact the confidence on the predictions? +# - How does it impact the underfit/overfit trade-off? +# - How does it impact the position and orientation of the decision boundary? +# +# Try to give an interpretation on the reason for such behavior. + +# %% +Cs = [1e-6, 0.01, 0.1, 1, 10, 100, 1e6] + +# solution +for C in Cs: + logistic_regression.set_params(logisticregression__C=C) + plot_decision_boundary(logistic_regression) + +# %% [markdown] tags=["solution"] +# +# On this series of plots we can observe several important points. Regarding the +# confidence on the predictions: +# +# - For low values of `C` (strong regularization), the classifier is less +# confident in its predictions. We are enforcing a **spread sigmoid**. +# - For high values of `C` (weak regularization), the classifier is more +# confident: the areas with dark blue (very confident in predicting "Adelie") +# and dark red (very confident in predicting "Chinstrap") nearly cover the +# entire feature space. We are enforcing a **steep sigmoid**. +# +# To answer the next question, think that misclassified data points are more +# costly when the classifier is more confident on the decision. Decision rules +# are mostly driven by avoiding such cost. From the previous observations we can +# then deduce that: +# +# - The smaller the `C` (the stronger the regularization), the lower the cost +# of a misclassification. As more data points lay in the low-confidence +# zone, the more the decision rules are influenced almost uniformly by all +# the data points. This leads to a less expressive model, which may underfit. +# - The higher the value of `C` (the weaker the regularization), the more the +# decision is influenced by a few training points very close to the boundary, +# where decisions are costly. Remember that models may overfit if the number +# of samples in the training set is too small, as at least a minimum of +# samples is needed to average the noise out. +# +# The orientation is the result of two factors: minimizing the number of +# misclassified training points with high confidence and their distance to the +# decision boundary (notice how the contour line tries to align with the most +# misclassified data points in the dark-colored zone). This is closely related +# to the value of the weights of the model, which is explained in the next part +# of the exercise. +# +# Finally, for small values of `C` the position of the decision boundary is +# affected by the class imbalance: when `C` is near zero, the model predicts the +# majority class (as seen in the training set) everywhere in the feature space. +# In our case, there are approximately two times more "Adelie" than "Chinstrap" +# penguins. This explains why the decision boundary is shifted to the right when +# `C` gets smaller. Indeed, the most regularized model predicts light blue +# almost everywhere in the feature space. + # %% [markdown] +# ## Impact of the regularization on the weights +# # Look at the impact of the `C` hyperparameter on the magnitude of the weights. +# **Hint**: You can [access pipeline +# steps](https://scikit-learn.org/stable/modules/compose.html#access-pipeline-steps) +# by name or position. Then you can query the attributes of that step such as +# `coef_`. # %% # solution -weights_ridge = [] +lr_weights = [] for C in Cs: logistic_regression.set_params(logisticregression__C=C) logistic_regression.fit(data_train, target_train) coefs = logistic_regression[-1].coef_[0] - weights_ridge.append(pd.Series(coefs, index=culmen_columns)) + lr_weights.append(pd.Series(coefs, index=culmen_columns)) # %% tags=["solution"] -weights_ridge = pd.concat(weights_ridge, axis=1, keys=[f"C: {C}" for C in Cs]) -weights_ridge.plot.barh() +lr_weights = pd.concat(lr_weights, axis=1, keys=[f"C: {C}" for C in Cs]) +lr_weights.plot.barh() _ = plt.title("LogisticRegression weights depending of C") # %% [markdown] tags=["solution"] -# We see that a small `C` will shrink the weights values toward zero. It means -# that a small `C` provides a more regularized model. Thus, `C` is the inverse -# of the `alpha` coefficient in the `Ridge` model. # -# Besides, with a strong penalty (i.e. small `C` value), the weight of the -# feature "Culmen Depth (mm)" is almost zero. It explains why the decision +# As small `C` provides a more regularized model, it shrinks the weights values +# toward zero, as in the `Ridge` model. +# +# In particular, with a strong penalty (e.g. `C = 0.01`), the weight of the feature +# named "Culmen Depth (mm)" is almost zero. It explains why the decision # separation in the plot is almost perpendicular to the "Culmen Length (mm)" # feature. +# +# For even stronger penalty strengths (e.g. `C = 1e-6`), the weights of both +# features are almost zero. It explains why the decision separation in the plot +# is almost constant in the feature space: the predicted probability is only +# based on the intercept parameter of the model (which is never regularized). + +# %% [markdown] +# ## Impact of the regularization on with non-linear feature engineering +# +# Use the `plot_decision_boundary` function to repeat the experiment using a +# non-linear feature engineering pipeline. For such purpose, insert +# `Nystroem(kernel="rbf", gamma=1, n_components=100)` between the +# `StandardScaler` and the `LogisticRegression` steps. +# +# - Does the value of `C` still impact the position of the decision boundary and +# the confidence of the model? +# - What can you say about the impact of `C` on the underfitting vs overfitting +# trade-off? + +# %% +from sklearn.kernel_approximation import Nystroem + +# solution +classifier = make_pipeline( + StandardScaler(), + Nystroem(kernel="rbf", gamma=1.0, n_components=100, random_state=0), + LogisticRegression(penalty="l2", max_iter=1000), +) + +for C in Cs: + classifier.set_params(logisticregression__C=C) + plot_decision_boundary(classifier) + +# %% [markdown] tags=["solution"] +# +# - For the lowest values of `C`, the overall pipeline underfits: it predicts +# the majority class everywhere, as previously. +# - When `C` increases, the models starts to predict some datapoints from the +# "Chinstrap" class but the model is not very confident anywhere in the +# feature space. +# - The decision boundary is no longer a straight line: the linear model is now +# classifying in the 100-dimensional feature space created by the `Nystroem` +# transformer. As are result, the decision boundary induced by the overall +# pipeline is now expressive enough to wrap around the minority class. +# - For `C = 1` in particular, it finds a smooth red blob around most of the +# "Chinstrap" data points. When moving away from the data points, the model is +# less confident in its predictions and again tends to predict the majority +# class according to the proportion in the training set. +# - For higher values of `C`, the model starts to overfit: it is very confident +# in its predictions almost everywhere, but it should not be trusted: the +# model also makes a larger number of mistakes on the test set (not shown in +# the plot) while adopting a very curvy decision boundary to attempt fitting +# all the training points, including the noisy ones at the frontier between +# the two classes. This makes the decision boundary very sensitive to the +# sampling of the training set and as a result, it does not generalize well in +# that region. This is confirmed by the (slightly) lower accuracy on the test +# set. +# +# Finally, we can also note that the linear model on the raw features was as +# good or better than the best model using non-linear feature engineering. So in +# this case, we did not really need this extra complexity in our pipeline. +# **Simpler is better!** +# +# So to conclude, when using non-linear feature engineering, it is often +# possible to make the pipeline overfit, even if the original feature space is +# low-dimensional. As a result, it is important to tune the regularization +# parameter in conjunction with the parameters of the transformers (e.g. tuning +# `gamma` would be important here). This has a direct impact on the certainty of +# the predictions. From 4b006d5eacab7802e4f9e6d2b15e8b6886bd9a81 Mon Sep 17 00:00:00 2001 From: Arturo Amor <86408019+ArturoAmorQ@users.noreply.github.com> Date: Tue, 17 Oct 2023 15:43:40 +0200 Subject: [PATCH 066/108] MAINT Fix broken sphinx-book-theme reference in CI (#732) --- requirements-dev.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements-dev.txt b/requirements-dev.txt index 77caf5786..89ad0ad4b 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -6,7 +6,7 @@ plotly jupyter-book>=0.11 # Partial fix for the navbar scrollToActive behavior: # https://github.com/executablebooks/sphinx-book-theme/issues/541 -sphinx-book-theme @ git+https://github.com/ogrisel/sphinx-book-theme@fix-bd-docs-nav +sphinx-book-theme @ git+https://github.com/executablebooks/sphinx-book-theme.git@aca0f7fd39314ec3283f1ff798b4bf8ee9b49cad jupytext beautifulsoup4 IPython From f339cc7b53736d906f4e61fa611dc8a81ad29a99 Mon Sep 17 00:00:00 2001 From: Arturo Amor <86408019+ArturoAmorQ@users.noreply.github.com> Date: Fri, 20 Oct 2023 14:38:43 +0200 Subject: [PATCH 067/108] ENH Add visualization of fold-to-fold comparison in Wrap-up Quiz 1 (#733) Co-authored-by: ArturoAmorQ --- ...merical_pipeline_wrap_up_quiz_comparison.png | Bin 0 -> 30556 bytes 1 file changed, 0 insertions(+), 0 deletions(-) create mode 100644 figures/numerical_pipeline_wrap_up_quiz_comparison.png diff --git a/figures/numerical_pipeline_wrap_up_quiz_comparison.png b/figures/numerical_pipeline_wrap_up_quiz_comparison.png new file mode 100644 index 0000000000000000000000000000000000000000..d71aca77202f1726a573601c58ccd5fad69620d3 GIT binary patch literal 30556 zcmd432~>#h+BN<(XdoKVfHbJkoRmsZp^!?WQW`~}(nOk2DHIwA&4VV5nkUgDp;@Gv z(jbM>T;G2D&Uw#y=Xd@8-?!Flt+URrddB;?@9Wyv-ut=(wT>NHyP9(~K@e-z)l`oY z1f>r_Q1sE!;5X9EU0?8jdz=p#I_ub6f(NLEqb#b4D#tWmhWF8R;pg#fa$_pVJ=4s4^r2)@+fN@K zLiON5{i2?2NA8=oB&sthSH69__T9U80k>{lSB_J9cKWTu?4-G@oSeRZGe#zU2VeRI zC1YdbvZ zhDQ7Svu!ENMBhulHFCFZ;UX!*0`u$o&fdR&pN6mwp47`M^}SM|Ma2$Ck(3gTQ+X)JKw&Y)4QPE9YN?{yMCR@ zXXV1G^n{bi!p8R%WvIE;I=rABR)PpuhSndYAY}}Q<7QY;>e`7ZP(cr zYT|BWOq`r)|MFc@{QCOLF1fo~G{+xEdhh4w zw?BmcsY&UEod%h{KkMCPE_7A)y|}JzXc(aAIbYe>s9C4YiU&v)Oy9qC`}XaZh927J zu4HYBZkrByB}gL*u~l<18($tw-89*x3X8)@-yGZA)dBxv**I zT@-n6bxjQ(zOt^4HeEj>t>Stb@0{BNuV=#s1vy6+zY{0K)AZKp@86%zIL;Vq@s%8! zXs_JE#G2Ks9VU9pk2n1K_3MbR_tda<@o1F7Wl0ZD&mZjur}}4qd@k}_*sY+TfH%F# zs5nc!fB!x_xV(gI=u6*wn_qXA$Fh{XyaHWL9J({`t|cjZqJvRXYM*cOO5<5F$Y{!7 zO-DyZry1uB#=X7W6(#R>@awy%olN1&xWZQJH--vwmybKeEqWx{_uTGEH|Vdh-FDk~ z*FJI{>v8|42+J{!dAn17Zs$;QeQ1%iy_fVMTvoNDl3MG@&xx+>CF92HMm`nrtbFgb zy@|I9aQ35*J(^_SZIss7%R2a#&$HqEDE*&5_~Q?I<>I(>?D-j#`wYbyl=FxaQ|QZU~;MqZXOf_QyD z@R`)Nh2J6_yMHgu=bU0{*16rfkNFayPHdBNl#pVI_|Bn>?pJI>)AZ(hBm{GP5q@V;zLt+zm9Y2)=kdim=*Ne-3!-)g6h z#6%Dw_h+NM<-{q%_VBG)Q{*K{`);BvXiCKR$8a_M^6`WnRbMqagd>*&)3|znX)+4< zP!r;g7k8A@RGpgB_1huYS2-QJr+rhnhvR&5a#gFxSlGl|GA->@Chhz_pX-lqwC-y^ z=eK815%X^eQ5O$pw+$i(-!~_`3Jww}IbU*ynf#rMVzi&KpK@F+nPuNsMQu41|n#o2kKr%3|SDzc1{VWpos_>@Mgp+J_B zSyiAFe^-6Arag802djjl@yIa?HL7(CaV`hzwUWKvAN9_xFLxPTeD_Iu^aI1l zUNPElR6>?c1u;9N>V>|~w(olLaf`uQX1#ZEDY^A^uqS*?P10fJ$yDRY{QTE|wb z_L=z%B~v_%sPw8NioBnv)fSxI!SQmTLbC9f9gB*4V*>@N(!-Ldd)Gy@QwKXwAJJX? z-Ky;kqqfTBic}txjVc@tT4gQ=-IY%a&OhqOG1;>eI&+myRx|pxQR6x3%nnOd?{w}hl2<>Dt23!wZfsbEt9N=xCh7L(3fAvr`mpKYaSdj@;5dcew!4|P z;uP#VwTx`qw2!T{_1$Qn?8;+a>t(nWlh3y8d`&L<>xCXIjNt1lrcsNfZjLjh6L%MD zh4|NP^8F%nu>ZT;PTMKQazS$}pUxf&kFFfwD|(yM2RT~WsQjHx7zQu5j;w6S>T|72 zlV#cQ?rzVF|HZMq*(;`xzq2U4^=RB^LubN(>?Wl1@aDyWGrAW`N3GOEcHN0NwGhd` zP?EhdX7hGsGQ6}>k+L{&O6F~<9JwM3XG%_FRg?RLLzt|hXDUEAxm<{OEXs8i3b zDU`9Y2FJKH)=bWtOg?vFe*0bU#mc(_w*z_u$=`lTE~w1gnpN#89mK;RTV!CE<7C`C zt|sq4RUwmToE-DL_?mj-?aFnHyI+O2H6FVw$YFWIhO6w?8#<5qnf03wrR-H1v52ML z7iTrPN9@q165VBEp7WgVGyTu(s3V`@K*!+h9M|mB@Q%Ea2eCPksX~=I1MB+kJ93#f z-buaJaX@`fU*&L+qwJ6IJK{Hkt_JdQ%?;gW@>-^g6sHu8o}SwL@m6CdOwUlub7X~{8cEz49Sj{%SS0`><)ox`+aN^HRPLh+y z4>xc0DbNXaR#&8_J#g@$_4I)~snhS zYo#YHI*Gaq(C|&a6&()JJ3R>w*T~^z3<=0l*9JDz3m_B0m zjar%Ymm7V0$@r_3bb;RcyEGd)jW$xb*!G+djLH?4u3hx1i0DkB-+kywpW(~$kc zPk6a|^GC1GKXiM&NtR-^0une{92j_I<^cS2Qk& z$Ryh3&RMOlm>yr-ar=|e^NHu@KXRRxo-`z?F3}Y6tQE~$y;>C+X#I!Gq3#nczGSv> zD3GYD-pFM+p7M2a%-7XWess@Kj~vG!&1IkXQ+JAMpA6c@)l2Oj#BEQjEnU4?c;BWy zw?-i5UBt%=!cSZ;aY=m&B7c09W^Mf0xMA(cqfO5r>Br4@YA9@|6u~rP##fc&%;^dG7t2J&&@^ zwk8YFZoD8Hx0JBmZnRI?dgJxG)s{wYUu}&`xpnnc@!CiiR*NJFip1oe&pMrFq^6ZL zmfJ6r8Bt@OmWrUSbmFp7&35AkyMgo7f)a^#KQA5aESqRQZ@|jll5yGEjsComl%2a9 zy~?OX>^ggjR-MMXPx2(fzJ8@_a`&D4Y-`c*G)_Fx&O)uuLalIUt+hj3oQs53^3W0+ zZyKR8*G|t~>znOkuCe=q5W~nL_Ldy-vX$8-N7$7&QfUBQ?AKZzjeNFuopF3y?=cDK z`?ia%YkgJ(e9NWn6v+q5sHO(Ud4cFG?RrdHWt7qCAD5aNU`Q>kZtbd33QZO~ z9xo{Fdn)SiYkKpj-pgDg#sve-fviGrUSHGBFv?!N`kLpN70%0_W*q(7B<=oO<24>< zYP-Mkx(-zq<*=-;+uP|VC@2DggZ-aB-v{VFwRzpWg9jR;Jm-y1uMOleVO5d6|A}MI znVGrei@om@SJ$&D{$&T&wq-WGx^R zlRltSwvx@e%o5PzWEw2(PMs0gG%ejcJtLQVm4}zNMM~wNV$EIldFg%;Gt`fql}tF9 zQ>8-SPi<{`H~I2Bn~HTnhi{?J|NmP1;C*_t|Evb~8$EL5NY$H%Qq3hUc812rRd3!f zOpkROT3KEKlGd(iCMylS;BPFIsD)#lzP_6s%X)5RZhrOj+ebVS63qC`0W-6$EiEmt z^?FgF_Kl6*Lgm|!D>^tj?r*p+a_`~8!1JF9;&hUaMaa8dklwtOyzpH84vO11Z@&Ce zA4);&)PJgU@uFmWLc+bfcYRQOSJ%~9zKM_+OmkYC{ZZt#Bv)2m?l?bn_GIzJRVdYp zoW8Dm`SK+}aCtu{Jyn3b8J+Sn%K@3ILnszxUDJhSi^@3-*OQe07J`_9Iv~!bnx%R48 z0u*oFys6GKDJ2M8P$a+S(8a8a&n?xwJKGlwZiiXQJG^|mzUOBUr+!2`**^J?@(}9h#Aold~00IO(>s z?3ppPdDj`HbLY;9+q{cBUU)w$GBVqFQ)hvwITd3y1O^8Teq(0;^6A|E;cN(BF=f;=! zwAhOhR>|4fx%jsRc_q_abwWf~m>`Oai#Z#F*j5uhNl8iXr;kO-GWU8dh%7EHGNF_` zb?OvurdTmQy3=Fw1?_NKDr@kjJ(S+wE0SXuU0ggZIFe>(XDO0k&>)EV`g+b}tKy4;hXZppFM{@=zj?0sKeVH=c zlZjl?QyRPJWApQ+m6eqVrE#@SM;?fpW6=>lO-))Eqq+rbre&9yK^56H2nojg{_$hG z+Z*yd+xPFL-%9xYGe@AC|Bhr%uG2CeG&kqLoS@pb?>Wn-RkXBKt*xxLHi+!VQQ+sN z78MnJ)!2CG%o(9~QS!+~4<0iYmvUUzs>S zC5j5ZZ(@Q`OiWDmz=0Qk(F8m4O>Avh1c=`DHzkJcnoR@PM^?hd3Xn`Zo^px%S-c@`ZKfY{fVFAHcy4Q?{LZ;2SGQ*(APsjFKYKrj58rTdqQC!Y*7N0(5N-015&v&Sf4Ivp!T)Qx%m3|^ ztGfolxJ4nQv74OX^&zILmBE|k;}PNw^Ug3M>`!s5#vksT8(!KlIrr=9kME@u@m@>w z>~hXR#;eGQp7s#ukh2>zljHzF-eJ1j*?7w1UdFXQQ(7&H-Ynr@(B$0L7`|=WRv8zZ z+y6k+WUG2enqJe}w?Tg$rFV3q(1)V3x|$j>t_tD!E>guqDVuoHa`#vD^)E1yKJ54t zsQheIDGg6vr&&8Wk{rXkeLMa0m1TMN-{0u@_!5q1TssuJso5=sys>5y=L2E~2M0%- zO=I+>UvFi8bU8JGd!ysg&A293Hc8o;Z+9b9P;*pXmn3EbG>>v9ZO1GbcYcM%}+X-M}tbX~nLDrTxZ{PM03|w;^@9a;wqp0lV<&`ulfcvEF2+`_sYQO&CI;|y={cuy< z^>@FEt?F5hKH5dEOjlG~{Nm$B4o62v5Z5F{yN+9t;n1PFi|r3%V(#4IJ2ZeiM?sK> zwYRUI`QEL9&~(ws>E6A26n1uFJLH2&GXKz_D&!(enc+(;{rB)kmR9u=O$$AUC5xcyrFuyt_(Uxw&$hG^N z&%Jxh03!OC`?uj?W^uB1mg!#b!q3~r?RM_mIW^k8=E;*MdhWvHw_BgoK|T2)$8vIJ zrVbo)jG^n)a5H{5WMyUL_d6Wu3gM7~s14q1Y+GJfD1)#`P0%qic9ish?xiGpu1u6) z?3E+Ox`h_gV5}qS(7AKMorU%*)w$<8^IoLtBxhsAt{&ehAV9O)k8)^eh;H@ji%FADRA|rZWA=b)c0;KLCxrDUwYQ2q$3;2$Bgm#NsmdnlZ_l!ZI|Y;UE(W?zZi)# z?Tp!1X1)_QFBI!n*WB7j!M4}Vuyy>y!5~fwLiA)&tX@=3PR^@_1{DYy+13pkfcB`s ze5Xx+DzvZr8si-t<26s8RWWbtH}Gec*dh)A1({#Cu+G4sL-4+fi;EwW1I$SMI8UGl z3Z;jIXT!(7_dd_kivrvr2;d52ACu{nlKUdYWEhaLd8c~mlB}usk_$=cV+LBkFZ?^u z@vN(>v(pPESDGVk_3YU*bEp$8mx0|d&bnt#LBSxjhL*$lT=2L2G~h*6rRH7Y$BrGt zx1j}&BG{(4Lr>tt%Jl`-?SIy&ST*$aal_LIzzY-^?`?bk{yDRcMaikDtG@uy#+NUb zY%nc#UCYR=sEW|GGf!;Inl*&dg$tRrlK2{g+1axvGUcwuU)}cli)Q_+S9Cg0j=q@e zuhP?0kdSQY8gIe82k3x~p{Qs&rW%e{NRh>N#;4tX`0#4HynmzO!qYdoz~WSd!c3Rb z`pyUcEQO?>k0P3$o{%E(E4QLYKwjQ1**$yCK1HZ=gi;Wxzo37NT^-M^YwEsvm4a=X z5FJmp*^6tE3l$lLIh1QQ2>bA>vrF6R`u%QCRA(hfDF5^4&$j!2!%Qtut=ga8Dx}$9 z;Bk5X{;T@aq^hVVSYAN5j?XJXTA{VW65m&#og=_o2#-breKr!TEH(C=BThoDOiJTmSiU zSi{@5Z?&O|t*-ea^@UG8A}A}Nr%|hiKpG1Mo*3vVcG~uSC%2x$nWn8zP0LGOF68_r z*pN5J0>6Kl5W^>3T9~0ClvPynGu`?suNr2XQFzUd2$s$CTuGXrT|z)6ZayiIkZ4{Q z`Lohi68eNiL~_oy3ViD>-I!JWn-gK*46_#>my~*5-R1f4+3qUT|a4wg-u7&J9Cg^lu3bfa3IXk3qq+C1?2vxN0) zw)U;8Jk(_Vft-i@Lt^Fdn^EL6NNbA_NX1-B-BP1(Lirb{I~pao{&Q_m62RH{@*z;tVAatOk(lkPQ#$>)%iFj+R5E*J^G> zUc>*u1bwV>TqSDJ9Kd%gYwNu^W&G-Y$=5sf19fRcq>O$wKorbpXVtgQG^c+tlWbG$%_@ahc- xm zmoI<$`qdI(wa9%sO;LrM4DO>N(iC0U=2ZOZFKuExX;8r&9}BvDyD3hEcJWu^J`yc( z*_XZe+WTi=kQfUSlTv8uOHf_RL%Vb5ZX=NM0u7wvAE z2}1VbmjK%ny%nB3zLj}rTge4Q;ylC|c0~_ae)Y89UssW+kwXsQsN8+}8WZK%`wSfs zE@@jfqk?mRNGcXhvHQ<-=5420w<(~lO}7ndBgFL<%WCqGH20BV>{CGvFh>p$LAxX} zz_(9@j3{;dQ9F*_b9nLMg{(s#@o4w?v@8b_Z8BUljiKgMXTfSaEaHY43|x2@NI~B1 zdnF(ard*4PFL`l)yf0K$S_%Rg=vsNXXX>PBp=m z1Dt_VW3lXR2L<)_^~E-dlfSe&kab;&Q|7|$+qW68kUag#f}4LQhKP18r!N%ab3J7z zV$4Y{q5dX5Ea^|cD_t7PD9i@9^7L0GkLPk9r;@erQ7D^yxklD$hyiFdaD#|ZEHXO@ znQ=$y3o}3JxW~i~;|uUs#dlCU{QmawF2B0;$6p_x**Idp30|oCW7|j>K#;4zh49xH zoEJ@*sZo4~6!pq*YaL7(b(C z%~qyCHFF;7Lu>!kx#SEz@$7VX@iGQb)@xCAd1>ZUTc!z@eZ}1AhpejPLCvA)>qkk*I!p6%R3;0&mE_{Yi4@AvnKvI41N&I1VTNWk6O>%aCkCoS{%+&<>Buyx!fe~4h#&8kaO9AaJ^Wad&+Zq{gY!+H^ai%Hmj0;0jnZ> zJ7Ag(V1C})Ui=DH$Na}Y4T(IRnK&@2_xfAxC_#T6Bb4rJ*&A)>iuJ>!+=mi3?Rz$P zGloY!_7pGzR5dh=1zMMvm6;)YBqs9z)AgSuC6R?kZEdZlu$_ekC44JL28U?x;rh%R zqIk&Pzkgqv?4yvBl#Djq&BsTD0?;QTBjcY&A?ETB2OSSq{Bm{f~1FJQPaUQOn{e}SApy%?}&3WfPZJ3$pJ@jkntpxCgr~gr$Kh@Sc_RL2^Ljw!Ldn*`@2V$pc zaZ--+NXzj-U#7|5ziWSwe=@g-@3>2bhG=(pZ??z>4>p486?YmsT2xY^Os74N1?)RJ z`}q#@`t_?cH5+$)wE52}n((&gSh8v%01s)xr?C#>(~k2!Y6bQqE6ssc&3y*|eaNIh zAddKzmR!qdgSP9~8-%o;*T7QF)+dm<}(r>Ks)vHQ4Cep<% z{#-O^Wh=JNeAxc9$1>i!q0h~OjtW1f2Ot83s)Y7%(o_M~Z zc>MVBCRBZ|Y02@?U2A)i(lF}3uS13czT|FF^kDDxoY_48d;Aubg@N@j`Gxcj;tQb& zAgjzkRzby_Z^O+A{rKV4t5>KO{$tNk-lGSTj)2STd);@z353WUvbshMS4&)j_0xgc zACn=^zIQp$`cvN7>%hw9z`g4`4S)qe4xmtEW@Ae%xGfnBzBRmbSyOS2dZXyc%CgJ9 zw4Qq{{nNEp=Du~2cEUlzXU#^@u=MXFCchs}c$J;IOw!-o&w4+49l4AqYF9Q^WyP$uMo+Y?3NRK2o$Q5)TMwGDx9TO8~!NeZSmwoO6q!hhhJ+%fGWh9Z#1d z0V>h}7Co6h)3i?3{M{Hu61anhb;tv%>WE(P*suR=e!+UOiY#Ul`&V-m=G;M z-MmDhlVkRR^3tVCDN)g@ghsgv@snCe;;6lWsm7yS6a?vg6BE-MSGo(rI811PA4NBU zLDdaLvHW8n`CB66)C~>UffqqRk=Rx5;^oh^YZ9$0Uth zqWxBnkF}D_VTjwa{WrKBzPw^4R;E{-d;j!A(%3{-v6P(K_ops%H>di!BrK>8-e%YS zK2ENIG~=Md#GTFZ{vd=Y$oz8pw17{1HT2IXm!{hdevGDP#RE>b2Z!KbB=;HWNqj7_ zLebAM4f^%9k!m(8Uo(V1QsFWSB))ludpPb@?C>@UHf1yT$jCuiUi_sAsIEds0R?Po zq?HzP@euF?h3;FCQy5O(`8&s0D=}O9Zg6{%HKQ+zG9@}pI;KsVZbMKquk@kF&dF)E zCS#jwZbB<{6Y7%?epIUv4}}NViUv?s*~o|!x<)XF zG?I?T$U^YR6|Ewfr;u$?eoA{^dQSGUXH}5qm^N(i$AiTq5Y8fHx^Qgx z^HjsTD1^U1w26L^Yu(_0Nk%#kAzVP!0`jj!yw@9CUS6=rP>t<8N{-9*ghFbq`mU}` zD*g;FP?tN-{j~PL=S?~FT+t_fjuMmj4!|9`1W0NmbU)6CKiqPK(vVEP@EzU)9)a*e zg{Xf$FpwVZ#2=`F5Acs>kpEI1;p&sH3G-QE_q6h)7V)5sZUG_bjl<5+a zlgH#0$m=E>1sx@B0n@$2w00bN!?9_vI~yWWi`$>CqA6W|l-fj_%pGi$OXeKNo&51kL%qTmje$TUZ*DA@krVZBn+ZZ+=SND;r~K3FCJx9 z?!(F;t=@1IgJ3!z@#xV@@YKldlE^>?e211X=-MYwZYpsexrTWEva^%3ao>sp;(2mm zvE)V(BQjz0Nl0jmEeKu??N)6%a$opGqI&37UmsNAUESTx8#ms7(qMs-S=M7#eADL5 z`Du12r!I{oJnv`Vxc{69-2i@K-+)H|p{|%kyJmZyrgf zKLVYqR@mb{CPGTC_T|&}D^#`@lF^v}>oOI+$UO4+X%7XM-tDW`tdVdY)`EF`LkK?w zfw#JW*Qo^CiD#{XpE*BGOh|~1gn?uPMR*A_l)(uei$CfJAH8(_xa{m~TSnA+ueZs| zbCINJ>Q}XZDb3yVaG47rSgs<#!k?&;|vm1JV8g2Kskx%21Go7>u^IeOwv2;%J7 zvt(^`Sw+Bow5wjrY-ITed&#`)yw??K6$9?N+<*pHQJdV(SpS@w(p^iVrlwYjnWSBK z!NkzeFcuZ@?7%Itw&j4N>S?Bg0486gD*g}r;pTn}%)$MW3QG|wGu ztH~scDR~{hV6-Fa4hH_gNBccHcdl7k=v`rAXQz{wm)G4+-+jZ>35NVKq4=&UPyir z6I#N1*+ap7N^RT=6}beUSo@F8k00XzN9oqCtpT|%U7KDH5GZlJbJs3J`sy(=6kTbD zj1@Cy$E2ZYc+O8<4Gm=lAN%8tR3n+ro7`8t7k_|czE)W1_Rw=?VS0>KJ!G<8FeYq) z-H!9)6vgepz^lq}KA6qdu3c-pe-SP0d3{P^6{_ z$~8Van~5xDX}LXozzqnWEMfu(egUwyY=@!+Hov9dJb{UPrxD^lgI-}0XzPhfoMQStA(&}BHL_y%iV1qy`2*LRyy z*K%`n8=g2p#uUTC3!C3X@4W?a?Svi#s^^CQN6FAWI=ULZj4mVt0C-u)0cx+6B{xte zwD|h#(*G+B^0`~6`l)ZsxmMzK$Sj#Z#;`<_<;~+iJqB=eclymWynILJ`3Lr zV1C}YkGa)KVKi$wbw@98%b35U7Cu$6i4+3>Y(-C1JT=;g7==gpjHE;QGk`=8b4b8Y zU0chmps;D-_xN@Evwe(KgLs21Q@q8EprEzL2{Z($)gY>qK!<^Im)h9uyU0D)F}!{F zrMR^80pOD#!_9nTtwE}SWHAJyfTUr&y?`jmwU4oDDsr?;Oib(pP=Q*)2JKuNvsJK` zT*;&|NPcnfY0YseH;!AAZijszRy{Uxp$zN*H%ixG5)9YNM!bAuZxlrRkp>J>41 z=P8tC=jQa(J!|F`7S<4O-8_Fi_9WRDQVYO6E2Pu-*x1<4nFmpi#b`X-bT5^yd}UoB z|I6y@kinIsgM$4fWPctfYmW7^YHg_;M>xpXgF`2~opb>{8dMK32 zes|gpV2sjS3hjG0K$T`9R3XtEu34=_Fk$)*Kp$oSy}NC6y>bA8Dm`I=%Wo?v_gX4J zfMFq2A@hQ1xfnPN4RHYRO*H2l;;;p(!fn6#e3?+-#%snXQu@q-T|ZKjs^CV72B|LA zBRJ~eCztPCCkS#uF{6^#76JLfWMB5fpV$hh?yDRplw)$~Qc*x);C8)Jy1HR!S`q_^ ztsppikB^0C!6HK!B(XCqP~cG}3MAiABmBFWVO*H@9z}it@RQ@h@AG6vpin|D*#U)1 zlS+C5JUsisj~^tMfS9*y7skZI5L-{biQvII9DWLBI?^!5!msuc_k++-K1eL9h-PTA zELj~`2{&P0!vG*4Cl%;LbKxT~Z`hzeP_j7Ty~0ZPLh=v%%%c2j!FA}(LrNd^-RB}Y z4WGj?MWN*D>wCuK+~AV%`{|hyc(+h>&=CX0U!z+MPh;6r`k*fw?7Xs&O5{N^Vbo<6i9V! z1X+I`7S6g!-GqV@(>ttFR7gmTU<%<6#N*#LT0yP0ny^3@-8Pgh6(X)& z$Agq7Mg$}rx^v$!c~N=1r@XZGb90yj8s<(+Kb*RxY`}ct#EF-vNO!@d)K)U8MnJFF zP6P;OJPH7qJwApuB;8X@U`aaMW3K%E#!OT{wy!v08zsu3L?9JcXqAp5Ze;C#p>tSW zof-Lr1?y>>X#!LuN*|=04)zBHVf$RcQq=&X9nOH3Q94?ug{nQ!<@M*W&iwSw661W^ zZA1XpSw{6mjfqdbzA(UtbS7%k@j%1C!@PYgW9M>CdM`zo2Dj4S_=nsJ+l1_$gydHn zv|(N}fkyu_I2dQ&TVW*Q*ZWt8o6(Zq?t}4Z4)T1oFv8BKuyZn zGskpYc3iw-2i!#Xpk5CDC^Jv8aU_QQy7doKdfk&dkONa+rpYDo8=uQMtVq^-iha>|-Z(W9ty z@U`WZeO_gq(-$sWpkrWA1tGi+wjm>!9M=(s&%Hd8EI$=H3HrqU%Dk|;V9Q>2vgHh1 z#)tANg`IK;@X4;1kpPJ^D;3i}e=4I!M?_6HzQlfqdmky&#sk6)cb69GPq#bgj!>Ne zjv_eaLn*afWHXO?m(wzCiG@5&7BfmpO2wzs++1ArJcswGqsuZ>sipL-pw8jxxho-M zlvm&0+a57%@!9rf+5pyCyQrI^BZbzkh^J}N7oILF2rZ;Y+qR{=nZ8rU1`i@J3y0{B zZnxglZv8#8O0BUp)-j^ztx+TG*~3xkU7x!fA}mZmjQsrkVJAdkghV(LGdMb!q+k!| zqVp+>MdJ$KGNCu8o;AkbAw`2*)PS(jvZzeCs8o*#X*!~qD(!cUMai00Un8OSX4cN0kNFCcQc}%;4b|1v zu?KE#z^9Nocm8~Sj~9f0PVagJ(XCsRvYrbC5`D9JZzqy-HytT50HI@)Z<}h8c}_h& zcqDSyxXzxRztqQSEcrHsC&-7>Yn<2>G})G?sR6b!WxlYyyj=ei*|NH87aPosaOO!N z@LeSV-isIe+pm3y-A{#}0jpEi$sfADKg-S@mb|&ss|WhMxYa8rWOf(_nZeB5fU^Ze zJE>-11@5wKQxmL<4Sr&G!h>xR{-+p2%^#Lq(AN0v+qX$HAd+5KWaTPw8fI2jB#H1R zPe>F!5k{#>EFXE_rY#WuI`>Zc>zVUV*!#YHyZyQD4xgy#I;`hIuC4`X2OQc`w2eCS zTI+{4WdE#*h>U#tBPeB2I&pkWVG&5JI1I9^>o@jCyUvmY9yvkRGe!(=znl81V+5T( zlJq;r);uY@6Y*3S)<~if*q-O+{AIhv^jFWE+qW;P6kf8>zKL)yd8SJxxIbX=wU5A5${K) zmg_w|x8K<)#MwQcXzAtg=;8kBH2rW_lU`}?e=?-GEXAaBx9hhLSHYFpEy-yGf72_- zOmhN-#A2Nu`tTtKz&%ahbN%OtD}DV{H?WRtc7pvAeXi1WODJN=OK1DI!h~ z&56NMKMJ;K^O5dS8LW0Xu$Yi%v%22)scQ0}O{!ZpWf&uoJGj5~K?e*Jjx*m)O#Bl| z!jY{wqQ+aHLT%Zyh4ArSSu#jJA6DoIW@y!_6dy1K7{5|Q$5>j<<> zx2~=`u>}`_f&zN$^5x4U14vZaE>T&95k|(we2|Zb?2Ci1kB|Lpvxf|1cRYC=%4Wo2 zfBa|qc$0=zK=iMc@5LYQ*tIr^8`Xt--oZypa%~x~f4-#O!PAc!xQc+AMK`%{Gg({2 zrc9W7Er%Lk3a1juJoP-&bqV%IZI>5k_h0TI8B2t(Gvy{H>h48FnGAK^d%ZklJ^fX` zqPFd==Ht6}V;#+WcZb!HY_363@Bk_?AS@?%qdD{AJj`B#bBHgfaYIg*^Dk$FH^I=K<{k2H8+>p<_La$^`g88ugHD*Ks4#>fP=(gec6ea7i4v zxH(yq8<;ZugQ%=5JM6qL|7iyff-Pc~w4x(t5V(2^a1oO~f3~#uGmUinD7i&&?kirO zAgg0ZPZjNw^(4T#0aoC(oVzN?(hNEve1s|Q0eI;#-*$Ecd`Kg+5fVECw2)ya{DJD; zSK}Kqu+}YMa7(+&rPp6R({mk#8~Os8kw!^yVdxDPDN9Tou5&dtbvvx6{(}t<((mtN zTTN1sNHpcg?iFlL1IdBjN^dR0t zdR-pHGFM;1*D>u>mYrRR`|7EzaIaihtnz)u)RvQ@bVg~;s7!^btN&a=cjz`-IV5)v z6zI@7$cYgd8A#+!{f;mr%syooS{7I7giC5Eht0aal8S+6rP~#sI#ixy<3M1&d!fz9l5<@ z91#Z&#ygNu2VfgM1gAb}Q%6aOPr!)x3aa&72SG(i8+02yk52!%v3*$W`F0&I(bWK5`T*b*x#&>SL1|WP7h@3` zy~Pg03pu?C3JT7&r>zH30DaOElb4EAKP4=DN1M|$Ezm0i92k!P@uAdB48$I(pOW1( z*mfW_HT4Xx{P1gz92wTL3kwal75lp`Y`lAdl8$X}E-3N6P_(#mb90$CZluMylCwPE z_HEzL(9riT>z@yXLI6V#$(hBSjUub!lk1;Gi{w?TUyOXfcY6Aq>09OzeMqL0n2LN%AogNuJN_Ed08c=ge}O>U0_%J^xVS6<03&vv=frT`!yA%1 zD8^zIMX;WeXqpsAAdi`DH*BL(3GI~hPLn>WZP8rk?d|ViI7scm!P&VQzNx!WQ9&^2 zF(PAUIH$KsH7bx@1QyGSbCA|w;>(ax0cs324GpibFaubPw6hkZq~9R++^jl)86oZ^ z<^d_Ez|B*(__N^ySQIZz_lq^88yGZ>CAL%`#XhjTSdq+Fw(w(cLTQk*_k5O@r`!GW zd{>c#>81WvQQuE(I;>loE%Oaw?Yo;Osa)y&P~Y%hp_fXNS~FvyR$p{O%I z5;lLCLL(?CdEi8)s_J3VCt2dM6DvLWQB$eH>xC0f(S=sUe)Ev3yEQsF=axJq@{h2d z3qJY+9n`OMzf^U0QKL(^*2iY;*8(wSjtT%rbF5l%xD;pk!;?3IR!@_Bw|j_$!<>8S zrI_#S_GOCL<0Jt8G|96?&3tZO|A%Pjzk(R>lC2oAe%7YrU|U5O52^wMNN8` zriwr>4q)}aAgi7y8eHGHigi9a`LCrgCR;AleuG!P%FK5VdPS7^n*Yn&&aD43GD81H z4K}ox3t;9Wy_8Q+=zN!B+$`q{qNcK^XA6Y65dKZF4zY&^w(%Yxc!tJ0+W^=f;TBm1 z1%r%9=#{~lfRIxxjfe|qZwbZ|SGKn5h)FPMPn$t3(pB+TS(cBo z1kCY(tmG2Hc+vQiU3<0x|IBpXGyg2ez^uI%8GbF1otM|<#II^Kj16|Q!|R`4+35Cs zVO_obZRLT={c2v8Y~Q^KDX|a7sC=Ge)ZQx_oQB@)N4?h7)dez}2DCnE`#!AcjNjni zKIBxJkw*A58XssJ@v0v&O-b?#Y4=7#0v`nCW@rT@j7GGv9TS`Qk#XV~O$a{^EYZXy zs^qkRd;IiR6Z+wLqODd?5|Roie2}%2LipP=29cRRgEY9Fks$(4uDg3l+BcwG(m#`D z+rApRdZg&3Z!jvn5QOFrvJ$eL;R4eJ@R=Iifsl>mc@X0L?h9Qh5~jSxX55%_c`qY3 zbGXwQB$;`^CV|)4E zFjadM6sQSDh{-GsNRgsM06HaePC7cH8R5-1M0a6=>@W#{x}9?0#>y%{rst9h!DMP` zsvM`KJM!azs_L2B96my0=~h@rls*yCcKU&Xh-j2P5CAt|mmN};N^?P;1GlIIkZBCE z!+>CdH=P-7F|BZQH8m!HG%zxEc~lO!jeq?rNCe=c1psfbjG+J#guWCfVG0XXPoss2 zch;{B;>>!lGa85bv0;l*xTo_^{tIu@EAHprT)lSf0o?6~$P79l@MQ=B#OSl&gdJx= zj=-kKO;%ulb8Dd(>1I4DD5y=-OD7#m=>}QkV{R+RsIN27(()mVM9`ddL}0|ZfgTL9 zv#_h|GAqe#pvE@0vrFIcd2rB=Y@dPE5S*Bh$Tt3 z8CR<6n$Nj?XY2kga%1ZJr#|fOfz43To}}y;0$xFL4Arq{g+K(|KbsxFuuhQM49znz zqQMC*cW6+BLX-3!%OwtcGim99@r;IX%Sj7Q4aEx7U+E^93PUJv(@iD-Mk*_n5WNVL z3e|R5VG_o2HMg{+vb3}Wff5V17Ft4D?IK@m=e+*(*m}F7WWf{e zQ*b@BiHa%0rT*iy=F{X^i91{}>s(!3XF)A8L9PS~Vh&vk{k_VV5kF8SJzzhn!hZA8Gj?N|b}f;s0EL+!YG9)` zX|(gsMgkl>Rq{nw4t6xkLRs)7B~I|(hPv2H1-ZXK( zwHY$)`6e#V2MOX|>eo@pO+jD`=?R>0w74Ms&5v;D=P@C_n!DqfeWF^BNc+hjb73D8oSU4z=WpvH;_g`J#C7N=RFI9HX=O!@H z1EU8~Q5gm8N4d%OBo43&>;wNiHon8jfc5Fu*X;2U|wc`)&OEW|#*?b@Q@= z65c2X)@LTj9y)Z0MD7HPAVE7)Lx<1Sz*D^AO@+6&lFC<)prc34xI3S_Q>GEt4Ic`7 zy^MJSi?PiU+Ksb*jBu8`e`cdu1`U)nYnZx^Yy+_rW+>*1*56&gl12lvc#t4YX-Wck z;f!!AXfuK|=j&;%qwql}GDBkpIv#}UwC_(rb4s|-UGf3#a9}0$cLc&rS3?GV3Mv|? z83aMfOAPGd1X6^`vL#g*DLQZ3bCH^u{PwI-UsjY?rSi|CSAXRA?YVH@j%&ed$+nG6 z9Leh-&iR4U70{{#dY^uAX^E`uJr{oWW4N2$0?@Wp=YQiy0)9hcZBVbkq0PWFc^A4l zG4V*;iS%n?k!NomY>!qB!iQ|7C8g!2I4U{UZvn7?rMxa%_+ColLU53n=l$r6ho`#| z9wERDsNwp4cRtqCtde*AMk@a3{aEAX<^}DK|%NG)=Lf6*u`^+mzO-}sd*b9m-U1|KnD&ti&FR`6M`zGx< zbj$wd$hsLlU7j~mj{o-{ovf!~JD*6_$^6GaT+GgH!JZqg@Nm;s>e={T%|lDqHCX8wK3csHv&9Vw*kmMBxH0ZnC=9K|qrC z$9@68*L+gx9x>R1>Tf$v>=TO(R1fk(_R-ecIboB7&cHou8JP7R-Sb;@a^FL$uWmgD z=kI4+avq>YC^6Gb-|+O>HF6ak?EbxtH$ei-91}iP7}O9upQCN<{$4!(^jcLV9Wl zSaO>HDph~-`Bs9aq9Atf-t8WIaIfO(;8(=Q&Q3rZDi|Da&Q*B1)AaC+X@@}5LxduG z7%}XW1POgNg}Y<^Fx-XsNs_63FQ`Zk2}H?z(o&VHgAy0cfl+yed<1w7bm?l5{s$wY z0a(Gltx;1=;Eu`Zp{Te8LUQw!zm8S^ciTqjO$Vl-2Wa=X4+Nh6+^2%IWG}7V z-wyvTJtg)of0}q?z_9&iyDk3HA#hew6T4vbM!%Ufz>u01&d6Bsyn~4@Hiq_ap9ly)E^q_l27~(Z*f@Tk@nvoJ@rhI z)6;h%K<0PO!0o}~>jRw${6SvlK`uCpv7-NcD!@bQ>EnZwC2b020@AEmN8VQ1#YX{u z@uGLsWwfoie%UINkdu{#4LVeEXUF!2(UNyCaQw9aV`<6XmMokwDgd&kmT@cp>Qcw0 z>i=ms20#QI;~^NYx4j-sI7XTPM+F{JH=)579t2coa3%P5O2QRRNuX~)|DQ%1)e0Rw zJq3}^cmAJQ-f4Y7KwE8>Pz`jRAnu@4KwDuA?!ecUkyP25_oP4FAyQ8VOgm` z6$a-<+JsgLyXk{RH#!ne54faWzNBF*pe`hsNj9$Vp6DZYqWKSZc}>N&|HFkjBERdz zf8&Isb?wgJ0Amc#2TE%=pr$tme;fbH{HcPTFtR0bImL|bv+2#z-_Q+W z4ng}2c4L7y3)#URuIL2VyAcMO+%DzT?cYjG80K2hLkXtlp+;d$3OIlDGLn??U!4^R z6;)w=WL8glV0wJX8;ybJ+wvi2C3HH1v|W&(7+UcBYMtLODmIaQD2a�fI2k-IhtO zd~wm^_NF~7&li8LA;>LYK~omNqeSwqk00Mc%NKUc8QXJwuE=?Wg-rXvV5A`nq#EhD z1#l)&3Qxiu%aUL6FXmn8Kp!*o@_(Hx+1F*92LT)3NkkVV7!hYu1 zG;YBjs@u>gB#gwUXVDG!pfHO0Rm2Zqqp|JEEXo0}!|M=Cuof|so6+2;PzlKZ9DoG` zdxXM_5?@_{tu=l?0l$T9Q}-Sa)Pxg>h44kU6720yZHtxT$TUmo1D{8voXg=m+ON4- zl~{m~HW1a>=2eF?N-zD02dswW;@r8XTjE1wP<@2WSxQ6HaBa7)E^d?JPtf? z7Z;ft|75U;R+^*4^ck?~ls;&rjK+XPS5)!vRD&aj(F?8&cqy>6Y+y8;B3VhMFh;b*sry z0741^0!pj(nZ*5Y!;bc%#C5-|8m&0@h4p07|EccG!+K2HK7L0OMv^5A5+cghV5SDq z*orZgibD3Sc?=mzqsS79c+g@kQ&N6yiWDJI8Z``_79vznBZU?uQN5qDIo{)V=Qy7C zpZCAlA9IYQ-*VsAbzbNB-R^a2v}50KXIQ*3@|N_?inp}q5Nt{coR2x$*D^ z|6rjqe(=vA2Z@-TI)=z6Wbqyrg3i_>?uYtWijN9NH%@>^hH#cjFjUuQasB%B+! zgm1#I-kp&+JyrJ?Y}&Xmc1j*Bg54C4w}W8bcY-;-t1Re*kEr}vnz^U|ymk+rM?UB* zQ`Lo-d4sdv@LCmE&OGf%<9_Yan{Sotq#(TO?SF;ay$DuR@n&hK|F%ajv|%HPag++Z zIUo0f5B3E=r34F!Eh7jKL8XjizkYER@q(rNdR1|G+h!tPhuwddV7RelqQK;;+nIeOF@k0 zURrOkU(Y7kBu=PC+6HYUi%u&eU$+04+s<2HG-FZGcDz)H+Lzrvs07bx~u z05rjjV;s+uC)Q|Q_CnWs^qxDnZk@>xSp}k?HVV-QKhcv`jGkb6{2SN)Qp6-Z2O%1m z90p9$__H$(fGvmfwSWfl-J=pS)vf>R-MhE)+1+SvA2N;{AH;HjM9)E^A)x%ms;76Z zZ8aOa!1;(-nTx`9j~L<=-j~;`xAzLI=f~i`>}nH*UUFfI*SRyYkML-K)f<^((ek zVKQrj#DXx_wUfyEZnu68)*602`6kt+P*5vCz9yAzK zCl~GFQfH5{ZlAH^$Z5Vme1HLYg^{mDg#Sm|e`WQuVKf@7e-Y3jX7;7(pVj%|!FYXj z-Z;Y5OW<`c#|1;9Dstl&>Onel5Zg9dd!1O1(9Cq}xR=wXPHn4zc=|67c&pw$G~aK- z-em9PcTu4NZ76Jd(>x zGWP-+SkIgKQE5fna58g>!6>zXe@}aAG)bj}11H)CTER`+JwVUxG!wC)SS)wx#wB7J zEU!UpZ`4%U=w9^k(kBj?eJ09jp^vsGMHdH7MulCwc`$ZvJVue52SGzG*^R`%|>x83x9-^ zQFQll&R6~D6>r?X-x*TtCg_l9@Y#@rHh&F(|K~xFhU-6t@SAKx2e)mYrCis7%h~y{ z9r!4_23nSrV&+`8(57L++A-gAjOknItoS1&w#s_?Ad=sdgZhSshS$OQYN*g9R9=bZ z<{(htym_-Ep}-VmfzpC{&+yu9^@FcCB>{JKT)jG;`6;Gq-G++m+ggjyp8jprk_=6+ z-Lr>?=p0VdhnEdUY}uf-dpcpwlLI&4ToMQg0wIH@#+Af##M_z-qtZEX8g8eN>COF- zvxk!UzK=<_r2t-sp()4kq*Y1gO zkD|7V&KfkR52y~>p)5`hQS627!*LVhRp&SkpXDZa^=dL6%7-{Ht~6J7A8FIZ>)rin z!dL)!Rin@fvTEx4qP$#$h8-EqRLc*z?<6j5qKK0aT`yf)YTFn}_1z`f&2tF+VoBh`EH7p<9$-A%?3(*#Seo@u*#Pb`1eJ2M(Aw^#C0a;XR_C ziu~BU=&=?H|C+ouv~F+*@6b>^tcRdUImXQW6Tlugxo~hs2Z1Ll&@ck(ll0gDxDe;V zb&}g8;e_hr5VtAdLeG8cSOknFCwq>`s7+=7hQhG|*kj12n@CqI#E zFSb0Fw(yHqk1nqHQXW9ToWrvMZh-ko6qk7RU5~>r3eUFkwj`}U z9|Dyj!E6tQdW*w=;UFqO%fx#7p6u5Uzv&VZsg0OKwQf9;W+d56DjEE{mcE*iqYE&? z5{_M!nDTX6RIW-t$ZV_6X{o8HsKJer4zRo1X=rHJn}R1X51lI1_15{+*cavyn!PR`0Z$07>9Ru->Wv!JBj zz*2{%)?wbtnz!>q+$=ODt1K_c!TC$PxM~xDHtpaFs(syzno*bdh<157dV!zLzD5bc zVvenNlA4kdQ2&XyvpT8!eL<;cyIo@gD6+DL9ErNZ1?helN4=y*F(ESGf`3ajNflD-qMF7+L!`2V@>ovFYbxhW>l~FHKX^x z7_Kf{S-;E>e9Znj__m$iiLI0rpGuo8?VQMGxTsome)eBu0So{pOOlL1F6 zI;dc4cm#+8zlSDd+%n%89UZOoL!V<;k%QjvZYyuJGrCBY;z1F+T6Fa~i;UmK^J7`? zk{jFG7+mZ*#wCEYGk$y~GmL$^d0Vzz1G4R!6Ffo}es#;G{qI@gD)sfUC3D1jgX{(E zIK~;ngr8_DmrzdR9FaX{p{b8xY>I`&c47|1F#yKAx4gXE_PxVsNqDvOx(`QfL!xU3 z{hhzvxzmmAKpS_VYJ{!Z<7U)6OJK0U9W0)p1kSj0fW;+FYErzGl#dMw&G zk|v=$`caELVLvvb^oYLup8qZ!I%Lk9XW7|SM;OK#e50L@qZeQD@Akqe$~D1mE$OsscI9U2xrVs-56`PFSEF*@>OzF43?|7Ble4w)~ zN``pH&3g<(9uVbZlHfMf;_{Xs zta5X0F8Xa`uAcAOZH_}?E@CIOiRu~}(QDiqSz>lI01VMZTODuba4y)#=hUkF)JKjS zS?$}d97kTbmG@qw5T$LKHi3_ZHV(wgqpM_43!PjREbnq1<|=RVCp6>Msj;jNm#_QRQkMG}4u&Vg{d`G9;;Rs9RTvby?NuS2=fRqW{=kT7BA|TIGTka-VG_oL~n`C9C`p0i&I;?#Zy;v5yQrO~dt>@Hgt&jjVr_^){ zpFKB3Z;k!?ALCP~?`&&53J#9SU4=XFYFO!;H*sdV2P*TTb-lw1zPPQ? zWnx%+Z40~AUVL6Q_ng$>W@b)FX~@qkm-fNSqGtV0KZNnj>1hkAMh%lBPSN9dlmsWO zKQ4L+Hkw2_9o;eWLLPcb(X!!{grFXl@J1MG`dMYw=P)a5jS}ait*y`?vYn&Na?@(F zjmY=EVkZoZeO5i4-)iGI`DUleQ#c8I5W)@Pd*%9pq=#2LzT)8V;RB*G64@kWI+MG5 z4^ywsW06O!nJ-N-$F-UQ^Dq4`;))%J%Yu58UBPjMj72z4)#=kI3z5cUZZG({EUC6#s9O!+-yTJB}4K1v+#;rWst+;RSm_o2PaLE=OHj> z>Ob8u=`TT)-HS0S6(R~_5(}JNvny@$f7FgTRvA_K(bkl)x@g~Wk zJPTxvTGyhxMGVYZXs|$_!nqzSkV&vIf~tYX1Yd4k>Ehytyt0^@A{jh_`H2IO!YjrL zQCxbG_h4}G5+AQHP0&Zf$SWN^@sI*4LbJnJkV z4a>Wj)=9Hdr})pl(H0*+eiV%}nFA7dE-3_(yGRQ$^v{yn1$J)HiMBNb7WM(rLjsb2 zrip5AlN;LTyY<+ah9Q-Wx(v}ERpHn(iKZNim2n%1*iX{7>dE8B=qOuKlk?m=s@=T0 z|Fn5aMbFe-XqWUbaE!(rd&8$@Hu=K_m6`n&9u?rEN05X#G{xU7DF`67HxW$(cWo;5 z|2p?#7~Sb&yoRDU;O`UMtDGevjA_m~NoVNlh6i4rPcNb*?~KfPB2*rmSVN)9`wraB zyICirB;fX8W@-^Fa}xq84!fhZ^nY=tzhCGFw$*Cm$kjzygC&oFWnIc(Hks4GDk?9>iGT=di)fABNEQD7{PcL4HRcH86Eju4`JD6JB;gsR(ujtjz z&!Vpq(-C^u)(ZI5xP;cJQaF&5q%}2kpqbUSh^li$$G2D*_og4;&u&U)w^hG=Yn0?) zoAXCj*0k6CdMMreB8)Tt*o-;v)_OphIjQq|lGu6v)?@eOnK4h?r_2is@Ja{isNw6b zGLGD*KXT+bi%(~@3VWv}K6A#tl+>g-+j$yTpnuWm4X%A?hi4?6F^Y(c3@$gCe!3%s zZ@1ATIDr%nU{8Pw8=Vi-G^`l-4C|x6UdgeCj~}<#397mEQLn{D6y3{i)TZi1d9_HP zO;*;Ac~1szja*6x;jXow554~(jeGg+UZWS!+3Phbh}B!{M=yR7qW&H^jB(R{j=t# z-87Za*;J%{%y7(oqZrSkTwIOf$t3-PX(h$;qlrW?VhB;D4CqIzxNVNJM;o^3Djq#2 zzOcgKm`E9Td3eBZ8ZpBB;+@>d@V|oML-7^>*+y(}1GEKBZ;R7#A4;R= zbcOI2wgzX)@&Fv|b8aE<@MY4G>qqUyet?>x58;|)_p$_`Y?LX!$Sj$ zQ>(7+V4wwfm0)T?`VF@9ql$JEK1wJDzoBVZFhSac^xE=%0g8X6gusR)?Htow;gqQ#z5HY zSQu&^i0MU?s~Q{GfGvk;lj@cSAbV8DZ1>Y zayt~tpMShyMNL8Q<@y|3hej0e>C_4PWE%2{hk9FM z#n>yYi-B_~2l4Rtzg^85Y&v{G=XZy(LqttpAd^aDArQ(QgBd}q#sv|MTm0%(OkqB$ z8gBCO<5f=%_ighv_ujVZcI|}rvkupKY8@EU zmJ(_MI^+irK%mMgkjo?R31Zy8ckiCg<3_$*Jq`4|YhIfnqoiCZ;zAcrQ-YjfuZ%YI zwvtA}b@=F_Ci@0fqGa(5pv|Sxlze9HaScFq|3QP!JeXax%4FlKxgxWYm|ng_*Y-Xt zpLfV>T{NB;`r}tbNTv%Ds!1~xTOQ}|%;dw>aM6}BoXP^=D zQ(`$WaZBxitol_xe)aQh6?1dljI6T=Y2pK=bIRU_h7|7uUP5uWAbH%y_+Vdg-sj(z zI$^!CVkNxH#{XvbiT?|w&R_q(f9fg+;s3;`n>x+AsaOAL$l7f@=~l6vGHY_2#iH&1 E1=&jLH~;_u literal 0 HcmV?d00001 From 3afba7eeece8f8331e2d2de605478d7a61bff379 Mon Sep 17 00:00:00 2001 From: Olivier Grisel Date: Fri, 20 Oct 2023 15:54:07 +0200 Subject: [PATCH 068/108] MAINT use actions/cache@v3 instead of v2 (#734) * MAINT use actions/cache@v3 instead of v2 * Attempt to fix the PR used for the jupyter preview --- .github/workflows/deploy-gh-pages.yml | 2 +- .github/workflows/jupyter-book-pr-preview.yml | 12 ++++++------ 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/.github/workflows/deploy-gh-pages.yml b/.github/workflows/deploy-gh-pages.yml index 236d4dacc..1642e7ffa 100644 --- a/.github/workflows/deploy-gh-pages.yml +++ b/.github/workflows/deploy-gh-pages.yml @@ -32,7 +32,7 @@ jobs: pip install -r requirements-dev.txt - name: Cache jupyter-cache folder - uses: actions/cache@v2 + uses: actions/cache@v3 env: cache-name: jupyter-cache with: diff --git a/.github/workflows/jupyter-book-pr-preview.yml b/.github/workflows/jupyter-book-pr-preview.yml index cabe43fe9..fbe6a8d13 100644 --- a/.github/workflows/jupyter-book-pr-preview.yml +++ b/.github/workflows/jupyter-book-pr-preview.yml @@ -19,6 +19,12 @@ jobs: sha: ${{ github.event.workflow_run.head_sha }} context: 'JupyterBook preview' + - name: Get pull request number + id: pull-request-number + run: | + export PULL_REQUEST_NUMBER=${{github.event.workflow_run.event.number}} + echo "result=${PULL_REQUEST_NUMBER}" >> $GITHUB_OUTPUT + - uses: dawidd6/action-download-artifact@v2 with: github_token: ${{secrets.GITHUB_TOKEN}} @@ -26,12 +32,6 @@ jobs: pr: ${{steps.pull-request-number.outputs.result}} name: jupyter-book - - name: Get pull request number - id: pull-request-number - run: | - export PULL_REQUEST_NUMBER=$(cat pull_request_number) - echo "result=${PULL_REQUEST_NUMBER}" >> $GITHUB_OUTPUT - - uses: actions/setup-node@v3 with: node-version: '16' From c9a7ad4d2a7c619573d4358219efadb408a52b36 Mon Sep 17 00:00:00 2001 From: Arturo Amor <86408019+ArturoAmorQ@users.noreply.github.com> Date: Mon, 23 Oct 2023 12:21:24 +0200 Subject: [PATCH 069/108] ENH Emphasize discussion on multi-class classification in tree notebook (#730) Co-authored-by: ArturoAmorQ Co-authored-by: Olivier Grisel --- python_scripts/trees_classification.py | 131 +++++++++++++++---------- python_scripts/trees_sol_01.py | 126 +++++++++++++++++++++--- 2 files changed, 191 insertions(+), 66 deletions(-) diff --git a/python_scripts/trees_classification.py b/python_scripts/trees_classification.py index d83b5203e..3723109a5 100644 --- a/python_scripts/trees_classification.py +++ b/python_scripts/trees_classification.py @@ -8,8 +8,11 @@ # %% [markdown] # # Build a classification decision tree # -# We will illustrate how decision tree fit data with a simple classification -# problem using the penguins dataset. +# In this notebook we illustrate decision trees in a multiclass classification +# problem by using the penguins dataset with 2 features and 3 classes. +# +# For the sake of simplicity, we focus the discussion on the hyperparamter +# `max_depth`, which controls the maximal depth of the decision tree. # %% [markdown] # ```{note} @@ -25,8 +28,8 @@ target_column = "Species" # %% [markdown] -# Besides, we split the data into two subsets to investigate how trees will -# predict values based on an out-of-samples dataset. +# First, we split the data into two subsets to investigate how trees predict +# values based on unseen data. # %% from sklearn.model_selection import train_test_split @@ -37,16 +40,13 @@ ) # %% [markdown] -# In a previous notebook, we learnt that a linear classifier will define a -# linear separation to split classes using a linear combination of the input -# features. In our 2-dimensional space, it means that a linear classifier will -# define some oblique lines that best separate our classes. We define a function -# below that, given a set of data points and a classifier, will plot the -# decision boundaries learnt by the classifier. -# -# Thus, for a linear classifier, we will obtain the following decision -# boundaries. These boundaries lines indicate where the model changes its -# prediction from one class to another. +# In a previous notebook, we learnt that linear classifiers define a linear +# separation to split classes using a linear combination of the input features. +# In our 2-dimensional feature space, it means that a linear classifier finds +# the oblique lines that best separate the classes. This is still true for +# multiclass problems, except that more than one line is fitted. We can use +# `DecisionBoundaryDisplay` to plot the decision boundaries learnt by the +# classifier. # %% from sklearn.linear_model import LogisticRegression @@ -56,15 +56,22 @@ # %% import matplotlib.pyplot as plt +import matplotlib as mpl import seaborn as sns from sklearn.inspection import DecisionBoundaryDisplay +tab10_norm = mpl.colors.Normalize(vmin=-0.5, vmax=8.5) # create a palette to be used in the scatterplot -palette = ["tab:red", "tab:blue", "black"] - -DecisionBoundaryDisplay.from_estimator( - linear_model, data_train, response_method="predict", cmap="RdBu", alpha=0.5 +palette = ["tab:blue", "tab:green", "tab:orange"] + +dbd = DecisionBoundaryDisplay.from_estimator( + linear_model, + data_train, + response_method="predict", + cmap="tab10", + norm=tab10_norm, + alpha=0.5, ) sns.scatterplot( data=penguins, @@ -80,7 +87,7 @@ # %% [markdown] # We see that the lines are a combination of the input features since they are # not perpendicular a specific axis. Indeed, this is due to the model -# parametrization that we saw in the previous notebook, controlled by the +# parametrization that we saw in some previous notebooks, i.e. controlled by the # model's weights and intercept. # # Besides, it seems that the linear model would be a good candidate for such @@ -92,13 +99,27 @@ print(f"Accuracy of the LogisticRegression: {test_score:.2f}") # %% [markdown] -# Unlike linear models, decision trees are non-parametric models: they are not -# controlled by a mathematical decision function and do not have weights or -# intercept to be optimized. +# Unlike linear models, the decision rule for the decision tree is not +# controlled by a simple linear combination of weights and feature values. +# +# Instead, the decision rules of trees can be defined in terms of +# - the feature index used at each split node of the tree, +# - the threshold value used at each split node, +# - the value to predict at each leaf node. # -# Indeed, decision trees will partition the space by considering a single -# feature at a time. Let's illustrate this behaviour by having a decision tree -# make a single split to partition the feature space. +# Decision trees partition the feature space by considering a single feature at +# a time. The number of splits depends on both the hyperparameters and the +# number of data points in the training set: the more flexible the +# hyperparameters and the larger the training set, the more splits can be +# considered by the model. +# +# As the number of adjustable components taking part in the decision rule +# changes with the training size, we say that decision trees are non-parametric +# models. +# +# Let's now visualize the shape of the decision boundary of a decision tree when +# we set the `max_depth` hyperparameter to only allow for a single split to +# partition the feature space. # %% from sklearn.tree import DecisionTreeClassifier @@ -108,7 +129,12 @@ # %% DecisionBoundaryDisplay.from_estimator( - tree, data_train, response_method="predict", cmap="RdBu", alpha=0.5 + tree, + data_train, + response_method="predict", + cmap="tab10", + norm=tab10_norm, + alpha=0.5, ) sns.scatterplot( data=penguins, @@ -123,8 +149,8 @@ # %% [markdown] # The partitions found by the algorithm separates the data along the axis # "Culmen Depth", discarding the feature "Culmen Length". Thus, it highlights -# that a decision tree does not use a combination of feature when making a -# split. We can look more in depth at the tree structure. +# that a decision tree does not use a combination of features when making a +# single split. We can look more in depth at the tree structure. # %% from sklearn.tree import plot_tree @@ -150,36 +176,40 @@ # dataset was subdivided into 2 sets based on the culmen depth (inferior or # superior to 16.45 mm). # -# This partition of the dataset minimizes the class diversities in each +# This partition of the dataset minimizes the class diversity in each # sub-partitions. This measure is also known as a **criterion**, and is a # settable parameter. # # If we look more closely at the partition, we see that the sample superior to -# 16.45 belongs mainly to the Adelie class. Looking at the values, we indeed -# observe 103 Adelie individuals in this space. We also count 52 Chinstrap -# samples and 6 Gentoo samples. We can make similar interpretation for the +# 16.45 belongs mainly to the "Adelie" class. Looking at the values, we indeed +# observe 103 "Adelie" individuals in this space. We also count 52 "Chinstrap" +# samples and 6 "Gentoo" samples. We can make similar interpretation for the # partition defined by a threshold inferior to 16.45mm. In this case, the most -# represented class is the Gentoo species. +# represented class is the "Gentoo" species. # # Let's see how our tree would work as a predictor. Let's start with a case # where the culmen depth is inferior to the threshold. # %% -sample_1 = pd.DataFrame({"Culmen Length (mm)": [0], "Culmen Depth (mm)": [15]}) -tree.predict(sample_1) +test_penguin_1 = pd.DataFrame( + {"Culmen Length (mm)": [0], "Culmen Depth (mm)": [15]} +) +tree.predict(test_penguin_1) # %% [markdown] -# The class predicted is the Gentoo. We can now check what happens if we pass a +# The class predicted is the "Gentoo". We can now check what happens if we pass a # culmen depth superior to the threshold. # %% -sample_2 = pd.DataFrame({"Culmen Length (mm)": [0], "Culmen Depth (mm)": [17]}) -tree.predict(sample_2) +test_penguin_2 = pd.DataFrame( + {"Culmen Length (mm)": [0], "Culmen Depth (mm)": [17]} +) +tree.predict(test_penguin_2) # %% [markdown] -# In this case, the tree predicts the Adelie specie. +# In this case, the tree predicts the "Adelie" specie. # -# Thus, we can conclude that a decision tree classifier will predict the most +# Thus, we can conclude that a decision tree classifier predicts the most # represented class within a partition. # # During the training, we have a count of samples in each partition, we can also @@ -187,7 +217,7 @@ # partition. # %% -y_pred_proba = tree.predict_proba(sample_2) +y_pred_proba = tree.predict_proba(test_penguin_2) y_proba_class_0 = pd.Series(y_pred_proba[0], index=tree.classes_) # %% @@ -212,14 +242,14 @@ # %% [markdown] # It is also important to note that the culmen length has been disregarded for -# the moment. It means that whatever the value given, it will not be used during -# the prediction. +# the moment. It means that regardless of its value, it is not used during the +# prediction. # %% -sample_3 = pd.DataFrame( +test_penguin_3 = pd.DataFrame( {"Culmen Length (mm)": [10_000], "Culmen Depth (mm)": [17]} ) -tree.predict_proba(sample_3) +tree.predict_proba(test_penguin_3) # %% [markdown] # Going back to our classification problem, the split found with a maximum depth @@ -232,9 +262,10 @@ print(f"Accuracy of the DecisionTreeClassifier: {test_score:.2f}") # %% [markdown] -# Indeed, it is not a surprise. We saw earlier that a single feature will not be -# able to separate all three species. However, from the previous analysis we saw -# that by using both features we should be able to get fairly good results. +# Indeed, it is not a surprise. We saw earlier that a single feature is not able +# to separate all three species: it underfits. However, from the previous +# analysis we saw that by using both features we should be able to get fairly +# good results. # -# In the next exercise, you will increase the size of the tree depth. You will -# get intuitions on how the space partitioning is repeated over time. +# In the next exercise, you will increase the tree depth to get an intuition on +# how such a parameter affects the space partitioning. diff --git a/python_scripts/trees_sol_01.py b/python_scripts/trees_sol_01.py index 34dcbf81c..e97b7e8b2 100644 --- a/python_scripts/trees_sol_01.py +++ b/python_scripts/trees_sol_01.py @@ -8,16 +8,13 @@ # %% [markdown] # # ๐Ÿ“ƒ Solution for Exercise M5.01 # -# In the previous notebook, we showed how a tree with a depth of 1 level was -# working. The aim of this exercise is to repeat part of the previous experiment -# for a depth with 2 levels to show how the process of partitioning is repeated -# over time. +# In the previous notebook, we showed how a tree with 1 level depth works. The +# aim of this exercise is to repeat part of the previous experiment for a tree +# with 2 levels depth to show how such parameter affects the feature space +# partitioning. # -# Before to start, we will: -# -# * load the dataset; -# * split the dataset into training and testing dataset; -# * define the function to show the classification decision function. +# We first load the penguins dataset and split it into a training and a testing +# sets: # %% import pandas as pd @@ -42,10 +39,7 @@ # %% [markdown] # Create a decision tree classifier with a maximum depth of 2 levels and fit the -# training data. Once this classifier trained, plot the data and the decision -# boundary to see the benefit of increasing the depth. To plot the decision -# boundary, you should import the class `DecisionBoundaryDisplay` from the -# module `sklearn.inspection` as shown in the previous course notebook. +# training data. # %% # solution @@ -54,15 +48,39 @@ tree = DecisionTreeClassifier(max_depth=2) tree.fit(data_train, target_train) -# %% tags=["solution"] +# %% [markdown] +# Now plot the data and the decision boundary of the trained classifier to see +# the effect of increasing the depth of the tree. +# +# Hint: Use the class `DecisionBoundaryDisplay` from the module +# `sklearn.inspection` as shown in previous course notebooks. +# +# ```{warning} +# At this time, it is not possible to use `response_method="predict_proba"` for +# multiclass problems. This is a planned feature for a future version of +# scikit-learn. In the mean time, you can use `response_method="predict"` +# instead. +# ``` + +# %% +# solution import matplotlib.pyplot as plt +import matplotlib as mpl import seaborn as sns from sklearn.inspection import DecisionBoundaryDisplay -palette = ["tab:red", "tab:blue", "black"] + +tab10_norm = mpl.colors.Normalize(vmin=-0.5, vmax=8.5) + +palette = ["tab:blue", "tab:green", "tab:orange"] DecisionBoundaryDisplay.from_estimator( - tree, data_train, response_method="predict", cmap="RdBu", alpha=0.5 + tree, + data_train, + response_method="predict", + cmap="tab10", + norm=tab10_norm, + alpha=0.5, ) ax = sns.scatterplot( data=penguins, @@ -114,3 +132,79 @@ # which is not surprising since this partition was almost pure. If the feature # value is above the threshold, we predict the Gentoo penguin, the class that is # most probable. +# +# ## (Estimated) predicted probabilities in multi-class problems +# +# For those interested, one can further try to visualize the output of +# `predict_proba` for a multiclass problem using `DecisionBoundaryDisplay`, +# except that for a K-class problem you have K probability outputs for each +# data point. Visualizing all these on a single plot can quickly become tricky +# to interpret. It is then common to instead produce K separate plots, one for +# each class, in a one-vs-rest (or one-vs-all) fashion. +# +# For example, in the plot below, the first plot on the left shows in yellow the +# certainty on classifying a data point as belonging to the "Adelie" class. In +# the same plot, the spectre from green to purple represents the certainty of +# **not** belonging to the "Adelie" class. The same logic applies to the other +# plots in the figure. + +# %% tags=["solution"] +import numpy as np + +xx = np.linspace(30, 60, 100) +yy = np.linspace(10, 23, 100) +xx, yy = np.meshgrid(xx, yy) +Xfull = pd.DataFrame( + {"Culmen Length (mm)": xx.ravel(), "Culmen Depth (mm)": yy.ravel()} +) + +probas = tree.predict_proba(Xfull) +n_classes = len(np.unique(tree.classes_)) + +_, axs = plt.subplots(ncols=3, nrows=1, sharey=True, figsize=(12, 5)) +plt.suptitle("Predicted probabilities for decision tree model", y=0.8) + +for class_of_interest in range(n_classes): + axs[class_of_interest].set_title( + f"Class {tree.classes_[class_of_interest]}" + ) + imshow_handle = axs[class_of_interest].imshow( + probas[:, class_of_interest].reshape((100, 100)), + extent=(30, 60, 10, 23), + vmin=0.0, + vmax=1.0, + origin="lower", + cmap="viridis", + ) + axs[class_of_interest].set_xlabel("Culmen Length (mm)") + if class_of_interest == 0: + axs[class_of_interest].set_ylabel("Culmen Depth (mm)") + idx = target_test == tree.classes_[class_of_interest] + axs[class_of_interest].scatter( + data_test["Culmen Length (mm)"].loc[idx], + data_test["Culmen Depth (mm)"].loc[idx], + marker="o", + c="w", + edgecolor="k", + ) + +ax = plt.axes([0.15, 0.04, 0.7, 0.05]) +plt.colorbar(imshow_handle, cax=ax, orientation="horizontal") +_ = plt.title("Probability") + +# %% [markdown] tags=["solution"] +# ```{note} +# You may have noticed that we are no longer using a diverging colormap. Indeed, +# the chance level for a one-vs-rest binarization of the multi-class +# classification problem is almost never at predicted probability of 0.5. So +# using a colormap with a neutral white at 0.5 might give a false impression on +# the certainty. +# ``` +# +# In future versions of scikit-learn `DecisionBoundaryDisplay` will support a +# `class_of_interest` parameter that will allow in particular for a +# visualization of `predict_proba` in multi-class settings. +# +# We also plan to make it possible to visualize the `predict_proba` values for +# the class with the maximum predicted probability (without having to pass a +# given a fixed `class_of_interest` value). From 6ba12f20916e8e0886c0e0170bccfa94cdb1e58c Mon Sep 17 00:00:00 2001 From: Arturo Amor <86408019+ArturoAmorQ@users.noreply.github.com> Date: Mon, 23 Oct 2023 12:23:13 +0200 Subject: [PATCH 070/108] Synchronize quizzes (#735) Co-authored-by: ArturoAmorQ --- .../linear_models/linear_models_quiz_m4_03.md | 2 +- .../linear_models/linear_models_quiz_m4_05.md | 5 +- .../linear_models_wrap_up_quiz.md | 143 +++++------------- .../wrap_up_quiz.md | 46 ++++-- 4 files changed, 76 insertions(+), 120 deletions(-) diff --git a/jupyter-book/linear_models/linear_models_quiz_m4_03.md b/jupyter-book/linear_models/linear_models_quiz_m4_03.md index eb458ad46..2989e3e66 100644 --- a/jupyter-book/linear_models/linear_models_quiz_m4_03.md +++ b/jupyter-book/linear_models/linear_models_quiz_m4_03.md @@ -33,7 +33,7 @@ _Select a single answer_ Combining (one or more) feature engineering transformers in a single pipeline: - a) increases the expressivity of the model -- b) ensures that models extrapolate accurately regardless of its distribution +- b) ensures that models extrapolate accurately regardless of the distribution of the data - c) may require tuning additional hyperparameters - d) inherently prevents any underfitting diff --git a/jupyter-book/linear_models/linear_models_quiz_m4_05.md b/jupyter-book/linear_models/linear_models_quiz_m4_05.md index 1fe12883c..45ce1f9b8 100644 --- a/jupyter-book/linear_models/linear_models_quiz_m4_05.md +++ b/jupyter-book/linear_models/linear_models_quiz_m4_05.md @@ -25,10 +25,11 @@ _Select a single answer_ +++ ```{admonition} Question -In logistic regression, increasing the regularization strength makes the model: +In logistic regression, increasing the regularization strength (by +decreasing the value of `C`) makes the model: - a) more likely to overfit to the training data -- b) more flexible, fitting closely to the training data +- b) more confident: the values returned by `predict_proba` are closer to 0 or 1 - c) less complex, potentially underfitting the training data _Select a single answer_ diff --git a/jupyter-book/linear_models/linear_models_wrap_up_quiz.md b/jupyter-book/linear_models/linear_models_wrap_up_quiz.md index 3f8f18d72..e903e29b2 100644 --- a/jupyter-book/linear_models/linear_models_wrap_up_quiz.md +++ b/jupyter-book/linear_models/linear_models_wrap_up_quiz.md @@ -153,132 +153,59 @@ _Select a single answer_ +++ -Now, we will tackle a classification problem instead of a regression problem. -Load the Adult Census dataset with the following snippet of code and we will -work only with **numerical features**. +So far we only used the list of `numerical_features` to build the predictive +model. Now create a preprocessor to deal separately with the numerical and +categorical columns: -```python -adult_census = pd.read_csv("../datasets/adult-census.csv") -target = adult_census["class"] -data = adult_census.select_dtypes(["integer", "floating"]) -data = data.drop(columns=["education-num"]) -``` - -```{admonition} Question -How many numerical features are present in the dataset contained in the -variable `data`? +- categorical features can be selected if they have an `object` data type; +- use an `OneHotEncoder` to encode the categorical features; +- numerical features should correspond to the `numerical_features` as defined + above. This is a subset of the features that are not an `object` data type; +- use an `StandardScaler` to scale the numerical features. -- a) 3 -- b) 4 -- c) 5 - -_Select a single answer_ -``` - -+++ +The last step of the pipeline should be a `RidgeCV` with the same set of `alphas` +to evaluate as previously. ```{admonition} Question -Compare the generalization performance using the accuracy of the two following -predictive models using a 10-fold cross-validation: - -- a linear model composed of a `StandardScaler` and a `LogisticRegression` -- a `DummyClassifier` predicting the most frequent class - -By comparing the cross-validation test scores of both models fold-to-fold, count the number -of times the linear model has a better test score than the dummy classifier -Select the range which this number belongs to: +By comparing the cross-validation test scores fold-to-fold for the model with +`numerical_features` only and the model with both `numerical_features` and +`categorical_features`, count the number of times the simple model has a better +test score than the model with all features. Select the range which this number +belongs to: -- a) [0, 3]: the linear model is substantially worse than the dummy classifier +- a) [0, 3]: the simple model is consistently worse than the model with all features - b) [4, 6]: both models are almost equivalent -- c) [7, 10]: the linear model is substantially better than the dummy classifier +- c) [7, 10]: the simple model is consistently better than the model with all features _Select a single answer_ ``` +++ -```{admonition} Question -What is the most important feature seen by the logistic regression? - -- a) `"age"` -- b) `"capital-gain"` -- c) `"capital-loss"` -- d) `"hours-per-week"` - -_Select a single answer_ -``` - -+++ - -Now, we will work with **both numerical and categorical features**. You can -load Adult Census with the following snippet: - -```python -adult_census = pd.read_csv("../datasets/adult-census.csv") -target = adult_census["class"] -data = adult_census.drop(columns=["class", "education-num"]) -``` +In this Module we saw that non-linear feature engineering may yield a more +predictive pipeline, as long as we take care of adjusting the regularization to +avoid overfitting. -Create a predictive model where the categorical data must be one-hot encoded, -the numerical data must be scaled, and the predictor is a -logistic regression classifier. +Try this approach by building a new pipeline similar to the previous one but +replacing the `StandardScaler` by a `SplineTransformer` (with default +hyperparameter values) to better model the non-linear influence of the +numerical features. -Use the same 10-fold cross-validation strategy as above to evaluate this -complex pipeline. +Furthermore, let the new pipeline model feature interactions by adding a new +`Nystroem` step between the preprocessor and the `RidgeCV` estimator. Set +`kernel="poly"`, `degree=2` and `n_components=300` for this new feature +engineering step. ```{admonition} Question -Look at the cross-validation test scores for both models and count the number of -times the model using both numerical and categorical features has a better -test score than the model using only numerical features. -Select the range which this number belongs to: +By comparing the cross-validation test scores fold-to-fold for the model with +both `numerical_features` and `categorical_features`, and the model that +performs non-linear feature engineering; count the number of times the +non-linear pipeline has a better test score than the model with simpler +preprocessing. Select the range which this number belongs to: -- a) [0, 3]: the model using both numerical and categorical features is - substantially worse than the model using only numerical features +- a) [0, 3]: the new non-linear pipeline is consistently worse than the previous pipeline - b) [4, 6]: both models are almost equivalent -- c) [7, 10]: the model using both numerical and categorical features is - substantially better than the model using only numerical features +- c) [7, 10]: the new non-linear pipeline is consistently better than the previous pipeline _Select a single answer_ ``` - -+++ - -For the following questions, you can use the following snippet to get the -feature names after the preprocessing performed. - -```python -preprocessor.fit(data) -feature_names = (preprocessor.named_transformers_["onehotencoder"] - .get_feature_names_out(categorical_columns)).tolist() -feature_names += numerical_columns -feature_names -``` - -There is as many feature names as coefficients in the last step of your -predictive pipeline. - -```{admonition} Question -Which of the following pair of features is most impacting the -predictions of the logistic regression classifier based on -the relative magnitude of its coefficients? - -- a) `"hours-per-week"` and `"native-country_Columbia"` -- b) `"workclass_?"` and `"native_country_?"` -- c) `"capital-gain"` and `"education_Doctorate"` - -_Select a single answer_ -``` - -+++ - -```{admonition} Question -What is the effect of decreasing the `C` parameter on the coefficients? - -- a) shrinking the magnitude of the weights towards zeros -- b) increasing the magnitude of the weights -- c) reducing the weights' variance -- d) increasing the weights' variance -- e) it has no influence on the weights' variance - -_Select all answers that apply_ -``` diff --git a/jupyter-book/predictive_modeling_pipeline/wrap_up_quiz.md b/jupyter-book/predictive_modeling_pipeline/wrap_up_quiz.md index 179d9ed09..65e1fa6df 100644 --- a/jupyter-book/predictive_modeling_pipeline/wrap_up_quiz.md +++ b/jupyter-book/predictive_modeling_pipeline/wrap_up_quiz.md @@ -127,15 +127,43 @@ can process both the numerical and categorical features together as follows: `OneHotEncoder`. ```{admonition} Question -One way to compare two models is by comparing the cross-validation test scores -of both models fold-to-fold, i.e. counting the number of folds where one model -has a better test score than the other. Let's compare the model using all -features with the model consisting of only numerical features. Select the range -of folds where the former has a better test score than the latter: - -- a) [0, 3]: the pipeline using all features is substantially worse than the pipeline using only numerical feature -- b) [4, 6]: both pipelines are almost equivalent -- c) [7, 10]: the pipeline using all features is substantially better than the pipeline using only numerical feature +What is the accuracy score obtained by 10-fold cross-validation of the pipeline +using both the numerical and categorical features? + +- a) ~0.7 +- b) ~0.9 +- c) ~1.0 + +_Select a single answer_ +``` + ++++ + +One way to compare two models is by comparing their means, but small differences +in performance measures might easily turn out to be merely by chance (e.g. +when using random resampling during cross-validation), and not because one +model predicts systematically better than the other. + +Another way is to compare cross-validation test scores of both models +fold-to-fold, i.e. counting the number of folds where one model has a better +test score than the other. This provides some extra information: are some +partitions of the data making the classifaction task particularly easy or hard +for both models? + +Let's visualize the second approach. + +![Fold-to-fold comparison](../../figures/numerical_pipeline_wrap_up_quiz_comparison.png) + +```{admonition} Question +Select the true statement. + +The number of folds where the model using all features perform better than the +model using only numerical features lies in the range: + +- a) [0, 3]: the model using all features is consistently worse +- b) [4, 6]: both models are almost equivalent +- c) [7, 10]: the model using all features is consistently better + _Select a single answer_ ``` From 6af31c6cad224ed5bb78862cbb95287518bb7723 Mon Sep 17 00:00:00 2001 From: Arturo Amor <86408019+ArturoAmorQ@users.noreply.github.com> Date: Tue, 24 Oct 2023 17:37:49 +0200 Subject: [PATCH 071/108] MAINT Remove sparse_threshold (#736) Co-authored-by: ArturoAmorQ --- python_scripts/parameter_tuning_ex_02.py | 1 - python_scripts/parameter_tuning_grid_search.py | 1 - python_scripts/parameter_tuning_nested.py | 1 - python_scripts/parameter_tuning_randomized_search.py | 1 - python_scripts/parameter_tuning_sol_02.py | 1 - 5 files changed, 5 deletions(-) diff --git a/python_scripts/parameter_tuning_ex_02.py b/python_scripts/parameter_tuning_ex_02.py index 2d99c4bfb..cd0e5e3f0 100644 --- a/python_scripts/parameter_tuning_ex_02.py +++ b/python_scripts/parameter_tuning_ex_02.py @@ -53,7 +53,6 @@ ) ], remainder="passthrough", - sparse_threshold=0, ) from sklearn.ensemble import HistGradientBoostingClassifier diff --git a/python_scripts/parameter_tuning_grid_search.py b/python_scripts/parameter_tuning_grid_search.py index 1cf89cdd9..5219d0b51 100644 --- a/python_scripts/parameter_tuning_grid_search.py +++ b/python_scripts/parameter_tuning_grid_search.py @@ -89,7 +89,6 @@ preprocessor = ColumnTransformer( [("cat_preprocessor", categorical_preprocessor, categorical_columns)], remainder="passthrough", - sparse_threshold=0, ) # %% [markdown] diff --git a/python_scripts/parameter_tuning_nested.py b/python_scripts/parameter_tuning_nested.py index 137825a01..fb7406274 100644 --- a/python_scripts/parameter_tuning_nested.py +++ b/python_scripts/parameter_tuning_nested.py @@ -56,7 +56,6 @@ ("cat_preprocessor", categorical_preprocessor, categorical_columns), ], remainder="passthrough", - sparse_threshold=0, ) # %% diff --git a/python_scripts/parameter_tuning_randomized_search.py b/python_scripts/parameter_tuning_randomized_search.py index c69573aad..b146b832d 100644 --- a/python_scripts/parameter_tuning_randomized_search.py +++ b/python_scripts/parameter_tuning_randomized_search.py @@ -73,7 +73,6 @@ preprocessor = ColumnTransformer( [("cat_preprocessor", categorical_preprocessor, categorical_columns)], remainder="passthrough", - sparse_threshold=0, ) # %% diff --git a/python_scripts/parameter_tuning_sol_02.py b/python_scripts/parameter_tuning_sol_02.py index ed9aeaf3f..1ea4cf572 100644 --- a/python_scripts/parameter_tuning_sol_02.py +++ b/python_scripts/parameter_tuning_sol_02.py @@ -47,7 +47,6 @@ ) ], remainder="passthrough", - sparse_threshold=0, ) from sklearn.ensemble import HistGradientBoostingClassifier From e6af8a4c08621565812c8f6b297c2d40d2ec4df6 Mon Sep 17 00:00:00 2001 From: Arturo Amor <86408019+ArturoAmorQ@users.noreply.github.com> Date: Wed, 25 Oct 2023 16:11:35 +0200 Subject: [PATCH 072/108] Merge linear model quizzes (#737) --- jupyter-book/_toc.yml | 8 +- .../linear_models/linear_models_quiz_m4_01.md | 67 ++++++++++- .../linear_models/linear_models_quiz_m4_02.md | 59 +++------- .../linear_models/linear_models_quiz_m4_03.md | 110 +++++++++++++++--- .../linear_models/linear_models_quiz_m4_04.md | 78 ------------- .../linear_models/linear_models_quiz_m4_05.md | 36 ------ 6 files changed, 179 insertions(+), 179 deletions(-) delete mode 100644 jupyter-book/linear_models/linear_models_quiz_m4_04.md delete mode 100644 jupyter-book/linear_models/linear_models_quiz_m4_05.md diff --git a/jupyter-book/_toc.yml b/jupyter-book/_toc.yml index ac643fa2f..277bc22eb 100644 --- a/jupyter-book/_toc.yml +++ b/jupyter-book/_toc.yml @@ -90,13 +90,12 @@ parts: - file: linear_models/linear_models_intuitions_index sections: - file: linear_models/linear_models_slides - - file: linear_models/linear_models_quiz_m4_01 - file: python_scripts/linear_regression_without_sklearn - file: python_scripts/linear_models_ex_01 - file: python_scripts/linear_models_sol_01 - file: python_scripts/linear_regression_in_sklearn - file: python_scripts/logistic_regression - - file: linear_models/linear_models_quiz_m4_02 + - file: linear_models/linear_models_quiz_m4_01 - file: linear_models/linear_models_non_linear_index sections: - file: python_scripts/linear_regression_non_linear_link @@ -104,15 +103,14 @@ parts: - file: python_scripts/linear_models_sol_02 - file: python_scripts/linear_models_feature_engineering_classification.py - file: python_scripts/logistic_regression_non_linear - - file: linear_models/linear_models_quiz_m4_03 + - file: linear_models/linear_models_quiz_m4_02 - file: linear_models/linear_models_regularization_index sections: - file: linear_models/regularized_linear_models_slides - file: python_scripts/linear_models_regularization - - file: linear_models/linear_models_quiz_m4_04 - file: python_scripts/linear_models_ex_03 - file: python_scripts/linear_models_sol_03 - - file: linear_models/linear_models_quiz_m4_05 + - file: linear_models/linear_models_quiz_m4_03 - file: linear_models/linear_models_wrap_up_quiz - file: linear_models/linear_models_module_take_away - caption: Decision tree models diff --git a/jupyter-book/linear_models/linear_models_quiz_m4_01.md b/jupyter-book/linear_models/linear_models_quiz_m4_01.md index 3023edf04..fe87a06ef 100644 --- a/jupyter-book/linear_models/linear_models_quiz_m4_01.md +++ b/jupyter-book/linear_models/linear_models_quiz_m4_01.md @@ -17,10 +17,75 @@ _Select a single answer_ ```{admonition} Question Is it possible to get a perfect fit (zero prediction error on the training set) -with a linear classifier by itself on a non-linearly separable dataset? +with a linear classifier **by itself** on a non-linearly separable dataset? - a) yes - b) no _Select a single answer_ ``` + ++++ + +```{admonition} Question +If we fit a linear regression where `X` is a single column vector, how many +parameters our model will be made of? + +- a) 1 +- b) 2 +- c) 3 + +_Select a single answer_ +``` + ++++ + +```{admonition} Question +If we train a scikit-learn `LinearRegression` with `X` being a single column +vector and `y` a vector, `coef_` and `intercept_` will be respectively: + +- a) an array of shape (1, 1) and a number +- b) an array of shape (1,) and an array of shape (1,) +- c) an array of shape (1, 1) and an array of shape (1,) +- d) an array of shape (1,) and a number + +_Select a single answer_ +``` + ++++ + +```{admonition} Question +The decision boundaries of a logistic regression model: + +- a) split classes using only one of the input features +- b) split classes using a combination of the input features +- c) often have curved shapes + +_Select a single answer_ +``` + ++++ + +```{admonition} Question +For a binary classification task, what is the shape of the array returned by the +`predict_proba` method for 10 input samples? + +- a) (10,) +- b) (10, 2) +- c) (2, 10) + +_Select a single answer_ +``` + ++++ + +```{admonition} Question +In logistic regression's `predict_proba` method in scikit-learn, which of the +following statements is true regarding the predicted probabilities? + +- a) The sum of probabilities across different classes for a given sample is always equal to 1.0. +- b) The sum of probabilities across all samples for a given class is always equal to 1.0. +- c) The sum of probabilities across all features for a given class is always equal to 1.0. + +_Select a single answer_ +``` diff --git a/jupyter-book/linear_models/linear_models_quiz_m4_02.md b/jupyter-book/linear_models/linear_models_quiz_m4_02.md index e0c5f6c9d..fc6d273f3 100644 --- a/jupyter-book/linear_models/linear_models_quiz_m4_02.md +++ b/jupyter-book/linear_models/linear_models_quiz_m4_02.md @@ -1,38 +1,14 @@ # โœ… Quiz M4.02 ```{admonition} Question -If we fit a linear regression where `X` is a single column vector, how many -parameters our model will be made of? -- a) 1 -- b) 2 -- c) 3 +Let us consider a pipeline that combines a polynomial feature extraction of +degree 2 and a linear regression model. Let us assume that the linear regression +coefficients are all non-zero and that the dataset contains a single feature. +Is the prediction function of this pipeline a straight line? -_Select a single answer_ -``` - -+++ - -```{admonition} Question -If we train a scikit-learn `LinearRegression` with `X` being a single column -vector and `y` a vector, `coef_` and `intercept_` will be respectively: - -- a) an array of shape (1, 1) and a number -- b) an array of shape (1,) and an array of shape (1,) -- c) an array of shape (1, 1) and an array of shape (1,) -- d) an array of shape (1,) and a number - -_Select a single answer_ -``` - -+++ - -```{admonition} Question -The decision boundaries of a logistic regression model: - -- a) split classes using only one of the input features -- b) split classes using a combination of the input features -- c) often have curved shapes +- a) yes +- b) no _Select a single answer_ ``` @@ -40,12 +16,13 @@ _Select a single answer_ +++ ```{admonition} Question -For a binary classification task, what is the shape of the array returned by the -`predict_proba` method for 10 input samples? +Fitting a linear regression where `X` has `n_features` columns and the target +is a single continuous vector, what is the respective type/shape of `coef_` +and `intercept_`? -- a) (10,) -- b) (10, 2) -- c) (2, 10) +- a) it is not possible to fit a linear regression in dimension higher than 2 +- b) array of shape (`n_features`,) and a float +- c) array of shape (1, `n_features`) and an array of shape (1,) _Select a single answer_ ``` @@ -53,12 +30,12 @@ _Select a single answer_ +++ ```{admonition} Question -In logistic regression's `predict_proba` method in scikit-learn, which of the -following statements is true regarding the predicted probabilities? +Combining (one or more) feature engineering transformers in a single pipeline: -- a) The sum of probabilities across different classes for a given sample is always equal to 1.0. -- b) The sum of probabilities across all samples for a given class is always equal to 1.0. -- c) The sum of probabilities across all features for a given class is always equal to 1.0. +- a) increases the expressivity of the model +- b) ensures that models extrapolate accurately regardless of the distribution of the data +- c) may require tuning additional hyperparameters +- d) inherently prevents any underfitting -_Select a single answer_ +_Select all answers that apply_ ``` diff --git a/jupyter-book/linear_models/linear_models_quiz_m4_03.md b/jupyter-book/linear_models/linear_models_quiz_m4_03.md index 2989e3e66..979238076 100644 --- a/jupyter-book/linear_models/linear_models_quiz_m4_03.md +++ b/jupyter-book/linear_models/linear_models_quiz_m4_03.md @@ -1,14 +1,24 @@ # โœ… Quiz M4.03 ```{admonition} Question +Which of the following estimators can solve linear regression problems? -Let us consider a pipeline that combines a polynomial feature extraction of -degree 2 and a linear regression model. Let us assume that the linear regression -coefficients are all non-zero and that the dataset contains a single feature. -Is the prediction function of this pipeline a straight line? +- a) sklearn.linear_model.LinearRegression +- b) sklearn.linear_model.LogisticRegression +- c) sklearn.linear_model.Ridge -- a) yes -- b) no +_Select all answers that apply_ +``` + ++++ + +```{admonition} Question +Regularization allows: + +- a) to create a model robust to outliers (samples that differ widely from + other observations) +- b) to reduce overfitting by forcing the weights to stay close to zero +- c) to reduce underfitting by making the problem linearly separable _Select a single answer_ ``` @@ -16,26 +26,90 @@ _Select a single answer_ +++ ```{admonition} Question -Fitting a linear regression where `X` has `n_features` columns and the target -is a single continuous vector, what is the respective type/shape of `coef_` -and `intercept_`? +A ridge model is: -- a) it is not possible to fit a linear regression in dimension higher than 2 -- b) array of shape (`n_features`,) and a float -- c) array of shape (1, `n_features`) and an array of shape (1,) +- a) the same as linear regression with penalized weights +- b) the same as logistic regression with penalized weights +- c) a linear model +- d) a non linear model -_Select a single answer_ +_Select all answers that apply_ +``` + ++++ + +```{admonition} Question +Assume that a data scientist has prepared a train/test split and plans to use +the test for the final evaluation of a `Ridge` model. The parameter `alpha` of +the `Ridge` model: + +- a) is internally tuned when calling `fit` on the train set +- b) should be tuned by running cross-validation on a **train set** +- c) should be tuned by running cross-validation on a **test set** +- d) must be a positive number + +_Select all answers that apply_ ``` +++ ```{admonition} Question -Combining (one or more) feature engineering transformers in a single pipeline: +Scaling the data before fitting a model: -- a) increases the expressivity of the model -- b) ensures that models extrapolate accurately regardless of the distribution of the data -- c) may require tuning additional hyperparameters -- d) inherently prevents any underfitting +- a) is often useful for regularized linear models +- b) is always necessary for regularized linear models +- c) may speed-up fitting +- d) has no impact on the optimal choice of the value of a regularization parameter _Select all answers that apply_ ``` + ++++ + +```{admonition} Question +The effect of increasing the regularization strength in a ridge model is to: + +- a) shrink all weights towards zero +- b) make all weights equal +- c) set a subset of the weights to exactly zero +- d) constrain all the weights to be positive + +_Select all answers that apply_ +``` + ++++ + +```{admonition} Question +The parameter `C` in a logistic regression is: + +- a) similar to the parameter `alpha` in a ridge regressor +- b) similar to `1 / alpha` where `alpha` is the parameter of a ridge regressor +- c) not controlling the regularization + +_Select a single answer_ +``` + ++++ + +The [LogisticRegression documentation](https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html) +says: + +> C : Inverse of regularization strength; smaller values specify stronger regularization. + +The [Ridge documentation](https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.Ridge.html) +says: + +> alpha : Regularization strength; Larger values specify stronger regularization. + ++++ + +```{admonition} Question +In logistic regression, increasing the regularization strength (by +decreasing the value of `C`) makes the model: + +- a) more likely to overfit to the training data +- b) more confident: the values returned by `predict_proba` are closer to 0 or 1 +- c) less complex, potentially underfitting the training data + +_Select a single answer_ +``` diff --git a/jupyter-book/linear_models/linear_models_quiz_m4_04.md b/jupyter-book/linear_models/linear_models_quiz_m4_04.md deleted file mode 100644 index 592dec6dd..000000000 --- a/jupyter-book/linear_models/linear_models_quiz_m4_04.md +++ /dev/null @@ -1,78 +0,0 @@ -# โœ… Quiz M4.04 - -```{admonition} Question -Which of the following estimators can solve linear regression problems? - -- a) sklearn.linear_model.LinearRegression -- b) sklearn.linear_model.LogisticRegression -- c) sklearn.linear_model.Ridge - -_Select all answers that apply_ -``` - -+++ - -```{admonition} Question -Regularization allows: - -- a) to create a model robust to outliers (samples that differ widely from - other observations) -- b) to reduce overfitting by forcing the weights to stay close to zero -- c) to reduce underfitting by making the problem linearly separable - -_Select a single answer_ -``` - -+++ - -```{admonition} Question -A ridge model is: - -- a) the same as linear regression with penalized weights -- b) the same as logistic regression with penalized weights -- c) a linear model -- d) a non linear model - -_Select all answers that apply_ -``` - -+++ - -```{admonition} Question -Assume that a data scientist has prepared a train/test split and plans to use -the test for the final evaluation of a `Ridge` model. The parameter `alpha` of -the `Ridge` model: - -- a) is internally tuned when calling `fit` on the train set -- b) should be tuned by running cross-validation on a **train set** -- c) should be tuned by running cross-validation on a **test set** -- d) must be a positive number - -_Select all answers that apply_ -``` - -+++ - -```{admonition} Question -Scaling the data before fitting a model: - -- a) is often useful for regularized linear models -- b) is always necessary for regularized linear models -- c) may speed-up fitting -- d) has no impact on the optimal choice of the value of a regularization parameter - -_Select all answers that apply_ -``` - -+++ - -```{admonition} Question -The effect of increasing the regularization strength in a ridge model is to: - -- a) shrink all weights towards zero -- b) make all weights equal -- c) set a subset of the weights to exactly zero -- d) constrain all the weights to be positive - -_Select all answers that apply_ -``` diff --git a/jupyter-book/linear_models/linear_models_quiz_m4_05.md b/jupyter-book/linear_models/linear_models_quiz_m4_05.md deleted file mode 100644 index 45ce1f9b8..000000000 --- a/jupyter-book/linear_models/linear_models_quiz_m4_05.md +++ /dev/null @@ -1,36 +0,0 @@ -# โœ… Quiz M4.05 - -```{admonition} Question -By default, a [`LogisticRegression`](https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html) in scikit-learn applies: - -- a) no penalty -- b) a penalty that shrinks the magnitude of the weights towards zero (also called "l2 penalty") -- c) a penalty that ensures all weights are equal - -_Select a single answer_ -``` - -+++ - -```{admonition} Question -The parameter `C` in a logistic regression is: - -- a) similar to the parameter `alpha` in a ridge regressor -- b) similar to `1 / alpha` where `alpha` is the parameter of a ridge regressor -- c) not controlling the regularization - -_Select a single answer_ -``` - -+++ - -```{admonition} Question -In logistic regression, increasing the regularization strength (by -decreasing the value of `C`) makes the model: - -- a) more likely to overfit to the training data -- b) more confident: the values returned by `predict_proba` are closer to 0 or 1 -- c) less complex, potentially underfitting the training data - -_Select a single answer_ -``` From f0ca8ed9c7b36ae25988ad614ef033fa086e8eef Mon Sep 17 00:00:00 2001 From: Arturo Amor <86408019+ArturoAmorQ@users.noreply.github.com> Date: Wed, 25 Oct 2023 16:41:41 +0200 Subject: [PATCH 073/108] FIX Remove solution being shown (#738) --- .../linear_models/linear_models_quiz_m4_03.md | 12 ------------ 1 file changed, 12 deletions(-) diff --git a/jupyter-book/linear_models/linear_models_quiz_m4_03.md b/jupyter-book/linear_models/linear_models_quiz_m4_03.md index 979238076..672f04e58 100644 --- a/jupyter-book/linear_models/linear_models_quiz_m4_03.md +++ b/jupyter-book/linear_models/linear_models_quiz_m4_03.md @@ -91,18 +91,6 @@ _Select a single answer_ +++ -The [LogisticRegression documentation](https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html) -says: - -> C : Inverse of regularization strength; smaller values specify stronger regularization. - -The [Ridge documentation](https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.Ridge.html) -says: - -> alpha : Regularization strength; Larger values specify stronger regularization. - -+++ - ```{admonition} Question In logistic regression, increasing the regularization strength (by decreasing the value of `C`) makes the model: From d51a62b6954737380aa00b2b57219f1ef199030b Mon Sep 17 00:00:00 2001 From: Arturo Amor <86408019+ArturoAmorQ@users.noreply.github.com> Date: Thu, 26 Oct 2023 12:02:42 +0200 Subject: [PATCH 074/108] Update notebooks (#740) Co-authored-by: ArturoAmorQ --- notebooks/linear_models_sol_03.ipynb | 298 +++++++++++++++--- notebooks/parameter_tuning_ex_02.ipynb | 1 - notebooks/parameter_tuning_grid_search.ipynb | 1 - notebooks/parameter_tuning_nested.ipynb | 1 - .../parameter_tuning_randomized_search.ipynb | 1 - notebooks/parameter_tuning_sol_02.ipynb | 1 - notebooks/trees_classification.ipynb | 129 +++++--- notebooks/trees_sol_01.ipynb | 156 +++++++-- 8 files changed, 461 insertions(+), 127 deletions(-) diff --git a/notebooks/linear_models_sol_03.ipynb b/notebooks/linear_models_sol_03.ipynb index 0eabeeb54..178514087 100644 --- a/notebooks/linear_models_sol_03.ipynb +++ b/notebooks/linear_models_sol_03.ipynb @@ -2,18 +2,25 @@ "cells": [ { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "lines_to_next_cell": 2 + }, "source": [ "# \ud83d\udcc3 Solution for Exercise M4.03\n", "\n", - "The parameter `penalty` can control the **type** of regularization to use,\n", - "whereas the regularization **strength** is set using the parameter `C`.\n", - "Setting`penalty=\"none\"` is equivalent to an infinitely large value of `C`. In\n", - "this exercise, we ask you to train a logistic regression classifier using the\n", - "`penalty=\"l2\"` regularization (which happens to be the default in\n", - "scikit-learn) to find by yourself the effect of the parameter `C`.\n", + "In the previous Module we tuned the hyperparameter `C` of the logistic\n", + "regression without mentioning that it controls the regularization strength.\n", + "Later, on the slides on \ud83c\udfa5 **Intuitions on regularized linear models** we\n", + "metioned that a small `C` provides a more regularized model, whereas a\n", + "non-regularized model is obtained with an infinitely large value of `C`.\n", + "Indeed, `C` behaves as the inverse of the `alpha` coefficient in the `Ridge`\n", + "model.\n", + "\n", + "In this exercise, we ask you to train a logistic regression classifier using\n", + "different values of the parameter `C` to find its effects by yourself.\n", "\n", - "We start by loading the dataset." + "We start by loading the dataset. We only keep the Adelie and Chinstrap classes\n", + "to keep the discussion simple." ] }, { @@ -36,7 +43,6 @@ "import pandas as pd\n", "\n", "penguins = pd.read_csv(\"../datasets/penguins_classification.csv\")\n", - "# only keep the Adelie and Chinstrap classes\n", "penguins = (\n", " penguins.set_index(\"Species\").loc[[\"Adelie\", \"Chinstrap\"]].reset_index()\n", ")\n", @@ -53,7 +59,9 @@ "source": [ "from sklearn.model_selection import train_test_split\n", "\n", - "penguins_train, penguins_test = train_test_split(penguins, random_state=0)\n", + "penguins_train, penguins_test = train_test_split(\n", + " penguins, random_state=0, test_size=0.4\n", + ")\n", "\n", "data_train = penguins_train[culmen_columns]\n", "data_test = penguins_test[culmen_columns]\n", @@ -66,7 +74,67 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "First, let's create our predictive model." + "We define a function to help us fit a given `model` and plot its decision\n", + "boundary. We recall that by using a `DecisionBoundaryDisplay` with diverging\n", + "colormap, `vmin=0` and `vmax=1`, we ensure that the 0.5 probability is mapped\n", + "to the white color. Equivalently, the darker the color, the closer the\n", + "predicted probability is to 0 or 1 and the more confident the classifier is in\n", + "its predictions." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import matplotlib.pyplot as plt\n", + "import seaborn as sns\n", + "from sklearn.inspection import DecisionBoundaryDisplay\n", + "\n", + "\n", + "def plot_decision_boundary(model):\n", + " model.fit(data_train, target_train)\n", + " accuracy = model.score(data_test, target_test)\n", + "\n", + " disp = DecisionBoundaryDisplay.from_estimator(\n", + " model,\n", + " data_train,\n", + " response_method=\"predict_proba\",\n", + " plot_method=\"pcolormesh\",\n", + " cmap=\"RdBu_r\",\n", + " alpha=0.8,\n", + " vmin=0.0,\n", + " vmax=1.0,\n", + " )\n", + " DecisionBoundaryDisplay.from_estimator(\n", + " model,\n", + " data_train,\n", + " response_method=\"predict_proba\",\n", + " plot_method=\"contour\",\n", + " linestyles=\"--\",\n", + " linewidths=1,\n", + " alpha=0.8,\n", + " levels=[0.5],\n", + " ax=disp.ax_,\n", + " )\n", + " sns.scatterplot(\n", + " data=penguins_train,\n", + " x=culmen_columns[0],\n", + " y=culmen_columns[1],\n", + " hue=target_column,\n", + " palette=[\"tab:blue\", \"tab:red\"],\n", + " ax=disp.ax_,\n", + " )\n", + " plt.legend(bbox_to_anchor=(1.05, 0.8), loc=\"upper left\")\n", + " plt.title(f\"C: {C} \\n Accuracy on the test set: {accuracy:.2f}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's now create our predictive model." ] }, { @@ -79,19 +147,24 @@ "from sklearn.preprocessing import StandardScaler\n", "from sklearn.linear_model import LogisticRegression\n", "\n", - "logistic_regression = make_pipeline(\n", - " StandardScaler(), LogisticRegression(penalty=\"l2\")\n", - ")" + "logistic_regression = make_pipeline(StandardScaler(), LogisticRegression())" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "Given the following candidates for the `C` parameter, find out the impact of\n", - "`C` on the classifier decision boundary. You can use\n", - "`sklearn.inspection.DecisionBoundaryDisplay.from_estimator` to plot the\n", - "decision function boundary." + "## Influence of the parameter `C` on the decision boundary\n", + "\n", + "Given the following candidates for the `C` parameter and the\n", + "`plot_decision_boundary` function, find out the impact of `C` on the\n", + "classifier's decision boundary.\n", + "\n", + "- How does the value of `C` impact the confidence on the predictions?\n", + "- How does it impact the underfit/overfit trade-off?\n", + "- How does it impact the position and orientation of the decision boundary?\n", + "\n", + "Try to give an interpretation on the reason for such behavior." ] }, { @@ -100,41 +173,75 @@ "metadata": {}, "outputs": [], "source": [ - "Cs = [0.01, 0.1, 1, 10]\n", + "Cs = [1e-6, 0.01, 0.1, 1, 10, 100, 1e6]\n", "\n", "# solution\n", - "import matplotlib.pyplot as plt\n", - "import seaborn as sns\n", - "from sklearn.inspection import DecisionBoundaryDisplay\n", - "\n", "for C in Cs:\n", " logistic_regression.set_params(logisticregression__C=C)\n", - " logistic_regression.fit(data_train, target_train)\n", - " accuracy = logistic_regression.score(data_test, target_test)\n", + " plot_decision_boundary(logistic_regression)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "tags": [ + "solution" + ] + }, + "source": [ "\n", - " DecisionBoundaryDisplay.from_estimator(\n", - " logistic_regression,\n", - " data_test,\n", - " response_method=\"predict\",\n", - " cmap=\"RdBu_r\",\n", - " alpha=0.5,\n", - " )\n", - " sns.scatterplot(\n", - " data=penguins_test,\n", - " x=culmen_columns[0],\n", - " y=culmen_columns[1],\n", - " hue=target_column,\n", - " palette=[\"tab:red\", \"tab:blue\"],\n", - " )\n", - " plt.legend(bbox_to_anchor=(1.05, 0.8), loc=\"upper left\")\n", - " plt.title(f\"C: {C} \\n Accuracy on the test set: {accuracy:.2f}\")" + "On this series of plots we can observe several important points. Regarding the\n", + "confidence on the predictions:\n", + "\n", + "- For low values of `C` (strong regularization), the classifier is less\n", + " confident in its predictions. We are enforcing a **spread sigmoid**.\n", + "- For high values of `C` (weak regularization), the classifier is more\n", + " confident: the areas with dark blue (very confident in predicting \"Adelie\")\n", + " and dark red (very confident in predicting \"Chinstrap\") nearly cover the\n", + " entire feature space. We are enforcing a **steep sigmoid**.\n", + "\n", + "To answer the next question, think that misclassified data points are more\n", + "costly when the classifier is more confident on the decision. Decision rules\n", + "are mostly driven by avoiding such cost. From the previous observations we can\n", + "then deduce that:\n", + "\n", + "- The smaller the `C` (the stronger the regularization), the lower the cost\n", + " of a misclassification. As more data points lay in the low-confidence\n", + " zone, the more the decision rules are influenced almost uniformly by all\n", + " the data points. This leads to a less expressive model, which may underfit.\n", + "- The higher the value of `C` (the weaker the regularization), the more the\n", + " decision is influenced by a few training points very close to the boundary,\n", + " where decisions are costly. Remember that models may overfit if the number\n", + " of samples in the training set is too small, as at least a minimum of\n", + " samples is needed to average the noise out.\n", + "\n", + "The orientation is the result of two factors: minimizing the number of\n", + "misclassified training points with high confidence and their distance to the\n", + "decision boundary (notice how the contour line tries to align with the most\n", + "misclassified data points in the dark-colored zone). This is closely related\n", + "to the value of the weights of the model, which is explained in the next part\n", + "of the exercise.\n", + "\n", + "Finally, for small values of `C` the position of the decision boundary is\n", + "affected by the class imbalance: when `C` is near zero, the model predicts the\n", + "majority class (as seen in the training set) everywhere in the feature space.\n", + "In our case, there are approximately two times more \"Adelie\" than \"Chinstrap\"\n", + "penguins. This explains why the decision boundary is shifted to the right when\n", + "`C` gets smaller. Indeed, the most regularized model predicts light blue\n", + "almost everywhere in the feature space." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "Look at the impact of the `C` hyperparameter on the magnitude of the weights." + "## Impact of the regularization on the weights\n", + "\n", + "Look at the impact of the `C` hyperparameter on the magnitude of the weights.\n", + "**Hint**: You can [access pipeline\n", + "steps](https://scikit-learn.org/stable/modules/compose.html#access-pipeline-steps)\n", + "by name or position. Then you can query the attributes of that step such as\n", + "`coef_`." ] }, { @@ -144,12 +251,12 @@ "outputs": [], "source": [ "# solution\n", - "weights_ridge = []\n", + "lr_weights = []\n", "for C in Cs:\n", " logistic_regression.set_params(logisticregression__C=C)\n", " logistic_regression.fit(data_train, target_train)\n", " coefs = logistic_regression[-1].coef_[0]\n", - " weights_ridge.append(pd.Series(coefs, index=culmen_columns))" + " lr_weights.append(pd.Series(coefs, index=culmen_columns))" ] }, { @@ -162,8 +269,8 @@ }, "outputs": [], "source": [ - "weights_ridge = pd.concat(weights_ridge, axis=1, keys=[f\"C: {C}\" for C in Cs])\n", - "weights_ridge.plot.barh()\n", + "lr_weights = pd.concat(lr_weights, axis=1, keys=[f\"C: {C}\" for C in Cs])\n", + "lr_weights.plot.barh()\n", "_ = plt.title(\"LogisticRegression weights depending of C\")" ] }, @@ -175,14 +282,101 @@ ] }, "source": [ - "We see that a small `C` will shrink the weights values toward zero. It means\n", - "that a small `C` provides a more regularized model. Thus, `C` is the inverse\n", - "of the `alpha` coefficient in the `Ridge` model.\n", "\n", - "Besides, with a strong penalty (i.e. small `C` value), the weight of the\n", - "feature \"Culmen Depth (mm)\" is almost zero. It explains why the decision\n", + "As small `C` provides a more regularized model, it shrinks the weights values\n", + "toward zero, as in the `Ridge` model.\n", + "\n", + "In particular, with a strong penalty (e.g. `C = 0.01`), the weight of the feature\n", + "named \"Culmen Depth (mm)\" is almost zero. It explains why the decision\n", "separation in the plot is almost perpendicular to the \"Culmen Length (mm)\"\n", - "feature." + "feature.\n", + "\n", + "For even stronger penalty strengths (e.g. `C = 1e-6`), the weights of both\n", + "features are almost zero. It explains why the decision separation in the plot\n", + "is almost constant in the feature space: the predicted probability is only\n", + "based on the intercept parameter of the model (which is never regularized)." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Impact of the regularization on with non-linear feature engineering\n", + "\n", + "Use the `plot_decision_boundary` function to repeat the experiment using a\n", + "non-linear feature engineering pipeline. For such purpose, insert\n", + "`Nystroem(kernel=\"rbf\", gamma=1, n_components=100)` between the\n", + "`StandardScaler` and the `LogisticRegression` steps.\n", + "\n", + "- Does the value of `C` still impact the position of the decision boundary and\n", + " the confidence of the model?\n", + "- What can you say about the impact of `C` on the underfitting vs overfitting\n", + " trade-off?" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.kernel_approximation import Nystroem\n", + "\n", + "# solution\n", + "classifier = make_pipeline(\n", + " StandardScaler(),\n", + " Nystroem(kernel=\"rbf\", gamma=1.0, n_components=100, random_state=0),\n", + " LogisticRegression(penalty=\"l2\", max_iter=1000),\n", + ")\n", + "\n", + "for C in Cs:\n", + " classifier.set_params(logisticregression__C=C)\n", + " plot_decision_boundary(classifier)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "tags": [ + "solution" + ] + }, + "source": [ + "\n", + "- For the lowest values of `C`, the overall pipeline underfits: it predicts\n", + " the majority class everywhere, as previously.\n", + "- When `C` increases, the models starts to predict some datapoints from the\n", + " \"Chinstrap\" class but the model is not very confident anywhere in the\n", + " feature space.\n", + "- The decision boundary is no longer a straight line: the linear model is now\n", + " classifying in the 100-dimensional feature space created by the `Nystroem`\n", + " transformer. As are result, the decision boundary induced by the overall\n", + " pipeline is now expressive enough to wrap around the minority class.\n", + "- For `C = 1` in particular, it finds a smooth red blob around most of the\n", + " \"Chinstrap\" data points. When moving away from the data points, the model is\n", + " less confident in its predictions and again tends to predict the majority\n", + " class according to the proportion in the training set.\n", + "- For higher values of `C`, the model starts to overfit: it is very confident\n", + " in its predictions almost everywhere, but it should not be trusted: the\n", + " model also makes a larger number of mistakes on the test set (not shown in\n", + " the plot) while adopting a very curvy decision boundary to attempt fitting\n", + " all the training points, including the noisy ones at the frontier between\n", + " the two classes. This makes the decision boundary very sensitive to the\n", + " sampling of the training set and as a result, it does not generalize well in\n", + " that region. This is confirmed by the (slightly) lower accuracy on the test\n", + " set.\n", + "\n", + "Finally, we can also note that the linear model on the raw features was as\n", + "good or better than the best model using non-linear feature engineering. So in\n", + "this case, we did not really need this extra complexity in our pipeline.\n", + "**Simpler is better!**\n", + "\n", + "So to conclude, when using non-linear feature engineering, it is often\n", + "possible to make the pipeline overfit, even if the original feature space is\n", + "low-dimensional. As a result, it is important to tune the regularization\n", + "parameter in conjunction with the parameters of the transformers (e.g. tuning\n", + "`gamma` would be important here). This has a direct impact on the certainty of\n", + "the predictions." ] } ], diff --git a/notebooks/parameter_tuning_ex_02.ipynb b/notebooks/parameter_tuning_ex_02.ipynb index 46345e86b..2aa096d5c 100644 --- a/notebooks/parameter_tuning_ex_02.ipynb +++ b/notebooks/parameter_tuning_ex_02.ipynb @@ -57,7 +57,6 @@ " )\n", " ],\n", " remainder=\"passthrough\",\n", - " sparse_threshold=0,\n", ")\n", "\n", "from sklearn.ensemble import HistGradientBoostingClassifier\n", diff --git a/notebooks/parameter_tuning_grid_search.ipynb b/notebooks/parameter_tuning_grid_search.ipynb index d26aff083..e0912cb54 100644 --- a/notebooks/parameter_tuning_grid_search.ipynb +++ b/notebooks/parameter_tuning_grid_search.ipynb @@ -157,7 +157,6 @@ "preprocessor = ColumnTransformer(\n", " [(\"cat_preprocessor\", categorical_preprocessor, categorical_columns)],\n", " remainder=\"passthrough\",\n", - " sparse_threshold=0,\n", ")" ] }, diff --git a/notebooks/parameter_tuning_nested.ipynb b/notebooks/parameter_tuning_nested.ipynb index b7c14a3bf..efc43173d 100644 --- a/notebooks/parameter_tuning_nested.ipynb +++ b/notebooks/parameter_tuning_nested.ipynb @@ -70,7 +70,6 @@ " (\"cat_preprocessor\", categorical_preprocessor, categorical_columns),\n", " ],\n", " remainder=\"passthrough\",\n", - " sparse_threshold=0,\n", ")" ] }, diff --git a/notebooks/parameter_tuning_randomized_search.ipynb b/notebooks/parameter_tuning_randomized_search.ipynb index 11bfac389..3189e9301 100644 --- a/notebooks/parameter_tuning_randomized_search.ipynb +++ b/notebooks/parameter_tuning_randomized_search.ipynb @@ -121,7 +121,6 @@ "preprocessor = ColumnTransformer(\n", " [(\"cat_preprocessor\", categorical_preprocessor, categorical_columns)],\n", " remainder=\"passthrough\",\n", - " sparse_threshold=0,\n", ")" ] }, diff --git a/notebooks/parameter_tuning_sol_02.ipynb b/notebooks/parameter_tuning_sol_02.ipynb index bbcb42f88..58ef6a501 100644 --- a/notebooks/parameter_tuning_sol_02.ipynb +++ b/notebooks/parameter_tuning_sol_02.ipynb @@ -57,7 +57,6 @@ " )\n", " ],\n", " remainder=\"passthrough\",\n", - " sparse_threshold=0,\n", ")\n", "\n", "from sklearn.ensemble import HistGradientBoostingClassifier\n", diff --git a/notebooks/trees_classification.ipynb b/notebooks/trees_classification.ipynb index dfcae831c..22eae1fca 100644 --- a/notebooks/trees_classification.ipynb +++ b/notebooks/trees_classification.ipynb @@ -6,8 +6,11 @@ "source": [ "# Build a classification decision tree\n", "\n", - "We will illustrate how decision tree fit data with a simple classification\n", - "problem using the penguins dataset." + "In this notebook we illustrate decision trees in a multiclass classification\n", + "problem by using the penguins dataset with 2 features and 3 classes.\n", + "\n", + "For the sake of simplicity, we focus the discussion on the hyperparamter\n", + "`max_depth`, which controls the maximal depth of the decision tree." ] }, { @@ -38,8 +41,8 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Besides, we split the data into two subsets to investigate how trees will\n", - "predict values based on an out-of-samples dataset." + "First, we split the data into two subsets to investigate how trees predict\n", + "values based on unseen data." ] }, { @@ -60,16 +63,13 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "In a previous notebook, we learnt that a linear classifier will define a\n", - "linear separation to split classes using a linear combination of the input\n", - "features. In our 2-dimensional space, it means that a linear classifier will\n", - "define some oblique lines that best separate our classes. We define a function\n", - "below that, given a set of data points and a classifier, will plot the\n", - "decision boundaries learnt by the classifier.\n", - "\n", - "Thus, for a linear classifier, we will obtain the following decision\n", - "boundaries. These boundaries lines indicate where the model changes its\n", - "prediction from one class to another." + "In a previous notebook, we learnt that linear classifiers define a linear\n", + "separation to split classes using a linear combination of the input features.\n", + "In our 2-dimensional feature space, it means that a linear classifier finds\n", + "the oblique lines that best separate the classes. This is still true for\n", + "multiclass problems, except that more than one line is fitted. We can use\n", + "`DecisionBoundaryDisplay` to plot the decision boundaries learnt by the\n", + "classifier." ] }, { @@ -91,15 +91,22 @@ "outputs": [], "source": [ "import matplotlib.pyplot as plt\n", + "import matplotlib as mpl\n", "import seaborn as sns\n", "\n", "from sklearn.inspection import DecisionBoundaryDisplay\n", "\n", + "tab10_norm = mpl.colors.Normalize(vmin=-0.5, vmax=8.5)\n", "# create a palette to be used in the scatterplot\n", - "palette = [\"tab:red\", \"tab:blue\", \"black\"]\n", + "palette = [\"tab:blue\", \"tab:green\", \"tab:orange\"]\n", "\n", - "DecisionBoundaryDisplay.from_estimator(\n", - " linear_model, data_train, response_method=\"predict\", cmap=\"RdBu\", alpha=0.5\n", + "dbd = DecisionBoundaryDisplay.from_estimator(\n", + " linear_model,\n", + " data_train,\n", + " response_method=\"predict\",\n", + " cmap=\"tab10\",\n", + " norm=tab10_norm,\n", + " alpha=0.5,\n", ")\n", "sns.scatterplot(\n", " data=penguins,\n", @@ -119,7 +126,7 @@ "source": [ "We see that the lines are a combination of the input features since they are\n", "not perpendicular a specific axis. Indeed, this is due to the model\n", - "parametrization that we saw in the previous notebook, controlled by the\n", + "parametrization that we saw in some previous notebooks, i.e. controlled by the\n", "model's weights and intercept.\n", "\n", "Besides, it seems that the linear model would be a good candidate for such\n", @@ -141,13 +148,27 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Unlike linear models, decision trees are non-parametric models: they are not\n", - "controlled by a mathematical decision function and do not have weights or\n", - "intercept to be optimized.\n", + "Unlike linear models, the decision rule for the decision tree is not\n", + "controlled by a simple linear combination of weights and feature values.\n", + "\n", + "Instead, the decision rules of trees can be defined in terms of\n", + "- the feature index used at each split node of the tree,\n", + "- the threshold value used at each split node,\n", + "- the value to predict at each leaf node.\n", + "\n", + "Decision trees partition the feature space by considering a single feature at\n", + "a time. The number of splits depends on both the hyperparameters and the\n", + "number of data points in the training set: the more flexible the\n", + "hyperparameters and the larger the training set, the more splits can be\n", + "considered by the model.\n", "\n", - "Indeed, decision trees will partition the space by considering a single\n", - "feature at a time. Let's illustrate this behaviour by having a decision tree\n", - "make a single split to partition the feature space." + "As the number of adjustable components taking part in the decision rule\n", + "changes with the training size, we say that decision trees are non-parametric\n", + "models.\n", + "\n", + "Let's now visualize the shape of the decision boundary of a decision tree when\n", + "we set the `max_depth` hyperparameter to only allow for a single split to\n", + "partition the feature space." ] }, { @@ -169,7 +190,12 @@ "outputs": [], "source": [ "DecisionBoundaryDisplay.from_estimator(\n", - " tree, data_train, response_method=\"predict\", cmap=\"RdBu\", alpha=0.5\n", + " tree,\n", + " data_train,\n", + " response_method=\"predict\",\n", + " cmap=\"tab10\",\n", + " norm=tab10_norm,\n", + " alpha=0.5,\n", ")\n", "sns.scatterplot(\n", " data=penguins,\n", @@ -188,8 +214,8 @@ "source": [ "The partitions found by the algorithm separates the data along the axis\n", "\"Culmen Depth\", discarding the feature \"Culmen Length\". Thus, it highlights\n", - "that a decision tree does not use a combination of feature when making a\n", - "split. We can look more in depth at the tree structure." + "that a decision tree does not use a combination of features when making a\n", + "single split. We can look more in depth at the tree structure." ] }, { @@ -230,16 +256,16 @@ "dataset was subdivided into 2 sets based on the culmen depth (inferior or\n", "superior to 16.45 mm).\n", "\n", - "This partition of the dataset minimizes the class diversities in each\n", + "This partition of the dataset minimizes the class diversity in each\n", "sub-partitions. This measure is also known as a **criterion**, and is a\n", "settable parameter.\n", "\n", "If we look more closely at the partition, we see that the sample superior to\n", - "16.45 belongs mainly to the Adelie class. Looking at the values, we indeed\n", - "observe 103 Adelie individuals in this space. We also count 52 Chinstrap\n", - "samples and 6 Gentoo samples. We can make similar interpretation for the\n", + "16.45 belongs mainly to the \"Adelie\" class. Looking at the values, we indeed\n", + "observe 103 \"Adelie\" individuals in this space. We also count 52 \"Chinstrap\"\n", + "samples and 6 \"Gentoo\" samples. We can make similar interpretation for the\n", "partition defined by a threshold inferior to 16.45mm. In this case, the most\n", - "represented class is the Gentoo species.\n", + "represented class is the \"Gentoo\" species.\n", "\n", "Let's see how our tree would work as a predictor. Let's start with a case\n", "where the culmen depth is inferior to the threshold." @@ -251,15 +277,17 @@ "metadata": {}, "outputs": [], "source": [ - "sample_1 = pd.DataFrame({\"Culmen Length (mm)\": [0], \"Culmen Depth (mm)\": [15]})\n", - "tree.predict(sample_1)" + "test_penguin_1 = pd.DataFrame(\n", + " {\"Culmen Length (mm)\": [0], \"Culmen Depth (mm)\": [15]}\n", + ")\n", + "tree.predict(test_penguin_1)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "The class predicted is the Gentoo. We can now check what happens if we pass a\n", + "The class predicted is the \"Gentoo\". We can now check what happens if we pass a\n", "culmen depth superior to the threshold." ] }, @@ -269,17 +297,19 @@ "metadata": {}, "outputs": [], "source": [ - "sample_2 = pd.DataFrame({\"Culmen Length (mm)\": [0], \"Culmen Depth (mm)\": [17]})\n", - "tree.predict(sample_2)" + "test_penguin_2 = pd.DataFrame(\n", + " {\"Culmen Length (mm)\": [0], \"Culmen Depth (mm)\": [17]}\n", + ")\n", + "tree.predict(test_penguin_2)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "In this case, the tree predicts the Adelie specie.\n", + "In this case, the tree predicts the \"Adelie\" specie.\n", "\n", - "Thus, we can conclude that a decision tree classifier will predict the most\n", + "Thus, we can conclude that a decision tree classifier predicts the most\n", "represented class within a partition.\n", "\n", "During the training, we have a count of samples in each partition, we can also\n", @@ -293,7 +323,7 @@ "metadata": {}, "outputs": [], "source": [ - "y_pred_proba = tree.predict_proba(sample_2)\n", + "y_pred_proba = tree.predict_proba(test_penguin_2)\n", "y_proba_class_0 = pd.Series(y_pred_proba[0], index=tree.classes_)" ] }, @@ -338,8 +368,8 @@ "metadata": {}, "source": [ "It is also important to note that the culmen length has been disregarded for\n", - "the moment. It means that whatever the value given, it will not be used during\n", - "the prediction." + "the moment. It means that regardless of its value, it is not used during the\n", + "prediction." ] }, { @@ -348,10 +378,10 @@ "metadata": {}, "outputs": [], "source": [ - "sample_3 = pd.DataFrame(\n", + "test_penguin_3 = pd.DataFrame(\n", " {\"Culmen Length (mm)\": [10_000], \"Culmen Depth (mm)\": [17]}\n", ")\n", - "tree.predict_proba(sample_3)" + "tree.predict_proba(test_penguin_3)" ] }, { @@ -378,12 +408,13 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Indeed, it is not a surprise. We saw earlier that a single feature will not be\n", - "able to separate all three species. However, from the previous analysis we saw\n", - "that by using both features we should be able to get fairly good results.\n", + "Indeed, it is not a surprise. We saw earlier that a single feature is not able\n", + "to separate all three species: it underfits. However, from the previous\n", + "analysis we saw that by using both features we should be able to get fairly\n", + "good results.\n", "\n", - "In the next exercise, you will increase the size of the tree depth. You will\n", - "get intuitions on how the space partitioning is repeated over time." + "In the next exercise, you will increase the tree depth to get an intuition on\n", + "how such a parameter affects the space partitioning." ] } ], diff --git a/notebooks/trees_sol_01.ipynb b/notebooks/trees_sol_01.ipynb index c126f23fa..2ce0c1b8b 100644 --- a/notebooks/trees_sol_01.ipynb +++ b/notebooks/trees_sol_01.ipynb @@ -6,16 +6,13 @@ "source": [ "# \ud83d\udcc3 Solution for Exercise M5.01\n", "\n", - "In the previous notebook, we showed how a tree with a depth of 1 level was\n", - "working. The aim of this exercise is to repeat part of the previous experiment\n", - "for a depth with 2 levels to show how the process of partitioning is repeated\n", - "over time.\n", + "In the previous notebook, we showed how a tree with 1 level depth works. The\n", + "aim of this exercise is to repeat part of the previous experiment for a tree\n", + "with 2 levels depth to show how such parameter affects the feature space\n", + "partitioning.\n", "\n", - "Before to start, we will:\n", - "\n", - "* load the dataset;\n", - "* split the dataset into training and testing dataset;\n", - "* define the function to show the classification decision function." + "We first load the penguins dataset and split it into a training and a testing\n", + "sets:" ] }, { @@ -61,10 +58,7 @@ "metadata": {}, "source": [ "Create a decision tree classifier with a maximum depth of 2 levels and fit the\n", - "training data. Once this classifier trained, plot the data and the decision\n", - "boundary to see the benefit of increasing the depth. To plot the decision\n", - "boundary, you should import the class `DecisionBoundaryDisplay` from the\n", - "module `sklearn.inspection` as shown in the previous course notebook." + "training data." ] }, { @@ -80,24 +74,49 @@ "tree.fit(data_train, target_train)" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now plot the data and the decision boundary of the trained classifier to see\n", + "the effect of increasing the depth of the tree.\n", + "\n", + "Hint: Use the class `DecisionBoundaryDisplay` from the module\n", + "`sklearn.inspection` as shown in previous course notebooks.\n", + "\n", + "
\n", + "

Warning

\n", + "

At this time, it is not possible to use response_method=\"predict_proba\" for\n", + "multiclass problems. This is a planned feature for a future version of\n", + "scikit-learn. In the mean time, you can use response_method=\"predict\"\n", + "instead.

\n", + "
" + ] + }, { "cell_type": "code", "execution_count": null, - "metadata": { - "tags": [ - "solution" - ] - }, + "metadata": {}, "outputs": [], "source": [ + "# solution\n", "import matplotlib.pyplot as plt\n", + "import matplotlib as mpl\n", "import seaborn as sns\n", "\n", "from sklearn.inspection import DecisionBoundaryDisplay\n", "\n", - "palette = [\"tab:red\", \"tab:blue\", \"black\"]\n", + "\n", + "tab10_norm = mpl.colors.Normalize(vmin=-0.5, vmax=8.5)\n", + "\n", + "palette = [\"tab:blue\", \"tab:green\", \"tab:orange\"]\n", "DecisionBoundaryDisplay.from_estimator(\n", - " tree, data_train, response_method=\"predict\", cmap=\"RdBu\", alpha=0.5\n", + " tree,\n", + " data_train,\n", + " response_method=\"predict\",\n", + " cmap=\"tab10\",\n", + " norm=tab10_norm,\n", + " alpha=0.5,\n", ")\n", "ax = sns.scatterplot(\n", " data=penguins,\n", @@ -184,7 +203,102 @@ "We predict an Adelie penguin if the feature value is below the threshold,\n", "which is not surprising since this partition was almost pure. If the feature\n", "value is above the threshold, we predict the Gentoo penguin, the class that is\n", - "most probable." + "most probable.\n", + "\n", + "## (Estimated) predicted probabilities in multi-class problems\n", + "\n", + "For those interested, one can further try to visualize the output of\n", + "`predict_proba` for a multiclass problem using `DecisionBoundaryDisplay`,\n", + "except that for a K-class problem you have K probability outputs for each\n", + "data point. Visualizing all these on a single plot can quickly become tricky\n", + "to interpret. It is then common to instead produce K separate plots, one for\n", + "each class, in a one-vs-rest (or one-vs-all) fashion.\n", + "\n", + "For example, in the plot below, the first plot on the left shows in yellow the\n", + "certainty on classifying a data point as belonging to the \"Adelie\" class. In\n", + "the same plot, the spectre from green to purple represents the certainty of\n", + "**not** belonging to the \"Adelie\" class. The same logic applies to the other\n", + "plots in the figure." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "tags": [ + "solution" + ] + }, + "outputs": [], + "source": [ + "import numpy as np\n", + "\n", + "xx = np.linspace(30, 60, 100)\n", + "yy = np.linspace(10, 23, 100)\n", + "xx, yy = np.meshgrid(xx, yy)\n", + "Xfull = pd.DataFrame(\n", + " {\"Culmen Length (mm)\": xx.ravel(), \"Culmen Depth (mm)\": yy.ravel()}\n", + ")\n", + "\n", + "probas = tree.predict_proba(Xfull)\n", + "n_classes = len(np.unique(tree.classes_))\n", + "\n", + "_, axs = plt.subplots(ncols=3, nrows=1, sharey=True, figsize=(12, 5))\n", + "plt.suptitle(\"Predicted probabilities for decision tree model\", y=0.8)\n", + "\n", + "for class_of_interest in range(n_classes):\n", + " axs[class_of_interest].set_title(\n", + " f\"Class {tree.classes_[class_of_interest]}\"\n", + " )\n", + " imshow_handle = axs[class_of_interest].imshow(\n", + " probas[:, class_of_interest].reshape((100, 100)),\n", + " extent=(30, 60, 10, 23),\n", + " vmin=0.0,\n", + " vmax=1.0,\n", + " origin=\"lower\",\n", + " cmap=\"viridis\",\n", + " )\n", + " axs[class_of_interest].set_xlabel(\"Culmen Length (mm)\")\n", + " if class_of_interest == 0:\n", + " axs[class_of_interest].set_ylabel(\"Culmen Depth (mm)\")\n", + " idx = target_test == tree.classes_[class_of_interest]\n", + " axs[class_of_interest].scatter(\n", + " data_test[\"Culmen Length (mm)\"].loc[idx],\n", + " data_test[\"Culmen Depth (mm)\"].loc[idx],\n", + " marker=\"o\",\n", + " c=\"w\",\n", + " edgecolor=\"k\",\n", + " )\n", + "\n", + "ax = plt.axes([0.15, 0.04, 0.7, 0.05])\n", + "plt.colorbar(imshow_handle, cax=ax, orientation=\"horizontal\")\n", + "_ = plt.title(\"Probability\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "tags": [ + "solution" + ] + }, + "source": [ + "
\n", + "

Note

\n", + "

You may have noticed that we are no longer using a diverging colormap. Indeed,\n", + "the chance level for a one-vs-rest binarization of the multi-class\n", + "classification problem is almost never at predicted probability of 0.5. So\n", + "using a colormap with a neutral white at 0.5 might give a false impression on\n", + "the certainty.

\n", + "
\n", + "\n", + "In future versions of scikit-learn `DecisionBoundaryDisplay` will support a\n", + "`class_of_interest` parameter that will allow in particular for a\n", + "visualization of `predict_proba` in multi-class settings.\n", + "\n", + "We also plan to make it possible to visualize the `predict_proba` values for\n", + "the class with the maximum predicted probability (without having to pass a\n", + "given a fixed `class_of_interest` value)." ] } ], From 767499b99c793ea8172d2357a7130bba1f68474c Mon Sep 17 00:00:00 2001 From: Arturo Amor <86408019+ArturoAmorQ@users.noreply.github.com> Date: Thu, 26 Oct 2023 13:48:38 +0200 Subject: [PATCH 075/108] ENH Mention scaling behavior of binning and splines (#739) Co-authored-by: ArturoAmorQ Co-authored-by: Olivier Grisel --- ...dels_feature_engineering_classification.py | 20 +++++++++++++------ 1 file changed, 14 insertions(+), 6 deletions(-) diff --git a/python_scripts/linear_models_feature_engineering_classification.py b/python_scripts/linear_models_feature_engineering_classification.py index 9fd203f34..12a2997da 100644 --- a/python_scripts/linear_models_feature_engineering_classification.py +++ b/python_scripts/linear_models_feature_engineering_classification.py @@ -235,7 +235,10 @@ def plot_decision_boundary(model, title=None): # %% from sklearn.preprocessing import KBinsDiscretizer -classifier = make_pipeline(KBinsDiscretizer(n_bins=5), LogisticRegression()) +classifier = make_pipeline( + KBinsDiscretizer(n_bins=5, encode="onehot"), # already the default params + LogisticRegression(), +) classifier # %% @@ -279,15 +282,20 @@ def plot_decision_boundary(model, title=None): # We can see that the decision boundary is now smooth, and while it favors # axis-aligned decision rules when extrapolating in low density regions, it can # adopt a more curvy decision boundary in the high density regions. -# -# Note however, that the number of knots is a hyperparameter that needs to be -# tuned. If we use too few knots, the model would underfit the data, as shown on -# the moons dataset. If we use too many knots, the model would overfit the data. -# # However, as for the binning transformation, the model still fails to separate # the data for the XOR dataset, irrespective of the number of knots, for the # same reasons: **the spline transformation is a feature-wise transformation** # and thus **cannot capture interactions** between features. +# +# Take into account that the number of knots is a hyperparameter that needs to be +# tuned. If we use too few knots, the model would underfit the data, as shown on +# the moons dataset. If we use too many knots, the model would overfit the data. +# +# ```{note} +# Notice that `KBinsDiscretizer(encode="onehot")` and `SplineTransformer` do not +# require additional scaling. Indeed, they can replace the scaling step for +# numerical features: they both create features with values in the [0, 1] range. +# ``` # %% [markdown] # From 008cff42df5cd63a9bac207f8fddc4b10a5599d5 Mon Sep 17 00:00:00 2001 From: Arturo Amor <86408019+ArturoAmorQ@users.noreply.github.com> Date: Fri, 27 Oct 2023 11:22:13 +0200 Subject: [PATCH 076/108] ENH Convert some of the Wrap-up M4 content into exercise (#731) --- jupyter-book/_toc.yml | 7 +- python_scripts/linear_models_ex_03.py | 127 +++-- python_scripts/linear_models_ex_04.py | 170 +++++++ python_scripts/linear_models_sol_03.py | 438 +++++++++--------- python_scripts/linear_models_sol_04.py | 281 +++++++++++ .../logistic_regression_non_linear.py | 217 --------- 6 files changed, 759 insertions(+), 481 deletions(-) create mode 100644 python_scripts/linear_models_ex_04.py create mode 100644 python_scripts/linear_models_sol_04.py delete mode 100644 python_scripts/logistic_regression_non_linear.py diff --git a/jupyter-book/_toc.yml b/jupyter-book/_toc.yml index 277bc22eb..01d356e74 100644 --- a/jupyter-book/_toc.yml +++ b/jupyter-book/_toc.yml @@ -102,14 +102,15 @@ parts: - file: python_scripts/linear_models_ex_02 - file: python_scripts/linear_models_sol_02 - file: python_scripts/linear_models_feature_engineering_classification.py - - file: python_scripts/logistic_regression_non_linear + - file: python_scripts/linear_models_ex_03 + - file: python_scripts/linear_models_sol_03 - file: linear_models/linear_models_quiz_m4_02 - file: linear_models/linear_models_regularization_index sections: - file: linear_models/regularized_linear_models_slides - file: python_scripts/linear_models_regularization - - file: python_scripts/linear_models_ex_03 - - file: python_scripts/linear_models_sol_03 + - file: python_scripts/linear_models_ex_04 + - file: python_scripts/linear_models_sol_04 - file: linear_models/linear_models_quiz_m4_03 - file: linear_models/linear_models_wrap_up_quiz - file: linear_models/linear_models_module_take_away diff --git a/python_scripts/linear_models_ex_03.py b/python_scripts/linear_models_ex_03.py index 9c311e817..50fe942cd 100644 --- a/python_scripts/linear_models_ex_03.py +++ b/python_scripts/linear_models_ex_03.py @@ -14,69 +14,118 @@ # %% [markdown] # # ๐Ÿ“ Exercise M4.03 # -# The parameter `penalty` can control the **type** of regularization to use, -# whereas the regularization **strength** is set using the parameter `C`. -# Setting`penalty="none"` is equivalent to an infinitely large value of `C`. In -# this exercise, we ask you to train a logistic regression classifier using the -# `penalty="l2"` regularization (which happens to be the default in -# scikit-learn) to find by yourself the effect of the parameter `C`. -# -# We start by loading the dataset. +# Now, we tackle a more realistic classification problem instead of making a +# synthetic dataset. We start by loading the Adult Census dataset with the +# following snippet. For the moment we retain only the **numerical features**. + +# %% +import pandas as pd + +adult_census = pd.read_csv("../datasets/adult-census.csv") +target = adult_census["class"] +data = adult_census.select_dtypes(["integer", "floating"]) +data = data.drop(columns=["education-num"]) +data # %% [markdown] -# ```{note} -# If you want a deeper overview regarding this dataset, you can refer to the -# Appendix - Datasets description section at the end of this MOOC. -# ``` +# We confirm that all the selected features are numerical. +# +# Compute the generalization performance in terms of accuracy of a linear model +# composed of a `StandardScaler` and a `LogisticRegression`. Use a 10-fold +# cross-validation with `return_estimator=True` to be able to inspect the +# trained estimators. # %% -import pandas as pd +# Write your code here. -penguins = pd.read_csv("../datasets/penguins_classification.csv") -# only keep the Adelie and Chinstrap classes -penguins = ( - penguins.set_index("Species").loc[["Adelie", "Chinstrap"]].reset_index() -) +# %% [markdown] +# What is the most important feature seen by the logistic regression? +# +# You can use a boxplot to compare the absolute values of the coefficients while +# also visualizing the variability induced by the cross-validation resampling. + +# %% +# Write your code here. -culmen_columns = ["Culmen Length (mm)", "Culmen Depth (mm)"] -target_column = "Species" +# %% [markdown] +# Let's now work with **both numerical and categorical features**. You can +# reload the Adult Census dataset with the following snippet: # %% -from sklearn.model_selection import train_test_split +adult_census = pd.read_csv("../datasets/adult-census.csv") +target = adult_census["class"] +data = adult_census.drop(columns=["class", "education-num"]) + +# %% [markdown] +# Create a predictive model where: +# - The numerical data must be scaled. +# - The categorical data must be one-hot encoded, set `min_frequency=0.01` to +# group categories concerning less than 1% of the total samples. +# - The predictor is a `LogisticRegression`. You may need to increase the number +# of `max_iter`, which is 100 by default. +# +# Use the same 10-fold cross-validation strategy with `return_estimator=True` as +# above to evaluate this complex pipeline. -penguins_train, penguins_test = train_test_split(penguins, random_state=0) +# %% +# Write your code here. -data_train = penguins_train[culmen_columns] -data_test = penguins_test[culmen_columns] +# %% [markdown] +# By comparing the cross-validation test scores of both models fold-to-fold, +# count the number of times the model using both numerical and categorical +# features has a better test score than the model using only numerical features. -target_train = penguins_train[target_column] -target_test = penguins_test[target_column] +# %% +# Write your code here. # %% [markdown] -# First, let's create our predictive model. +# For the following questions, you can copy adn paste the following snippet to +# get the feature names from the column transformer here named `preprocessor`. +# +# ```python +# preprocessor.fit(data) +# feature_names = ( +# preprocessor.named_transformers_["onehotencoder"].get_feature_names_out( +# categorical_columns +# ) +# ).tolist() +# feature_names += numerical_columns +# feature_names +# ``` # %% -from sklearn.pipeline import make_pipeline -from sklearn.preprocessing import StandardScaler -from sklearn.linear_model import LogisticRegression +# Write your code here. -logistic_regression = make_pipeline( - StandardScaler(), LogisticRegression(penalty="l2") -) +# %% [markdown] +# Notice that there are as many feature names as coefficients in the last step +# of your predictive pipeline. # %% [markdown] -# Given the following candidates for the `C` parameter, find out the impact of -# `C` on the classifier decision boundary. You can use -# `sklearn.inspection.DecisionBoundaryDisplay.from_estimator` to plot the -# decision function boundary. +# Which of the following pairs of features is most impacting the predictions of +# the logistic regression classifier based on the absolute magnitude of its +# coefficients? # %% -Cs = [0.01, 0.1, 1, 10] +# Write your code here. + +# %% [markdown] +# Now create a similar pipeline consisting of the same preprocessor as above, +# followed by a `PolynomialFeatures` and a logistic regression with `C=0.01`. +# Set `degree=2` and `interaction_only=True` to the feature engineering step. +# Remember not to include a "bias" feature to avoid introducing a redundancy +# with the intercept of the subsequent logistic regression. +# %% # Write your code here. # %% [markdown] -# Look at the impact of the `C` hyperparameter on the magnitude of the weights. +# By comparing the cross-validation test scores of both models fold-to-fold, +# count the number of times the model using multiplicative interactions and both +# numerical and categorical features has a better test score than the model +# without interactions. + +# %% +# Write your code here. # %% # Write your code here. diff --git a/python_scripts/linear_models_ex_04.py b/python_scripts/linear_models_ex_04.py new file mode 100644 index 000000000..dd9ae6bb1 --- /dev/null +++ b/python_scripts/linear_models_ex_04.py @@ -0,0 +1,170 @@ +# --- +# jupyter: +# jupytext: +# text_representation: +# extension: .py +# format_name: percent +# format_version: '1.3' +# jupytext_version: 1.15.2 +# kernelspec: +# display_name: Python 3 +# name: python3 +# --- + +# %% [markdown] +# # ๐Ÿ“ Exercise M4.04 +# +# In the previous Module we tuned the hyperparameter `C` of the logistic +# regression without mentioning that it controls the regularization strength. +# Later, on the slides on ๐ŸŽฅ **Intuitions on regularized linear models** we +# metioned that a small `C` provides a more regularized model, whereas a +# non-regularized model is obtained with an infinitely large value of `C`. +# Indeed, `C` behaves as the inverse of the `alpha` coefficient in the `Ridge` +# model. +# +# In this exercise, we ask you to train a logistic regression classifier using +# different values of the parameter `C` to find its effects by yourself. +# +# We start by loading the dataset. We only keep the Adelie and Chinstrap classes +# to keep the discussion simple. + + +# %% [markdown] +# ```{note} +# If you want a deeper overview regarding this dataset, you can refer to the +# Appendix - Datasets description section at the end of this MOOC. +# ``` + +# %% +import pandas as pd + +penguins = pd.read_csv("../datasets/penguins_classification.csv") +penguins = ( + penguins.set_index("Species").loc[["Adelie", "Chinstrap"]].reset_index() +) + +culmen_columns = ["Culmen Length (mm)", "Culmen Depth (mm)"] +target_column = "Species" + +# %% +from sklearn.model_selection import train_test_split + +penguins_train, penguins_test = train_test_split( + penguins, random_state=0, test_size=0.4 +) + +data_train = penguins_train[culmen_columns] +data_test = penguins_test[culmen_columns] + +target_train = penguins_train[target_column] +target_test = penguins_test[target_column] + +# %% [markdown] +# We define a function to help us fit a given `model` and plot its decision +# boundary. We recall that by using a `DecisionBoundaryDisplay` with diverging +# colormap, `vmin=0` and `vmax=1`, we ensure that the 0.5 probability is mapped +# to the white color. Equivalently, the darker the color, the closer the +# predicted probability is to 0 or 1 and the more confident the classifier is in +# its predictions. + +# %% +import matplotlib.pyplot as plt +import seaborn as sns +from sklearn.inspection import DecisionBoundaryDisplay + + +def plot_decision_boundary(model): + model.fit(data_train, target_train) + accuracy = model.score(data_test, target_test) + C = model.get_params()["logisticregression__C"] + + disp = DecisionBoundaryDisplay.from_estimator( + model, + data_train, + response_method="predict_proba", + plot_method="pcolormesh", + cmap="RdBu_r", + alpha=0.8, + vmin=0.0, + vmax=1.0, + ) + DecisionBoundaryDisplay.from_estimator( + model, + data_train, + response_method="predict_proba", + plot_method="contour", + linestyles="--", + linewidths=1, + alpha=0.8, + levels=[0.5], + ax=disp.ax_, + ) + sns.scatterplot( + data=penguins_train, + x=culmen_columns[0], + y=culmen_columns[1], + hue=target_column, + palette=["tab:blue", "tab:red"], + ax=disp.ax_, + ) + plt.legend(bbox_to_anchor=(1.05, 0.8), loc="upper left") + plt.title(f"C: {C} \n Accuracy on the test set: {accuracy:.2f}") + + +# %% [markdown] +# Let's now create our predictive model. + +# %% +from sklearn.pipeline import make_pipeline +from sklearn.preprocessing import StandardScaler +from sklearn.linear_model import LogisticRegression + +logistic_regression = make_pipeline(StandardScaler(), LogisticRegression()) + +# %% [markdown] +# ## Influence of the parameter `C` on the decision boundary +# +# Given the following candidates for the `C` parameter and the +# `plot_decision_boundary` function, find out the impact of `C` on the +# classifier's decision boundary. +# +# - How does the value of `C` impact the confidence on the predictions? +# - How does it impact the underfit/overfit trade-off? +# - How does it impact the position and orientation of the decision boundary? +# +# Try to give an interpretation on the reason for such behavior. + +# %% +Cs = [1e-6, 0.01, 0.1, 1, 10, 100, 1e6] + +# Write your code here. + +# %% [markdown] +# ## Impact of the regularization on the weights +# +# Look at the impact of the `C` hyperparameter on the magnitude of the weights. +# **Hint**: You can [access pipeline +# steps](https://scikit-learn.org/stable/modules/compose.html#access-pipeline-steps) +# by name or position. Then you can query the attributes of that step such as +# `coef_`. + +# %% +# Write your code here. + +# %% [markdown] +# ## Impact of the regularization on with non-linear feature engineering +# +# Use the `plot_decision_boundary` function to repeat the experiment using a +# non-linear feature engineering pipeline. For such purpose, insert +# `Nystroem(kernel="rbf", gamma=1, n_components=100)` between the +# `StandardScaler` and the `LogisticRegression` steps. +# +# - Does the value of `C` still impact the position of the decision boundary and +# the confidence of the model? +# - What can you say about the impact of `C` on the underfitting vs overfitting +# trade-off? + +# %% +from sklearn.kernel_approximation import Nystroem + +# Write your code here. diff --git a/python_scripts/linear_models_sol_03.py b/python_scripts/linear_models_sol_03.py index dc2a82f5c..c76806a45 100644 --- a/python_scripts/linear_models_sol_03.py +++ b/python_scripts/linear_models_sol_03.py @@ -8,273 +8,267 @@ # %% [markdown] # # ๐Ÿ“ƒ Solution for Exercise M4.03 # -# In the previous Module we tuned the hyperparameter `C` of the logistic -# regression without mentioning that it controls the regularization strength. -# Later, on the slides on ๐ŸŽฅ **Intuitions on regularized linear models** we -# metioned that a small `C` provides a more regularized model, whereas a -# non-regularized model is obtained with an infinitely large value of `C`. -# Indeed, `C` behaves as the inverse of the `alpha` coefficient in the `Ridge` -# model. -# -# In this exercise, we ask you to train a logistic regression classifier using -# different values of the parameter `C` to find its effects by yourself. -# -# We start by loading the dataset. We only keep the Adelie and Chinstrap classes -# to keep the discussion simple. +# Now, we tackle a more realistic classification problem instead of making a +# synthetic dataset. We start by loading the Adult Census dataset with the +# following snippet. For the moment we retain only the **numerical features**. + +# %% +import pandas as pd +adult_census = pd.read_csv("../datasets/adult-census.csv") +target = adult_census["class"] +data = adult_census.select_dtypes(["integer", "floating"]) +data = data.drop(columns=["education-num"]) +data # %% [markdown] -# ```{note} -# If you want a deeper overview regarding this dataset, you can refer to the -# Appendix - Datasets description section at the end of this MOOC. -# ``` +# We confirm that all the selected features are numerical. +# +# Compute the generalization performance in terms of accuracy of a linear model +# composed of a `StandardScaler` and a `LogisticRegression`. Use a 10-fold +# cross-validation with `return_estimator=True` to be able to inspect the +# trained estimators. # %% -import pandas as pd +# solution +from sklearn.pipeline import make_pipeline +from sklearn.preprocessing import StandardScaler +from sklearn.linear_model import LogisticRegression +from sklearn.model_selection import cross_validate -penguins = pd.read_csv("../datasets/penguins_classification.csv") -penguins = ( - penguins.set_index("Species").loc[["Adelie", "Chinstrap"]].reset_index() +model = make_pipeline(StandardScaler(), LogisticRegression()) +cv_results_lr = cross_validate( + model, data, target, cv=10, return_estimator=True ) +test_score_lr = cv_results_lr["test_score"] +test_score_lr -culmen_columns = ["Culmen Length (mm)", "Culmen Depth (mm)"] -target_column = "Species" +# %% [markdown] +# What is the most important feature seen by the logistic regression? +# +# You can use a boxplot to compare the absolute values of the coefficients while +# also visualizing the variability induced by the cross-validation resampling. # %% -from sklearn.model_selection import train_test_split +# solution +import matplotlib.pyplot as plt -penguins_train, penguins_test = train_test_split( - penguins, random_state=0, test_size=0.4 -) +coefs = [pipeline[-1].coef_[0] for pipeline in cv_results_lr["estimator"]] +coefs = pd.DataFrame(coefs, columns=data.columns) -data_train = penguins_train[culmen_columns] -data_test = penguins_test[culmen_columns] +color = {"whiskers": "black", "medians": "black", "caps": "black"} +_, ax = plt.subplots() +_ = coefs.abs().plot.box(color=color, vert=False, ax=ax) -target_train = penguins_train[target_column] -target_test = penguins_test[target_column] +# %% [markdown] tags=["solution"] +# Since we scaled the features, the coefficients of the linear model can be +# meaningful compared directly. `"capital-gain"` is the most impacting feature. +# Just be aware not to draw conclusions on the causal effect provided the impact +# of a feature. Interested readers are refered to the [example on Common +# pitfalls in the interpretation of coefficients of linear +# models](https://scikit-learn.org/stable/auto_examples/inspection/plot_linear_model_coefficient_interpretation.html) +# or the [example on Failure of Machine Learning to infer causal +# effects](https://scikit-learn.org/stable/auto_examples/inspection/plot_causal_interpretation.html). # %% [markdown] -# We define a function to help us fit a given `model` and plot its decision -# boundary. We recall that by using a `DecisionBoundaryDisplay` with diverging -# colormap, `vmin=0` and `vmax=1`, we ensure that the 0.5 probability is mapped -# to the white color. Equivalently, the darker the color, the closer the -# predicted probability is to 0 or 1 and the more confident the classifier is in -# its predictions. +# Let's now work with **both numerical and categorical features**. You can +# reload the Adult Census dataset with the following snippet: # %% -import matplotlib.pyplot as plt -import seaborn as sns -from sklearn.inspection import DecisionBoundaryDisplay - - -def plot_decision_boundary(model): - model.fit(data_train, target_train) - accuracy = model.score(data_test, target_test) - - disp = DecisionBoundaryDisplay.from_estimator( - model, - data_train, - response_method="predict_proba", - plot_method="pcolormesh", - cmap="RdBu_r", - alpha=0.8, - vmin=0.0, - vmax=1.0, - ) - DecisionBoundaryDisplay.from_estimator( - model, - data_train, - response_method="predict_proba", - plot_method="contour", - linestyles="--", - linewidths=1, - alpha=0.8, - levels=[0.5], - ax=disp.ax_, - ) - sns.scatterplot( - data=penguins_train, - x=culmen_columns[0], - y=culmen_columns[1], - hue=target_column, - palette=["tab:blue", "tab:red"], - ax=disp.ax_, - ) - plt.legend(bbox_to_anchor=(1.05, 0.8), loc="upper left") - plt.title(f"C: {C} \n Accuracy on the test set: {accuracy:.2f}") +adult_census = pd.read_csv("../datasets/adult-census.csv") +target = adult_census["class"] +data = adult_census.drop(columns=["class", "education-num"]) +# %% [markdown] +# Create a predictive model where: +# - The numerical data must be scaled. +# - The categorical data must be one-hot encoded, set `min_frequency=0.01` to +# group categories concerning less than 1% of the total samples. +# - The predictor is a `LogisticRegression`. You may need to increase the number +# of `max_iter`, which is 100 by default. +# +# Use the same 10-fold cross-validation strategy with `return_estimator=True` as +# above to evaluate this complex pipeline. + +# %% +# solution +from sklearn.compose import make_column_selector as selector +from sklearn.compose import make_column_transformer +from sklearn.preprocessing import OneHotEncoder + +categorical_columns = selector(dtype_include=object)(data) +numerical_columns = selector(dtype_exclude=object)(data) + +preprocessor = make_column_transformer( + ( + OneHotEncoder(handle_unknown="ignore", min_frequency=0.01), + categorical_columns, + ), + (StandardScaler(), numerical_columns), +) +model = make_pipeline(preprocessor, LogisticRegression(max_iter=5_000)) +cv_results_complex_lr = cross_validate( + model, data, target, cv=10, return_estimator=True, n_jobs=2 +) +test_score_complex_lr = cv_results_complex_lr["test_score"] +test_score_complex_lr # %% [markdown] -# Let's now create our predictive model. +# By comparing the cross-validation test scores of both models fold-to-fold, +# count the number of times the model using both numerical and categorical +# features has a better test score than the model using only numerical features. # %% -from sklearn.pipeline import make_pipeline -from sklearn.preprocessing import StandardScaler -from sklearn.linear_model import LogisticRegression +# solution +import numpy as np +import matplotlib.pyplot as plt -logistic_regression = make_pipeline(StandardScaler(), LogisticRegression()) +indices = np.arange(len(test_score_lr)) +plt.scatter( + indices, test_score_lr, color="tab:blue", label="numerical features only" +) +plt.scatter( + indices, + test_score_complex_lr, + color="tab:red", + label="all features", +) +plt.ylim((0, 1)) +plt.xlabel("Cross-validation iteration") +plt.ylabel("Accuracy") +_ = plt.legend(bbox_to_anchor=(1.05, 1), loc="upper left") + +print( + "A model using both all features is better than a" + " model using only numerical features for" + f" {sum(test_score_complex_lr > test_score_lr)} CV iterations out of 10." +) # %% [markdown] -# ## Influence of the parameter `C` on the decision boundary -# -# Given the following candidates for the `C` parameter and the -# `plot_decision_boundary` function, find out the impact of `C` on the -# classifier's decision boundary. -# -# - How does the value of `C` impact the confidence on the predictions? -# - How does it impact the underfit/overfit trade-off? -# - How does it impact the position and orientation of the decision boundary? +# For the following questions, you can copy adn paste the following snippet to +# get the feature names from the column transformer here named `preprocessor`. # -# Try to give an interpretation on the reason for such behavior. +# ```python +# preprocessor.fit(data) +# feature_names = ( +# preprocessor.named_transformers_["onehotencoder"].get_feature_names_out( +# categorical_columns +# ) +# ).tolist() +# feature_names += numerical_columns +# feature_names +# ``` # %% -Cs = [1e-6, 0.01, 0.1, 1, 10, 100, 1e6] - # solution -for C in Cs: - logistic_regression.set_params(logisticregression__C=C) - plot_decision_boundary(logistic_regression) +preprocessor.fit(data) +feature_names = ( + preprocessor.named_transformers_["onehotencoder"].get_feature_names_out( + categorical_columns + ) +).tolist() +feature_names += numerical_columns +feature_names -# %% [markdown] tags=["solution"] -# -# On this series of plots we can observe several important points. Regarding the -# confidence on the predictions: -# -# - For low values of `C` (strong regularization), the classifier is less -# confident in its predictions. We are enforcing a **spread sigmoid**. -# - For high values of `C` (weak regularization), the classifier is more -# confident: the areas with dark blue (very confident in predicting "Adelie") -# and dark red (very confident in predicting "Chinstrap") nearly cover the -# entire feature space. We are enforcing a **steep sigmoid**. -# -# To answer the next question, think that misclassified data points are more -# costly when the classifier is more confident on the decision. Decision rules -# are mostly driven by avoiding such cost. From the previous observations we can -# then deduce that: -# -# - The smaller the `C` (the stronger the regularization), the lower the cost -# of a misclassification. As more data points lay in the low-confidence -# zone, the more the decision rules are influenced almost uniformly by all -# the data points. This leads to a less expressive model, which may underfit. -# - The higher the value of `C` (the weaker the regularization), the more the -# decision is influenced by a few training points very close to the boundary, -# where decisions are costly. Remember that models may overfit if the number -# of samples in the training set is too small, as at least a minimum of -# samples is needed to average the noise out. -# -# The orientation is the result of two factors: minimizing the number of -# misclassified training points with high confidence and their distance to the -# decision boundary (notice how the contour line tries to align with the most -# misclassified data points in the dark-colored zone). This is closely related -# to the value of the weights of the model, which is explained in the next part -# of the exercise. -# -# Finally, for small values of `C` the position of the decision boundary is -# affected by the class imbalance: when `C` is near zero, the model predicts the -# majority class (as seen in the training set) everywhere in the feature space. -# In our case, there are approximately two times more "Adelie" than "Chinstrap" -# penguins. This explains why the decision boundary is shifted to the right when -# `C` gets smaller. Indeed, the most regularized model predicts light blue -# almost everywhere in the feature space. +# %% [markdown] +# Notice that there are as many feature names as coefficients in the last step +# of your predictive pipeline. # %% [markdown] -# ## Impact of the regularization on the weights -# -# Look at the impact of the `C` hyperparameter on the magnitude of the weights. -# **Hint**: You can [access pipeline -# steps](https://scikit-learn.org/stable/modules/compose.html#access-pipeline-steps) -# by name or position. Then you can query the attributes of that step such as -# `coef_`. +# Which of the following pairs of features is most impacting the predictions of +# the logistic regression classifier based on the absolute magnitude of its +# coefficients? # %% # solution -lr_weights = [] -for C in Cs: - logistic_regression.set_params(logisticregression__C=C) - logistic_regression.fit(data_train, target_train) - coefs = logistic_regression[-1].coef_[0] - lr_weights.append(pd.Series(coefs, index=culmen_columns)) - -# %% tags=["solution"] -lr_weights = pd.concat(lr_weights, axis=1, keys=[f"C: {C}" for C in Cs]) -lr_weights.plot.barh() -_ = plt.title("LogisticRegression weights depending of C") +coefs = [ + pipeline[-1].coef_[0] for pipeline in cv_results_complex_lr["estimator"] +] +coefs = pd.DataFrame(coefs, columns=feature_names) + +_, ax = plt.subplots(figsize=(10, 35)) +_ = coefs.abs().plot.box(color=color, vert=False, ax=ax) # %% [markdown] tags=["solution"] -# -# As small `C` provides a more regularized model, it shrinks the weights values -# toward zero, as in the `Ridge` model. -# -# In particular, with a strong penalty (e.g. `C = 0.01`), the weight of the feature -# named "Culmen Depth (mm)" is almost zero. It explains why the decision -# separation in the plot is almost perpendicular to the "Culmen Length (mm)" -# feature. -# -# For even stronger penalty strengths (e.g. `C = 1e-6`), the weights of both -# features are almost zero. It explains why the decision separation in the plot -# is almost constant in the feature space: the predicted probability is only -# based on the intercept parameter of the model (which is never regularized). +# We can visually inspect the coefficients and observe that `"capital-gain"` and +# `"education_Doctorate"` are impacting the predictions the most. # %% [markdown] -# ## Impact of the regularization on with non-linear feature engineering -# -# Use the `plot_decision_boundary` function to repeat the experiment using a -# non-linear feature engineering pipeline. For such purpose, insert -# `Nystroem(kernel="rbf", gamma=1, n_components=100)` between the -# `StandardScaler` and the `LogisticRegression` steps. -# -# - Does the value of `C` still impact the position of the decision boundary and -# the confidence of the model? -# - What can you say about the impact of `C` on the underfitting vs overfitting -# trade-off? +# Now create a similar pipeline consisting of the same preprocessor as above, +# followed by a `PolynomialFeatures` and a logistic regression with `C=0.01`. +# Set `degree=2` and `interaction_only=True` to the feature engineering step. +# Remember not to include a "bias" feature to avoid introducing a redundancy +# with the intercept of the subsequent logistic regression. # %% -from sklearn.kernel_approximation import Nystroem +# solution +from sklearn.preprocessing import PolynomialFeatures +model_with_interaction = make_pipeline( + preprocessor, + PolynomialFeatures(degree=2, include_bias=False, interaction_only=True), + LogisticRegression(C=0.01, max_iter=5_000), +) +model_with_interaction + +# %% [markdown] +# By comparing the cross-validation test scores of both models fold-to-fold, +# count the number of times the model using multiplicative interactions and both +# numerical and categorical features has a better test score than the model +# without interactions. + +# %% # solution -classifier = make_pipeline( - StandardScaler(), - Nystroem(kernel="rbf", gamma=1.0, n_components=100, random_state=0), - LogisticRegression(penalty="l2", max_iter=1000), +cv_results_interactions = cross_validate( + model_with_interaction, + data, + target, + cv=10, + return_estimator=True, + n_jobs=2, ) +test_score_interactions = cv_results_interactions["test_score"] +test_score_interactions -for C in Cs: - classifier.set_params(logisticregression__C=C) - plot_decision_boundary(classifier) +# %% +# solution +plt.scatter( + indices, test_score_lr, color="tab:blue", label="numerical features only" +) +plt.scatter( + indices, + test_score_complex_lr, + color="tab:red", + label="all features", +) +plt.scatter( + indices, + test_score_interactions, + color="black", + label="all features and interactions", +) +plt.xlabel("Cross-validation iteration") +plt.ylabel("Accuracy") +_ = plt.legend(bbox_to_anchor=(1.05, 1), loc="upper left") + +print( + "A model using all features and interactions is better than a model" + " without interactions for" + f" {sum(test_score_interactions > test_score_complex_lr)} CV iterations" + " out of 10." +) # %% [markdown] tags=["solution"] +# When you multiply two one-hot encoded categorical features, the resulting +# interaction feature is mostly 0, with a 1 only when both original features are +# active, acting as a logical `AND`. In this case it could mean we are creating +# new rules such as "has a given education `AND` a given native country", which +# we expect to be predictive. This new rules map the original feature space into +# a higher dimension space, where the linear model can separate the data more +# easily. # -# - For the lowest values of `C`, the overall pipeline underfits: it predicts -# the majority class everywhere, as previously. -# - When `C` increases, the models starts to predict some datapoints from the -# "Chinstrap" class but the model is not very confident anywhere in the -# feature space. -# - The decision boundary is no longer a straight line: the linear model is now -# classifying in the 100-dimensional feature space created by the `Nystroem` -# transformer. As are result, the decision boundary induced by the overall -# pipeline is now expressive enough to wrap around the minority class. -# - For `C = 1` in particular, it finds a smooth red blob around most of the -# "Chinstrap" data points. When moving away from the data points, the model is -# less confident in its predictions and again tends to predict the majority -# class according to the proportion in the training set. -# - For higher values of `C`, the model starts to overfit: it is very confident -# in its predictions almost everywhere, but it should not be trusted: the -# model also makes a larger number of mistakes on the test set (not shown in -# the plot) while adopting a very curvy decision boundary to attempt fitting -# all the training points, including the noisy ones at the frontier between -# the two classes. This makes the decision boundary very sensitive to the -# sampling of the training set and as a result, it does not generalize well in -# that region. This is confirmed by the (slightly) lower accuracy on the test -# set. -# -# Finally, we can also note that the linear model on the raw features was as -# good or better than the best model using non-linear feature engineering. So in -# this case, we did not really need this extra complexity in our pipeline. -# **Simpler is better!** -# -# So to conclude, when using non-linear feature engineering, it is often -# possible to make the pipeline overfit, even if the original feature space is -# low-dimensional. As a result, it is important to tune the regularization -# parameter in conjunction with the parameters of the transformers (e.g. tuning -# `gamma` would be important here). This has a direct impact on the certainty of -# the predictions. +# Keep into account that multiplying all pairs of one-hot encoded features may +# lead to a rapid increase in the number of features, especially if the original +# categorical variables have many levels. This can increase the computational +# cost of your model and promote overfitting, as we will see in a future +# notebook. diff --git a/python_scripts/linear_models_sol_04.py b/python_scripts/linear_models_sol_04.py new file mode 100644 index 000000000..942aed56d --- /dev/null +++ b/python_scripts/linear_models_sol_04.py @@ -0,0 +1,281 @@ +# --- +# jupyter: +# kernelspec: +# display_name: Python 3 +# name: python3 +# --- + +# %% [markdown] +# # ๐Ÿ“ƒ Solution for Exercise M4.04 +# +# In the previous Module we tuned the hyperparameter `C` of the logistic +# regression without mentioning that it controls the regularization strength. +# Later, on the slides on ๐ŸŽฅ **Intuitions on regularized linear models** we +# metioned that a small `C` provides a more regularized model, whereas a +# non-regularized model is obtained with an infinitely large value of `C`. +# Indeed, `C` behaves as the inverse of the `alpha` coefficient in the `Ridge` +# model. +# +# In this exercise, we ask you to train a logistic regression classifier using +# different values of the parameter `C` to find its effects by yourself. +# +# We start by loading the dataset. We only keep the Adelie and Chinstrap classes +# to keep the discussion simple. + + +# %% [markdown] +# ```{note} +# If you want a deeper overview regarding this dataset, you can refer to the +# Appendix - Datasets description section at the end of this MOOC. +# ``` + +# %% +import pandas as pd + +penguins = pd.read_csv("../datasets/penguins_classification.csv") +penguins = ( + penguins.set_index("Species").loc[["Adelie", "Chinstrap"]].reset_index() +) + +culmen_columns = ["Culmen Length (mm)", "Culmen Depth (mm)"] +target_column = "Species" + +# %% +from sklearn.model_selection import train_test_split + +penguins_train, penguins_test = train_test_split( + penguins, random_state=0, test_size=0.4 +) + +data_train = penguins_train[culmen_columns] +data_test = penguins_test[culmen_columns] + +target_train = penguins_train[target_column] +target_test = penguins_test[target_column] + +# %% [markdown] +# We define a function to help us fit a given `model` and plot its decision +# boundary. We recall that by using a `DecisionBoundaryDisplay` with diverging +# colormap, `vmin=0` and `vmax=1`, we ensure that the 0.5 probability is mapped +# to the white color. Equivalently, the darker the color, the closer the +# predicted probability is to 0 or 1 and the more confident the classifier is in +# its predictions. + +# %% +import matplotlib.pyplot as plt +import seaborn as sns +from sklearn.inspection import DecisionBoundaryDisplay + + +def plot_decision_boundary(model): + model.fit(data_train, target_train) + accuracy = model.score(data_test, target_test) + C = model.get_params()["logisticregression__C"] + + disp = DecisionBoundaryDisplay.from_estimator( + model, + data_train, + response_method="predict_proba", + plot_method="pcolormesh", + cmap="RdBu_r", + alpha=0.8, + vmin=0.0, + vmax=1.0, + ) + DecisionBoundaryDisplay.from_estimator( + model, + data_train, + response_method="predict_proba", + plot_method="contour", + linestyles="--", + linewidths=1, + alpha=0.8, + levels=[0.5], + ax=disp.ax_, + ) + sns.scatterplot( + data=penguins_train, + x=culmen_columns[0], + y=culmen_columns[1], + hue=target_column, + palette=["tab:blue", "tab:red"], + ax=disp.ax_, + ) + plt.legend(bbox_to_anchor=(1.05, 0.8), loc="upper left") + plt.title(f"C: {C} \n Accuracy on the test set: {accuracy:.2f}") + + +# %% [markdown] +# Let's now create our predictive model. + +# %% +from sklearn.pipeline import make_pipeline +from sklearn.preprocessing import StandardScaler +from sklearn.linear_model import LogisticRegression + +logistic_regression = make_pipeline(StandardScaler(), LogisticRegression()) + +# %% [markdown] +# ## Influence of the parameter `C` on the decision boundary +# +# Given the following candidates for the `C` parameter and the +# `plot_decision_boundary` function, find out the impact of `C` on the +# classifier's decision boundary. +# +# - How does the value of `C` impact the confidence on the predictions? +# - How does it impact the underfit/overfit trade-off? +# - How does it impact the position and orientation of the decision boundary? +# +# Try to give an interpretation on the reason for such behavior. + +# %% +Cs = [1e-6, 0.01, 0.1, 1, 10, 100, 1e6] + +# solution +for C in Cs: + logistic_regression.set_params(logisticregression__C=C) + plot_decision_boundary(logistic_regression) + +# %% [markdown] tags=["solution"] +# +# On this series of plots we can observe several important points. Regarding the +# confidence on the predictions: +# +# - For low values of `C` (strong regularization), the classifier is less +# confident in its predictions. We are enforcing a **spread sigmoid**. +# - For high values of `C` (weak regularization), the classifier is more +# confident: the areas with dark blue (very confident in predicting "Adelie") +# and dark red (very confident in predicting "Chinstrap") nearly cover the +# entire feature space. We are enforcing a **steep sigmoid**. +# +# To answer the next question, think that misclassified data points are more +# costly when the classifier is more confident on the decision. Decision rules +# are mostly driven by avoiding such cost. From the previous observations we can +# then deduce that: +# +# - The smaller the `C` (the stronger the regularization), the lower the cost +# of a misclassification. As more data points lay in the low-confidence +# zone, the more the decision rules are influenced almost uniformly by all +# the data points. This leads to a less expressive model, which may underfit. +# - The higher the value of `C` (the weaker the regularization), the more the +# decision is influenced by a few training points very close to the boundary, +# where decisions are costly. Remember that models may overfit if the number +# of samples in the training set is too small, as at least a minimum of +# samples is needed to average the noise out. +# +# The orientation is the result of two factors: minimizing the number of +# misclassified training points with high confidence and their distance to the +# decision boundary (notice how the contour line tries to align with the most +# misclassified data points in the dark-colored zone). This is closely related +# to the value of the weights of the model, which is explained in the next part +# of the exercise. +# +# Finally, for small values of `C` the position of the decision boundary is +# affected by the class imbalance: when `C` is near zero, the model predicts the +# majority class (as seen in the training set) everywhere in the feature space. +# In our case, there are approximately two times more "Adelie" than "Chinstrap" +# penguins. This explains why the decision boundary is shifted to the right when +# `C` gets smaller. Indeed, the most regularized model predicts light blue +# almost everywhere in the feature space. + +# %% [markdown] +# ## Impact of the regularization on the weights +# +# Look at the impact of the `C` hyperparameter on the magnitude of the weights. +# **Hint**: You can [access pipeline +# steps](https://scikit-learn.org/stable/modules/compose.html#access-pipeline-steps) +# by name or position. Then you can query the attributes of that step such as +# `coef_`. + +# %% +# solution +lr_weights = [] +for C in Cs: + logistic_regression.set_params(logisticregression__C=C) + logistic_regression.fit(data_train, target_train) + coefs = logistic_regression[-1].coef_[0] + lr_weights.append(pd.Series(coefs, index=culmen_columns)) + +# %% tags=["solution"] +lr_weights = pd.concat(lr_weights, axis=1, keys=[f"C: {C}" for C in Cs]) +lr_weights.plot.barh() +_ = plt.title("LogisticRegression weights depending of C") + +# %% [markdown] tags=["solution"] +# +# As small `C` provides a more regularized model, it shrinks the weights values +# toward zero, as in the `Ridge` model. +# +# In particular, with a strong penalty (e.g. `C = 0.01`), the weight of the feature +# named "Culmen Depth (mm)" is almost zero. It explains why the decision +# separation in the plot is almost perpendicular to the "Culmen Length (mm)" +# feature. +# +# For even stronger penalty strengths (e.g. `C = 1e-6`), the weights of both +# features are almost zero. It explains why the decision separation in the plot +# is almost constant in the feature space: the predicted probability is only +# based on the intercept parameter of the model (which is never regularized). + +# %% [markdown] +# ## Impact of the regularization on with non-linear feature engineering +# +# Use the `plot_decision_boundary` function to repeat the experiment using a +# non-linear feature engineering pipeline. For such purpose, insert +# `Nystroem(kernel="rbf", gamma=1, n_components=100)` between the +# `StandardScaler` and the `LogisticRegression` steps. +# +# - Does the value of `C` still impact the position of the decision boundary and +# the confidence of the model? +# - What can you say about the impact of `C` on the underfitting vs overfitting +# trade-off? + +# %% +from sklearn.kernel_approximation import Nystroem + +# solution +classifier = make_pipeline( + StandardScaler(), + Nystroem(kernel="rbf", gamma=1.0, n_components=100, random_state=0), + LogisticRegression(max_iter=1000), +) + +for C in Cs: + classifier.set_params(logisticregression__C=C) + plot_decision_boundary(classifier) + +# %% [markdown] tags=["solution"] +# +# - For the lowest values of `C`, the overall pipeline underfits: it predicts +# the majority class everywhere, as previously. +# - When `C` increases, the models starts to predict some datapoints from the +# "Chinstrap" class but the model is not very confident anywhere in the +# feature space. +# - The decision boundary is no longer a straight line: the linear model is now +# classifying in the 100-dimensional feature space created by the `Nystroem` +# transformer. As are result, the decision boundary induced by the overall +# pipeline is now expressive enough to wrap around the minority class. +# - For `C = 1` in particular, it finds a smooth red blob around most of the +# "Chinstrap" data points. When moving away from the data points, the model is +# less confident in its predictions and again tends to predict the majority +# class according to the proportion in the training set. +# - For higher values of `C`, the model starts to overfit: it is very confident +# in its predictions almost everywhere, but it should not be trusted: the +# model also makes a larger number of mistakes on the test set (not shown in +# the plot) while adopting a very curvy decision boundary to attempt fitting +# all the training points, including the noisy ones at the frontier between +# the two classes. This makes the decision boundary very sensitive to the +# sampling of the training set and as a result, it does not generalize well in +# that region. This is confirmed by the (slightly) lower accuracy on the test +# set. +# +# Finally, we can also note that the linear model on the raw features was as +# good or better than the best model using non-linear feature engineering. So in +# this case, we did not really need this extra complexity in our pipeline. +# **Simpler is better!** +# +# So to conclude, when using non-linear feature engineering, it is often +# possible to make the pipeline overfit, even if the original feature space is +# low-dimensional. As a result, it is important to tune the regularization +# parameter in conjunction with the parameters of the transformers (e.g. tuning +# `gamma` would be important here). This has a direct impact on the certainty of +# the predictions. diff --git a/python_scripts/logistic_regression_non_linear.py b/python_scripts/logistic_regression_non_linear.py deleted file mode 100644 index d28a4a9e6..000000000 --- a/python_scripts/logistic_regression_non_linear.py +++ /dev/null @@ -1,217 +0,0 @@ -# --- -# jupyter: -# kernelspec: -# display_name: Python 3 -# name: python3 -# --- - -# %% [markdown] -# # Beyond linear separation in classification -# -# As we saw in the regression section, the linear classification model expects -# the data to be linearly separable. When this assumption does not hold, the -# model is not expressive enough to properly fit the data. Therefore, we need to -# apply the same tricks as in regression: feature augmentation (potentially -# using expert-knowledge) or using a kernel-based method. -# -# We will provide examples where we will use a kernel support vector machine to -# perform classification on some toy-datasets where it is impossible to find a -# perfect linear separation. -# -# We will generate a first dataset where the data are represented as two -# interlaced half circles. This dataset is generated using the function -# [`sklearn.datasets.make_moons`](https://scikit-learn.org/stable/modules/generated/sklearn.datasets.make_moons.html). - -# %% -import numpy as np -import pandas as pd -from sklearn.datasets import make_moons - -feature_names = ["Feature #0", "Features #1"] -target_name = "class" - -X, y = make_moons(n_samples=100, noise=0.13, random_state=42) - -# We store both the data and target in a dataframe to ease plotting -moons = pd.DataFrame( - np.concatenate([X, y[:, np.newaxis]], axis=1), - columns=feature_names + [target_name], -) -data_moons, target_moons = moons[feature_names], moons[target_name] - -# %% [markdown] -# Since the dataset contains only two features, we can make a scatter plot to -# have a look at it. - -# %% -import matplotlib.pyplot as plt -import seaborn as sns - -sns.scatterplot( - data=moons, - x=feature_names[0], - y=feature_names[1], - hue=target_moons, - palette=["tab:red", "tab:blue"], -) -_ = plt.title("Illustration of the moons dataset") - -# %% [markdown] -# From the intuitions that we got by studying linear model, it should be obvious -# that a linear classifier will not be able to find a perfect decision function -# to separate the two classes. -# -# Let's try to see what is the decision boundary of such a linear classifier. We -# will create a predictive model by standardizing the dataset followed by a -# linear support vector machine classifier. - -# %% -from sklearn.pipeline import make_pipeline -from sklearn.preprocessing import StandardScaler -from sklearn.svm import SVC - -linear_model = make_pipeline(StandardScaler(), SVC(kernel="linear")) -linear_model.fit(data_moons, target_moons) - -# %% [markdown] -# ```{warning} -# Be aware that we fit and will check the boundary decision of the classifier on -# the same dataset without splitting the dataset into a training set and a -# testing set. While this is a bad practice, we use it for the sake of -# simplicity to depict the model behavior. Always use cross-validation when you -# want to assess the generalization performance of a machine-learning model. -# ``` - -# %% [markdown] -# Let's check the decision boundary of such a linear model on this dataset. - -# %% -from sklearn.inspection import DecisionBoundaryDisplay - -DecisionBoundaryDisplay.from_estimator( - linear_model, data_moons, response_method="predict", cmap="RdBu", alpha=0.5 -) -sns.scatterplot( - data=moons, - x=feature_names[0], - y=feature_names[1], - hue=target_moons, - palette=["tab:red", "tab:blue"], -) -_ = plt.title("Decision boundary of a linear model") - -# %% [markdown] -# As expected, a linear decision boundary is not enough flexible to split the -# two classes. -# -# To push this example to the limit, we will create another dataset where -# samples of a class will be surrounded by samples from the other class. - -# %% -from sklearn.datasets import make_gaussian_quantiles - -feature_names = ["Feature #0", "Features #1"] -target_name = "class" - -X, y = make_gaussian_quantiles( - n_samples=100, n_features=2, n_classes=2, random_state=42 -) -gauss = pd.DataFrame( - np.concatenate([X, y[:, np.newaxis]], axis=1), - columns=feature_names + [target_name], -) -data_gauss, target_gauss = gauss[feature_names], gauss[target_name] - -# %% -ax = sns.scatterplot( - data=gauss, - x=feature_names[0], - y=feature_names[1], - hue=target_gauss, - palette=["tab:red", "tab:blue"], -) -_ = plt.title("Illustration of the Gaussian quantiles dataset") - -# %% [markdown] -# Here, this is even more obvious that a linear decision function is not -# adapted. We can check what decision function, a linear support vector machine -# will find. - -# %% -linear_model.fit(data_gauss, target_gauss) -DecisionBoundaryDisplay.from_estimator( - linear_model, data_gauss, response_method="predict", cmap="RdBu", alpha=0.5 -) -sns.scatterplot( - data=gauss, - x=feature_names[0], - y=feature_names[1], - hue=target_gauss, - palette=["tab:red", "tab:blue"], -) -_ = plt.title("Decision boundary of a linear model") - -# %% [markdown] -# As expected, a linear separation cannot be used to separate the classes -# properly: the model will under-fit as it will make errors even on the training -# set. -# -# In the section about linear regression, we saw that we could use several -# tricks to make a linear model more flexible by augmenting features or using a -# kernel. Here, we will use the later solution by using a radial basis function -# (RBF) kernel together with a support vector machine classifier. -# -# We will repeat the two previous experiments and check the obtained decision -# function. - -# %% -kernel_model = make_pipeline(StandardScaler(), SVC(kernel="rbf", gamma=5)) - -# %% -kernel_model.fit(data_moons, target_moons) -DecisionBoundaryDisplay.from_estimator( - kernel_model, data_moons, response_method="predict", cmap="RdBu", alpha=0.5 -) -sns.scatterplot( - data=moons, - x=feature_names[0], - y=feature_names[1], - hue=target_moons, - palette=["tab:red", "tab:blue"], -) -_ = plt.title("Decision boundary with a model using an RBF kernel") - -# %% [markdown] -# We see that the decision boundary is not anymore a straight line. Indeed, an -# area is defined around the red samples and we could imagine that this -# classifier should be able to generalize on unseen data. -# -# Let's check the decision function on the second dataset. - -# %% -kernel_model.fit(data_gauss, target_gauss) -DecisionBoundaryDisplay.from_estimator( - kernel_model, data_gauss, response_method="predict", cmap="RdBu", alpha=0.5 -) -ax = sns.scatterplot( - data=gauss, - x=feature_names[0], - y=feature_names[1], - hue=target_gauss, - palette=["tab:red", "tab:blue"], -) -_ = plt.title("Decision boundary with a model using an RBF kernel") - -# %% [markdown] -# We observe something similar than in the previous case. The decision function -# is more flexible and does not underfit anymore. -# -# Thus, kernel trick or feature expansion are the tricks to make a linear -# classifier more expressive, exactly as we saw in regression. -# -# Keep in mind that adding flexibility to a model can also risk increasing -# overfitting by making the decision function to be sensitive to individual -# (possibly noisy) data points of the training set. Here we can observe that the -# decision functions remain smooth enough to preserve good generalization. If -# you are curious, you can try to repeat the above experiment with `gamma=100` -# and look at the decision functions. From 5cc989e98fb8251b2f3e7df08c783ff9f2e592e4 Mon Sep 17 00:00:00 2001 From: Arturo Amor <86408019+ArturoAmorQ@users.noreply.github.com> Date: Fri, 27 Oct 2023 11:56:47 +0200 Subject: [PATCH 077/108] FIX Notebooks not updated by `make notebooks` (#743) --- notebooks/linear_models_ex_03.ipynb | 194 +++-- notebooks/linear_models_ex_04.ipynb | 244 +++++++ ...s_feature_engineering_classification.ipynb | 682 ++++++++++++++++++ notebooks/linear_models_sol_03.ipynb | 501 +++++++------ notebooks/linear_models_sol_04.ipynb | 395 ++++++++++ .../linear_regression_non_linear_link.ipynb | 312 ++++---- notebooks/logistic_regression.ipynb | 259 ++++++- .../logistic_regression_non_linear.ipynb | 327 --------- notebooks/trees_ex_01.ipynb | 48 +- python_scripts/trees_ex_01.py | 37 +- 10 files changed, 2191 insertions(+), 808 deletions(-) create mode 100644 notebooks/linear_models_ex_04.ipynb create mode 100644 notebooks/linear_models_feature_engineering_classification.ipynb create mode 100644 notebooks/linear_models_sol_04.ipynb delete mode 100644 notebooks/logistic_regression_non_linear.ipynb diff --git a/notebooks/linear_models_ex_03.ipynb b/notebooks/linear_models_ex_03.ipynb index 36b516f3c..7ada01f07 100644 --- a/notebooks/linear_models_ex_03.ipynb +++ b/notebooks/linear_models_ex_03.ipynb @@ -6,25 +6,36 @@ "source": [ "# \ud83d\udcdd Exercise M4.03\n", "\n", - "The parameter `penalty` can control the **type** of regularization to use,\n", - "whereas the regularization **strength** is set using the parameter `C`.\n", - "Setting`penalty=\"none\"` is equivalent to an infinitely large value of `C`. In\n", - "this exercise, we ask you to train a logistic regression classifier using the\n", - "`penalty=\"l2\"` regularization (which happens to be the default in\n", - "scikit-learn) to find by yourself the effect of the parameter `C`.\n", + "Now, we tackle a more realistic classification problem instead of making a\n", + "synthetic dataset. We start by loading the Adult Census dataset with the\n", + "following snippet. For the moment we retain only the **numerical features**." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", "\n", - "We start by loading the dataset." + "adult_census = pd.read_csv(\"../datasets/adult-census.csv\")\n", + "target = adult_census[\"class\"]\n", + "data = adult_census.select_dtypes([\"integer\", \"floating\"])\n", + "data = data.drop(columns=[\"education-num\"])\n", + "data" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "
\n", - "

Note

\n", - "

If you want a deeper overview regarding this dataset, you can refer to the\n", - "Appendix - Datasets description section at the end of this MOOC.

\n", - "
" + "We confirm that all the selected features are numerical.\n", + "\n", + "Compute the generalization performance in terms of accuracy of a linear model\n", + "composed of a `StandardScaler` and a `LogisticRegression`. Use a 10-fold\n", + "cross-validation with `return_estimator=True` to be able to inspect the\n", + "trained estimators." ] }, { @@ -33,16 +44,17 @@ "metadata": {}, "outputs": [], "source": [ - "import pandas as pd\n", - "\n", - "penguins = pd.read_csv(\"../datasets/penguins_classification.csv\")\n", - "# only keep the Adelie and Chinstrap classes\n", - "penguins = (\n", - " penguins.set_index(\"Species\").loc[[\"Adelie\", \"Chinstrap\"]].reset_index()\n", - ")\n", + "# Write your code here." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "What is the most important feature seen by the logistic regression?\n", "\n", - "culmen_columns = [\"Culmen Length (mm)\", \"Culmen Depth (mm)\"]\n", - "target_column = \"Species\"" + "You can use a boxplot to compare the absolute values of the coefficients while\n", + "also visualizing the variability induced by the cross-validation resampling." ] }, { @@ -51,22 +63,15 @@ "metadata": {}, "outputs": [], "source": [ - "from sklearn.model_selection import train_test_split\n", - "\n", - "penguins_train, penguins_test = train_test_split(penguins, random_state=0)\n", - "\n", - "data_train = penguins_train[culmen_columns]\n", - "data_test = penguins_test[culmen_columns]\n", - "\n", - "target_train = penguins_train[target_column]\n", - "target_test = penguins_test[target_column]" + "# Write your code here." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "First, let's create our predictive model." + "Let's now work with **both numerical and categorical features**. You can\n", + "reload the Adult Census dataset with the following snippet:" ] }, { @@ -75,23 +80,42 @@ "metadata": {}, "outputs": [], "source": [ - "from sklearn.pipeline import make_pipeline\n", - "from sklearn.preprocessing import StandardScaler\n", - "from sklearn.linear_model import LogisticRegression\n", + "adult_census = pd.read_csv(\"../datasets/adult-census.csv\")\n", + "target = adult_census[\"class\"]\n", + "data = adult_census.drop(columns=[\"class\", \"education-num\"])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Create a predictive model where:\n", + "- The numerical data must be scaled.\n", + "- The categorical data must be one-hot encoded, set `min_frequency=0.01` to\n", + " group categories concerning less than 1% of the total samples.\n", + "- The predictor is a `LogisticRegression`. You may need to increase the number\n", + " of `max_iter`, which is 100 by default.\n", "\n", - "logistic_regression = make_pipeline(\n", - " StandardScaler(), LogisticRegression(penalty=\"l2\")\n", - ")" + "Use the same 10-fold cross-validation strategy with `return_estimator=True` as\n", + "above to evaluate this complex pipeline." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Write your code here." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "Given the following candidates for the `C` parameter, find out the impact of\n", - "`C` on the classifier decision boundary. You can use\n", - "`sklearn.inspection.DecisionBoundaryDisplay.from_estimator` to plot the\n", - "decision function boundary." + "By comparing the cross-validation test scores of both models fold-to-fold,\n", + "count the number of times the model using both numerical and categorical\n", + "features has a better test score than the model using only numerical features." ] }, { @@ -100,8 +124,60 @@ "metadata": {}, "outputs": [], "source": [ - "Cs = [0.01, 0.1, 1, 10]\n", + "# Write your code here." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "For the following questions, you can copy adn paste the following snippet to\n", + "get the feature names from the column transformer here named `preprocessor`.\n", "\n", + "```python\n", + "preprocessor.fit(data)\n", + "feature_names = (\n", + " preprocessor.named_transformers_[\"onehotencoder\"].get_feature_names_out(\n", + " categorical_columns\n", + " )\n", + ").tolist()\n", + "feature_names += numerical_columns\n", + "feature_names\n", + "```" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Write your code here." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Notice that there are as many feature names as coefficients in the last step\n", + "of your predictive pipeline." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Which of the following pairs of features is most impacting the predictions of\n", + "the logistic regression classifier based on the absolute magnitude of its\n", + "coefficients?" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ "# Write your code here." ] }, @@ -109,7 +185,39 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Look at the impact of the `C` hyperparameter on the magnitude of the weights." + "Now create a similar pipeline consisting of the same preprocessor as above,\n", + "followed by a `PolynomialFeatures` and a logistic regression with `C=0.01`.\n", + "Set `degree=2` and `interaction_only=True` to the feature engineering step.\n", + "Remember not to include a \"bias\" feature to avoid introducing a redundancy\n", + "with the intercept of the subsequent logistic regression." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Write your code here." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "By comparing the cross-validation test scores of both models fold-to-fold,\n", + "count the number of times the model using multiplicative interactions and both\n", + "numerical and categorical features has a better test score than the model\n", + "without interactions." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Write your code here." ] }, { diff --git a/notebooks/linear_models_ex_04.ipynb b/notebooks/linear_models_ex_04.ipynb new file mode 100644 index 000000000..5d40693d7 --- /dev/null +++ b/notebooks/linear_models_ex_04.ipynb @@ -0,0 +1,244 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "lines_to_next_cell": 2 + }, + "source": [ + "# \ud83d\udcdd Exercise M4.04\n", + "\n", + "In the previous Module we tuned the hyperparameter `C` of the logistic\n", + "regression without mentioning that it controls the regularization strength.\n", + "Later, on the slides on \ud83c\udfa5 **Intuitions on regularized linear models** we\n", + "metioned that a small `C` provides a more regularized model, whereas a\n", + "non-regularized model is obtained with an infinitely large value of `C`.\n", + "Indeed, `C` behaves as the inverse of the `alpha` coefficient in the `Ridge`\n", + "model.\n", + "\n", + "In this exercise, we ask you to train a logistic regression classifier using\n", + "different values of the parameter `C` to find its effects by yourself.\n", + "\n", + "We start by loading the dataset. We only keep the Adelie and Chinstrap classes\n", + "to keep the discussion simple." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "
\n", + "

Note

\n", + "

If you want a deeper overview regarding this dataset, you can refer to the\n", + "Appendix - Datasets description section at the end of this MOOC.

\n", + "
" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "\n", + "penguins = pd.read_csv(\"../datasets/penguins_classification.csv\")\n", + "penguins = (\n", + " penguins.set_index(\"Species\").loc[[\"Adelie\", \"Chinstrap\"]].reset_index()\n", + ")\n", + "\n", + "culmen_columns = [\"Culmen Length (mm)\", \"Culmen Depth (mm)\"]\n", + "target_column = \"Species\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.model_selection import train_test_split\n", + "\n", + "penguins_train, penguins_test = train_test_split(\n", + " penguins, random_state=0, test_size=0.4\n", + ")\n", + "\n", + "data_train = penguins_train[culmen_columns]\n", + "data_test = penguins_test[culmen_columns]\n", + "\n", + "target_train = penguins_train[target_column]\n", + "target_test = penguins_test[target_column]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We define a function to help us fit a given `model` and plot its decision\n", + "boundary. We recall that by using a `DecisionBoundaryDisplay` with diverging\n", + "colormap, `vmin=0` and `vmax=1`, we ensure that the 0.5 probability is mapped\n", + "to the white color. Equivalently, the darker the color, the closer the\n", + "predicted probability is to 0 or 1 and the more confident the classifier is in\n", + "its predictions." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import matplotlib.pyplot as plt\n", + "import seaborn as sns\n", + "from sklearn.inspection import DecisionBoundaryDisplay\n", + "\n", + "\n", + "def plot_decision_boundary(model):\n", + " model.fit(data_train, target_train)\n", + " accuracy = model.score(data_test, target_test)\n", + " C = model.get_params()[\"logisticregression__C\"]\n", + "\n", + " disp = DecisionBoundaryDisplay.from_estimator(\n", + " model,\n", + " data_train,\n", + " response_method=\"predict_proba\",\n", + " plot_method=\"pcolormesh\",\n", + " cmap=\"RdBu_r\",\n", + " alpha=0.8,\n", + " vmin=0.0,\n", + " vmax=1.0,\n", + " )\n", + " DecisionBoundaryDisplay.from_estimator(\n", + " model,\n", + " data_train,\n", + " response_method=\"predict_proba\",\n", + " plot_method=\"contour\",\n", + " linestyles=\"--\",\n", + " linewidths=1,\n", + " alpha=0.8,\n", + " levels=[0.5],\n", + " ax=disp.ax_,\n", + " )\n", + " sns.scatterplot(\n", + " data=penguins_train,\n", + " x=culmen_columns[0],\n", + " y=culmen_columns[1],\n", + " hue=target_column,\n", + " palette=[\"tab:blue\", \"tab:red\"],\n", + " ax=disp.ax_,\n", + " )\n", + " plt.legend(bbox_to_anchor=(1.05, 0.8), loc=\"upper left\")\n", + " plt.title(f\"C: {C} \\n Accuracy on the test set: {accuracy:.2f}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's now create our predictive model." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.pipeline import make_pipeline\n", + "from sklearn.preprocessing import StandardScaler\n", + "from sklearn.linear_model import LogisticRegression\n", + "\n", + "logistic_regression = make_pipeline(StandardScaler(), LogisticRegression())" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Influence of the parameter `C` on the decision boundary\n", + "\n", + "Given the following candidates for the `C` parameter and the\n", + "`plot_decision_boundary` function, find out the impact of `C` on the\n", + "classifier's decision boundary.\n", + "\n", + "- How does the value of `C` impact the confidence on the predictions?\n", + "- How does it impact the underfit/overfit trade-off?\n", + "- How does it impact the position and orientation of the decision boundary?\n", + "\n", + "Try to give an interpretation on the reason for such behavior." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "Cs = [1e-6, 0.01, 0.1, 1, 10, 100, 1e6]\n", + "\n", + "# Write your code here." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Impact of the regularization on the weights\n", + "\n", + "Look at the impact of the `C` hyperparameter on the magnitude of the weights.\n", + "**Hint**: You can [access pipeline\n", + "steps](https://scikit-learn.org/stable/modules/compose.html#access-pipeline-steps)\n", + "by name or position. Then you can query the attributes of that step such as\n", + "`coef_`." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Write your code here." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Impact of the regularization on with non-linear feature engineering\n", + "\n", + "Use the `plot_decision_boundary` function to repeat the experiment using a\n", + "non-linear feature engineering pipeline. For such purpose, insert\n", + "`Nystroem(kernel=\"rbf\", gamma=1, n_components=100)` between the\n", + "`StandardScaler` and the `LogisticRegression` steps.\n", + "\n", + "- Does the value of `C` still impact the position of the decision boundary and\n", + " the confidence of the model?\n", + "- What can you say about the impact of `C` on the underfitting vs overfitting\n", + " trade-off?" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.kernel_approximation import Nystroem\n", + "\n", + "# Write your code here." + ] + } + ], + "metadata": { + "jupytext": { + "main_language": "python" + }, + "kernelspec": { + "display_name": "Python 3", + "name": "python3" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} \ No newline at end of file diff --git a/notebooks/linear_models_feature_engineering_classification.ipynb b/notebooks/linear_models_feature_engineering_classification.ipynb new file mode 100644 index 000000000..87544be19 --- /dev/null +++ b/notebooks/linear_models_feature_engineering_classification.ipynb @@ -0,0 +1,682 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "# Non-linear feature engineering for Logistic Regression\n", + "\n", + "In the slides at the beginning of the module we mentioned that linear\n", + "classification models are not suited to non-linearly separable data.\n", + "Nevertheless, one can still use feature engineering as previously done for\n", + "regression models to overcome this issue. To do so, we use non-linear\n", + "transformations that typically map the original feature space into a higher\n", + "dimension space, where the linear model can separate the data more easily.\n", + "\n", + "Let us illustrate this on three synthetic datasets. Each dataset has two\n", + "original features and two classes to make it easy to visualize. The first\n", + "dataset is called the \"moons\" dataset as the data points from each class are\n", + "shaped as a crescent moon:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import numpy as np\n", + "import pandas as pd\n", + "from sklearn.datasets import make_moons\n", + "\n", + "feature_names = [\"Feature #0\", \"Feature #1\"]\n", + "target_name = \"class\"\n", + "\n", + "X, y = make_moons(n_samples=100, noise=0.13, random_state=42)\n", + "\n", + "# We store both the data and target in a dataframe to ease plotting\n", + "moons = pd.DataFrame(\n", + " np.concatenate([X, y[:, np.newaxis]], axis=1),\n", + " columns=feature_names + [target_name],\n", + ")\n", + "data_moons, target_moons = moons[feature_names], moons[target_name]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "The second dataset is called the \"Gaussian quantiles\" dataset as all data\n", + "points are sampled from a 2D Gaussian distribution regardless of the class.\n", + "The points closest to the center are assigned to the class 1 while the points\n", + "in the outer edges are assigned to the class 0, resulting in concentric\n", + "circles." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.datasets import make_gaussian_quantiles\n", + "\n", + "X, y = make_gaussian_quantiles(\n", + " n_samples=100, n_features=2, n_classes=2, random_state=42\n", + ")\n", + "gauss = pd.DataFrame(\n", + " np.concatenate([X, y[:, np.newaxis]], axis=1),\n", + " columns=feature_names + [target_name],\n", + ")\n", + "data_gauss, target_gauss = gauss[feature_names], gauss[target_name]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "The third dataset is called the \"XOR\" dataset as the data points are sampled\n", + "from a uniform distribution in a 2D space and the class is defined by the\n", + "Exclusive OR (XOR) operation on the two features: the target class is 1 if\n", + "only one of the two features is greater than 0. The target class is 0\n", + "otherwise." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "xor = pd.DataFrame(\n", + " np.random.RandomState(0).uniform(low=-1, high=1, size=(200, 2)),\n", + " columns=feature_names,\n", + ")\n", + "target_xor = np.logical_xor(xor[\"Feature #0\"] > 0, xor[\"Feature #1\"] > 0)\n", + "target_xor = target_xor.astype(np.int32)\n", + "xor[\"class\"] = target_xor\n", + "data_xor = xor[feature_names]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "We use matplotlib to visualize all the datasets at a glance:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "lines_to_next_cell": 2 + }, + "outputs": [], + "source": [ + "import matplotlib.pyplot as plt\n", + "from matplotlib.colors import ListedColormap\n", + "\n", + "\n", + "_, axs = plt.subplots(ncols=3, figsize=(14, 4), constrained_layout=True)\n", + "\n", + "common_scatter_plot_params = dict(\n", + " cmap=ListedColormap([\"tab:red\", \"tab:blue\"]),\n", + " edgecolor=\"white\",\n", + " linewidth=1,\n", + ")\n", + "\n", + "axs[0].scatter(\n", + " data_moons[feature_names[0]],\n", + " data_moons[feature_names[1]],\n", + " c=target_moons,\n", + " **common_scatter_plot_params,\n", + ")\n", + "axs[1].scatter(\n", + " data_gauss[feature_names[0]],\n", + " data_gauss[feature_names[1]],\n", + " c=target_gauss,\n", + " **common_scatter_plot_params,\n", + ")\n", + "axs[2].scatter(\n", + " data_xor[feature_names[0]],\n", + " data_xor[feature_names[1]],\n", + " c=target_xor,\n", + " **common_scatter_plot_params,\n", + ")\n", + "axs[0].set(\n", + " title=\"The moons dataset\",\n", + " xlabel=feature_names[0],\n", + " ylabel=feature_names[1],\n", + ")\n", + "axs[1].set(\n", + " title=\"The Gaussian quantiles dataset\",\n", + " xlabel=feature_names[0],\n", + ")\n", + "axs[2].set(\n", + " title=\"The XOR dataset\",\n", + " xlabel=feature_names[0],\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "We intuitively observe that there is no (single) straight line that can\n", + "separate the two classes in any of the datasets. We can confirm this by\n", + "fitting a linear model, such as a logistic regression, to each dataset and\n", + "plot the decision boundary of the model.\n", + "\n", + "Let's first define a function to help us fit a given model and plot its\n", + "decision boundary on the previous datasets at a glance:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.inspection import DecisionBoundaryDisplay\n", + "\n", + "\n", + "def plot_decision_boundary(model, title=None):\n", + " datasets = [\n", + " (data_moons, target_moons),\n", + " (data_gauss, target_gauss),\n", + " (data_xor, target_xor),\n", + " ]\n", + " fig, axs = plt.subplots(\n", + " ncols=3,\n", + " figsize=(14, 4),\n", + " constrained_layout=True,\n", + " )\n", + "\n", + " for i, ax, (data, target) in zip(\n", + " range(len(datasets)),\n", + " axs,\n", + " datasets,\n", + " ):\n", + " model.fit(data, target)\n", + " DecisionBoundaryDisplay.from_estimator(\n", + " model,\n", + " data,\n", + " response_method=\"predict_proba\",\n", + " plot_method=\"pcolormesh\",\n", + " cmap=\"RdBu\",\n", + " alpha=0.8,\n", + " # Setting vmin and vmax to the extreme values of the probability to\n", + " # ensure that 0.5 is mapped to white (the middle) of the blue-red\n", + " # colormap.\n", + " vmin=0,\n", + " vmax=1,\n", + " ax=ax,\n", + " )\n", + " DecisionBoundaryDisplay.from_estimator(\n", + " model,\n", + " data,\n", + " response_method=\"predict_proba\",\n", + " plot_method=\"contour\",\n", + " alpha=0.8,\n", + " levels=[0.5], # 0.5 probability contour line\n", + " linestyles=\"--\",\n", + " linewidths=2,\n", + " ax=ax,\n", + " )\n", + " ax.scatter(\n", + " data[feature_names[0]],\n", + " data[feature_names[1]],\n", + " c=target,\n", + " **common_scatter_plot_params,\n", + " )\n", + " if i > 0:\n", + " ax.set_ylabel(None)\n", + " if title is not None:\n", + " fig.suptitle(title)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "Now let's define our logistic regression model and plot its decision boundary\n", + "on the three datasets:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.pipeline import make_pipeline\n", + "from sklearn.preprocessing import StandardScaler\n", + "from sklearn.linear_model import LogisticRegression\n", + "\n", + "logistic_regression = make_pipeline(StandardScaler(), LogisticRegression())\n", + "logistic_regression" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "plot_decision_boundary(logistic_regression, title=\"Linear classifier\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "This confirms that it is not possible to separate the two classes with a\n", + "linear model. On each plot we see a **significant number of misclassified\n", + "samples on the training set**! The three plots show typical cases of\n", + "**underfitting** for linear models.\n", + "\n", + "Also, the last two plots show soft colors, meaning that the model is highly\n", + "unsure about which class to choose." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "## Engineering non-linear features\n", + "\n", + "As we did for the linear regression models, we now attempt to build a more\n", + "expressive machine learning pipeline by leveraging non-linear feature\n", + "engineering, with techniques such as binning, splines, polynomial features,\n", + "and kernel approximation.\n", + "\n", + "Let's start with the binning transformation of the features:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.preprocessing import KBinsDiscretizer\n", + "\n", + "classifier = make_pipeline(\n", + " KBinsDiscretizer(n_bins=5, encode=\"onehot\"), # already the default params\n", + " LogisticRegression(),\n", + ")\n", + "classifier" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "plot_decision_boundary(classifier, title=\"Binning classifier\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "We can see that the resulting decision boundary is constrained to follow\n", + "**axis-aligned segments**, which is very similar to what a decision tree would\n", + "do as we will see in the next Module. Furthermore, as for decision trees, the\n", + "model makes piecewise constant predictions within each rectangular region.\n", + "\n", + "This axis-aligned decision boundary is not necessarily the natural decision\n", + "boundary a human would have intuitively drawn for the moons dataset and the\n", + "Gaussian quantiles datasets. It still makes it possible for the model to\n", + "successfully separate the data. However, binning alone does not help the\n", + "classifier separate the data for the XOR dataset. This is because **the\n", + "binning transformation is a feature-wise transformation** and thus **cannot\n", + "capture interactions** between features that are necessary to separate the\n", + "XOR dataset.\n", + "\n", + "Let's now consider a **spline** transformation of the original features. This\n", + "transformation can be considered a **smooth version of the binning\n", + "transformation**. You can find more details in the [scikit-learn user guide](\n", + "https://scikit-learn.org/stable/modules/preprocessing.html#spline-transformer)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.preprocessing import SplineTransformer\n", + "\n", + "classifier = make_pipeline(\n", + " SplineTransformer(degree=3, n_knots=5),\n", + " LogisticRegression(),\n", + ")\n", + "classifier" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "plot_decision_boundary(classifier, title=\"Spline classifier\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "We can see that the decision boundary is now smooth, and while it favors\n", + "axis-aligned decision rules when extrapolating in low density regions, it can\n", + "adopt a more curvy decision boundary in the high density regions.\n", + "However, as for the binning transformation, the model still fails to separate\n", + "the data for the XOR dataset, irrespective of the number of knots, for the\n", + "same reasons: **the spline transformation is a feature-wise transformation**\n", + "and thus **cannot capture interactions** between features.\n", + "\n", + "Take into account that the number of knots is a hyperparameter that needs to be\n", + "tuned. If we use too few knots, the model would underfit the data, as shown on\n", + "the moons dataset. If we use too many knots, the model would overfit the data.\n", + "\n", + "
\n", + "

Note

\n", + "

Notice that KBinsDiscretizer(encode=\"onehot\") and SplineTransformer do not\n", + "require additional scaling. Indeed, they can replace the scaling step for\n", + "numerical features: they both create features with values in the [0, 1] range.

\n", + "
" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "## Modeling non-additive feature interactions\n", + "\n", + "We now consider feature engineering techniques that non-linearly combine the\n", + "original features in the hope of capturing interactions between them. We will\n", + "consider polynomial features and kernel approximation.\n", + "\n", + "Let's start with the polynomial features:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.preprocessing import PolynomialFeatures\n", + "\n", + "classifier = make_pipeline(\n", + " StandardScaler(),\n", + " PolynomialFeatures(degree=3, include_bias=False),\n", + " LogisticRegression(C=10),\n", + ")\n", + "classifier" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "plot_decision_boundary(classifier, title=\"Polynomial classifier\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "We can see that the decision boundary of this polynomial classifier is\n", + "**smooth** and can successfully separate the data on all three datasets\n", + "(depending on how we set the values of the `degree` and `C`\n", + "hyperparameters).\n", + "\n", + "It is interesting to observe that this models extrapolates very differently\n", + "from the previous models: its decision boundary can take a diagonal\n", + "direction. Furthermore, we can observe that predictions are very confident in\n", + "the low density regions of the feature space, even very close to the decision\n", + "boundary\n", + "\n", + "We can obtain very similar results by using a kernel approximation technique\n", + "such as the Nystr\u00f6m method with a polynomial kernel:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "lines_to_next_cell": 0 + }, + "outputs": [], + "source": [ + "from sklearn.kernel_approximation import Nystroem\n", + "\n", + "classifier = make_pipeline(\n", + " StandardScaler(),\n", + " Nystroem(kernel=\"poly\", degree=3, coef0=1, n_components=100),\n", + " LogisticRegression(C=10),\n", + ")\n", + "classifier" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "plot_decision_boundary(classifier, title=\"Polynomial Nystroem classifier\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "The polynomial kernel approach would be interesting in cases were the\n", + "original feature space is already of high dimension: in these cases,\n", + "**computing the complete polynomial expansion** with `PolynomialFeatures`\n", + "could be **intractable**, while Nystr\u00f6m method can control the output\n", + "dimensionality with the `n_components` parameter.\n", + "\n", + "Let's now explore the use of a radial basis function (RBF) kernel:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "lines_to_next_cell": 0 + }, + "outputs": [], + "source": [ + "from sklearn.kernel_approximation import Nystroem\n", + "\n", + "classifier = make_pipeline(\n", + " StandardScaler(),\n", + " Nystroem(kernel=\"rbf\", gamma=1, n_components=100),\n", + " LogisticRegression(C=5),\n", + ")\n", + "classifier" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "plot_decision_boundary(classifier, title=\"RBF Nystroem classifier\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "The resulting decision boundary is **smooth** and can successfully separate\n", + "the classes for all three datasets. Furthemore, the model extrapolates very\n", + "differently: in particular, it tends to be **much less confident in its\n", + "predictions in the low density regions** of the feature space.\n", + "\n", + "As for the previous polynomial pipelines, this pipeline **does not favor\n", + "axis-aligned decision rules**. It can be shown mathematically that the\n", + "[inductive bias](https://en.wikipedia.org/wiki/Inductive_bias) of our RBF\n", + "pipeline is actually rotationally invariant." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "## Multi-step feature engineering\n", + "\n", + "It is possible to combine several feature engineering transformers in a\n", + "single pipeline to blend their respective inductive biases. For instance, we\n", + "can combine the binning transformation with a kernel approximation:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "lines_to_next_cell": 0 + }, + "outputs": [], + "source": [ + "classifier = make_pipeline(\n", + " KBinsDiscretizer(n_bins=5),\n", + " Nystroem(kernel=\"rbf\", gamma=1.0, n_components=100),\n", + " LogisticRegression(),\n", + ")\n", + "classifier" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "plot_decision_boundary(classifier, title=\"Binning + Nystroem classifier\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "It is interesting to observe that this model is still piecewise constant with\n", + "axis-aligned decision boundaries everywhere, but it can now successfully deal\n", + "with the XOR problem thanks to the second step of the pipeline that can\n", + "model the interactions between the features transformed by the first step.\n", + "\n", + "We can also combine the spline transformation with a kernel approximation:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.kernel_approximation import Nystroem\n", + "\n", + "classifier = make_pipeline(\n", + " SplineTransformer(n_knots=5),\n", + " Nystroem(kernel=\"rbf\", gamma=1.0, n_components=100),\n", + " LogisticRegression(),\n", + ")\n", + "classifier" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "plot_decision_boundary(classifier, title=\"Spline + RBF Nystroem classifier\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "The decision boundary of this pipeline is smooth, but with axis-aligned\n", + "extrapolation.\n", + "\n", + "Depending on the task, this can be considered an advantage or a drawback." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "## Summary and take-away messages\n", + "\n", + "- Linear models such as logistic regression can be used for classification on\n", + " non-linearly separable datasets by leveraging non-linear feature\n", + " engineering.\n", + "- Transformers such as `KBinsDiscretizer` and `SplineTransformer` can be used\n", + " to engineer non-linear features independently for each original feature.\n", + "- As a result, these transformers cannot capture interactions between the\n", + " orignal features (and then would fail on the XOR classification task).\n", + "- Despite this limitation they already augment the expressivity of the\n", + " pipeline, which can be sufficient for some datasets.\n", + "- They also favor axis-aligned decision boundaries, in particular in the low\n", + " density regions of the feature space (axis-aligned extrapolation).\n", + "- Transformers such as `PolynomialFeatures` and `Nystroem` can be used to\n", + " engineer non-linear features that capture interactions between the original\n", + " features.\n", + "- It can be useful to combine several feature engineering transformers in a\n", + " single pipeline to build a more expressive model, for instance to favor\n", + " axis-aligned extrapolation while also capturing interactions.\n", + "- In particular, if the original dataset has both numerical and categorical\n", + " features, it can be useful to apply binning or a spline transformation to the\n", + " numerical features and one-hot encoding to the categorical features. Then,\n", + " the resulting features can be combined with a kernel approximation to model\n", + " interactions between numerical and categorical features. This can be\n", + " achieved with the help of `ColumnTransformer`.\n", + "\n", + "In subsequent notebooks and exercises, we will further explore the interplay\n", + "between regularization, feature engineering, and the under-fitting /\n", + "overfitting trade-off.\n", + "\n", + "But first we will do an exercise to illustrate the relationship between the\n", + "Nystr\u00f6m kernel approximation and support vector machines." + ] + } + ], + "metadata": { + "jupytext": { + "main_language": "python" + }, + "kernelspec": { + "display_name": "Python 3", + "name": "python3" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} \ No newline at end of file diff --git a/notebooks/linear_models_sol_03.ipynb b/notebooks/linear_models_sol_03.ipynb index 178514087..20256e76b 100644 --- a/notebooks/linear_models_sol_03.ipynb +++ b/notebooks/linear_models_sol_03.ipynb @@ -2,36 +2,40 @@ "cells": [ { "cell_type": "markdown", - "metadata": { - "lines_to_next_cell": 2 - }, + "metadata": {}, "source": [ "# \ud83d\udcc3 Solution for Exercise M4.03\n", "\n", - "In the previous Module we tuned the hyperparameter `C` of the logistic\n", - "regression without mentioning that it controls the regularization strength.\n", - "Later, on the slides on \ud83c\udfa5 **Intuitions on regularized linear models** we\n", - "metioned that a small `C` provides a more regularized model, whereas a\n", - "non-regularized model is obtained with an infinitely large value of `C`.\n", - "Indeed, `C` behaves as the inverse of the `alpha` coefficient in the `Ridge`\n", - "model.\n", - "\n", - "In this exercise, we ask you to train a logistic regression classifier using\n", - "different values of the parameter `C` to find its effects by yourself.\n", + "Now, we tackle a more realistic classification problem instead of making a\n", + "synthetic dataset. We start by loading the Adult Census dataset with the\n", + "following snippet. For the moment we retain only the **numerical features**." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", "\n", - "We start by loading the dataset. We only keep the Adelie and Chinstrap classes\n", - "to keep the discussion simple." + "adult_census = pd.read_csv(\"../datasets/adult-census.csv\")\n", + "target = adult_census[\"class\"]\n", + "data = adult_census.select_dtypes([\"integer\", \"floating\"])\n", + "data = data.drop(columns=[\"education-num\"])\n", + "data" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "
\n", - "

Note

\n", - "

If you want a deeper overview regarding this dataset, you can refer to the\n", - "Appendix - Datasets description section at the end of this MOOC.

\n", - "
" + "We confirm that all the selected features are numerical.\n", + "\n", + "Compute the generalization performance in terms of accuracy of a linear model\n", + "composed of a `StandardScaler` and a `LogisticRegression`. Use a 10-fold\n", + "cross-validation with `return_estimator=True` to be able to inspect the\n", + "trained estimators." ] }, { @@ -40,15 +44,28 @@ "metadata": {}, "outputs": [], "source": [ - "import pandas as pd\n", + "# solution\n", + "from sklearn.pipeline import make_pipeline\n", + "from sklearn.preprocessing import StandardScaler\n", + "from sklearn.linear_model import LogisticRegression\n", + "from sklearn.model_selection import cross_validate\n", "\n", - "penguins = pd.read_csv(\"../datasets/penguins_classification.csv\")\n", - "penguins = (\n", - " penguins.set_index(\"Species\").loc[[\"Adelie\", \"Chinstrap\"]].reset_index()\n", + "model = make_pipeline(StandardScaler(), LogisticRegression())\n", + "cv_results_lr = cross_validate(\n", + " model, data, target, cv=10, return_estimator=True\n", ")\n", + "test_score_lr = cv_results_lr[\"test_score\"]\n", + "test_score_lr" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "What is the most important feature seen by the logistic regression?\n", "\n", - "culmen_columns = [\"Culmen Length (mm)\", \"Culmen Depth (mm)\"]\n", - "target_column = \"Species\"" + "You can use a boxplot to compare the absolute values of the coefficients while\n", + "also visualizing the variability induced by the cross-validation resampling." ] }, { @@ -57,29 +74,41 @@ "metadata": {}, "outputs": [], "source": [ - "from sklearn.model_selection import train_test_split\n", - "\n", - "penguins_train, penguins_test = train_test_split(\n", - " penguins, random_state=0, test_size=0.4\n", - ")\n", + "# solution\n", + "import matplotlib.pyplot as plt\n", "\n", - "data_train = penguins_train[culmen_columns]\n", - "data_test = penguins_test[culmen_columns]\n", + "coefs = [pipeline[-1].coef_[0] for pipeline in cv_results_lr[\"estimator\"]]\n", + "coefs = pd.DataFrame(coefs, columns=data.columns)\n", "\n", - "target_train = penguins_train[target_column]\n", - "target_test = penguins_test[target_column]" + "color = {\"whiskers\": \"black\", \"medians\": \"black\", \"caps\": \"black\"}\n", + "_, ax = plt.subplots()\n", + "_ = coefs.abs().plot.box(color=color, vert=False, ax=ax)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "tags": [ + "solution" + ] + }, + "source": [ + "Since we scaled the features, the coefficients of the linear model can be\n", + "meaningful compared directly. `\"capital-gain\"` is the most impacting feature.\n", + "Just be aware not to draw conclusions on the causal effect provided the impact\n", + "of a feature. Interested readers are refered to the [example on Common\n", + "pitfalls in the interpretation of coefficients of linear\n", + "models](https://scikit-learn.org/stable/auto_examples/inspection/plot_linear_model_coefficient_interpretation.html)\n", + "or the [example on Failure of Machine Learning to infer causal\n", + "effects](https://scikit-learn.org/stable/auto_examples/inspection/plot_causal_interpretation.html)." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "We define a function to help us fit a given `model` and plot its decision\n", - "boundary. We recall that by using a `DecisionBoundaryDisplay` with diverging\n", - "colormap, `vmin=0` and `vmax=1`, we ensure that the 0.5 probability is mapped\n", - "to the white color. Equivalently, the darker the color, the closer the\n", - "predicted probability is to 0 or 1 and the more confident the classifier is in\n", - "its predictions." + "Let's now work with **both numerical and categorical features**. You can\n", + "reload the Adult Census dataset with the following snippet:" ] }, { @@ -88,53 +117,24 @@ "metadata": {}, "outputs": [], "source": [ - "import matplotlib.pyplot as plt\n", - "import seaborn as sns\n", - "from sklearn.inspection import DecisionBoundaryDisplay\n", - "\n", - "\n", - "def plot_decision_boundary(model):\n", - " model.fit(data_train, target_train)\n", - " accuracy = model.score(data_test, target_test)\n", - "\n", - " disp = DecisionBoundaryDisplay.from_estimator(\n", - " model,\n", - " data_train,\n", - " response_method=\"predict_proba\",\n", - " plot_method=\"pcolormesh\",\n", - " cmap=\"RdBu_r\",\n", - " alpha=0.8,\n", - " vmin=0.0,\n", - " vmax=1.0,\n", - " )\n", - " DecisionBoundaryDisplay.from_estimator(\n", - " model,\n", - " data_train,\n", - " response_method=\"predict_proba\",\n", - " plot_method=\"contour\",\n", - " linestyles=\"--\",\n", - " linewidths=1,\n", - " alpha=0.8,\n", - " levels=[0.5],\n", - " ax=disp.ax_,\n", - " )\n", - " sns.scatterplot(\n", - " data=penguins_train,\n", - " x=culmen_columns[0],\n", - " y=culmen_columns[1],\n", - " hue=target_column,\n", - " palette=[\"tab:blue\", \"tab:red\"],\n", - " ax=disp.ax_,\n", - " )\n", - " plt.legend(bbox_to_anchor=(1.05, 0.8), loc=\"upper left\")\n", - " plt.title(f\"C: {C} \\n Accuracy on the test set: {accuracy:.2f}\")" + "adult_census = pd.read_csv(\"../datasets/adult-census.csv\")\n", + "target = adult_census[\"class\"]\n", + "data = adult_census.drop(columns=[\"class\", \"education-num\"])" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "Let's now create our predictive model." + "Create a predictive model where:\n", + "- The numerical data must be scaled.\n", + "- The categorical data must be one-hot encoded, set `min_frequency=0.01` to\n", + " group categories concerning less than 1% of the total samples.\n", + "- The predictor is a `LogisticRegression`. You may need to increase the number\n", + " of `max_iter`, which is 100 by default.\n", + "\n", + "Use the same 10-fold cross-validation strategy with `return_estimator=True` as\n", + "above to evaluate this complex pipeline." ] }, { @@ -143,28 +143,36 @@ "metadata": {}, "outputs": [], "source": [ - "from sklearn.pipeline import make_pipeline\n", - "from sklearn.preprocessing import StandardScaler\n", - "from sklearn.linear_model import LogisticRegression\n", - "\n", - "logistic_regression = make_pipeline(StandardScaler(), LogisticRegression())" + "# solution\n", + "from sklearn.compose import make_column_selector as selector\n", + "from sklearn.compose import make_column_transformer\n", + "from sklearn.preprocessing import OneHotEncoder\n", + "\n", + "categorical_columns = selector(dtype_include=object)(data)\n", + "numerical_columns = selector(dtype_exclude=object)(data)\n", + "\n", + "preprocessor = make_column_transformer(\n", + " (\n", + " OneHotEncoder(handle_unknown=\"ignore\", min_frequency=0.01),\n", + " categorical_columns,\n", + " ),\n", + " (StandardScaler(), numerical_columns),\n", + ")\n", + "model = make_pipeline(preprocessor, LogisticRegression(max_iter=5_000))\n", + "cv_results_complex_lr = cross_validate(\n", + " model, data, target, cv=10, return_estimator=True, n_jobs=2\n", + ")\n", + "test_score_complex_lr = cv_results_complex_lr[\"test_score\"]\n", + "test_score_complex_lr" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "## Influence of the parameter `C` on the decision boundary\n", - "\n", - "Given the following candidates for the `C` parameter and the\n", - "`plot_decision_boundary` function, find out the impact of `C` on the\n", - "classifier's decision boundary.\n", - "\n", - "- How does the value of `C` impact the confidence on the predictions?\n", - "- How does it impact the underfit/overfit trade-off?\n", - "- How does it impact the position and orientation of the decision boundary?\n", - "\n", - "Try to give an interpretation on the reason for such behavior." + "By comparing the cross-validation test scores of both models fold-to-fold,\n", + "count the number of times the model using both numerical and categorical\n", + "features has a better test score than the model using only numerical features." ] }, { @@ -173,75 +181,83 @@ "metadata": {}, "outputs": [], "source": [ - "Cs = [1e-6, 0.01, 0.1, 1, 10, 100, 1e6]\n", + "# solution\n", + "import numpy as np\n", + "import matplotlib.pyplot as plt\n", "\n", + "indices = np.arange(len(test_score_lr))\n", + "plt.scatter(\n", + " indices, test_score_lr, color=\"tab:blue\", label=\"numerical features only\"\n", + ")\n", + "plt.scatter(\n", + " indices,\n", + " test_score_complex_lr,\n", + " color=\"tab:red\",\n", + " label=\"all features\",\n", + ")\n", + "plt.ylim((0, 1))\n", + "plt.xlabel(\"Cross-validation iteration\")\n", + "plt.ylabel(\"Accuracy\")\n", + "_ = plt.legend(bbox_to_anchor=(1.05, 1), loc=\"upper left\")\n", + "\n", + "print(\n", + " \"A model using both all features is better than a\"\n", + " \" model using only numerical features for\"\n", + " f\" {sum(test_score_complex_lr > test_score_lr)} CV iterations out of 10.\"\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "For the following questions, you can copy adn paste the following snippet to\n", + "get the feature names from the column transformer here named `preprocessor`.\n", + "\n", + "```python\n", + "preprocessor.fit(data)\n", + "feature_names = (\n", + " preprocessor.named_transformers_[\"onehotencoder\"].get_feature_names_out(\n", + " categorical_columns\n", + " )\n", + ").tolist()\n", + "feature_names += numerical_columns\n", + "feature_names\n", + "```" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ "# solution\n", - "for C in Cs:\n", - " logistic_regression.set_params(logisticregression__C=C)\n", - " plot_decision_boundary(logistic_regression)" + "preprocessor.fit(data)\n", + "feature_names = (\n", + " preprocessor.named_transformers_[\"onehotencoder\"].get_feature_names_out(\n", + " categorical_columns\n", + " )\n", + ").tolist()\n", + "feature_names += numerical_columns\n", + "feature_names" ] }, { "cell_type": "markdown", - "metadata": { - "tags": [ - "solution" - ] - }, + "metadata": {}, "source": [ - "\n", - "On this series of plots we can observe several important points. Regarding the\n", - "confidence on the predictions:\n", - "\n", - "- For low values of `C` (strong regularization), the classifier is less\n", - " confident in its predictions. We are enforcing a **spread sigmoid**.\n", - "- For high values of `C` (weak regularization), the classifier is more\n", - " confident: the areas with dark blue (very confident in predicting \"Adelie\")\n", - " and dark red (very confident in predicting \"Chinstrap\") nearly cover the\n", - " entire feature space. We are enforcing a **steep sigmoid**.\n", - "\n", - "To answer the next question, think that misclassified data points are more\n", - "costly when the classifier is more confident on the decision. Decision rules\n", - "are mostly driven by avoiding such cost. From the previous observations we can\n", - "then deduce that:\n", - "\n", - "- The smaller the `C` (the stronger the regularization), the lower the cost\n", - " of a misclassification. As more data points lay in the low-confidence\n", - " zone, the more the decision rules are influenced almost uniformly by all\n", - " the data points. This leads to a less expressive model, which may underfit.\n", - "- The higher the value of `C` (the weaker the regularization), the more the\n", - " decision is influenced by a few training points very close to the boundary,\n", - " where decisions are costly. Remember that models may overfit if the number\n", - " of samples in the training set is too small, as at least a minimum of\n", - " samples is needed to average the noise out.\n", - "\n", - "The orientation is the result of two factors: minimizing the number of\n", - "misclassified training points with high confidence and their distance to the\n", - "decision boundary (notice how the contour line tries to align with the most\n", - "misclassified data points in the dark-colored zone). This is closely related\n", - "to the value of the weights of the model, which is explained in the next part\n", - "of the exercise.\n", - "\n", - "Finally, for small values of `C` the position of the decision boundary is\n", - "affected by the class imbalance: when `C` is near zero, the model predicts the\n", - "majority class (as seen in the training set) everywhere in the feature space.\n", - "In our case, there are approximately two times more \"Adelie\" than \"Chinstrap\"\n", - "penguins. This explains why the decision boundary is shifted to the right when\n", - "`C` gets smaller. Indeed, the most regularized model predicts light blue\n", - "almost everywhere in the feature space." + "Notice that there are as many feature names as coefficients in the last step\n", + "of your predictive pipeline." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "## Impact of the regularization on the weights\n", - "\n", - "Look at the impact of the `C` hyperparameter on the magnitude of the weights.\n", - "**Hint**: You can [access pipeline\n", - "steps](https://scikit-learn.org/stable/modules/compose.html#access-pipeline-steps)\n", - "by name or position. Then you can query the attributes of that step such as\n", - "`coef_`." + "Which of the following pairs of features is most impacting the predictions of\n", + "the logistic regression classifier based on the absolute magnitude of its\n", + "coefficients?" ] }, { @@ -251,67 +267,63 @@ "outputs": [], "source": [ "# solution\n", - "lr_weights = []\n", - "for C in Cs:\n", - " logistic_regression.set_params(logisticregression__C=C)\n", - " logistic_regression.fit(data_train, target_train)\n", - " coefs = logistic_regression[-1].coef_[0]\n", - " lr_weights.append(pd.Series(coefs, index=culmen_columns))" + "coefs = [\n", + " pipeline[-1].coef_[0] for pipeline in cv_results_complex_lr[\"estimator\"]\n", + "]\n", + "coefs = pd.DataFrame(coefs, columns=feature_names)\n", + "\n", + "_, ax = plt.subplots(figsize=(10, 35))\n", + "_ = coefs.abs().plot.box(color=color, vert=False, ax=ax)" ] }, { - "cell_type": "code", - "execution_count": null, + "cell_type": "markdown", "metadata": { "tags": [ "solution" ] }, - "outputs": [], "source": [ - "lr_weights = pd.concat(lr_weights, axis=1, keys=[f\"C: {C}\" for C in Cs])\n", - "lr_weights.plot.barh()\n", - "_ = plt.title(\"LogisticRegression weights depending of C\")" + "We can visually inspect the coefficients and observe that `\"capital-gain\"` and\n", + "`\"education_Doctorate\"` are impacting the predictions the most." ] }, { "cell_type": "markdown", - "metadata": { - "tags": [ - "solution" - ] - }, + "metadata": {}, "source": [ + "Now create a similar pipeline consisting of the same preprocessor as above,\n", + "followed by a `PolynomialFeatures` and a logistic regression with `C=0.01`.\n", + "Set `degree=2` and `interaction_only=True` to the feature engineering step.\n", + "Remember not to include a \"bias\" feature to avoid introducing a redundancy\n", + "with the intercept of the subsequent logistic regression." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# solution\n", + "from sklearn.preprocessing import PolynomialFeatures\n", "\n", - "As small `C` provides a more regularized model, it shrinks the weights values\n", - "toward zero, as in the `Ridge` model.\n", - "\n", - "In particular, with a strong penalty (e.g. `C = 0.01`), the weight of the feature\n", - "named \"Culmen Depth (mm)\" is almost zero. It explains why the decision\n", - "separation in the plot is almost perpendicular to the \"Culmen Length (mm)\"\n", - "feature.\n", - "\n", - "For even stronger penalty strengths (e.g. `C = 1e-6`), the weights of both\n", - "features are almost zero. It explains why the decision separation in the plot\n", - "is almost constant in the feature space: the predicted probability is only\n", - "based on the intercept parameter of the model (which is never regularized)." + "model_with_interaction = make_pipeline(\n", + " preprocessor,\n", + " PolynomialFeatures(degree=2, include_bias=False, interaction_only=True),\n", + " LogisticRegression(C=0.01, max_iter=5_000),\n", + ")\n", + "model_with_interaction" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "## Impact of the regularization on with non-linear feature engineering\n", - "\n", - "Use the `plot_decision_boundary` function to repeat the experiment using a\n", - "non-linear feature engineering pipeline. For such purpose, insert\n", - "`Nystroem(kernel=\"rbf\", gamma=1, n_components=100)` between the\n", - "`StandardScaler` and the `LogisticRegression` steps.\n", - "\n", - "- Does the value of `C` still impact the position of the decision boundary and\n", - " the confidence of the model?\n", - "- What can you say about the impact of `C` on the underfitting vs overfitting\n", - " trade-off?" + "By comparing the cross-validation test scores of both models fold-to-fold,\n", + "count the number of times the model using multiplicative interactions and both\n", + "numerical and categorical features has a better test score than the model\n", + "without interactions." ] }, { @@ -320,18 +332,51 @@ "metadata": {}, "outputs": [], "source": [ - "from sklearn.kernel_approximation import Nystroem\n", - "\n", "# solution\n", - "classifier = make_pipeline(\n", - " StandardScaler(),\n", - " Nystroem(kernel=\"rbf\", gamma=1.0, n_components=100, random_state=0),\n", - " LogisticRegression(penalty=\"l2\", max_iter=1000),\n", + "cv_results_interactions = cross_validate(\n", + " model_with_interaction,\n", + " data,\n", + " target,\n", + " cv=10,\n", + " return_estimator=True,\n", + " n_jobs=2,\n", ")\n", - "\n", - "for C in Cs:\n", - " classifier.set_params(logisticregression__C=C)\n", - " plot_decision_boundary(classifier)" + "test_score_interactions = cv_results_interactions[\"test_score\"]\n", + "test_score_interactions" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# solution\n", + "plt.scatter(\n", + " indices, test_score_lr, color=\"tab:blue\", label=\"numerical features only\"\n", + ")\n", + "plt.scatter(\n", + " indices,\n", + " test_score_complex_lr,\n", + " color=\"tab:red\",\n", + " label=\"all features\",\n", + ")\n", + "plt.scatter(\n", + " indices,\n", + " test_score_interactions,\n", + " color=\"black\",\n", + " label=\"all features and interactions\",\n", + ")\n", + "plt.xlabel(\"Cross-validation iteration\")\n", + "plt.ylabel(\"Accuracy\")\n", + "_ = plt.legend(bbox_to_anchor=(1.05, 1), loc=\"upper left\")\n", + "\n", + "print(\n", + " \"A model using all features and interactions is better than a model\"\n", + " \" without interactions for\"\n", + " f\" {sum(test_score_interactions > test_score_complex_lr)} CV iterations\"\n", + " \" out of 10.\"\n", + ")" ] }, { @@ -342,41 +387,19 @@ ] }, "source": [ - "\n", - "- For the lowest values of `C`, the overall pipeline underfits: it predicts\n", - " the majority class everywhere, as previously.\n", - "- When `C` increases, the models starts to predict some datapoints from the\n", - " \"Chinstrap\" class but the model is not very confident anywhere in the\n", - " feature space.\n", - "- The decision boundary is no longer a straight line: the linear model is now\n", - " classifying in the 100-dimensional feature space created by the `Nystroem`\n", - " transformer. As are result, the decision boundary induced by the overall\n", - " pipeline is now expressive enough to wrap around the minority class.\n", - "- For `C = 1` in particular, it finds a smooth red blob around most of the\n", - " \"Chinstrap\" data points. When moving away from the data points, the model is\n", - " less confident in its predictions and again tends to predict the majority\n", - " class according to the proportion in the training set.\n", - "- For higher values of `C`, the model starts to overfit: it is very confident\n", - " in its predictions almost everywhere, but it should not be trusted: the\n", - " model also makes a larger number of mistakes on the test set (not shown in\n", - " the plot) while adopting a very curvy decision boundary to attempt fitting\n", - " all the training points, including the noisy ones at the frontier between\n", - " the two classes. This makes the decision boundary very sensitive to the\n", - " sampling of the training set and as a result, it does not generalize well in\n", - " that region. This is confirmed by the (slightly) lower accuracy on the test\n", - " set.\n", - "\n", - "Finally, we can also note that the linear model on the raw features was as\n", - "good or better than the best model using non-linear feature engineering. So in\n", - "this case, we did not really need this extra complexity in our pipeline.\n", - "**Simpler is better!**\n", - "\n", - "So to conclude, when using non-linear feature engineering, it is often\n", - "possible to make the pipeline overfit, even if the original feature space is\n", - "low-dimensional. As a result, it is important to tune the regularization\n", - "parameter in conjunction with the parameters of the transformers (e.g. tuning\n", - "`gamma` would be important here). This has a direct impact on the certainty of\n", - "the predictions." + "When you multiply two one-hot encoded categorical features, the resulting\n", + "interaction feature is mostly 0, with a 1 only when both original features are\n", + "active, acting as a logical `AND`. In this case it could mean we are creating\n", + "new rules such as \"has a given education `AND` a given native country\", which\n", + "we expect to be predictive. This new rules map the original feature space into\n", + "a higher dimension space, where the linear model can separate the data more\n", + "easily.\n", + "\n", + "Keep into account that multiplying all pairs of one-hot encoded features may\n", + "lead to a rapid increase in the number of features, especially if the original\n", + "categorical variables have many levels. This can increase the computational\n", + "cost of your model and promote overfitting, as we will see in a future\n", + "notebook." ] } ], diff --git a/notebooks/linear_models_sol_04.ipynb b/notebooks/linear_models_sol_04.ipynb new file mode 100644 index 000000000..54b7a613e --- /dev/null +++ b/notebooks/linear_models_sol_04.ipynb @@ -0,0 +1,395 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "lines_to_next_cell": 2 + }, + "source": [ + "# \ud83d\udcc3 Solution for Exercise M4.04\n", + "\n", + "In the previous Module we tuned the hyperparameter `C` of the logistic\n", + "regression without mentioning that it controls the regularization strength.\n", + "Later, on the slides on \ud83c\udfa5 **Intuitions on regularized linear models** we\n", + "metioned that a small `C` provides a more regularized model, whereas a\n", + "non-regularized model is obtained with an infinitely large value of `C`.\n", + "Indeed, `C` behaves as the inverse of the `alpha` coefficient in the `Ridge`\n", + "model.\n", + "\n", + "In this exercise, we ask you to train a logistic regression classifier using\n", + "different values of the parameter `C` to find its effects by yourself.\n", + "\n", + "We start by loading the dataset. We only keep the Adelie and Chinstrap classes\n", + "to keep the discussion simple." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "
\n", + "

Note

\n", + "

If you want a deeper overview regarding this dataset, you can refer to the\n", + "Appendix - Datasets description section at the end of this MOOC.

\n", + "
" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "\n", + "penguins = pd.read_csv(\"../datasets/penguins_classification.csv\")\n", + "penguins = (\n", + " penguins.set_index(\"Species\").loc[[\"Adelie\", \"Chinstrap\"]].reset_index()\n", + ")\n", + "\n", + "culmen_columns = [\"Culmen Length (mm)\", \"Culmen Depth (mm)\"]\n", + "target_column = \"Species\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.model_selection import train_test_split\n", + "\n", + "penguins_train, penguins_test = train_test_split(\n", + " penguins, random_state=0, test_size=0.4\n", + ")\n", + "\n", + "data_train = penguins_train[culmen_columns]\n", + "data_test = penguins_test[culmen_columns]\n", + "\n", + "target_train = penguins_train[target_column]\n", + "target_test = penguins_test[target_column]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We define a function to help us fit a given `model` and plot its decision\n", + "boundary. We recall that by using a `DecisionBoundaryDisplay` with diverging\n", + "colormap, `vmin=0` and `vmax=1`, we ensure that the 0.5 probability is mapped\n", + "to the white color. Equivalently, the darker the color, the closer the\n", + "predicted probability is to 0 or 1 and the more confident the classifier is in\n", + "its predictions." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import matplotlib.pyplot as plt\n", + "import seaborn as sns\n", + "from sklearn.inspection import DecisionBoundaryDisplay\n", + "\n", + "\n", + "def plot_decision_boundary(model):\n", + " model.fit(data_train, target_train)\n", + " accuracy = model.score(data_test, target_test)\n", + " C = model.get_params()[\"logisticregression__C\"]\n", + "\n", + " disp = DecisionBoundaryDisplay.from_estimator(\n", + " model,\n", + " data_train,\n", + " response_method=\"predict_proba\",\n", + " plot_method=\"pcolormesh\",\n", + " cmap=\"RdBu_r\",\n", + " alpha=0.8,\n", + " vmin=0.0,\n", + " vmax=1.0,\n", + " )\n", + " DecisionBoundaryDisplay.from_estimator(\n", + " model,\n", + " data_train,\n", + " response_method=\"predict_proba\",\n", + " plot_method=\"contour\",\n", + " linestyles=\"--\",\n", + " linewidths=1,\n", + " alpha=0.8,\n", + " levels=[0.5],\n", + " ax=disp.ax_,\n", + " )\n", + " sns.scatterplot(\n", + " data=penguins_train,\n", + " x=culmen_columns[0],\n", + " y=culmen_columns[1],\n", + " hue=target_column,\n", + " palette=[\"tab:blue\", \"tab:red\"],\n", + " ax=disp.ax_,\n", + " )\n", + " plt.legend(bbox_to_anchor=(1.05, 0.8), loc=\"upper left\")\n", + " plt.title(f\"C: {C} \\n Accuracy on the test set: {accuracy:.2f}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's now create our predictive model." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.pipeline import make_pipeline\n", + "from sklearn.preprocessing import StandardScaler\n", + "from sklearn.linear_model import LogisticRegression\n", + "\n", + "logistic_regression = make_pipeline(StandardScaler(), LogisticRegression())" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Influence of the parameter `C` on the decision boundary\n", + "\n", + "Given the following candidates for the `C` parameter and the\n", + "`plot_decision_boundary` function, find out the impact of `C` on the\n", + "classifier's decision boundary.\n", + "\n", + "- How does the value of `C` impact the confidence on the predictions?\n", + "- How does it impact the underfit/overfit trade-off?\n", + "- How does it impact the position and orientation of the decision boundary?\n", + "\n", + "Try to give an interpretation on the reason for such behavior." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "Cs = [1e-6, 0.01, 0.1, 1, 10, 100, 1e6]\n", + "\n", + "# solution\n", + "for C in Cs:\n", + " logistic_regression.set_params(logisticregression__C=C)\n", + " plot_decision_boundary(logistic_regression)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "tags": [ + "solution" + ] + }, + "source": [ + "\n", + "On this series of plots we can observe several important points. Regarding the\n", + "confidence on the predictions:\n", + "\n", + "- For low values of `C` (strong regularization), the classifier is less\n", + " confident in its predictions. We are enforcing a **spread sigmoid**.\n", + "- For high values of `C` (weak regularization), the classifier is more\n", + " confident: the areas with dark blue (very confident in predicting \"Adelie\")\n", + " and dark red (very confident in predicting \"Chinstrap\") nearly cover the\n", + " entire feature space. We are enforcing a **steep sigmoid**.\n", + "\n", + "To answer the next question, think that misclassified data points are more\n", + "costly when the classifier is more confident on the decision. Decision rules\n", + "are mostly driven by avoiding such cost. From the previous observations we can\n", + "then deduce that:\n", + "\n", + "- The smaller the `C` (the stronger the regularization), the lower the cost\n", + " of a misclassification. As more data points lay in the low-confidence\n", + " zone, the more the decision rules are influenced almost uniformly by all\n", + " the data points. This leads to a less expressive model, which may underfit.\n", + "- The higher the value of `C` (the weaker the regularization), the more the\n", + " decision is influenced by a few training points very close to the boundary,\n", + " where decisions are costly. Remember that models may overfit if the number\n", + " of samples in the training set is too small, as at least a minimum of\n", + " samples is needed to average the noise out.\n", + "\n", + "The orientation is the result of two factors: minimizing the number of\n", + "misclassified training points with high confidence and their distance to the\n", + "decision boundary (notice how the contour line tries to align with the most\n", + "misclassified data points in the dark-colored zone). This is closely related\n", + "to the value of the weights of the model, which is explained in the next part\n", + "of the exercise.\n", + "\n", + "Finally, for small values of `C` the position of the decision boundary is\n", + "affected by the class imbalance: when `C` is near zero, the model predicts the\n", + "majority class (as seen in the training set) everywhere in the feature space.\n", + "In our case, there are approximately two times more \"Adelie\" than \"Chinstrap\"\n", + "penguins. This explains why the decision boundary is shifted to the right when\n", + "`C` gets smaller. Indeed, the most regularized model predicts light blue\n", + "almost everywhere in the feature space." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Impact of the regularization on the weights\n", + "\n", + "Look at the impact of the `C` hyperparameter on the magnitude of the weights.\n", + "**Hint**: You can [access pipeline\n", + "steps](https://scikit-learn.org/stable/modules/compose.html#access-pipeline-steps)\n", + "by name or position. Then you can query the attributes of that step such as\n", + "`coef_`." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# solution\n", + "lr_weights = []\n", + "for C in Cs:\n", + " logistic_regression.set_params(logisticregression__C=C)\n", + " logistic_regression.fit(data_train, target_train)\n", + " coefs = logistic_regression[-1].coef_[0]\n", + " lr_weights.append(pd.Series(coefs, index=culmen_columns))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "tags": [ + "solution" + ] + }, + "outputs": [], + "source": [ + "lr_weights = pd.concat(lr_weights, axis=1, keys=[f\"C: {C}\" for C in Cs])\n", + "lr_weights.plot.barh()\n", + "_ = plt.title(\"LogisticRegression weights depending of C\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "tags": [ + "solution" + ] + }, + "source": [ + "\n", + "As small `C` provides a more regularized model, it shrinks the weights values\n", + "toward zero, as in the `Ridge` model.\n", + "\n", + "In particular, with a strong penalty (e.g. `C = 0.01`), the weight of the feature\n", + "named \"Culmen Depth (mm)\" is almost zero. It explains why the decision\n", + "separation in the plot is almost perpendicular to the \"Culmen Length (mm)\"\n", + "feature.\n", + "\n", + "For even stronger penalty strengths (e.g. `C = 1e-6`), the weights of both\n", + "features are almost zero. It explains why the decision separation in the plot\n", + "is almost constant in the feature space: the predicted probability is only\n", + "based on the intercept parameter of the model (which is never regularized)." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Impact of the regularization on with non-linear feature engineering\n", + "\n", + "Use the `plot_decision_boundary` function to repeat the experiment using a\n", + "non-linear feature engineering pipeline. For such purpose, insert\n", + "`Nystroem(kernel=\"rbf\", gamma=1, n_components=100)` between the\n", + "`StandardScaler` and the `LogisticRegression` steps.\n", + "\n", + "- Does the value of `C` still impact the position of the decision boundary and\n", + " the confidence of the model?\n", + "- What can you say about the impact of `C` on the underfitting vs overfitting\n", + " trade-off?" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.kernel_approximation import Nystroem\n", + "\n", + "# solution\n", + "classifier = make_pipeline(\n", + " StandardScaler(),\n", + " Nystroem(kernel=\"rbf\", gamma=1.0, n_components=100, random_state=0),\n", + " LogisticRegression(max_iter=1000),\n", + ")\n", + "\n", + "for C in Cs:\n", + " classifier.set_params(logisticregression__C=C)\n", + " plot_decision_boundary(classifier)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "tags": [ + "solution" + ] + }, + "source": [ + "\n", + "- For the lowest values of `C`, the overall pipeline underfits: it predicts\n", + " the majority class everywhere, as previously.\n", + "- When `C` increases, the models starts to predict some datapoints from the\n", + " \"Chinstrap\" class but the model is not very confident anywhere in the\n", + " feature space.\n", + "- The decision boundary is no longer a straight line: the linear model is now\n", + " classifying in the 100-dimensional feature space created by the `Nystroem`\n", + " transformer. As are result, the decision boundary induced by the overall\n", + " pipeline is now expressive enough to wrap around the minority class.\n", + "- For `C = 1` in particular, it finds a smooth red blob around most of the\n", + " \"Chinstrap\" data points. When moving away from the data points, the model is\n", + " less confident in its predictions and again tends to predict the majority\n", + " class according to the proportion in the training set.\n", + "- For higher values of `C`, the model starts to overfit: it is very confident\n", + " in its predictions almost everywhere, but it should not be trusted: the\n", + " model also makes a larger number of mistakes on the test set (not shown in\n", + " the plot) while adopting a very curvy decision boundary to attempt fitting\n", + " all the training points, including the noisy ones at the frontier between\n", + " the two classes. This makes the decision boundary very sensitive to the\n", + " sampling of the training set and as a result, it does not generalize well in\n", + " that region. This is confirmed by the (slightly) lower accuracy on the test\n", + " set.\n", + "\n", + "Finally, we can also note that the linear model on the raw features was as\n", + "good or better than the best model using non-linear feature engineering. So in\n", + "this case, we did not really need this extra complexity in our pipeline.\n", + "**Simpler is better!**\n", + "\n", + "So to conclude, when using non-linear feature engineering, it is often\n", + "possible to make the pipeline overfit, even if the original feature space is\n", + "low-dimensional. As a result, it is important to tune the regularization\n", + "parameter in conjunction with the parameters of the transformers (e.g. tuning\n", + "`gamma` would be important here). This has a direct impact on the certainty of\n", + "the predictions." + ] + } + ], + "metadata": { + "jupytext": { + "main_language": "python" + }, + "kernelspec": { + "display_name": "Python 3", + "name": "python3" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} \ No newline at end of file diff --git a/notebooks/linear_regression_non_linear_link.ipynb b/notebooks/linear_regression_non_linear_link.ipynb index d56505e65..33f6936cc 100644 --- a/notebooks/linear_regression_non_linear_link.ipynb +++ b/notebooks/linear_regression_non_linear_link.ipynb @@ -2,9 +2,10 @@ "cells": [ { "cell_type": "markdown", + "id": "14eec485", "metadata": {}, "source": [ - "# Linear regression for a non-linear features-target relationship\n", + "# Non-linear feature engineering for Linear Regression\n", "\n", "In this notebook, we show that even if linear models are not natively adapted\n", "to express a `target` that is not a linear function of the `data`, it is still\n", @@ -15,16 +16,16 @@ "step followed by a linear regression step can therefore be considered a\n", "non-linear regression model as a whole.\n", "\n", - "
\n", - "

Tip

\n", - "

np.random.RandomState allows to create a random number generator which can\n", - "be later used to get deterministic results.

\n", - "
" + "In this occasion we are not loading a dataset, but creating our own custom\n", + "data consisting of a single feature. The target is built as a cubic polynomial\n", + "on said feature. To make things a bit more challenging, we add some random\n", + "fluctuations to the target." ] }, { "cell_type": "code", "execution_count": null, + "id": "8f516165", "metadata": {}, "outputs": [], "source": [ @@ -43,18 +44,22 @@ }, { "cell_type": "markdown", + "id": "00fd3b4f", "metadata": {}, "source": [ - "
\n", - "

Note

\n", - "

To ease the plotting, we create a pandas dataframe containing the data and\n", - "target:

\n", - "
" + "```{tip}\n", + "`np.random.RandomState` allows to create a random number generator which can\n", + "be later used to get deterministic results.\n", + "```\n", + "\n", + "To ease the plotting, we create a pandas dataframe containing the data and\n", + "target:" ] }, { "cell_type": "code", "execution_count": null, + "id": "5459a97b", "metadata": {}, "outputs": [], "source": [ @@ -66,6 +71,7 @@ { "cell_type": "code", "execution_count": null, + "id": "8b1b2257", "metadata": {}, "outputs": [], "source": [ @@ -78,23 +84,22 @@ }, { "cell_type": "markdown", + "id": "be69fae1", "metadata": {}, "source": [ - "We now observe the limitations of fitting a linear regression model.\n", - "\n", - "
\n", - "

Warning

\n", - "

In scikit-learn, by convention data (also called X in the scikit-learn\n", - "documentation) should be a 2D matrix of shape (n_samples, n_features).\n", - "If data is a 1D vector, you need to reshape it into a matrix with a\n", + "```{warning}\n", + "In scikit-learn, by convention `data` (also called `X` in the scikit-learn\n", + "documentation) should be a 2D matrix of shape `(n_samples, n_features)`.\n", + "If `data` is a 1D vector, you need to reshape it into a matrix with a\n", "single column if the vector represents a feature or a single row if the\n", - "vector represents a sample.

\n", - "
" + "vector represents a sample.\n", + "```" ] }, { "cell_type": "code", "execution_count": null, + "id": "46804be9", "metadata": {}, "outputs": [], "source": [ @@ -103,47 +108,75 @@ "data.shape" ] }, + { + "cell_type": "markdown", + "id": "a4209f00", + "metadata": { + "lines_to_next_cell": 2 + }, + "source": [ + "To avoid writing the same code in multiple places we define a helper function\n", + "that fits, scores and plots the different regression models." + ] + }, { "cell_type": "code", "execution_count": null, + "id": "a1bd392b", "metadata": {}, "outputs": [], "source": [ - "from sklearn.linear_model import LinearRegression\n", - "from sklearn.metrics import mean_squared_error\n", - "\n", - "linear_regression = LinearRegression()\n", - "linear_regression.fit(data, target)\n", - "target_predicted = linear_regression.predict(data)" + "def fit_score_plot_regression(model, title=None):\n", + " model.fit(data, target)\n", + " target_predicted = model.predict(data)\n", + " mse = mean_squared_error(target, target_predicted)\n", + " ax = sns.scatterplot(\n", + " data=full_data, x=\"input_feature\", y=\"target\", color=\"black\", alpha=0.5\n", + " )\n", + " ax.plot(data, target_predicted)\n", + " if title is not None:\n", + " _ = ax.set_title(title + f\" (MSE = {mse:.2f})\")\n", + " else:\n", + " _ = ax.set_title(f\"Mean squared error = {mse:.2f}\")" + ] + }, + { + "cell_type": "markdown", + "id": "7bfcbeb8", + "metadata": {}, + "source": [ + "We now observe the limitations of fitting a linear regression model." ] }, { "cell_type": "code", "execution_count": null, + "id": "1545fec5", "metadata": {}, "outputs": [], "source": [ - "mse = mean_squared_error(target, target_predicted)" + "from sklearn.linear_model import LinearRegression\n", + "from sklearn.metrics import mean_squared_error\n", + "\n", + "linear_regression = LinearRegression()\n", + "linear_regression" ] }, { "cell_type": "code", "execution_count": null, + "id": "e8c79631", "metadata": {}, "outputs": [], "source": [ - "ax = sns.scatterplot(\n", - " data=full_data, x=\"input_feature\", y=\"target\", color=\"black\", alpha=0.5\n", - ")\n", - "ax.plot(data, target_predicted)\n", - "_ = ax.set_title(f\"Mean squared error = {mse:.2f}\")" + "fit_score_plot_regression(linear_regression, title=\"Simple linear regression\")" ] }, { "cell_type": "markdown", + "id": "545fc1f3", "metadata": {}, "source": [ - "\n", "Here the coefficient and intercept learnt by `LinearRegression` define the\n", "best \"straight line\" that fits the data. We can inspect the coefficients using\n", "the attributes of the model learnt as follows:" @@ -152,6 +185,7 @@ { "cell_type": "code", "execution_count": null, + "id": "0f95ceef", "metadata": {}, "outputs": [], "source": [ @@ -163,12 +197,11 @@ }, { "cell_type": "markdown", + "id": "1a34a48c", "metadata": {}, "source": [ - "It is important to note that the learnt model is not able to handle the\n", - "non-linear relationship between `data` and `target` since linear models assume\n", - "the relationship between `data` and `target` to be linear.\n", - "\n", + "Notice that the learnt model cannot handle the non-linear relationship between\n", + "`data` and `target` because linear models assume a linear relationship.\n", "Indeed, there are 3 possibilities to solve this issue:\n", "\n", "1. choose a model that can natively deal with non-linearity,\n", @@ -184,31 +217,29 @@ { "cell_type": "code", "execution_count": null, + "id": "e01b02d2", "metadata": {}, "outputs": [], "source": [ "from sklearn.tree import DecisionTreeRegressor\n", "\n", "tree = DecisionTreeRegressor(max_depth=3).fit(data, target)\n", - "target_predicted = tree.predict(data)\n", - "mse = mean_squared_error(target, target_predicted)" + "tree" ] }, { "cell_type": "code", "execution_count": null, + "id": "9a27773e", "metadata": {}, "outputs": [], "source": [ - "ax = sns.scatterplot(\n", - " data=full_data, x=\"input_feature\", y=\"target\", color=\"black\", alpha=0.5\n", - ")\n", - "ax.plot(data, target_predicted)\n", - "_ = ax.set_title(f\"Mean squared error = {mse:.2f}\")" + "fit_score_plot_regression(tree, title=\"Decision tree regression\")" ] }, { "cell_type": "markdown", + "id": "4d5070e3", "metadata": {}, "source": [ "Instead of having a model which can natively deal with non-linearity, we could\n", @@ -225,6 +256,7 @@ { "cell_type": "code", "execution_count": null, + "id": "28c13246", "metadata": {}, "outputs": [], "source": [ @@ -234,9 +266,8 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "lines_to_next_cell": 2 - }, + "id": "69d0ba50", + "metadata": {}, "outputs": [], "source": [ "data_expanded = np.concatenate([data, data**2, data**3], axis=1)\n", @@ -244,41 +275,46 @@ ] }, { - "cell_type": "code", - "execution_count": null, + "cell_type": "markdown", + "id": "7925141e", "metadata": {}, - "outputs": [], "source": [ - "linear_regression.fit(data_expanded, target)\n", - "target_predicted = linear_regression.predict(data_expanded)\n", - "mse = mean_squared_error(target, target_predicted)" + "Instead of manually creating such polynomial features one could directly use\n", + "[sklearn.preprocessing.PolynomialFeatures](https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.PolynomialFeatures.html)." ] }, { "cell_type": "code", "execution_count": null, + "id": "d31ed0f4", "metadata": {}, "outputs": [], "source": [ - "ax = sns.scatterplot(\n", - " data=full_data, x=\"input_feature\", y=\"target\", color=\"black\", alpha=0.5\n", - ")\n", - "ax.plot(data, target_predicted)\n", - "_ = ax.set_title(f\"Mean squared error = {mse:.2f}\")" + "from sklearn.preprocessing import PolynomialFeatures\n", + "\n", + "polynomial_expansion = PolynomialFeatures(degree=3, include_bias=False)" ] }, { "cell_type": "markdown", + "id": "6a7fe453", "metadata": {}, "source": [ - "We can see that even with a linear model, we can overcome the linearity\n", - "limitation of the model by adding the non-linear components in the design of\n", - "additional features. Here, we created new features by knowing the way the\n", - "target was generated.\n", - "\n", - "Instead of manually creating such polynomial features one could directly use\n", - "[sklearn.preprocessing.PolynomialFeatures](https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.PolynomialFeatures.html).\n", + "In the previous cell we had to set `include_bias=False` as otherwise we would\n", + "create a constant feature perfectly correlated to the `intercept_` introduced\n", + "by the `LinearRegression`. We can verify that this procedure is equivalent to\n", + "creating the features by hand up to numerical error by computing the maximum\n", + "of the absolute values of the differences between the features generated by\n", + "both methods and checking that it is close to zero:\n", "\n", + "np.abs(polynomial_expansion.fit_transform(data) - data_expanded).max()" + ] + }, + { + "cell_type": "markdown", + "id": "269fbe2b", + "metadata": {}, + "source": [ "To demonstrate the use of the `PolynomialFeatures` class, we use a\n", "scikit-learn pipeline which first transforms the features and then fit the\n", "regression model." @@ -287,6 +323,7 @@ { "cell_type": "code", "execution_count": null, + "id": "38ba0c5c", "metadata": {}, "outputs": [], "source": [ @@ -297,58 +334,29 @@ " PolynomialFeatures(degree=3, include_bias=False),\n", " LinearRegression(),\n", ")\n", - "polynomial_regression.fit(data, target)\n", - "target_predicted = polynomial_regression.predict(data)\n", - "mse = mean_squared_error(target, target_predicted)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "In the previous cell we had to set `include_bias=False` as otherwise we would\n", - "create a column perfectly correlated to the `intercept_` introduced by the\n", - "`LinearRegression`. We can verify that this procedure is equivalent to\n", - "creating the features by hand up to numerical error by computing the maximum\n", - "of the absolute values of the differences between the features generated by\n", - "both methods and checking that it is close to zero:" + "polynomial_regression" ] }, { "cell_type": "code", "execution_count": null, + "id": "5df7d4a4", "metadata": {}, "outputs": [], "source": [ - "np.abs(polynomial_regression[0].fit_transform(data) - data_expanded).max()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Then it should not be surprising that the predictions of the\n", - "`PolynomialFeatures` pipeline match the predictions of the linear model fit on\n", - "manually engineered features." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "ax = sns.scatterplot(\n", - " data=full_data, x=\"input_feature\", y=\"target\", color=\"black\", alpha=0.5\n", - ")\n", - "ax.plot(data, target_predicted)\n", - "_ = ax.set_title(f\"Mean squared error = {mse:.2f}\")" + "fit_score_plot_regression(polynomial_regression, title=\"Polynomial regression\")" ] }, { "cell_type": "markdown", + "id": "fe259d20", "metadata": {}, "source": [ + "We can see that even with a linear model, we can overcome the linearity\n", + "limitation of the model by adding the non-linear components in the design of\n", + "additional features. Here, we created new features by knowing the way the\n", + "target was generated.\n", + "\n", "The last possibility is to make a linear model more expressive is to use a\n", "\"kernel\". Instead of learning one weight per feature as we previously did, a\n", "weight is assigned to each sample. However, not all samples are used: some\n", @@ -371,32 +379,29 @@ { "cell_type": "code", "execution_count": null, + "id": "7d46da9b", "metadata": {}, "outputs": [], "source": [ "from sklearn.svm import SVR\n", "\n", "svr = SVR(kernel=\"linear\")\n", - "svr.fit(data, target)\n", - "target_predicted = svr.predict(data)\n", - "mse = mean_squared_error(target, target_predicted)" + "svr" ] }, { "cell_type": "code", "execution_count": null, + "id": "9406b676", "metadata": {}, "outputs": [], "source": [ - "ax = sns.scatterplot(\n", - " data=full_data, x=\"input_feature\", y=\"target\", color=\"black\", alpha=0.5\n", - ")\n", - "ax.plot(data, target_predicted)\n", - "_ = ax.set_title(f\"Mean squared error = {mse:.2f}\")" + "fit_score_plot_regression(svr, title=\"Linear support vector machine\")" ] }, { "cell_type": "markdown", + "id": "fd29730e", "metadata": {}, "source": [ "The predictions of our SVR with a linear kernel are all aligned on a straight\n", @@ -414,30 +419,27 @@ { "cell_type": "code", "execution_count": null, + "id": "ae1550fa", "metadata": {}, "outputs": [], "source": [ "svr = SVR(kernel=\"poly\", degree=3)\n", - "svr.fit(data, target)\n", - "target_predicted = svr.predict(data)\n", - "mse = mean_squared_error(target, target_predicted)" + "svr" ] }, { "cell_type": "code", "execution_count": null, + "id": "c4670a4e", "metadata": {}, "outputs": [], "source": [ - "ax = sns.scatterplot(\n", - " data=full_data, x=\"input_feature\", y=\"target\", color=\"black\", alpha=0.5\n", - ")\n", - "ax.plot(data, target_predicted)\n", - "_ = ax.set_title(f\"Mean squared error = {mse:.2f}\")" + "fit_score_plot_regression(svr, title=\"Polynomial support vector machine\")" ] }, { "cell_type": "markdown", + "id": "732b2b0f", "metadata": {}, "source": [ "Kernel methods such as SVR are very efficient for small to medium datasets.\n", @@ -448,7 +450,7 @@ "as\n", "[KBinsDiscretizer](https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.KBinsDiscretizer.html)\n", "or\n", - "[Nystroem](https://scikit-learn.org/stable/modules/generated/sklearn.kernel_approximation.Nystroem.html).\n", + "[SplineTransformer](https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.SplineTransformer.html).\n", "\n", "Here again we refer the interested reader to the documentation to get a proper\n", "definition of those methods. The following just gives an intuitive overview of\n", @@ -458,6 +460,7 @@ { "cell_type": "code", "execution_count": null, + "id": "e30e6b37", "metadata": {}, "outputs": [], "source": [ @@ -467,19 +470,48 @@ " KBinsDiscretizer(n_bins=8),\n", " LinearRegression(),\n", ")\n", - "binned_regression.fit(data, target)\n", - "target_predicted = binned_regression.predict(data)\n", - "mse = mean_squared_error(target, target_predicted)\n", + "binned_regression" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b46eb0ef", + "metadata": {}, + "outputs": [], + "source": [ + "fit_score_plot_regression(binned_regression, title=\"Binned regression\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5403e6b1", + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.preprocessing import SplineTransformer\n", "\n", - "ax = sns.scatterplot(\n", - " data=full_data, x=\"input_feature\", y=\"target\", color=\"black\", alpha=0.5\n", + "spline_regression = make_pipeline(\n", + " SplineTransformer(degree=3, include_bias=False),\n", + " LinearRegression(),\n", ")\n", - "ax.plot(data, target_predicted)\n", - "_ = ax.set_title(f\"Mean squared error = {mse:.2f}\")" + "spline_regression" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0dcdfe92", + "metadata": {}, + "outputs": [], + "source": [ + "fit_score_plot_regression(spline_regression, title=\"Spline regression\")" ] }, { "cell_type": "markdown", + "id": "4b4f0560", "metadata": {}, "source": [ "`Nystroem` is a nice alternative to `PolynomialFeatures` that makes it\n", @@ -491,6 +523,7 @@ { "cell_type": "code", "execution_count": null, + "id": "41d6abd8", "metadata": {}, "outputs": [], "source": [ @@ -500,19 +533,24 @@ " Nystroem(kernel=\"poly\", degree=3, n_components=5, random_state=0),\n", " LinearRegression(),\n", ")\n", - "nystroem_regression.fit(data, target)\n", - "target_predicted = nystroem_regression.predict(data)\n", - "mse = mean_squared_error(target, target_predicted)\n", - "\n", - "ax = sns.scatterplot(\n", - " data=full_data, x=\"input_feature\", y=\"target\", color=\"black\", alpha=0.5\n", - ")\n", - "ax.plot(data, target_predicted)\n", - "_ = ax.set_title(f\"Mean squared error = {mse:.2f}\")" + "nystroem_regression" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "be6a232c", + "metadata": {}, + "outputs": [], + "source": [ + "fit_score_plot_regression(\n", + " nystroem_regression, title=\"Polynomial Nystroem regression\"\n", + ")" ] }, { "cell_type": "markdown", + "id": "7860e12d", "metadata": {}, "source": [ "## Notebook Recap\n", @@ -541,4 +579,4 @@ }, "nbformat": 4, "nbformat_minor": 5 -} \ No newline at end of file +} diff --git a/notebooks/logistic_regression.ipynb b/notebooks/logistic_regression.ipynb index 4c4cf0de7..691283b02 100644 --- a/notebooks/logistic_regression.ipynb +++ b/notebooks/logistic_regression.ipynb @@ -2,9 +2,10 @@ "cells": [ { "cell_type": "markdown", + "id": "b0e67575", "metadata": {}, "source": [ - "# Linear model for classification\n", + "# Linear models for classification\n", "\n", "In regression, we saw that the target to be predicted is a continuous\n", "variable. In classification, the target is discrete (e.g. categorical).\n", @@ -17,18 +18,19 @@ }, { "cell_type": "markdown", + "id": "ac574018", "metadata": {}, "source": [ - "
\n", - "

Note

\n", - "

If you want a deeper overview regarding this dataset, you can refer to the\n", - "Appendix - Datasets description section at the end of this MOOC.

\n", - "
" + "```{note}\n", + "If you want a deeper overview regarding this dataset, you can refer to the\n", + "Appendix - Datasets description section at the end of this MOOC.\n", + "```" ] }, { "cell_type": "code", "execution_count": null, + "id": "a47d670a", "metadata": {}, "outputs": [], "source": [ @@ -46,6 +48,7 @@ }, { "cell_type": "markdown", + "id": "2165fcfc", "metadata": {}, "source": [ "We can quickly start by visualizing the feature distribution by class:" @@ -54,6 +57,7 @@ { "cell_type": "code", "execution_count": null, + "id": "9ac5a70c", "metadata": {}, "outputs": [], "source": [ @@ -68,6 +72,7 @@ }, { "cell_type": "markdown", + "id": "cab96de7", "metadata": {}, "source": [ "We can observe that we have quite a simple problem. When the culmen length\n", @@ -81,6 +86,7 @@ { "cell_type": "code", "execution_count": null, + "id": "b6a3b04c", "metadata": {}, "outputs": [], "source": [ @@ -97,6 +103,7 @@ }, { "cell_type": "markdown", + "id": "4964b148", "metadata": {}, "source": [ "The linear regression that we previously saw predicts a continuous output.\n", @@ -110,6 +117,7 @@ { "cell_type": "code", "execution_count": null, + "id": "47347104", "metadata": {}, "outputs": [], "source": [ @@ -125,6 +133,7 @@ }, { "cell_type": "markdown", + "id": "bafd8265", "metadata": {}, "source": [ "Since we are dealing with a classification problem containing only 2 features,\n", @@ -132,21 +141,22 @@ "the rule used by our predictive model to affect a class label given the\n", "feature values of the sample.\n", "\n", - "
\n", - "

Note

\n", - "

Here, we use the class DecisionBoundaryDisplay. This educational tool allows\n", + "```{note}\n", + "Here, we use the class `DecisionBoundaryDisplay`. This educational tool allows\n", "us to gain some insights by plotting the decision function boundary learned by\n", - "the classifier in a 2 dimensional feature space.

\n", - "

Notice however that in more realistic machine learning contexts, one would\n", + "the classifier in a 2 dimensional feature space.\n", + "\n", + "Notice however that in more realistic machine learning contexts, one would\n", "typically fit on more than two features at once and therefore it would not be\n", "possible to display such a visualization of the decision boundary in\n", - "general.

\n", - "
" + "general.\n", + "```" ] }, { "cell_type": "code", "execution_count": null, + "id": "dd628d44", "metadata": {}, "outputs": [], "source": [ @@ -172,28 +182,61 @@ }, { "cell_type": "markdown", + "id": "dbd93bf3", "metadata": {}, "source": [ - "Thus, we see that our decision function is represented by a line separating\n", - "the 2 classes.\n", + "Thus, we see that our decision function is represented by a straight line\n", + "separating the 2 classes.\n", + "\n", + "For the mathematically inclined reader, the equation of the decision boundary\n", + "is:\n", + "\n", + " coef0 * x0 + coef1 * x1 + intercept = 0\n", + "\n", + "where `x0` is `\"Culmen Length (mm)\"` and `x1` is `\"Culmen Depth (mm)\"`.\n", "\n", - "Since the line is oblique, it means that we used a combination of both\n", - "features:" + "This equation is equivalent to (assuming that `coef1` is non-zero):\n", + "\n", + " x1 = coef0 / coef1 * x0 - intercept / coef1\n", + "\n", + "which is the equation of a straight line.\n", + "\n", + "Since the line is oblique, it means that both coefficients (also called\n", + "weights) are non-null:" ] }, { "cell_type": "code", "execution_count": null, + "id": "8c76e56c", "metadata": {}, "outputs": [], "source": [ - "coefs = logistic_regression[-1].coef_[0] # the coefficients is a 2d array\n", - "weights = pd.Series(coefs, index=culmen_columns)" + "coefs = logistic_regression[-1].coef_[0]\n", + "weights = pd.Series(coefs, index=[f\"Weight for '{c}'\" for c in culmen_columns])\n", + "weights" + ] + }, + { + "cell_type": "markdown", + "id": "416a9aff", + "metadata": {}, + "source": [ + "You can [access pipeline\n", + "steps](https://scikit-learn.org/stable/modules/compose.html#access-pipeline-steps)\n", + "by name or position. In the code above `logistic_regression[-1]` means the\n", + "last step of the pipeline. Then you can access the attributes of that step such\n", + "as `coef_`. Notice also that the `coef_` attribute is an array of shape (1,\n", + "`n_features`) an then we access it via its first entry. Alternatively one\n", + "could use `coef_.ravel()`.\n", + "\n", + "We are now ready to visualize the weight values as a barplot:" ] }, { "cell_type": "code", "execution_count": null, + "id": "8c9b19ae", "metadata": {}, "outputs": [], "source": [ @@ -203,34 +246,178 @@ }, { "cell_type": "markdown", + "id": "083d61ff", "metadata": {}, "source": [ - "Indeed, both coefficients are non-null. If one of them had been zero, the\n", - "decision boundary would have been either horizontal or vertical.\n", + "If one of the weights had been zero, the decision boundary would have been\n", + "either horizontal or vertical.\n", "\n", "Furthermore the intercept is also non-zero, which means that the decision does\n", "not go through the point with (0, 0) coordinates.\n", "\n", - "For the mathematically inclined reader, the equation of the decision boundary\n", - "is:\n", + "## (Estimated) predicted probabilities\n", "\n", - " coef0 * x0 + coef1 * x1 + intercept = 0\n", + "The `predict` method in classification models returns what we call a \"hard\n", + "class prediction\", i.e. the most likely class a given data point would belong\n", + "to. We can confirm the intuition given by the `DecisionBoundaryDisplay` by\n", + "testing on a hypothetical `sample`:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d30ac7e5", + "metadata": {}, + "outputs": [], + "source": [ + "test_penguin = pd.DataFrame(\n", + " {\"Culmen Length (mm)\": [45], \"Culmen Depth (mm)\": [17]}\n", + ")\n", + "logistic_regression.predict(test_penguin)" + ] + }, + { + "cell_type": "markdown", + "id": "6e7141da", + "metadata": {}, + "source": [ + "In this case, our logistic regression classifier predicts the Chinstrap\n", + "species. Note that this agrees with the decision boundary plot above: the\n", + "coordinates of this test data point match a location close to the decision\n", + "boundary, in the red region.\n", "\n", - "where `x0` is `\"Culmen Length (mm)\"` and `x1` is `\"Culmen Depth (mm)\"`.\n", + "As mentioned in the introductory slides ๐ŸŽฅ **Intuitions on linear models**,\n", + "one can alternatively use the `predict_proba` method to compute continuous\n", + "values (\"soft predictions\") that correspond to an estimation of the confidence\n", + "of the target belonging to each class.\n", "\n", - "This equation is equivalent to (assuming that `coef1` is non-zero):\n", + "For a binary classification scenario, the logistic regression makes both hard\n", + "and soft predictions based on the [logistic\n", + "function](https://en.wikipedia.org/wiki/Logistic_function) (also called\n", + "sigmoid function), which is S-shaped and maps any input into a value between 0\n", + "and 1." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f03d6062", + "metadata": {}, + "outputs": [], + "source": [ + "y_pred_proba = logistic_regression.predict_proba(test_penguin)\n", + "y_pred_proba" + ] + }, + { + "cell_type": "markdown", + "id": "bd3a7c7f", + "metadata": {}, + "source": [ + "More in general, the output of `predict_proba` is an array of shape\n", + "(`n_samples`, `n_classes`)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e12bb08c", + "metadata": {}, + "outputs": [], + "source": [ + "y_pred_proba.shape" + ] + }, + { + "cell_type": "markdown", + "id": "67f73ae8", + "metadata": {}, + "source": [ + "Also notice that the sum of (estimated) predicted probabilities across classes\n", + "is 1.0 for each given sample. We can visualize them for our `test_penguin` as\n", + "follows:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "427587b6", + "metadata": {}, + "outputs": [], + "source": [ + "y_proba_sample = pd.Series(\n", + " y_pred_proba.ravel(), index=logistic_regression.classes_\n", + ")\n", + "y_proba_sample.plot.bar()\n", + "plt.ylabel(\"Estimated probability\")\n", + "_ = plt.title(\"Probability of the sample belonging to a penguin class\")" + ] + }, + { + "cell_type": "markdown", + "id": "053ad22c", + "metadata": {}, + "source": [ + "```{warning}\n", + "We insist that the output of `predict_proba` are just estimations. Their\n", + "reliability on being a good estimate of the true conditional class-assignment\n", + "probabilities depends on the quality of the model. Even classifiers with a\n", + "high accuracy on a test set may be overconfident for some individuals and\n", + "underconfident for others.\n", + "```\n", "\n", - " x1 = coef0 / coef1 * x0 - intercept / coef1\n", + "Similarly to the hard decision boundary shown above, one can set the\n", + "`response_method` to `\"predict_proba\"` in the `DecisionBoundaryDisplay` to\n", + "rather show the confidence on individual classifications. In such case the\n", + "boundaries encode the estimated probablities by color. In particular, when\n", + "using [matplotlib diverging\n", + "colormaps](https://matplotlib.org/stable/users/explain/colors/colormaps.html#diverging)\n", + "such as `\"RdBu_r\"`, the softer the color, the more unsure about which class to\n", + "choose (the probability of 0.5 is mapped to white color).\n", "\n", - "which is the equation of a straight line.\n", + "Equivalently, towards the tails of the curve the sigmoid function approaches\n", + "its asymptotic values of 0 or 1, which are mapped to darker colors. Indeed,\n", + "the closer the predicted probability is to 0 or 1, the more confident the\n", + "classifier is in its predictions." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "fbcece8a", + "metadata": {}, + "outputs": [], + "source": [ + "DecisionBoundaryDisplay.from_estimator(\n", + " logistic_regression,\n", + " data_test,\n", + " response_method=\"predict_proba\",\n", + " cmap=\"RdBu_r\",\n", + " alpha=0.5,\n", + ")\n", + "sns.scatterplot(\n", + " data=penguins_test,\n", + " x=culmen_columns[0],\n", + " y=culmen_columns[1],\n", + " hue=target_column,\n", + " palette=[\"tab:red\", \"tab:blue\"],\n", + ")\n", + "_ = plt.title(\"Predicted probability of the trained\\n LogisticRegression\")" + ] + }, + { + "cell_type": "markdown", + "id": "54133c3a", + "metadata": {}, + "source": [ + "For multi-class classification the logistic regression uses the [softmax\n", + "function](https://en.wikipedia.org/wiki/Softmax_function) to make predictions.\n", + "Giving more details on that scenario is beyond the scope of this MOOC.\n", "\n", - "
\n", - "

Note

\n", - "

If you want to go further, try changing the response_method to\n", - "\"predict_proba\" in the DecisionBoundaryDisplay above. Now the boundaries\n", - "encode by color the estimated probability of belonging to either class, as\n", - "mentioned in the introductory slides \ud83c\udfa5 Intuitions on linear models.

\n", - "
" + "In any case, interested users are refered to the [scikit-learn user guide](\n", + "https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression)\n", + "for a more mathematical description of the `predict_proba` method of the\n", + "`LogisticRegression` and the respective normalization functions." ] } ], @@ -245,4 +432,4 @@ }, "nbformat": 4, "nbformat_minor": 5 -} \ No newline at end of file +} diff --git a/notebooks/logistic_regression_non_linear.ipynb b/notebooks/logistic_regression_non_linear.ipynb deleted file mode 100644 index ccc05be33..000000000 --- a/notebooks/logistic_regression_non_linear.ipynb +++ /dev/null @@ -1,327 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Beyond linear separation in classification\n", - "\n", - "As we saw in the regression section, the linear classification model expects\n", - "the data to be linearly separable. When this assumption does not hold, the\n", - "model is not expressive enough to properly fit the data. Therefore, we need to\n", - "apply the same tricks as in regression: feature augmentation (potentially\n", - "using expert-knowledge) or using a kernel-based method.\n", - "\n", - "We will provide examples where we will use a kernel support vector machine to\n", - "perform classification on some toy-datasets where it is impossible to find a\n", - "perfect linear separation.\n", - "\n", - "We will generate a first dataset where the data are represented as two\n", - "interlaced half circles. This dataset is generated using the function\n", - "[`sklearn.datasets.make_moons`](https://scikit-learn.org/stable/modules/generated/sklearn.datasets.make_moons.html)." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import numpy as np\n", - "import pandas as pd\n", - "from sklearn.datasets import make_moons\n", - "\n", - "feature_names = [\"Feature #0\", \"Features #1\"]\n", - "target_name = \"class\"\n", - "\n", - "X, y = make_moons(n_samples=100, noise=0.13, random_state=42)\n", - "\n", - "# We store both the data and target in a dataframe to ease plotting\n", - "moons = pd.DataFrame(\n", - " np.concatenate([X, y[:, np.newaxis]], axis=1),\n", - " columns=feature_names + [target_name],\n", - ")\n", - "data_moons, target_moons = moons[feature_names], moons[target_name]" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Since the dataset contains only two features, we can make a scatter plot to\n", - "have a look at it." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import matplotlib.pyplot as plt\n", - "import seaborn as sns\n", - "\n", - "sns.scatterplot(\n", - " data=moons,\n", - " x=feature_names[0],\n", - " y=feature_names[1],\n", - " hue=target_moons,\n", - " palette=[\"tab:red\", \"tab:blue\"],\n", - ")\n", - "_ = plt.title(\"Illustration of the moons dataset\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "From the intuitions that we got by studying linear model, it should be obvious\n", - "that a linear classifier will not be able to find a perfect decision function\n", - "to separate the two classes.\n", - "\n", - "Let's try to see what is the decision boundary of such a linear classifier. We\n", - "will create a predictive model by standardizing the dataset followed by a\n", - "linear support vector machine classifier." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from sklearn.pipeline import make_pipeline\n", - "from sklearn.preprocessing import StandardScaler\n", - "from sklearn.svm import SVC\n", - "\n", - "linear_model = make_pipeline(StandardScaler(), SVC(kernel=\"linear\"))\n", - "linear_model.fit(data_moons, target_moons)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "
\n", - "

Warning

\n", - "

Be aware that we fit and will check the boundary decision of the classifier on\n", - "the same dataset without splitting the dataset into a training set and a\n", - "testing set. While this is a bad practice, we use it for the sake of\n", - "simplicity to depict the model behavior. Always use cross-validation when you\n", - "want to assess the generalization performance of a machine-learning model.

\n", - "
" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Let's check the decision boundary of such a linear model on this dataset." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from sklearn.inspection import DecisionBoundaryDisplay\n", - "\n", - "DecisionBoundaryDisplay.from_estimator(\n", - " linear_model, data_moons, response_method=\"predict\", cmap=\"RdBu\", alpha=0.5\n", - ")\n", - "sns.scatterplot(\n", - " data=moons,\n", - " x=feature_names[0],\n", - " y=feature_names[1],\n", - " hue=target_moons,\n", - " palette=[\"tab:red\", \"tab:blue\"],\n", - ")\n", - "_ = plt.title(\"Decision boundary of a linear model\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "As expected, a linear decision boundary is not enough flexible to split the\n", - "two classes.\n", - "\n", - "To push this example to the limit, we will create another dataset where\n", - "samples of a class will be surrounded by samples from the other class." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from sklearn.datasets import make_gaussian_quantiles\n", - "\n", - "feature_names = [\"Feature #0\", \"Features #1\"]\n", - "target_name = \"class\"\n", - "\n", - "X, y = make_gaussian_quantiles(\n", - " n_samples=100, n_features=2, n_classes=2, random_state=42\n", - ")\n", - "gauss = pd.DataFrame(\n", - " np.concatenate([X, y[:, np.newaxis]], axis=1),\n", - " columns=feature_names + [target_name],\n", - ")\n", - "data_gauss, target_gauss = gauss[feature_names], gauss[target_name]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "ax = sns.scatterplot(\n", - " data=gauss,\n", - " x=feature_names[0],\n", - " y=feature_names[1],\n", - " hue=target_gauss,\n", - " palette=[\"tab:red\", \"tab:blue\"],\n", - ")\n", - "_ = plt.title(\"Illustration of the Gaussian quantiles dataset\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Here, this is even more obvious that a linear decision function is not\n", - "adapted. We can check what decision function, a linear support vector machine\n", - "will find." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "linear_model.fit(data_gauss, target_gauss)\n", - "DecisionBoundaryDisplay.from_estimator(\n", - " linear_model, data_gauss, response_method=\"predict\", cmap=\"RdBu\", alpha=0.5\n", - ")\n", - "sns.scatterplot(\n", - " data=gauss,\n", - " x=feature_names[0],\n", - " y=feature_names[1],\n", - " hue=target_gauss,\n", - " palette=[\"tab:red\", \"tab:blue\"],\n", - ")\n", - "_ = plt.title(\"Decision boundary of a linear model\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "As expected, a linear separation cannot be used to separate the classes\n", - "properly: the model will under-fit as it will make errors even on the training\n", - "set.\n", - "\n", - "In the section about linear regression, we saw that we could use several\n", - "tricks to make a linear model more flexible by augmenting features or using a\n", - "kernel. Here, we will use the later solution by using a radial basis function\n", - "(RBF) kernel together with a support vector machine classifier.\n", - "\n", - "We will repeat the two previous experiments and check the obtained decision\n", - "function." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "kernel_model = make_pipeline(StandardScaler(), SVC(kernel=\"rbf\", gamma=5))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "kernel_model.fit(data_moons, target_moons)\n", - "DecisionBoundaryDisplay.from_estimator(\n", - " kernel_model, data_moons, response_method=\"predict\", cmap=\"RdBu\", alpha=0.5\n", - ")\n", - "sns.scatterplot(\n", - " data=moons,\n", - " x=feature_names[0],\n", - " y=feature_names[1],\n", - " hue=target_moons,\n", - " palette=[\"tab:red\", \"tab:blue\"],\n", - ")\n", - "_ = plt.title(\"Decision boundary with a model using an RBF kernel\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "We see that the decision boundary is not anymore a straight line. Indeed, an\n", - "area is defined around the red samples and we could imagine that this\n", - "classifier should be able to generalize on unseen data.\n", - "\n", - "Let's check the decision function on the second dataset." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "kernel_model.fit(data_gauss, target_gauss)\n", - "DecisionBoundaryDisplay.from_estimator(\n", - " kernel_model, data_gauss, response_method=\"predict\", cmap=\"RdBu\", alpha=0.5\n", - ")\n", - "ax = sns.scatterplot(\n", - " data=gauss,\n", - " x=feature_names[0],\n", - " y=feature_names[1],\n", - " hue=target_gauss,\n", - " palette=[\"tab:red\", \"tab:blue\"],\n", - ")\n", - "_ = plt.title(\"Decision boundary with a model using an RBF kernel\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "We observe something similar than in the previous case. The decision function\n", - "is more flexible and does not underfit anymore.\n", - "\n", - "Thus, kernel trick or feature expansion are the tricks to make a linear\n", - "classifier more expressive, exactly as we saw in regression.\n", - "\n", - "Keep in mind that adding flexibility to a model can also risk increasing\n", - "overfitting by making the decision function to be sensitive to individual\n", - "(possibly noisy) data points of the training set. Here we can observe that the\n", - "decision functions remain smooth enough to preserve good generalization. If\n", - "you are curious, you can try to repeat the above experiment with `gamma=100`\n", - "and look at the decision functions." - ] - } - ], - "metadata": { - "jupytext": { - "main_language": "python" - }, - "kernelspec": { - "display_name": "Python 3", - "name": "python3" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} \ No newline at end of file diff --git a/notebooks/trees_ex_01.ipynb b/notebooks/trees_ex_01.ipynb index 99858920b..a2abdea01 100644 --- a/notebooks/trees_ex_01.ipynb +++ b/notebooks/trees_ex_01.ipynb @@ -6,16 +6,13 @@ "source": [ "# \ud83d\udcdd Exercise M5.01\n", "\n", - "In the previous notebook, we showed how a tree with a depth of 1 level was\n", - "working. The aim of this exercise is to repeat part of the previous experiment\n", - "for a depth with 2 levels to show how the process of partitioning is repeated\n", - "over time.\n", + "In the previous notebook, we showed how a tree with 1 level depth works. The\n", + "aim of this exercise is to repeat part of the previous experiment for a tree\n", + "with 2 levels depth to show how such parameter affects the feature space\n", + "partitioning.\n", "\n", - "Before to start, we will:\n", - "\n", - "* load the dataset;\n", - "* split the dataset into training and testing dataset;\n", - "* define the function to show the classification decision function." + "We first load the penguins dataset and split it into a training and a testing\n", + "sets:" ] }, { @@ -61,10 +58,35 @@ "metadata": {}, "source": [ "Create a decision tree classifier with a maximum depth of 2 levels and fit the\n", - "training data. Once this classifier trained, plot the data and the decision\n", - "boundary to see the benefit of increasing the depth. To plot the decision\n", - "boundary, you should import the class `DecisionBoundaryDisplay` from the\n", - "module `sklearn.inspection` as shown in the previous course notebook." + "training data." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Write your code here." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now plot the data and the decision boundary of the trained classifier to see\n", + "the effect of increasing the depth of the tree.\n", + "\n", + "Hint: Use the class `DecisionBoundaryDisplay` from the module\n", + "`sklearn.inspection` as shown in previous course notebooks.\n", + "\n", + "
\n", + "

Warning

\n", + "

At this time, it is not possible to use response_method=\"predict_proba\" for\n", + "multiclass problems. This is a planned feature for a future version of\n", + "scikit-learn. In the mean time, you can use response_method=\"predict\"\n", + "instead.

\n", + "
" ] }, { diff --git a/python_scripts/trees_ex_01.py b/python_scripts/trees_ex_01.py index ecfd6bf55..2d7b1d40b 100644 --- a/python_scripts/trees_ex_01.py +++ b/python_scripts/trees_ex_01.py @@ -14,16 +14,13 @@ # %% [markdown] # # ๐Ÿ“ Exercise M5.01 # -# In the previous notebook, we showed how a tree with a depth of 1 level was -# working. The aim of this exercise is to repeat part of the previous experiment -# for a depth with 2 levels to show how the process of partitioning is repeated -# over time. +# In the previous notebook, we showed how a tree with 1 level depth works. The +# aim of this exercise is to repeat part of the previous experiment for a tree +# with 2 levels depth to show how such parameter affects the feature space +# partitioning. # -# Before to start, we will: -# -# * load the dataset; -# * split the dataset into training and testing dataset; -# * define the function to show the classification decision function. +# We first load the penguins dataset and split it into a training and a testing +# sets: # %% import pandas as pd @@ -48,10 +45,24 @@ # %% [markdown] # Create a decision tree classifier with a maximum depth of 2 levels and fit the -# training data. Once this classifier trained, plot the data and the decision -# boundary to see the benefit of increasing the depth. To plot the decision -# boundary, you should import the class `DecisionBoundaryDisplay` from the -# module `sklearn.inspection` as shown in the previous course notebook. +# training data. + +# %% +# Write your code here. + +# %% [markdown] +# Now plot the data and the decision boundary of the trained classifier to see +# the effect of increasing the depth of the tree. +# +# Hint: Use the class `DecisionBoundaryDisplay` from the module +# `sklearn.inspection` as shown in previous course notebooks. +# +# ```{warning} +# At this time, it is not possible to use `response_method="predict_proba"` for +# multiclass problems. This is a planned feature for a future version of +# scikit-learn. In the mean time, you can use `response_method="predict"` +# instead. +# ``` # %% # Write your code here. From 0bf183058da76d6e22bf9cf4afacaead334f1a88 Mon Sep 17 00:00:00 2001 From: Olivier Grisel Date: Sun, 29 Oct 2023 09:45:50 +0100 Subject: [PATCH 078/108] FIX jupyter book preview in PRs (#741) --- .github/workflows/deploy-gh-pages.yml | 10 +++++++--- .github/workflows/jupyter-book-pr-preview.yml | 7 +++++-- 2 files changed, 12 insertions(+), 5 deletions(-) diff --git a/.github/workflows/deploy-gh-pages.yml b/.github/workflows/deploy-gh-pages.yml index 1642e7ffa..36e39c3e1 100644 --- a/.github/workflows/deploy-gh-pages.yml +++ b/.github/workflows/deploy-gh-pages.yml @@ -44,13 +44,17 @@ jobs: v2-refs/heads/main - name: Build the JupyterBook + run: | + bash build_tools/build_jupyter_book.sh + + - name: Save the PR number env: GITHUB_PULL_REQUEST_NUMBER: ${{github.event.number}} run: | - bash build_tools/build_jupyter_book.sh + echo "Storing PR number ${{github.event.number}} to 'pull_request_number' file" echo ${{github.event.number}} > pull_request_number - - name: Upload jupyter-book artifact for PRs + - name: Upload jupyter-book artifact for preview in PRs if: ${{ github.event_name == 'pull_request' }} uses: actions/upload-artifact@v3 with: @@ -59,7 +63,7 @@ jobs: jupyter-book/_build/html pull_request_number - - name: GitHub Pages action + - name: Update the main gh-page website if: ${{ github.event_name == 'push' && github.ref == 'refs/heads/main' }} uses: peaceiris/actions-gh-pages@v3.6.1 with: diff --git a/.github/workflows/jupyter-book-pr-preview.yml b/.github/workflows/jupyter-book-pr-preview.yml index fbe6a8d13..1b444907d 100644 --- a/.github/workflows/jupyter-book-pr-preview.yml +++ b/.github/workflows/jupyter-book-pr-preview.yml @@ -22,7 +22,8 @@ jobs: - name: Get pull request number id: pull-request-number run: | - export PULL_REQUEST_NUMBER=${{github.event.workflow_run.event.number}} + export PULL_REQUEST_NUMBER=${{github.event.workflow_run.pull_requests[0].number}} + echo "PULL_REQUEST_NUMBER=$PULL_REQUEST_NUMBER" echo "result=${PULL_REQUEST_NUMBER}" >> $GITHUB_OUTPUT - uses: dawidd6/action-download-artifact@v2 @@ -40,7 +41,9 @@ jobs: env: NETLIFY_AUTH_TOKEN: ${{secrets.NETLIFY_AUTH_TOKEN}} NETLIFY_SITE_ID: ${{secrets.NETLIFY_SITE_ID}} - run: netlify deploy --dir=jupyter-book/_build/html --alias=pull-request-${{steps.pull-request-number.outputs.result}} + run: | + echo "Deploying PR ${{steps.pull-request-number.outputs.result}} to Netlify" + netlify deploy --dir=jupyter-book/_build/html --alias=pull-request-${{steps.pull-request-number.outputs.result}} - name: 'Commit Status: Update deployment status' uses: myrotvorets/set-commit-status-action@1.1.6 From c35fcaf753ac8d0316f79605b395374c8ec20f00 Mon Sep 17 00:00:00 2001 From: Olivier Grisel Date: Mon, 30 Oct 2023 10:28:23 +0100 Subject: [PATCH 079/108] FIX (attempt) jupyter book preview in PRs (#745) --- .github/workflows/jupyter-book-pr-preview.yml | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/.github/workflows/jupyter-book-pr-preview.yml b/.github/workflows/jupyter-book-pr-preview.yml index 1b444907d..5a4ca191d 100644 --- a/.github/workflows/jupyter-book-pr-preview.yml +++ b/.github/workflows/jupyter-book-pr-preview.yml @@ -19,20 +19,18 @@ jobs: sha: ${{ github.event.workflow_run.head_sha }} context: 'JupyterBook preview' + - name: Download artifacts + run: gh run download --name jupyter-book ${{ github.event.workflow_run.id }} + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Get pull request number id: pull-request-number run: | - export PULL_REQUEST_NUMBER=${{github.event.workflow_run.pull_requests[0].number}} + export PULL_REQUEST_NUMBER=`cat pull_request_number` echo "PULL_REQUEST_NUMBER=$PULL_REQUEST_NUMBER" echo "result=${PULL_REQUEST_NUMBER}" >> $GITHUB_OUTPUT - - uses: dawidd6/action-download-artifact@v2 - with: - github_token: ${{secrets.GITHUB_TOKEN}} - workflow: deploy-gh-pages.yml - pr: ${{steps.pull-request-number.outputs.result}} - name: jupyter-book - - uses: actions/setup-node@v3 with: node-version: '16' From 1fe4a1d7950dc2c174233048493baae13979f349 Mon Sep 17 00:00:00 2001 From: PatriOr <147734035+PatriOr@users.noreply.github.com> Date: Mon, 30 Oct 2023 10:34:57 +0100 Subject: [PATCH 080/108] Remove redundant text (#744) --- python_scripts/trees_hyperparameters.py | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/python_scripts/trees_hyperparameters.py b/python_scripts/trees_hyperparameters.py index 007998851..dc8594968 100644 --- a/python_scripts/trees_hyperparameters.py +++ b/python_scripts/trees_hyperparameters.py @@ -227,14 +227,9 @@ def fit_and_plot_regression(model, data, feature_names, target_names): _ = plt.title(f"Decision tree with max-depth of {max_depth}") # %% [markdown] -# As expected, we see that the blue blob on the right and the red blob on the -# top are easily separated. However, more splits will be required to better +# As expected, we see that the blue blob in the lower right and the red blob on +# the top are easily separated. However, more splits will be required to better # split the blob were both blue and red data points are mixed. -# -# Indeed, we see that red blob on the top and the blue blob on the right of the -# plot are perfectly separated. However, the tree is still making mistakes in -# the area where the blobs are mixed together. Let's check the tree -# representation. # %% from sklearn.tree import plot_tree From 08f5b1c70fa46d2d0dba73598d5d8ea45f7e1d72 Mon Sep 17 00:00:00 2001 From: Olivier Grisel Date: Mon, 30 Oct 2023 10:41:55 +0100 Subject: [PATCH 081/108] Use dawidd6/action-download-artifact@v2 (with run_id) instead of gh CLI --- .github/workflows/jupyter-book-pr-preview.yml | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/.github/workflows/jupyter-book-pr-preview.yml b/.github/workflows/jupyter-book-pr-preview.yml index 5a4ca191d..b423f4168 100644 --- a/.github/workflows/jupyter-book-pr-preview.yml +++ b/.github/workflows/jupyter-book-pr-preview.yml @@ -19,10 +19,12 @@ jobs: sha: ${{ github.event.workflow_run.head_sha }} context: 'JupyterBook preview' - - name: Download artifacts - run: gh run download --name jupyter-book ${{ github.event.workflow_run.id }} - env: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - uses: dawidd6/action-download-artifact@v2 + with: + github_token: ${{secrets.GITHUB_TOKEN}} + workflow: deploy-gh-pages.yml + run_id: ${{ github.event.workflow_run.id }} + name: jupyter-book - name: Get pull request number id: pull-request-number From f39b464d3128a7cb43000e2544d11cb7d549fff6 Mon Sep 17 00:00:00 2001 From: Olivier Grisel Date: Mon, 30 Oct 2023 11:15:03 +0100 Subject: [PATCH 082/108] Fix _changed.html interplay with GITHUB_PULL_REQUEST_NUMBER from workflow --- .github/workflows/deploy-gh-pages.yml | 2 ++ build_tools/build_jupyter_book.sh | 6 ++++-- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/.github/workflows/deploy-gh-pages.yml b/.github/workflows/deploy-gh-pages.yml index 36e39c3e1..e12f9f7c6 100644 --- a/.github/workflows/deploy-gh-pages.yml +++ b/.github/workflows/deploy-gh-pages.yml @@ -44,6 +44,8 @@ jobs: v2-refs/heads/main - name: Build the JupyterBook + env: + GITHUB_PULL_REQUEST_NUMBER: ${{github.event.number}} run: | bash build_tools/build_jupyter_book.sh diff --git a/build_tools/build_jupyter_book.sh b/build_tools/build_jupyter_book.sh index e051a8d30..d791f7baa 100755 --- a/build_tools/build_jupyter_book.sh +++ b/build_tools/build_jupyter_book.sh @@ -30,16 +30,18 @@ write_changed_html() { if [ -n "$GITHUB_PULL_REQUEST_NUMBER" ] then GITHUB_PULL_REQUEST_URL="https://github.com/inria/scikit-learn-mooc/pull/$GITHUB_PULL_REQUEST_NUMBER" - echo "The following files may have been changed by PR $GITHUB_PR_NUMBER:" + echo "The following files may have been changed by PR $GITHUB_PULL_REQUEST_NUMBER:" echo "$affected" ( echo '' - echo 'Files changed by PR '"$GITHUB_PR_URL" + echo "Files changed by PR $GITHUB_PULL_REQUEST_URL" echo '
    ' echo "$affected" | sed 's|.*|
  • & [main]|' echo '

This PR JupyterBook index' echo '' ) > "$jupyter_book_build_dir/_changed.html" + else + echo "The variable 'GITHUB_PULL_REQUEST_NUMBER' is not defined: not writing the '_changed.html' file." fi } From 3aa71911d83afc3cdb68168eeb76d7e6033d0fef Mon Sep 17 00:00:00 2001 From: Arturo Amor <86408019+ArturoAmorQ@users.noreply.github.com> Date: Fri, 3 Nov 2023 15:17:48 +0100 Subject: [PATCH 083/108] MAINT Revert fix pandas version (#748) Co-authored-by: ArturoAmorQ --- environment-dev.yml | 4 ++-- environment.yml | 4 ++-- local-install-instructions.md | 2 +- requirements-dev.txt | 4 ++-- requirements.txt | 4 ++-- 5 files changed, 9 insertions(+), 9 deletions(-) diff --git a/environment-dev.yml b/environment-dev.yml index 260ae54a8..7836f9a59 100644 --- a/environment-dev.yml +++ b/environment-dev.yml @@ -3,9 +3,9 @@ channels: - conda-forge dependencies: - scikit-learn >= 1.3 - - pandas == 2.0 # avoid seaborn warning + - pandas >= 1 - matplotlib-base - - seaborn + - seaborn >= 0.13 - plotly >= 5.10 - jupytext - beautifulsoup4 diff --git a/environment.yml b/environment.yml index 6b9c4fcfa..0cad5e6fa 100644 --- a/environment.yml +++ b/environment.yml @@ -5,9 +5,9 @@ channels: dependencies: - scikit-learn >= 1.3 - - pandas == 2.0 # avoid seaborn warning + - pandas >= 1 - matplotlib-base - - seaborn + - seaborn >= 0.13 - jupyterlab - notebook - plotly >= 5.10 diff --git a/local-install-instructions.md b/local-install-instructions.md index 4cd7cffd1..e934dd882 100644 --- a/local-install-instructions.md +++ b/local-install-instructions.md @@ -48,7 +48,7 @@ Using python in /home/lesteve/miniconda3/envs/scikit-learn-course [ OK ] matplotlib version 3.3.3 [ OK ] sklearn version 1.3 [ OK ] pandas version 2.0 -[ OK ] seaborn version 0.11.1 +[ OK ] seaborn version 0.13 [ OK ] notebook version 6.2.0 [ OK ] plotly version 5.10.0 ``` diff --git a/requirements-dev.txt b/requirements-dev.txt index 89ad0ad4b..f7f00917d 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -1,7 +1,7 @@ scikit-learn>=1.3 -pandas==2.0 # avoid seaborn warning +pandas >= 1 matplotlib -seaborn +seaborn >= 0.13 plotly jupyter-book>=0.11 # Partial fix for the navbar scrollToActive behavior: diff --git a/requirements.txt b/requirements.txt index 6e006d442..1333ae488 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,7 +1,7 @@ scikit-learn>=1.3 -pandas==2.0 # avoid seaborn warning +pandas >= 1 matplotlib -seaborn +seaborn >= 0.13 plotly jupyterlab notebook From 4851394fd6248f20b664d54e7eba7d1762e87508 Mon Sep 17 00:00:00 2001 From: Arturo Amor <86408019+ArturoAmorQ@users.noreply.github.com> Date: Fri, 3 Nov 2023 15:43:17 +0100 Subject: [PATCH 084/108] ENH Add Adult census dataset description (#747) --- jupyter-book/_toc.yml | 2 +- .../appendix/adult_census_description.md | 11 - notebooks/datasets_adult_census.ipynb | 203 ++++++++++++++++++ python_scripts/datasets_adult_census.py | 160 ++++++++++++++ python_scripts/linear_models_ex_03.py | 2 +- python_scripts/linear_models_sol_03.py | 2 +- 6 files changed, 366 insertions(+), 14 deletions(-) delete mode 100644 jupyter-book/appendix/adult_census_description.md create mode 100644 notebooks/datasets_adult_census.ipynb create mode 100644 python_scripts/datasets_adult_census.py diff --git a/jupyter-book/_toc.yml b/jupyter-book/_toc.yml index 01d356e74..1907f35a5 100644 --- a/jupyter-book/_toc.yml +++ b/jupyter-book/_toc.yml @@ -213,7 +213,7 @@ parts: - file: appendix/datasets_intro sections: - file: python_scripts/trees_dataset - - file: appendix/adult_census_description + - file: python_scripts/datasets_adult_census - file: python_scripts/datasets_california_housing - file: python_scripts/datasets_ames_housing - file: python_scripts/datasets_blood_transfusion diff --git a/jupyter-book/appendix/adult_census_description.md b/jupyter-book/appendix/adult_census_description.md deleted file mode 100644 index d7775cf7d..000000000 --- a/jupyter-book/appendix/adult_census_description.md +++ /dev/null @@ -1,11 +0,0 @@ -# The adult census dataset - -This dataset is a collection of information related to a person. The prediction -task is to predict whether a person is earning a salary above or below 50 k$. - -We extensively exploring this dataset in the first module "The predictive -modeling pipeline", in the first sequence "Tabular data exploration", in the -first notebook "First look at our dataset". - -To avoid repeating the same information, we redirect the reader to this -particular notebook. diff --git a/notebooks/datasets_adult_census.ipynb b/notebooks/datasets_adult_census.ipynb new file mode 100644 index 000000000..371525cc2 --- /dev/null +++ b/notebooks/datasets_adult_census.ipynb @@ -0,0 +1,203 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# The Adult census dataset\n", + "\n", + "[This dataset](http://www.openml.org/d/1590) is a collection of demographic\n", + "information for the adult population as of 1994 in the USA. The prediction\n", + "task is to predict whether a person is earning a high or low revenue in\n", + "USD/year.\n", + "\n", + "The column named **class** is the target variable (i.e., the variable which we\n", + "want to predict). The two possible classes are `\" <=50K\"` (low-revenue) and\n", + "`\" >50K\"` (high-revenue).\n", + "\n", + "Before drawing any conclusions based on its statistics or the predictions of\n", + "models trained on it, remember that this dataset is not only outdated, but is\n", + "also not representative of the US population. In fact, the original data\n", + "contains a feature named `fnlwgt` that encodes the number of units in the\n", + "target population that the responding unit represents.\n", + "\n", + "First we load the dataset. We keep only some columns of interest to ease the\n", + "plotting." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "\n", + "adult_census = pd.read_csv(\"../datasets/adult-census.csv\")\n", + "columns_to_plot = [\n", + " \"age\",\n", + " \"education-num\",\n", + " \"capital-loss\",\n", + " \"capital-gain\",\n", + " \"hours-per-week\",\n", + " \"relationship\",\n", + " \"class\",\n", + "]\n", + "target_name = \"class\"\n", + "target = adult_census[target_name]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We explore this dataset in the first module's notebook \"First look at our\n", + "dataset\", where we provide a first intuition on how the data is structured.\n", + "There, we use a seaborn pairplot to visualize pairwise relationships between\n", + "the numerical variables in the dataset. This tool aligns scatter plots for every pair\n", + "of variables and histograms for the plots in the\n", + "diagonal of the array.\n", + "\n", + "This approach is limited:\n", + "- Pair plots can only deal with numerical features and;\n", + "- by observing pairwise interactions we end up with a two-dimensional\n", + " projection of a multi-dimensional feature space, which can lead to a wrong\n", + " interpretation of the individual impact of a feature.\n", + "\n", + "Here we explore with some more detail the relation between features using\n", + "plotly `Parcoords`." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import plotly.graph_objects as go\n", + "from sklearn.preprocessing import LabelEncoder\n", + "\n", + "le = LabelEncoder()\n", + "\n", + "\n", + "def generate_dict(col):\n", + " \"\"\"Check if column is categorical and generate the appropriate dict\"\"\"\n", + " if adult_census[col].dtype == \"object\": # Categorical column\n", + " encoded = le.fit_transform(adult_census[col])\n", + " return {\n", + " \"tickvals\": list(range(len(le.classes_))),\n", + " \"ticktext\": list(le.classes_),\n", + " \"label\": col,\n", + " \"values\": encoded,\n", + " }\n", + " else: # Numerical column\n", + " return {\"label\": col, \"values\": adult_census[col]}\n", + "\n", + "\n", + "plot_list = [generate_dict(col) for col in columns_to_plot]\n", + "\n", + "fig = go.Figure(\n", + " data=go.Parcoords(\n", + " line=dict(\n", + " color=le.fit_transform(target),\n", + " colorscale=\"Viridis\",\n", + " ),\n", + " dimensions=plot_list,\n", + " )\n", + ")\n", + "fig.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The `Parcoords` plot is quite similar to the parallel coordinates plot that we\n", + "present in the module on hyperparameters tuning in this mooc. It display the\n", + "values of the features on different columns while the target class is color\n", + "coded. Thus, we are able to quickly inspect if there is a range of values for\n", + "a certain feature which is leading to a particular result.\n", + "\n", + "As in the parallel coordinates plot, it is possible to select one or more\n", + "ranges of values by clicking and holding on any axis of the plot. You can then\n", + "slide (move) the range selection and cross two selections to see the\n", + "intersections. You can undo a selection by clicking once again on the same\n", + "axis.\n", + "\n", + "In particular for this dataset we observe that values of `\"age\"` lower to 20\n", + "years are quite predictive of low-income, regardless of the value of other\n", + "features. Similarly, a `\"capital-loss\"` above `4000` seems to lead to\n", + "low-income.\n", + "\n", + "Even if it is beyond the scope of the present MOOC, one can additionally\n", + "explore correlations between features, for example, using Spearman's rank\n", + "correlation, as the more popular Pearson's correlation is only appropriate for\n", + "continuous data that is normally distributed and linearly related. Spearman's\n", + "correlation is more versatile in dealing with non-linear relationships and\n", + "ordinal data, but it is not meant for nominal categorical data." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import matplotlib.pyplot as plt\n", + "import numpy as np\n", + "\n", + "from scipy.cluster import hierarchy\n", + "from scipy.spatial.distance import squareform\n", + "from scipy.stats import spearmanr\n", + "\n", + "# Keep numerical features only\n", + "X = adult_census[columns_to_plot].drop(columns=[\"class\", \"relationship\"])\n", + "fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 8))\n", + "corr = spearmanr(X).correlation\n", + "\n", + "# Ensure the correlation matrix is symmetric\n", + "corr = (corr + corr.T) / 2\n", + "np.fill_diagonal(corr, 1)\n", + "\n", + "# We convert the correlation matrix to a distance matrix before performing\n", + "# hierarchical clustering using Ward's linkage.\n", + "distance_matrix = 1 - np.abs(corr)\n", + "dist_linkage = hierarchy.ward(squareform(distance_matrix))\n", + "dendro = hierarchy.dendrogram(\n", + " dist_linkage, labels=X.columns.to_list(), ax=ax1, leaf_rotation=90\n", + ")\n", + "dendro_idx = np.arange(0, len(dendro[\"ivl\"]))\n", + "\n", + "ax2.imshow(corr[dendro[\"leaves\"], :][:, dendro[\"leaves\"]], cmap=\"coolwarm\")\n", + "ax2.set_xticks(dendro_idx)\n", + "ax2.set_yticks(dendro_idx)\n", + "ax2.set_xticklabels(dendro[\"ivl\"], rotation=\"vertical\")\n", + "ax2.set_yticklabels(dendro[\"ivl\"])\n", + "_ = fig.tight_layout()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Using a [diverging\n", + "colormap](https://matplotlib.org/stable/users/explain/colors/colormaps.html#diverging)\n", + "such as \"coolwarm\", the softer the color, the less (anti)correlation between\n", + "features (no correlation is mapped to white color). In this case dark blue\n", + "represents strong negative correlations and dark red means strong positive\n", + "correlations. Indeed, any feature is perfectly correlated to itself." + ] + } + ], + "metadata": { + "jupytext": { + "main_language": "python" + }, + "kernelspec": { + "display_name": "Python 3", + "name": "python3" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} \ No newline at end of file diff --git a/python_scripts/datasets_adult_census.py b/python_scripts/datasets_adult_census.py new file mode 100644 index 000000000..f86bf40ef --- /dev/null +++ b/python_scripts/datasets_adult_census.py @@ -0,0 +1,160 @@ +# --- +# jupyter: +# kernelspec: +# display_name: Python 3 +# name: python3 +# --- + +# %% [markdown] +# # The adult census dataset +# +# [This dataset](http://www.openml.org/d/1590) is a collection of demographic +# information for the adult population as of 1994 in the USA. The prediction +# task is to predict whether a person is earning a high or low revenue in +# USD/year. +# +# The column named **class** is the target variable (i.e., the variable which we +# want to predict). The two possible classes are `" <=50K"` (low-revenue) and +# `" >50K"` (high-revenue). +# +# Before drawing any conclusions based on its statistics or the predictions of +# models trained on it, remember that this dataset is not only outdated, but is +# also not representative of the US population. In fact, the original data +# contains a feature named `fnlwgt` that encodes the number of units in the +# target population that the responding unit represents. +# +# First we load the dataset. We keep only some columns of interest to ease the +# plotting. + +# %% +import pandas as pd + +adult_census = pd.read_csv("../datasets/adult-census.csv") +columns_to_plot = [ + "age", + "education-num", + "capital-loss", + "capital-gain", + "hours-per-week", + "relationship", + "class", +] +target_name = "class" +target = adult_census[target_name] + +# %% [markdown] +# We explore this dataset in the first module's notebook "First look at our +# dataset", where we provide a first intuition on how the data is structured. +# There, we use a seaborn pairplot to visualize pairwise relationships between +# the numerical variables in the dataset. This tool aligns scatter plots for every pair +# of variables and histograms for the plots in the +# diagonal of the array. +# +# This approach is limited: +# - Pair plots can only deal with numerical features and; +# - by observing pairwise interactions we end up with a two-dimensional +# projection of a multi-dimensional feature space, which can lead to a wrong +# interpretation of the individual impact of a feature. +# +# Here we explore with some more detail the relation between features using +# plotly `Parcoords`. + +# %% +import plotly.graph_objects as go +from sklearn.preprocessing import LabelEncoder + +le = LabelEncoder() + + +def generate_dict(col): + """Check if column is categorical and generate the appropriate dict""" + if adult_census[col].dtype == "object": # Categorical column + encoded = le.fit_transform(adult_census[col]) + return { + "tickvals": list(range(len(le.classes_))), + "ticktext": list(le.classes_), + "label": col, + "values": encoded, + } + else: # Numerical column + return {"label": col, "values": adult_census[col]} + + +plot_list = [generate_dict(col) for col in columns_to_plot] + +fig = go.Figure( + data=go.Parcoords( + line=dict( + color=le.fit_transform(target), + colorscale="Viridis", + ), + dimensions=plot_list, + ) +) +fig.show() + +# %% [markdown] +# The `Parcoords` plot is quite similar to the parallel coordinates plot that we +# present in the module on hyperparameters tuning in this mooc. It display the +# values of the features on different columns while the target class is color +# coded. Thus, we are able to quickly inspect if there is a range of values for +# a certain feature which is leading to a particular result. +# +# As in the parallel coordinates plot, it is possible to select one or more +# ranges of values by clicking and holding on any axis of the plot. You can then +# slide (move) the range selection and cross two selections to see the +# intersections. You can undo a selection by clicking once again on the same +# axis. +# +# In particular for this dataset we observe that values of `"age"` lower to 20 +# years are quite predictive of low-income, regardless of the value of other +# features. Similarly, a `"capital-loss"` above `4000` seems to lead to +# low-income. +# +# Even if it is beyond the scope of the present MOOC, one can additionally +# explore correlations between features, for example, using Spearman's rank +# correlation, as the more popular Pearson's correlation is only appropriate for +# continuous data that is normally distributed and linearly related. Spearman's +# correlation is more versatile in dealing with non-linear relationships and +# ordinal data, but it is not meant for nominal categorical data. + +# %% +import matplotlib.pyplot as plt +import numpy as np + +from scipy.cluster import hierarchy +from scipy.spatial.distance import squareform +from scipy.stats import spearmanr + +# Keep numerical features only +X = adult_census[columns_to_plot].drop(columns=["class", "relationship"]) +fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 8)) +corr = spearmanr(X).correlation + +# Ensure the correlation matrix is symmetric +corr = (corr + corr.T) / 2 +np.fill_diagonal(corr, 1) + +# We convert the correlation matrix to a distance matrix before performing +# hierarchical clustering using Ward's linkage. +distance_matrix = 1 - np.abs(corr) +dist_linkage = hierarchy.ward(squareform(distance_matrix)) +dendro = hierarchy.dendrogram( + dist_linkage, labels=X.columns.to_list(), ax=ax1, leaf_rotation=90 +) +dendro_idx = np.arange(0, len(dendro["ivl"])) + +ax2.imshow(corr[dendro["leaves"], :][:, dendro["leaves"]], cmap="coolwarm") +ax2.set_xticks(dendro_idx) +ax2.set_yticks(dendro_idx) +ax2.set_xticklabels(dendro["ivl"], rotation="vertical") +ax2.set_yticklabels(dendro["ivl"]) +_ = fig.tight_layout() + +# %% [markdown] +# Using a [diverging +# colormap](https://matplotlib.org/stable/users/explain/colors/colormaps.html#diverging) +# such as "coolwarm", the softer the color, the less (anti)correlation between +# features (no correlation is mapped to white color). In this case dark blue +# represents strong negative correlations and dark red means strong positive +# correlations. Indeed, any feature is perfectly correlated to itself. diff --git a/python_scripts/linear_models_ex_03.py b/python_scripts/linear_models_ex_03.py index 50fe942cd..bede1ad85 100644 --- a/python_scripts/linear_models_ex_03.py +++ b/python_scripts/linear_models_ex_03.py @@ -79,7 +79,7 @@ # Write your code here. # %% [markdown] -# For the following questions, you can copy adn paste the following snippet to +# For the following questions, you can copy and paste the following snippet to # get the feature names from the column transformer here named `preprocessor`. # # ```python diff --git a/python_scripts/linear_models_sol_03.py b/python_scripts/linear_models_sol_03.py index c76806a45..a37bbf2fb 100644 --- a/python_scripts/linear_models_sol_03.py +++ b/python_scripts/linear_models_sol_03.py @@ -145,7 +145,7 @@ ) # %% [markdown] -# For the following questions, you can copy adn paste the following snippet to +# For the following questions, you can copy and paste the following snippet to # get the feature names from the column transformer here named `preprocessor`. # # ```python From 26ad6d282bdfa786f8da77b8a945229f8c51b662 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Est=C3=A8ve?= Date: Mon, 6 Nov 2023 10:35:58 +0100 Subject: [PATCH 085/108] Generate notebooks --- notebooks/datasets_adult_census.ipynb | 2 +- notebooks/linear_models_ex_03.ipynb | 2 +- notebooks/linear_models_sol_02.ipynb | 4 +- notebooks/linear_models_sol_03.ipynb | 2 +- .../linear_regression_non_linear_link.ipynb | 65 ++++--------------- .../linear_regression_without_sklearn.ipynb | 22 +++---- notebooks/logistic_regression.ipynb | 62 ++++++------------ notebooks/trees_hyperparameters.ipynb | 11 +--- 8 files changed, 51 insertions(+), 119 deletions(-) diff --git a/notebooks/datasets_adult_census.ipynb b/notebooks/datasets_adult_census.ipynb index 371525cc2..139287829 100644 --- a/notebooks/datasets_adult_census.ipynb +++ b/notebooks/datasets_adult_census.ipynb @@ -4,7 +4,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "# The Adult census dataset\n", + "# The adult census dataset\n", "\n", "[This dataset](http://www.openml.org/d/1590) is a collection of demographic\n", "information for the adult population as of 1994 in the USA. The prediction\n", diff --git a/notebooks/linear_models_ex_03.ipynb b/notebooks/linear_models_ex_03.ipynb index 7ada01f07..29db0d1d4 100644 --- a/notebooks/linear_models_ex_03.ipynb +++ b/notebooks/linear_models_ex_03.ipynb @@ -131,7 +131,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "For the following questions, you can copy adn paste the following snippet to\n", + "For the following questions, you can copy and paste the following snippet to\n", "get the feature names from the column transformer here named `preprocessor`.\n", "\n", "```python\n", diff --git a/notebooks/linear_models_sol_02.ipynb b/notebooks/linear_models_sol_02.ipynb index 38ac00ef6..e124537d1 100644 --- a/notebooks/linear_models_sol_02.ipynb +++ b/notebooks/linear_models_sol_02.ipynb @@ -223,9 +223,9 @@ "outputs": [], "source": [ "# solution\n", - "culmen_length_first_sample = 181.0\n", + "flipper_length_first_sample = 181.0\n", "culmen_depth_first_sample = 18.7\n", - "culmen_length_first_sample * culmen_depth_first_sample" + "flipper_length_first_sample * culmen_depth_first_sample" ] }, { diff --git a/notebooks/linear_models_sol_03.ipynb b/notebooks/linear_models_sol_03.ipynb index 20256e76b..ce7e5ace7 100644 --- a/notebooks/linear_models_sol_03.ipynb +++ b/notebooks/linear_models_sol_03.ipynb @@ -211,7 +211,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "For the following questions, you can copy adn paste the following snippet to\n", + "For the following questions, you can copy and paste the following snippet to\n", "get the feature names from the column transformer here named `preprocessor`.\n", "\n", "```python\n", diff --git a/notebooks/linear_regression_non_linear_link.ipynb b/notebooks/linear_regression_non_linear_link.ipynb index 33f6936cc..9a060a2c0 100644 --- a/notebooks/linear_regression_non_linear_link.ipynb +++ b/notebooks/linear_regression_non_linear_link.ipynb @@ -2,7 +2,6 @@ "cells": [ { "cell_type": "markdown", - "id": "14eec485", "metadata": {}, "source": [ "# Non-linear feature engineering for Linear Regression\n", @@ -25,7 +24,6 @@ { "cell_type": "code", "execution_count": null, - "id": "8f516165", "metadata": {}, "outputs": [], "source": [ @@ -44,13 +42,13 @@ }, { "cell_type": "markdown", - "id": "00fd3b4f", "metadata": {}, "source": [ - "```{tip}\n", - "`np.random.RandomState` allows to create a random number generator which can\n", - "be later used to get deterministic results.\n", - "```\n", + "

\n", + "

Tip

\n", + "

np.random.RandomState allows to create a random number generator which can\n", + "be later used to get deterministic results.

\n", + "
\n", "\n", "To ease the plotting, we create a pandas dataframe containing the data and\n", "target:" @@ -59,7 +57,6 @@ { "cell_type": "code", "execution_count": null, - "id": "5459a97b", "metadata": {}, "outputs": [], "source": [ @@ -71,7 +68,6 @@ { "cell_type": "code", "execution_count": null, - "id": "8b1b2257", "metadata": {}, "outputs": [], "source": [ @@ -84,22 +80,21 @@ }, { "cell_type": "markdown", - "id": "be69fae1", "metadata": {}, "source": [ - "```{warning}\n", - "In scikit-learn, by convention `data` (also called `X` in the scikit-learn\n", - "documentation) should be a 2D matrix of shape `(n_samples, n_features)`.\n", - "If `data` is a 1D vector, you need to reshape it into a matrix with a\n", + "
\n", + "

Warning

\n", + "

In scikit-learn, by convention data (also called X in the scikit-learn\n", + "documentation) should be a 2D matrix of shape (n_samples, n_features).\n", + "If data is a 1D vector, you need to reshape it into a matrix with a\n", "single column if the vector represents a feature or a single row if the\n", - "vector represents a sample.\n", - "```" + "vector represents a sample.

\n", + "
" ] }, { "cell_type": "code", "execution_count": null, - "id": "46804be9", "metadata": {}, "outputs": [], "source": [ @@ -110,7 +105,6 @@ }, { "cell_type": "markdown", - "id": "a4209f00", "metadata": { "lines_to_next_cell": 2 }, @@ -122,7 +116,6 @@ { "cell_type": "code", "execution_count": null, - "id": "a1bd392b", "metadata": {}, "outputs": [], "source": [ @@ -142,7 +135,6 @@ }, { "cell_type": "markdown", - "id": "7bfcbeb8", "metadata": {}, "source": [ "We now observe the limitations of fitting a linear regression model." @@ -151,7 +143,6 @@ { "cell_type": "code", "execution_count": null, - "id": "1545fec5", "metadata": {}, "outputs": [], "source": [ @@ -165,7 +156,6 @@ { "cell_type": "code", "execution_count": null, - "id": "e8c79631", "metadata": {}, "outputs": [], "source": [ @@ -174,7 +164,6 @@ }, { "cell_type": "markdown", - "id": "545fc1f3", "metadata": {}, "source": [ "Here the coefficient and intercept learnt by `LinearRegression` define the\n", @@ -185,7 +174,6 @@ { "cell_type": "code", "execution_count": null, - "id": "0f95ceef", "metadata": {}, "outputs": [], "source": [ @@ -197,7 +185,6 @@ }, { "cell_type": "markdown", - "id": "1a34a48c", "metadata": {}, "source": [ "Notice that the learnt model cannot handle the non-linear relationship between\n", @@ -217,7 +204,6 @@ { "cell_type": "code", "execution_count": null, - "id": "e01b02d2", "metadata": {}, "outputs": [], "source": [ @@ -230,7 +216,6 @@ { "cell_type": "code", "execution_count": null, - "id": "9a27773e", "metadata": {}, "outputs": [], "source": [ @@ -239,7 +224,6 @@ }, { "cell_type": "markdown", - "id": "4d5070e3", "metadata": {}, "source": [ "Instead of having a model which can natively deal with non-linearity, we could\n", @@ -256,7 +240,6 @@ { "cell_type": "code", "execution_count": null, - "id": "28c13246", "metadata": {}, "outputs": [], "source": [ @@ -266,7 +249,6 @@ { "cell_type": "code", "execution_count": null, - "id": "69d0ba50", "metadata": {}, "outputs": [], "source": [ @@ -276,7 +258,6 @@ }, { "cell_type": "markdown", - "id": "7925141e", "metadata": {}, "source": [ "Instead of manually creating such polynomial features one could directly use\n", @@ -286,7 +267,6 @@ { "cell_type": "code", "execution_count": null, - "id": "d31ed0f4", "metadata": {}, "outputs": [], "source": [ @@ -297,7 +277,6 @@ }, { "cell_type": "markdown", - "id": "6a7fe453", "metadata": {}, "source": [ "In the previous cell we had to set `include_bias=False` as otherwise we would\n", @@ -312,7 +291,6 @@ }, { "cell_type": "markdown", - "id": "269fbe2b", "metadata": {}, "source": [ "To demonstrate the use of the `PolynomialFeatures` class, we use a\n", @@ -323,7 +301,6 @@ { "cell_type": "code", "execution_count": null, - "id": "38ba0c5c", "metadata": {}, "outputs": [], "source": [ @@ -340,7 +317,6 @@ { "cell_type": "code", "execution_count": null, - "id": "5df7d4a4", "metadata": {}, "outputs": [], "source": [ @@ -349,7 +325,6 @@ }, { "cell_type": "markdown", - "id": "fe259d20", "metadata": {}, "source": [ "We can see that even with a linear model, we can overcome the linearity\n", @@ -379,7 +354,6 @@ { "cell_type": "code", "execution_count": null, - "id": "7d46da9b", "metadata": {}, "outputs": [], "source": [ @@ -392,7 +366,6 @@ { "cell_type": "code", "execution_count": null, - "id": "9406b676", "metadata": {}, "outputs": [], "source": [ @@ -401,7 +374,6 @@ }, { "cell_type": "markdown", - "id": "fd29730e", "metadata": {}, "source": [ "The predictions of our SVR with a linear kernel are all aligned on a straight\n", @@ -419,7 +391,6 @@ { "cell_type": "code", "execution_count": null, - "id": "ae1550fa", "metadata": {}, "outputs": [], "source": [ @@ -430,7 +401,6 @@ { "cell_type": "code", "execution_count": null, - "id": "c4670a4e", "metadata": {}, "outputs": [], "source": [ @@ -439,7 +409,6 @@ }, { "cell_type": "markdown", - "id": "732b2b0f", "metadata": {}, "source": [ "Kernel methods such as SVR are very efficient for small to medium datasets.\n", @@ -460,7 +429,6 @@ { "cell_type": "code", "execution_count": null, - "id": "e30e6b37", "metadata": {}, "outputs": [], "source": [ @@ -476,7 +444,6 @@ { "cell_type": "code", "execution_count": null, - "id": "b46eb0ef", "metadata": {}, "outputs": [], "source": [ @@ -486,7 +453,6 @@ { "cell_type": "code", "execution_count": null, - "id": "5403e6b1", "metadata": {}, "outputs": [], "source": [ @@ -502,7 +468,6 @@ { "cell_type": "code", "execution_count": null, - "id": "0dcdfe92", "metadata": {}, "outputs": [], "source": [ @@ -511,7 +476,6 @@ }, { "cell_type": "markdown", - "id": "4b4f0560", "metadata": {}, "source": [ "`Nystroem` is a nice alternative to `PolynomialFeatures` that makes it\n", @@ -523,7 +487,6 @@ { "cell_type": "code", "execution_count": null, - "id": "41d6abd8", "metadata": {}, "outputs": [], "source": [ @@ -539,7 +502,6 @@ { "cell_type": "code", "execution_count": null, - "id": "be6a232c", "metadata": {}, "outputs": [], "source": [ @@ -550,7 +512,6 @@ }, { "cell_type": "markdown", - "id": "7860e12d", "metadata": {}, "source": [ "## Notebook Recap\n", @@ -579,4 +540,4 @@ }, "nbformat": 4, "nbformat_minor": 5 -} +} \ No newline at end of file diff --git a/notebooks/linear_regression_without_sklearn.ipynb b/notebooks/linear_regression_without_sklearn.ipynb index 22707379c..039e1014a 100644 --- a/notebooks/linear_regression_without_sklearn.ipynb +++ b/notebooks/linear_regression_without_sklearn.ipynb @@ -7,8 +7,8 @@ "# Linear regression without scikit-learn\n", "\n", "In this notebook, we introduce linear regression. Before presenting the\n", - "available scikit-learn classes, we will provide some insights with a simple\n", - "example. We will use a dataset that contains measurements taken on penguins." + "available scikit-learn classes, here we provide some insights with a simple\n", + "example. We use a dataset that contains measurements taken on penguins." ] }, { @@ -38,8 +38,8 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "We will formulate the following problem: using the flipper length of a\n", - "penguin, we would like to infer its mass." + "We aim to solve the following problem: using the flipper length of a penguin,\n", + "we would like to infer its mass." ] }, { @@ -110,8 +110,8 @@ "metadata": {}, "source": [ "Using the model we defined above, we can check the body mass values predicted\n", - "for a range of flipper lengths. We will set `weight_flipper_length` to be 45\n", - "and `intercept_body_mass` to be -5000." + "for a range of flipper lengths. We set `weight_flipper_length` and\n", + "`intercept_body_mass` to arbitrary values of 45 and -5000, respectively." ] }, { @@ -159,7 +159,7 @@ "source": [ "The variable `weight_flipper_length` is a weight applied to the feature\n", "`flipper_length` in order to make the inference. When this coefficient is\n", - "positive, it means that penguins with longer flipper lengths will have larger\n", + "positive, it means that penguins with longer flipper lengths have larger\n", "body masses. If the coefficient is negative, it means that penguins with\n", "shorter flipper lengths have larger body masses. Graphically, this coefficient\n", "is represented by the slope of the curve in the plot. Below we show what the\n", @@ -207,7 +207,7 @@ "source": [ "In our case, this coefficient has a meaningful unit: g/mm. For instance, a\n", "coefficient of 40 g/mm, means that for each additional millimeter in flipper\n", - "length, the body weight predicted will increase by 40 g." + "length, the body weight predicted increases by 40 g." ] }, { @@ -238,8 +238,8 @@ "This parameter corresponds to the value on the y-axis if `flipper_length=0`\n", "(which in our case is only a mathematical consideration, as in our data, the\n", " value of `flipper_length` only goes from 170mm to 230mm). This y-value when\n", - "x=0 is called the y-intercept. If `intercept_body_mass` is 0, the curve will\n", - "pass through the origin:" + "x=0 is called the y-intercept. If `intercept_body_mass` is 0, the curve passes\n", + "through the origin:" ] }, { @@ -275,7 +275,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Otherwise, it will pass through the `intercept_body_mass` value:" + "Otherwise, it passes through the `intercept_body_mass` value:" ] }, { diff --git a/notebooks/logistic_regression.ipynb b/notebooks/logistic_regression.ipynb index 691283b02..0efd4e0dc 100644 --- a/notebooks/logistic_regression.ipynb +++ b/notebooks/logistic_regression.ipynb @@ -2,7 +2,6 @@ "cells": [ { "cell_type": "markdown", - "id": "b0e67575", "metadata": {}, "source": [ "# Linear models for classification\n", @@ -18,19 +17,18 @@ }, { "cell_type": "markdown", - "id": "ac574018", "metadata": {}, "source": [ - "```{note}\n", - "If you want a deeper overview regarding this dataset, you can refer to the\n", - "Appendix - Datasets description section at the end of this MOOC.\n", - "```" + "
\n", + "

Note

\n", + "

If you want a deeper overview regarding this dataset, you can refer to the\n", + "Appendix - Datasets description section at the end of this MOOC.

\n", + "
" ] }, { "cell_type": "code", "execution_count": null, - "id": "a47d670a", "metadata": {}, "outputs": [], "source": [ @@ -48,7 +46,6 @@ }, { "cell_type": "markdown", - "id": "2165fcfc", "metadata": {}, "source": [ "We can quickly start by visualizing the feature distribution by class:" @@ -57,7 +54,6 @@ { "cell_type": "code", "execution_count": null, - "id": "9ac5a70c", "metadata": {}, "outputs": [], "source": [ @@ -72,7 +68,6 @@ }, { "cell_type": "markdown", - "id": "cab96de7", "metadata": {}, "source": [ "We can observe that we have quite a simple problem. When the culmen length\n", @@ -86,7 +81,6 @@ { "cell_type": "code", "execution_count": null, - "id": "b6a3b04c", "metadata": {}, "outputs": [], "source": [ @@ -103,7 +97,6 @@ }, { "cell_type": "markdown", - "id": "4964b148", "metadata": {}, "source": [ "The linear regression that we previously saw predicts a continuous output.\n", @@ -117,7 +110,6 @@ { "cell_type": "code", "execution_count": null, - "id": "47347104", "metadata": {}, "outputs": [], "source": [ @@ -133,7 +125,6 @@ }, { "cell_type": "markdown", - "id": "bafd8265", "metadata": {}, "source": [ "Since we are dealing with a classification problem containing only 2 features,\n", @@ -141,22 +132,21 @@ "the rule used by our predictive model to affect a class label given the\n", "feature values of the sample.\n", "\n", - "```{note}\n", - "Here, we use the class `DecisionBoundaryDisplay`. This educational tool allows\n", + "
\n", + "

Note

\n", + "

Here, we use the class DecisionBoundaryDisplay. This educational tool allows\n", "us to gain some insights by plotting the decision function boundary learned by\n", - "the classifier in a 2 dimensional feature space.\n", - "\n", - "Notice however that in more realistic machine learning contexts, one would\n", + "the classifier in a 2 dimensional feature space.

\n", + "

Notice however that in more realistic machine learning contexts, one would\n", "typically fit on more than two features at once and therefore it would not be\n", "possible to display such a visualization of the decision boundary in\n", - "general.\n", - "```" + "general.

\n", + "
" ] }, { "cell_type": "code", "execution_count": null, - "id": "dd628d44", "metadata": {}, "outputs": [], "source": [ @@ -182,7 +172,6 @@ }, { "cell_type": "markdown", - "id": "dbd93bf3", "metadata": {}, "source": [ "Thus, we see that our decision function is represented by a straight line\n", @@ -208,7 +197,6 @@ { "cell_type": "code", "execution_count": null, - "id": "8c76e56c", "metadata": {}, "outputs": [], "source": [ @@ -219,7 +207,6 @@ }, { "cell_type": "markdown", - "id": "416a9aff", "metadata": {}, "source": [ "You can [access pipeline\n", @@ -236,7 +223,6 @@ { "cell_type": "code", "execution_count": null, - "id": "8c9b19ae", "metadata": {}, "outputs": [], "source": [ @@ -246,7 +232,6 @@ }, { "cell_type": "markdown", - "id": "083d61ff", "metadata": {}, "source": [ "If one of the weights had been zero, the decision boundary would have been\n", @@ -266,7 +251,6 @@ { "cell_type": "code", "execution_count": null, - "id": "d30ac7e5", "metadata": {}, "outputs": [], "source": [ @@ -278,7 +262,6 @@ }, { "cell_type": "markdown", - "id": "6e7141da", "metadata": {}, "source": [ "In this case, our logistic regression classifier predicts the Chinstrap\n", @@ -286,7 +269,7 @@ "coordinates of this test data point match a location close to the decision\n", "boundary, in the red region.\n", "\n", - "As mentioned in the introductory slides ๐ŸŽฅ **Intuitions on linear models**,\n", + "As mentioned in the introductory slides \ud83c\udfa5 **Intuitions on linear models**,\n", "one can alternatively use the `predict_proba` method to compute continuous\n", "values (\"soft predictions\") that correspond to an estimation of the confidence\n", "of the target belonging to each class.\n", @@ -301,7 +284,6 @@ { "cell_type": "code", "execution_count": null, - "id": "f03d6062", "metadata": {}, "outputs": [], "source": [ @@ -311,7 +293,6 @@ }, { "cell_type": "markdown", - "id": "bd3a7c7f", "metadata": {}, "source": [ "More in general, the output of `predict_proba` is an array of shape\n", @@ -321,7 +302,6 @@ { "cell_type": "code", "execution_count": null, - "id": "e12bb08c", "metadata": {}, "outputs": [], "source": [ @@ -330,7 +310,6 @@ }, { "cell_type": "markdown", - "id": "67f73ae8", "metadata": {}, "source": [ "Also notice that the sum of (estimated) predicted probabilities across classes\n", @@ -341,7 +320,6 @@ { "cell_type": "code", "execution_count": null, - "id": "427587b6", "metadata": {}, "outputs": [], "source": [ @@ -355,16 +333,16 @@ }, { "cell_type": "markdown", - "id": "053ad22c", "metadata": {}, "source": [ - "```{warning}\n", - "We insist that the output of `predict_proba` are just estimations. Their\n", + "
\n", + "

Warning

\n", + "

We insist that the output of predict_proba are just estimations. Their\n", "reliability on being a good estimate of the true conditional class-assignment\n", "probabilities depends on the quality of the model. Even classifiers with a\n", "high accuracy on a test set may be overconfident for some individuals and\n", - "underconfident for others.\n", - "```\n", + "underconfident for others.

\n", + "
\n", "\n", "Similarly to the hard decision boundary shown above, one can set the\n", "`response_method` to `\"predict_proba\"` in the `DecisionBoundaryDisplay` to\n", @@ -384,7 +362,6 @@ { "cell_type": "code", "execution_count": null, - "id": "fbcece8a", "metadata": {}, "outputs": [], "source": [ @@ -407,7 +384,6 @@ }, { "cell_type": "markdown", - "id": "54133c3a", "metadata": {}, "source": [ "For multi-class classification the logistic regression uses the [softmax\n", @@ -432,4 +408,4 @@ }, "nbformat": 4, "nbformat_minor": 5 -} +} \ No newline at end of file diff --git a/notebooks/trees_hyperparameters.ipynb b/notebooks/trees_hyperparameters.ipynb index e60248e94..b9de0ac27 100644 --- a/notebooks/trees_hyperparameters.ipynb +++ b/notebooks/trees_hyperparameters.ipynb @@ -347,14 +347,9 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "As expected, we see that the blue blob on the right and the red blob on the\n", - "top are easily separated. However, more splits will be required to better\n", - "split the blob were both blue and red data points are mixed.\n", - "\n", - "Indeed, we see that red blob on the top and the blue blob on the right of the\n", - "plot are perfectly separated. However, the tree is still making mistakes in\n", - "the area where the blobs are mixed together. Let's check the tree\n", - "representation." + "As expected, we see that the blue blob in the lower right and the red blob on\n", + "the top are easily separated. However, more splits will be required to better\n", + "split the blob were both blue and red data points are mixed." ] }, { From 3a5b94a74c6a76b506185fb3382f576d71bd0f05 Mon Sep 17 00:00:00 2001 From: Patricia Ortega <147734035+PatriOr@users.noreply.github.com> Date: Mon, 6 Nov 2023 18:22:26 +0100 Subject: [PATCH 086/108] Change verbs to present mode in M3 and M5 (#749) --- python_scripts/parameter_tuning_ex_02.py | 8 +-- python_scripts/parameter_tuning_ex_03.py | 2 +- .../parameter_tuning_grid_search.py | 38 +++++++------- python_scripts/parameter_tuning_nested.py | 12 ++--- .../parameter_tuning_parallel_plot.py | 4 +- python_scripts/parameter_tuning_sol_02.py | 8 +-- python_scripts/parameter_tuning_sol_03.py | 10 ++-- python_scripts/trees_dataset.py | 16 +++--- python_scripts/trees_ex_02.py | 10 ++-- python_scripts/trees_hyperparameters.py | 49 +++++++++---------- python_scripts/trees_regression.py | 21 ++++---- python_scripts/trees_sol_02.py | 16 +++--- 12 files changed, 96 insertions(+), 98 deletions(-) diff --git a/python_scripts/parameter_tuning_ex_02.py b/python_scripts/parameter_tuning_ex_02.py index cd0e5e3f0..ff119eb58 100644 --- a/python_scripts/parameter_tuning_ex_02.py +++ b/python_scripts/parameter_tuning_ex_02.py @@ -68,10 +68,10 @@ # %% [markdown] # Use the previously defined model (called `model`) and using two nested `for` # loops, make a search of the best combinations of the `learning_rate` and -# `max_leaf_nodes` parameters. In this regard, you will need to train and test -# the model by setting the parameters. The evaluation of the model should be -# performed using `cross_val_score` on the training set. We will use the -# following parameters search: +# `max_leaf_nodes` parameters. In this regard, you have to train and test the +# model by setting the parameters. The evaluation of the model should be +# performed using `cross_val_score` on the training set. Use the following +# parameters search: # - `learning_rate` for the values 0.01, 0.1, 1 and 10. This parameter controls # the ability of a new tree to correct the error of the previous sequence of # trees diff --git a/python_scripts/parameter_tuning_ex_03.py b/python_scripts/parameter_tuning_ex_03.py index 48c9a5c41..85dfda6db 100644 --- a/python_scripts/parameter_tuning_ex_03.py +++ b/python_scripts/parameter_tuning_ex_03.py @@ -29,7 +29,7 @@ ) # %% [markdown] -# In this exercise, we will progressively define the regression pipeline and +# In this exercise, we progressively define the regression pipeline and # later tune its hyperparameters. # # Start by defining a pipeline that: diff --git a/python_scripts/parameter_tuning_grid_search.py b/python_scripts/parameter_tuning_grid_search.py index 5219d0b51..12bbffb57 100644 --- a/python_scripts/parameter_tuning_grid_search.py +++ b/python_scripts/parameter_tuning_grid_search.py @@ -9,7 +9,7 @@ # # Hyperparameter tuning by grid-search # # In the previous notebook, we saw that hyperparameters can affect the -# generalization performance of a model. In this notebook, we will show how to +# generalization performance of a model. In this notebook, we show how to # optimize hyperparameters using a grid-search approach. # %% [markdown] @@ -49,8 +49,8 @@ ) # %% [markdown] -# We will define a pipeline as seen in the first module. It will handle both -# numerical and categorical features. +# We define a pipeline as seen in the first module, to handle both numerical and +# categorical features. # # The first step is to select all the categorical columns. @@ -61,7 +61,7 @@ categorical_columns = categorical_columns_selector(data) # %% [markdown] -# Here we will use a tree-based model as a classifier (i.e. +# Here we use a tree-based model as a classifier (i.e. # `HistGradientBoostingClassifier`). That means: # # * Numerical variables don't need scaling; @@ -119,8 +119,8 @@ # code. # # Let's see how to use the `GridSearchCV` estimator for doing such search. Since -# the grid-search will be costly, we will only explore the combination -# learning-rate and the maximum number of nodes. +# the grid-search is costly, we only explore the combination learning-rate and +# the maximum number of nodes. # %% # %%time @@ -134,7 +134,7 @@ model_grid_search.fit(data_train, target_train) # %% [markdown] -# Finally, we will check the accuracy of our model using the test set. +# Finally, we check the accuracy of our model using the test set. # %% accuracy = model_grid_search.score(data_test, target_test) @@ -155,17 +155,17 @@ # %% [markdown] # The `GridSearchCV` estimator takes a `param_grid` parameter which defines all -# hyperparameters and their associated values. The grid-search will be in charge +# hyperparameters and their associated values. The grid-search is in charge # of creating all possible combinations and test them. # -# The number of combinations will be equal to the product of the number of -# values to explore for each parameter (e.g. in our example 4 x 3 combinations). -# Thus, adding new parameters with their associated values to be explored become +# The number of combinations are equal to the product of the number of values to +# explore for each parameter (e.g. in our example 4 x 3 combinations). Thus, +# adding new parameters with their associated values to be explored become # rapidly computationally expensive. # # Once the grid-search is fitted, it can be used as any other predictor by -# calling `predict` and `predict_proba`. Internally, it will use the model with -# the best parameters found during `fit`. +# calling `predict` and `predict_proba`. Internally, it uses the model with the +# best parameters found during `fit`. # # Get predictions for the 5 first samples using the estimator with the best # parameters. @@ -186,8 +186,8 @@ # parameters "by hand" through a double for loop. # # In addition, we can inspect all results which are stored in the attribute -# `cv_results_` of the grid-search. We will filter some specific columns from -# these results. +# `cv_results_` of the grid-search. We filter some specific columns from these +# results. # %% cv_results = pd.DataFrame(model_grid_search.cv_results_).sort_values( @@ -220,9 +220,9 @@ def shorten_param(param_name): # With only 2 parameters, we might want to visualize the grid-search as a # heatmap. We need to transform our `cv_results` into a dataframe where: # -# - the rows will correspond to the learning-rate values; -# - the columns will correspond to the maximum number of leaf; -# - the content of the dataframe will be the mean test scores. +# - the rows correspond to the learning-rate values; +# - the columns correspond to the maximum number of leaf; +# - the content of the dataframe is the mean test scores. # %% pivoted_cv_results = cv_results.pivot_table( @@ -259,7 +259,7 @@ def shorten_param(param_name): # # The precise meaning of those two parameters will be explained later. # -# For now we will note that, in general, **there is no unique optimal parameter +# For now we note that, in general, **there is no unique optimal parameter # setting**: 4 models out of the 12 parameter configurations reach the maximal # accuracy (up to small random fluctuations caused by the sampling of the # training set). diff --git a/python_scripts/parameter_tuning_nested.py b/python_scripts/parameter_tuning_nested.py index fb7406274..e4ccc7102 100644 --- a/python_scripts/parameter_tuning_nested.py +++ b/python_scripts/parameter_tuning_nested.py @@ -12,12 +12,12 @@ # However, we did not present a proper framework to evaluate the tuned models. # Instead, we focused on the mechanism used to find the best set of parameters. # -# In this notebook, we will reuse some knowledge presented in the module -# "Selecting the best model" to show how to evaluate models where -# hyperparameters need to be tuned. +# In this notebook, we reuse some knowledge presented in the module "Selecting +# the best model" to show how to evaluate models where hyperparameters need to +# be tuned. # -# Thus, we will first load the dataset and create the predictive model that we -# want to optimize and later on, evaluate. +# Thus, we first load the dataset and create the predictive model that we want +# to optimize and later on, evaluate. # # ## Loading the dataset # @@ -111,7 +111,7 @@ # ### With hyperparameter tuning # # As shown in the previous notebook, one can use a search strategy that uses -# cross-validation to find the best set of parameters. Here, we will use a +# cross-validation to find the best set of parameters. Here, we use a # grid-search strategy and reproduce the steps done in the previous notebook. # # First, we have to embed our model into a grid-search and specify the diff --git a/python_scripts/parameter_tuning_parallel_plot.py b/python_scripts/parameter_tuning_parallel_plot.py index 304585cb0..340e75dd0 100644 --- a/python_scripts/parameter_tuning_parallel_plot.py +++ b/python_scripts/parameter_tuning_parallel_plot.py @@ -110,8 +110,8 @@ def shorten_param(param_name): # spread the active ranges and improve the readability of the plot. # ``` # -# The parallel coordinates plot will display the values of the hyperparameters -# on different columns while the performance metric is color coded. Thus, we are +# The parallel coordinates plot displays the values of the hyperparameters on +# different columns while the performance metric is color coded. Thus, we are # able to quickly inspect if there is a range of hyperparameters which is # working or not. # diff --git a/python_scripts/parameter_tuning_sol_02.py b/python_scripts/parameter_tuning_sol_02.py index 1ea4cf572..9c5ceaa2c 100644 --- a/python_scripts/parameter_tuning_sol_02.py +++ b/python_scripts/parameter_tuning_sol_02.py @@ -62,10 +62,10 @@ # %% [markdown] # Use the previously defined model (called `model`) and using two nested `for` # loops, make a search of the best combinations of the `learning_rate` and -# `max_leaf_nodes` parameters. In this regard, you will need to train and test -# the model by setting the parameters. The evaluation of the model should be -# performed using `cross_val_score` on the training set. We will use the -# following parameters search: +# `max_leaf_nodes` parameters. In this regard, you need to train and test the +# model by setting the parameters. The evaluation of the model should be +# performed using `cross_val_score` on the training set. Use the following +# parameters search: # - `learning_rate` for the values 0.01, 0.1, 1 and 10. This parameter controls # the ability of a new tree to correct the error of the previous sequence of # trees diff --git a/python_scripts/parameter_tuning_sol_03.py b/python_scripts/parameter_tuning_sol_03.py index 149cc0de1..55773ebd3 100644 --- a/python_scripts/parameter_tuning_sol_03.py +++ b/python_scripts/parameter_tuning_sol_03.py @@ -23,8 +23,8 @@ ) # %% [markdown] -# In this exercise, we will progressively define the regression pipeline and -# later tune its hyperparameters. +# In this exercise, we progressively define the regression pipeline and later +# tune its hyperparameters. # # Start by defining a pipeline that: # * uses a `StandardScaler` to normalize the numerical data; @@ -108,8 +108,8 @@ cv_results = pd.DataFrame(model_random_search.cv_results_) # %% [markdown] tags=["solution"] -# To simplify the axis of the plot, we will rename the column of the dataframe -# and only select the mean test score and the value of the hyperparameters. +# To simplify the axis of the plot, we rename the column of the dataframe and +# only select the mean test score and the value of the hyperparameters. # %% tags=["solution"] column_name_mapping = { @@ -170,7 +170,7 @@ # vary between 0 and 10,000 (e.g. the variable `"Population"`) and B is a # feature that varies between 1 and 10 (e.g. the variable `"AveRooms"`), then # distances between samples (rows of the dataframe) are mostly impacted by -# differences in values of the column A, while values of the column B will be +# differences in values of the column A, while values of the column B are # comparatively ignored. If one applies StandardScaler to such a database, both # the values of A and B will be approximately between -3 and 3 and the neighbor # structure will be impacted more or less equivalently by both variables. diff --git a/python_scripts/trees_dataset.py b/python_scripts/trees_dataset.py index 708c61b29..888eee5a7 100644 --- a/python_scripts/trees_dataset.py +++ b/python_scripts/trees_dataset.py @@ -15,7 +15,7 @@ # # ## Classification dataset # -# We will use this dataset in classification setting to predict the penguins' +# We use this dataset in classification setting to predict the penguins' # species from anatomical information. # # Each penguin is from one of the three following species: Adelie, Gentoo, and @@ -26,15 +26,15 @@ # penguins](https://github.com/allisonhorst/palmerpenguins/raw/master/man/figures/lter_penguins.png) # # This problem is a classification problem since the target is categorical. We -# will limit our input data to a subset of the original features to simplify our -# explanations when presenting the decision tree algorithm. Indeed, we will use +# limit our input data to a subset of the original features to simplify our +# explanations when presenting the decision tree algorithm. Indeed, we use # features based on penguins' culmen measurement. You can learn more about the # penguins' culmen with the illustration below: # # ![Image of # culmen](https://github.com/allisonhorst/palmerpenguins/raw/master/man/figures/culmen_depth.png) # -# We will start by loading this subset of the dataset. +# We start by loading this subset of the dataset. # %% import pandas as pd @@ -73,11 +73,11 @@ # # In a regression setting, the target is a continuous variable instead of # categories. Here, we use two features of the dataset to make such a problem: -# the flipper length will be used as data and the body mass will be the target. -# In short, we want to predict the body mass using the flipper length. +# the flipper length is used as data and the body mass as the target. In short, +# we want to predict the body mass using the flipper length. # -# We will load the dataset and visualize the relationship between the flipper -# length and the body mass of penguins. +# We load the dataset and visualize the relationship between the flipper length +# and the body mass of penguins. # %% penguins = pd.read_csv("../datasets/penguins_regression.csv") diff --git a/python_scripts/trees_ex_02.py b/python_scripts/trees_ex_02.py index 6c7d3b1b1..f53fb7566 100644 --- a/python_scripts/trees_ex_02.py +++ b/python_scripts/trees_ex_02.py @@ -20,7 +20,7 @@ # By extrapolation, we refer to values predicted by a model outside of the range # of feature values seen during the training. # -# We will first load the regression data. +# We first load the regression data. # %% import pandas as pd @@ -61,10 +61,10 @@ # Write your code here. # %% [markdown] -# Now, we will check the extrapolation capabilities of each model. Create a -# dataset containing a broader range of values than your previous dataset, in -# other words, add values below and above the minimum and the maximum of the -# flipper length seen during training. +# Now, we check the extrapolation capabilities of each model. Create a dataset +# containing a broader range of values than your previous dataset, in other +# words, add values below and above the minimum and the maximum of the flipper +# length seen during training. # %% # Write your code here. diff --git a/python_scripts/trees_hyperparameters.py b/python_scripts/trees_hyperparameters.py index dc8594968..218e9b38b 100644 --- a/python_scripts/trees_hyperparameters.py +++ b/python_scripts/trees_hyperparameters.py @@ -8,11 +8,11 @@ # %% [markdown] # # Importance of decision tree hyperparameters on generalization # -# In this notebook, we will illustrate the importance of some key -# hyperparameters on the decision tree; we will demonstrate their effects on the -# classification and regression problems we saw previously. +# In this notebook, we illustrate the importance of some key hyperparameters on +# the decision tree; we demonstrate their effects on the classification and +# regression problems we saw previously. # -# First, we will load the classification and regression datasets. +# First, we load the classification and regression datasets. # %% import pandas as pd @@ -35,7 +35,7 @@ # %% [markdown] # ## Create helper functions # -# We will create some helper functions to plot the data samples as well as the +# We create some helper functions to plot the data samples as well as the # decision boundary for classification and the regression line for regression. # %% @@ -135,10 +135,10 @@ def fit_and_plot_regression(model, data, feature_names, target_names): # %% [markdown] # For both classification and regression setting, we observe that increasing the -# depth will make the tree model more expressive. However, a tree that is too -# deep will overfit the training data, creating partitions which are only -# correct for "outliers" (noisy samples). The `max_depth` is one of the -# hyperparameters that one should optimize via cross-validation and grid-search. +# depth makes the tree model more expressive. However, a tree that is too deep +# may overfit the training data, creating partitions which are only correct for +# "outliers" (noisy samples). The `max_depth` is one of the hyperparameters that +# one should optimize via cross-validation and grid-search. # %% from sklearn.model_selection import GridSearchCV @@ -172,15 +172,15 @@ def fit_and_plot_regression(model, data, feature_names, target_names): # # The `max_depth` hyperparameter controls the overall complexity of the tree. # This parameter is adequate under the assumption that a tree is built -# symmetrically. However, there is no guarantee that a tree will be symmetrical. +# symmetrically. However, there is no reason why a tree should be symmetrical. # Indeed, optimal generalization performance could be reached by growing some of # the branches deeper than some others. # -# We will build a dataset where we will illustrate this asymmetry. We will -# generate a dataset composed of 2 subsets: one subset where a clear separation -# should be found by the tree and another subset where samples from both classes -# will be mixed. It implies that a decision tree will need more splits to -# classify properly samples from the second subset than from the first subset. +# We build a dataset where we illustrate this asymmetry. We generate a dataset +# composed of 2 subsets: one subset where a clear separation should be found by +# the tree and another subset where samples from both classes are mixed. It +# implies that a decision tree needs more splits to classify properly samples +# from the second subset than from the first subset. # %% from sklearn.datasets import make_blobs @@ -188,11 +188,11 @@ def fit_and_plot_regression(model, data, feature_names, target_names): data_clf_columns = ["Feature #0", "Feature #1"] target_clf_column = "Class" -# Blobs that will be interlaced +# Blobs that are interlaced X_1, y_1 = make_blobs( n_samples=300, centers=[[0, 0], [-1, -1]], random_state=0 ) -# Blobs that will be easily separated +# Blobs that can be easily separated X_2, y_2 = make_blobs(n_samples=300, centers=[[3, 6], [7, 0]], random_state=0) X = np.concatenate([X_1, X_2], axis=0) @@ -214,9 +214,8 @@ def fit_and_plot_regression(model, data, feature_names, target_names): _ = plt.title("Synthetic dataset") # %% [markdown] -# We will first train a shallow decision tree with `max_depth=2`. We would -# expect this depth to be enough to separate the blobs that are easy to -# separate. +# We first train a shallow decision tree with `max_depth=2`. We would expect +# this depth to be enough to separate the blobs that are easy to separate. # %% max_depth = 2 @@ -228,7 +227,7 @@ def fit_and_plot_regression(model, data, feature_names, target_names): # %% [markdown] # As expected, we see that the blue blob in the lower right and the red blob on -# the top are easily separated. However, more splits will be required to better +# the top are easily separated. However, more splits are required to better # split the blob were both blue and red data points are mixed. # %% @@ -239,7 +238,7 @@ def fit_and_plot_regression(model, data, feature_names, target_names): # %% [markdown] # We see that the right branch achieves perfect classification. Now, we increase -# the depth to check how the tree will grow. +# the depth to check how the tree grows. # %% max_depth = 6 @@ -260,8 +259,8 @@ def fit_and_plot_regression(model, data, feature_names, target_names): # beneficial that a branch continue growing. # # The hyperparameters `min_samples_leaf`, `min_samples_split`, `max_leaf_nodes`, -# or `min_impurity_decrease` allows growing asymmetric trees and apply a -# constraint at the leaves or nodes level. We will check the effect of +# or `min_impurity_decrease` allow growing asymmetric trees and apply a +# constraint at the leaves or nodes level. We check the effect of # `min_samples_leaf`. # %% @@ -280,5 +279,5 @@ def fit_and_plot_regression(model, data, feature_names, target_names): # %% [markdown] # This hyperparameter allows to have leaves with a minimum number of samples and -# no further splits will be searched otherwise. Therefore, these hyperparameters +# no further splits are searched otherwise. Therefore, these hyperparameters # could be an alternative to fix the `max_depth` hyperparameter. diff --git a/python_scripts/trees_regression.py b/python_scripts/trees_regression.py index 8431c025c..f354bee6f 100644 --- a/python_scripts/trees_regression.py +++ b/python_scripts/trees_regression.py @@ -31,9 +31,9 @@ data_train, target_train = penguins[[feature_name]], penguins[target_name] # %% [markdown] -# To illustrate how decision trees are predicting in a regression setting, we -# will create a synthetic dataset containing all possible flipper length from -# the minimum to the maximum of the original data. +# To illustrate how decision trees predict in a regression setting, we create a +# synthetic dataset containing some of the possible flipper length values +# between the minimum and the maximum of the original data. # %% import numpy as np @@ -53,9 +53,9 @@ # some intuitive understanding on the shape of the decision function of the # learned decision trees. # -# However computing an evaluation metric on such a synthetic test set would be +# However, computing an evaluation metric on such a synthetic test set would be # meaningless since the synthetic dataset does not follow the same distribution -# as the real world data on which the model will be deployed. +# as the real world data on which the model would be deployed. # %% import matplotlib.pyplot as plt @@ -67,7 +67,7 @@ _ = plt.title("Illustration of the regression dataset used") # %% [markdown] -# We will first illustrate the difference between a linear model and a decision +# We first illustrate the difference between a linear model and a decision # tree. # %% @@ -112,9 +112,8 @@ # %% [markdown] # Contrary to linear models, decision trees are non-parametric models: they do -# not make assumptions about the way data is distributed. This will affect the -# prediction scheme. Repeating the above experiment will highlight the -# differences. +# not make assumptions about the way data is distributed. This affects the +# prediction scheme. Repeating the above experiment highlights the differences. # %% from sklearn.tree import DecisionTreeRegressor @@ -170,8 +169,8 @@ _ = plt.title("Prediction function using a DecisionTreeRegressor") # %% [markdown] -# Increasing the depth of the tree will increase the number of partition and -# thus the number of constant values that the tree is capable of predicting. +# Increasing the depth of the tree increases the number of partitions and thus +# the number of constant values that the tree is capable of predicting. # # In this notebook, we highlighted the differences in behavior of a decision # tree used in a classification problem in contrast to a regression problem. diff --git a/python_scripts/trees_sol_02.py b/python_scripts/trees_sol_02.py index cc7d5dbce..2235ddaf1 100644 --- a/python_scripts/trees_sol_02.py +++ b/python_scripts/trees_sol_02.py @@ -14,7 +14,7 @@ # By extrapolation, we refer to values predicted by a model outside of the range # of feature values seen during the training. # -# We will first load the regression data. +# We first load the regression data. # %% import pandas as pd @@ -92,10 +92,10 @@ # interpolate. # %% [markdown] -# Now, we will check the extrapolation capabilities of each model. Create a -# dataset containing a broader range of values than your previous dataset, in -# other words, add values below and above the minimum and the maximum of the -# flipper length seen during training. +# Now, we check the extrapolation capabilities of each model. Create a dataset +# containing a broader range of values than your previous dataset, in other +# words, add values below and above the minimum and the maximum of the flipper +# length seen during training. # %% # solution @@ -131,9 +131,9 @@ _ = plt.title("Prediction of linear model and a decision tree") # %% [markdown] tags=["solution"] -# The linear model will extrapolate using the fitted model for flipper lengths < -# 175 mm and > 235 mm. In fact, we are using the model parametrization to make -# this predictions. +# The linear model extrapolates using the fitted model for flipper lengths < 175 +# mm and > 235 mm. In fact, we are using the model parametrization to make these +# predictions. # # As mentioned, decision trees are non-parametric models and we observe that # they cannot extrapolate. For flipper lengths below the minimum, the mass of From d3a4ee7d97bc5986eb097d8d5a4e3ca0e9464101 Mon Sep 17 00:00:00 2001 From: Arturo Amor <86408019+ArturoAmorQ@users.noreply.github.com> Date: Tue, 7 Nov 2023 16:18:26 +0100 Subject: [PATCH 087/108] Generate notebooks (#750) Co-authored-by: ArturoAmorQ --- .../linear_models/linear_models_quiz_m4_03.md | 12 +++++ notebooks/01_tabular_data_exploration.ipynb | 34 ++++++------- .../01_tabular_data_exploration_ex_01.ipynb | 2 +- .../01_tabular_data_exploration_sol_01.ipynb | 2 +- ..._numerical_pipeline_cross_validation.ipynb | 24 ++++----- notebooks/02_numerical_pipeline_ex_00.ipynb | 7 +-- notebooks/02_numerical_pipeline_ex_01.ipynb | 8 +-- .../02_numerical_pipeline_hands_on.ipynb | 23 +++++---- notebooks/02_numerical_pipeline_scaling.ipynb | 31 ++++++------ notebooks/02_numerical_pipeline_sol_00.ipynb | 7 +-- notebooks/02_numerical_pipeline_sol_01.ipynb | 14 +++--- notebooks/03_categorical_pipeline.ipynb | 46 +++++++++-------- ...egorical_pipeline_column_transformer.ipynb | 40 +++++++-------- notebooks/03_categorical_pipeline_ex_01.ipynb | 15 +++--- .../03_categorical_pipeline_sol_01.ipynb | 17 +++---- .../03_categorical_pipeline_sol_02.ipynb | 2 +- ...3_categorical_pipeline_visualization.ipynb | 4 +- notebooks/parameter_tuning_ex_02.ipynb | 8 +-- notebooks/parameter_tuning_ex_03.ipynb | 2 +- notebooks/parameter_tuning_grid_search.ipynb | 38 +++++++------- notebooks/parameter_tuning_nested.ipynb | 12 ++--- .../parameter_tuning_parallel_plot.ipynb | 4 +- notebooks/parameter_tuning_sol_02.ipynb | 8 +-- notebooks/parameter_tuning_sol_03.ipynb | 10 ++-- notebooks/trees_dataset.ipynb | 16 +++--- notebooks/trees_ex_02.ipynb | 10 ++-- notebooks/trees_hyperparameters.ipynb | 49 +++++++++---------- notebooks/trees_regression.ipynb | 21 ++++---- notebooks/trees_sol_02.ipynb | 16 +++--- python_scripts/01_tabular_data_exploration.py | 34 ++++++------- .../01_tabular_data_exploration_ex_01.py | 4 +- .../01_tabular_data_exploration_sol_01.py | 2 +- .../02_numerical_pipeline_cross_validation.py | 24 ++++----- python_scripts/02_numerical_pipeline_ex_00.py | 9 ++-- python_scripts/02_numerical_pipeline_ex_01.py | 10 ++-- .../02_numerical_pipeline_hands_on.py | 23 +++++---- .../02_numerical_pipeline_scaling.py | 31 ++++++------ .../02_numerical_pipeline_sol_00.py | 7 +-- .../02_numerical_pipeline_sol_01.py | 14 +++--- python_scripts/03_categorical_pipeline.py | 48 +++++++++--------- ...categorical_pipeline_column_transformer.py | 40 +++++++-------- .../03_categorical_pipeline_ex_01.py | 17 +++---- .../03_categorical_pipeline_ex_02.py | 2 +- .../03_categorical_pipeline_sol_01.py | 17 +++---- .../03_categorical_pipeline_sol_02.py | 2 +- .../03_categorical_pipeline_visualization.py | 4 +- 46 files changed, 386 insertions(+), 384 deletions(-) diff --git a/jupyter-book/linear_models/linear_models_quiz_m4_03.md b/jupyter-book/linear_models/linear_models_quiz_m4_03.md index 672f04e58..1a6cb1b1e 100644 --- a/jupyter-book/linear_models/linear_models_quiz_m4_03.md +++ b/jupyter-book/linear_models/linear_models_quiz_m4_03.md @@ -79,6 +79,18 @@ _Select all answers that apply_ +++ +```{admonition} Question +By default, a [`LogisticRegression`](https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html) in scikit-learn applies: + +- a) no penalty +- b) a penalty that shrinks the magnitude of the weights towards zero (also called "l2 penalty") +- c) a penalty that ensures all weights are equal + +_Select a single answer_ +``` + ++++ + ```{admonition} Question The parameter `C` in a logistic regression is: diff --git a/notebooks/01_tabular_data_exploration.ipynb b/notebooks/01_tabular_data_exploration.ipynb index 2f47aadde..6e11251e6 100644 --- a/notebooks/01_tabular_data_exploration.ipynb +++ b/notebooks/01_tabular_data_exploration.ipynb @@ -6,8 +6,8 @@ "source": [ "# First look at our dataset\n", "\n", - "In this notebook, we will look at the necessary steps required before any\n", - " machine learning takes place. It involves:\n", + "In this notebook, we look at the necessary steps required before any machine\n", + " learning takes place. It involves:\n", "\n", "* loading the data;\n", "* looking at the variables in the dataset, in particular, differentiate\n", @@ -23,14 +23,14 @@ "source": [ "## Loading the adult census dataset\n", "\n", - "We will use data from the 1994 US census that we downloaded from\n", + "We use data from the 1994 US census that we downloaded from\n", "[OpenML](http://openml.org/).\n", "\n", "You can look at the OpenML webpage to learn more about this dataset:\n", "\n", "\n", - "The dataset is available as a CSV (Comma-Separated Values) file and we will\n", - "use `pandas` to read it.\n", + "The dataset is available as a CSV (Comma-Separated Values) file and we use\n", + "`pandas` to read it.\n", "\n", "
\n", "

Note

\n", @@ -105,9 +105,9 @@ "The column named **class** is our target variable (i.e., the variable which we\n", "want to predict). The two possible classes are `<=50K` (low-revenue) and\n", "`>50K` (high-revenue). The resulting prediction problem is therefore a binary\n", - "classification problem as `class` has only two possible values. We will use\n", - "the left-over columns (any column other than `class`) as input variables for\n", - "our model." + "classification problem as `class` has only two possible values. We use the\n", + "left-over columns (any column other than `class`) as input variables for our\n", + "model." ] }, { @@ -131,7 +131,7 @@ "with \" <=50K\" than with \" >50K\". Class imbalance happens often in practice\n", "and may need special techniques when building a predictive model.

\n", "

For example in a medical setting, if we are trying to predict whether subjects\n", - "will develop a rare disease, there will be a lot more healthy subjects than\n", + "may develop a rare disease, there would be a lot more healthy subjects than\n", "ill subjects in the dataset.

\n", "
" ] @@ -389,8 +389,8 @@ "source": [ "import seaborn as sns\n", "\n", - "# We will plot a subset of the data to keep the plot readable and make the\n", - "# plotting faster\n", + "# We plot a subset of the data to keep the plot readable and make the plotting\n", + "# faster\n", "n_samples_to_plot = 5000\n", "columns = [\"age\", \"education-num\", \"hours-per-week\"]\n", "_ = sns.pairplot(\n", @@ -486,12 +486,12 @@ " a mix of blue points and orange points. It seems complicated to choose which\n", " class we should predict in this region.\n", "\n", - "It is interesting to note that some machine learning models will work\n", - "similarly to what we did: they are known as decision tree models. The two\n", - "thresholds that we chose (27 years and 40 hours) are somewhat arbitrary, i.e.\n", - "we chose them by only looking at the pairplot. In contrast, a decision tree\n", - "will choose the \"best\" splits based on data without human intervention or\n", - "inspection. Decision trees will be covered more in detail in a future module.\n", + "It is interesting to note that some machine learning models work similarly to\n", + "what we did: they are known as decision tree models. The two thresholds that\n", + "we chose (27 years and 40 hours) are somewhat arbitrary, i.e. we chose them by\n", + "only looking at the pairplot. In contrast, a decision tree chooses the \"best\"\n", + "splits based on data without human intervention or inspection. Decision trees\n", + "will be covered more in detail in a future module.\n", "\n", "Note that machine learning is often used when creating rules by hand is not\n", "straightforward. For example because we are in high dimension (many features\n", diff --git a/notebooks/01_tabular_data_exploration_ex_01.ipynb b/notebooks/01_tabular_data_exploration_ex_01.ipynb index 040c50c82..373db2d55 100644 --- a/notebooks/01_tabular_data_exploration_ex_01.ipynb +++ b/notebooks/01_tabular_data_exploration_ex_01.ipynb @@ -109,7 +109,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Looking at these distributions, how hard do you think it will be to classify\n", + "Looking at these distributions, how hard do you think it would be to classify\n", "the penguins only using `\"culmen depth\"` and `\"culmen length\"`?" ] } diff --git a/notebooks/01_tabular_data_exploration_sol_01.ipynb b/notebooks/01_tabular_data_exploration_sol_01.ipynb index 3cd2ae2c0..d8bd25e63 100644 --- a/notebooks/01_tabular_data_exploration_sol_01.ipynb +++ b/notebooks/01_tabular_data_exploration_sol_01.ipynb @@ -168,7 +168,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Looking at these distributions, how hard do you think it will be to classify\n", + "Looking at these distributions, how hard do you think it would be to classify\n", "the penguins only using `\"culmen depth\"` and `\"culmen length\"`?" ] }, diff --git a/notebooks/02_numerical_pipeline_cross_validation.ipynb b/notebooks/02_numerical_pipeline_cross_validation.ipynb index c7422f698..82b8ac2eb 100644 --- a/notebooks/02_numerical_pipeline_cross_validation.ipynb +++ b/notebooks/02_numerical_pipeline_cross_validation.ipynb @@ -6,9 +6,9 @@ "source": [ "# Model evaluation using cross-validation\n", "\n", - "In this notebook, we will still use only numerical features.\n", + "In this notebook, we still use numerical features only.\n", "\n", - "We will discuss the practical aspects of assessing the generalization\n", + "Here we discuss the practical aspects of assessing the generalization\n", "performance of our model via **cross-validation** instead of a single\n", "train-test split.\n", "\n", @@ -32,8 +32,8 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "We will now drop the target from the data we will use to train our\n", - "predictive model." + "We now drop the target from the data we will use to train our predictive\n", + "model." ] }, { @@ -94,11 +94,11 @@ "## The need for cross-validation\n", "\n", "In the previous notebook, we split the original data into a training set and a\n", - "testing set. The score of a model will in general depend on the way we make\n", - "such a split. One downside of doing a single split is that it does not give\n", - "any information about this variability. Another downside, in a setting where\n", - "the amount of data is small, is that the data available for training and\n", - "testing will be even smaller after splitting.\n", + "testing set. The score of a model in general depends on the way we make such a\n", + "split. One downside of doing a single split is that it does not give any\n", + "information about this variability. Another downside, in a setting where the\n", + "amount of data is small, is that the data available for training and testing\n", + "would be even smaller after splitting.\n", "\n", "Instead, we can use cross-validation. Cross-validation consists of repeating\n", "the procedure such that the training and testing sets are different each time.\n", @@ -107,8 +107,8 @@ "model's generalization performance.\n", "\n", "Note that there exists several cross-validation strategies, each of them\n", - "defines how to repeat the `fit`/`score` procedure. In this section, we will\n", - "use the K-fold strategy: the entire dataset is split into `K` partitions. The\n", + "defines how to repeat the `fit`/`score` procedure. In this section, we use the\n", + "K-fold strategy: the entire dataset is split into `K` partitions. The\n", "`fit`/`score` procedure is repeated `K` times where at each iteration `K - 1`\n", "partitions are used to fit the model and `1` partition is used to score. The\n", "figure below illustrates this K-fold strategy.\n", @@ -178,7 +178,7 @@ "[`sklearn.model_selection.cross_validate`](https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.cross_validate.html)\n", "to collect additional information, such as the training scores of the models\n", "obtained on each round or even return the models themselves instead of\n", - "discarding them. These features will be covered in a future notebook.\n", + "discarding them. These features will be covered in a future notebook.\n", "\n", "Let's extract the scores computed on the test fold of each cross-validation\n", "round from the `cv_result` dictionary and compute the mean accuracy and the\n", diff --git a/notebooks/02_numerical_pipeline_ex_00.ipynb b/notebooks/02_numerical_pipeline_ex_00.ipynb index ef7d6b923..4c09e2233 100644 --- a/notebooks/02_numerical_pipeline_ex_00.ipynb +++ b/notebooks/02_numerical_pipeline_ex_00.ipynb @@ -44,11 +44,12 @@ "number of neighbors we are going to use to make a prediction for a new data\n", "point.\n", "\n", - "What is the default value of the `n_neighbors` parameter? Hint: Look at the\n", - "documentation on the [scikit-learn\n", + "What is the default value of the `n_neighbors` parameter?\n", + "\n", + "**Hint**: Look at the documentation on the [scikit-learn\n", "website](https://scikit-learn.org/stable/modules/generated/sklearn.neighbors.KNeighborsClassifier.html)\n", "or directly access the description inside your notebook by running the\n", - "following cell. This will open a pager pointing to the documentation." + "following cell. This opens a pager pointing to the documentation." ] }, { diff --git a/notebooks/02_numerical_pipeline_ex_01.ipynb b/notebooks/02_numerical_pipeline_ex_01.ipynb index 688f435e6..08c008f6b 100644 --- a/notebooks/02_numerical_pipeline_ex_01.ipynb +++ b/notebooks/02_numerical_pipeline_ex_01.ipynb @@ -37,8 +37,8 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "We will first split our dataset to have the target separated from the data\n", - "used to train our predictive model." + "We first split our dataset to have the target separated from the data used to\n", + "train our predictive model." ] }, { @@ -93,8 +93,8 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Use a `DummyClassifier` such that the resulting classifier will always predict\n", - "the class `' >50K'`. What is the accuracy score on the test set? Repeat the\n", + "Use a `DummyClassifier` such that the resulting classifier always predict the\n", + "class `' >50K'`. What is the accuracy score on the test set? Repeat the\n", "experiment by always predicting the class `' <=50K'`.\n", "\n", "Hint: you can set the `strategy` parameter of the `DummyClassifier` to achieve\n", diff --git a/notebooks/02_numerical_pipeline_hands_on.ipynb b/notebooks/02_numerical_pipeline_hands_on.ipynb index 326bc0aad..fff46e8cc 100644 --- a/notebooks/02_numerical_pipeline_hands_on.ipynb +++ b/notebooks/02_numerical_pipeline_hands_on.ipynb @@ -19,8 +19,7 @@ "* using a scikit-learn helper to separate data into train-test sets;\n", "* training and evaluating a more complex scikit-learn model.\n", "\n", - "We will start by loading the adult census dataset used during the data\n", - "exploration.\n", + "We start by loading the adult census dataset used during the data exploration.\n", "\n", "## Loading the entire dataset\n", "\n", @@ -105,13 +104,13 @@ "numerical data usually requires very little work before getting started with\n", "training.\n", "\n", - "The first task here will be to identify numerical data in our dataset.\n", + "The first task here is to identify numerical data in our dataset.\n", "\n", "
\n", "

Caution!

\n", - "

Numerical data are represented with numbers, but numbers are not always\n", - "representing numerical data. Categories could already be encoded with\n", - "numbers and you will need to identify these features.

\n", + "

Numerical data are represented with numbers, but numbers do not always\n", + "represent numerical data. Categories could already be encoded with\n", + "numbers and you may need to identify these features.

\n", "
\n", "\n", "Thus, we can check the data type for each of the column in the dataset." @@ -209,7 +208,7 @@ "source": [ "We can see the age varies between 17 and 90 years.\n", "\n", - "We could extend our analysis and we will find that `\"capital-gain\"`,\n", + "We could extend our analysis and we would find that `\"capital-gain\"`,\n", "`\"capital-loss\"`, and `\"hours-per-week\"` are also representing quantitative\n", "data.\n", "\n", @@ -273,7 +272,7 @@ "source": [ "When calling the function `train_test_split`, we specified that we would like\n", "to have 25% of samples in the testing set while the remaining samples (75%)\n", - "will be available in the training set. We can check quickly if we got what we\n", + "are assigned to the training set. We can check quickly if we got what we\n", "expected." ] }, @@ -309,8 +308,8 @@ "source": [ "In the previous notebook, we used a k-nearest neighbors model. While this\n", "model is intuitive to understand, it is not widely used in practice. Now, we\n", - "will use a more useful model, called a logistic regression, which belongs to\n", - "the linear models family.\n", + "use a more useful model, called a logistic regression, which belongs to the\n", + "linear models family.\n", "\n", "
\n", "

Note

\n", @@ -321,8 +320,8 @@ "
  • if 0.1 * age + 3.3 * hours-per-week - 15.1 > 0, predict high-income
  • \n", "
  • otherwise predict low-income
  • \n", "\n", - "

    Linear models, and in particular the logistic regression, will be covered in\n", - "more details in the \"Linear models\" module later in this course. For now the\n", + "

    Linear models, and in particular the logistic regression, will be covered\n", + "more in detail in the \"Linear models\" module later in this course. For now the\n", "focus is to use this logistic regression model in scikit-learn rather than\n", "understand how it works in details.

    \n", "
    \n", diff --git a/notebooks/02_numerical_pipeline_scaling.ipynb b/notebooks/02_numerical_pipeline_scaling.ipynb index c7bd8d751..4fe003f24 100644 --- a/notebooks/02_numerical_pipeline_scaling.ipynb +++ b/notebooks/02_numerical_pipeline_scaling.ipynb @@ -6,9 +6,9 @@ "source": [ "# Preprocessing for numerical features\n", "\n", - "In this notebook, we will still use only numerical features.\n", + "In this notebook, we still use numerical features only.\n", "\n", - "We will introduce these new aspects:\n", + "Here we introduce these new aspects:\n", "\n", "* an example of preprocessing, namely **scaling numerical variables**;\n", "* using a scikit-learn **pipeline** to chain preprocessing and model training.\n", @@ -33,8 +33,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "We will now drop the target from the data we will use to train our predictive\n", - "model." + "We now drop the target from the data we use to train our predictive model." ] }, { @@ -115,7 +114,7 @@ "source": [ "We see that the dataset's features span across different ranges. Some\n", "algorithms make some assumptions regarding the feature distributions and\n", - "usually normalizing features will be helpful to address these assumptions.\n", + "normalizing features is usually helpful to address such assumptions.\n", "\n", "
    \n", "

    Tip

    \n", @@ -133,13 +132,13 @@ "Whether or not a machine learning model requires scaling the features depends\n", "on the model family. Linear models such as logistic regression generally\n", "benefit from scaling the features while other models such as decision trees do\n", - "not need such preprocessing (but will not suffer from it).\n", + "not need such preprocessing (but would not suffer from it).\n", "\n", "We show how to apply such normalization using a scikit-learn transformer\n", "called `StandardScaler`. This transformer shifts and scales each feature\n", "individually so that they all have a 0-mean and a unit standard deviation.\n", "\n", - "We will investigate different steps used in scikit-learn to achieve such a\n", + "We now investigate different steps used in scikit-learn to achieve such a\n", "transformation of the data.\n", "\n", "First, one needs to call the method `fit` in order to learn the scaling from\n", @@ -175,10 +174,10 @@ "\n", "
    \n", "

    Note

    \n", - "

    The fact that the model states of this scaler are arrays of means and\n", - "standard deviations is specific to the StandardScaler. Other\n", - "scikit-learn transformers will compute different statistics and store them\n", - "as model states, in the same fashion.

    \n", + "

    The fact that the model states of this scaler are arrays of means and standard\n", + "deviations is specific to the StandardScaler. Other scikit-learn\n", + "transformers may compute different statistics and store them as model states,\n", + "in a similar fashion.

    \n", "
    \n", "\n", "We can inspect the computed means and standard deviations." @@ -353,7 +352,7 @@ "source": [ "We can easily combine sequential operations with a scikit-learn `Pipeline`,\n", "which chains together operations and is used as any other classifier or\n", - "regressor. The helper function `make_pipeline` will create a `Pipeline`: it\n", + "regressor. The helper function `make_pipeline` creates a `Pipeline`: it\n", "takes as arguments the successive transformations to perform, followed by the\n", "classifier or regressor model." ] @@ -378,8 +377,8 @@ "source": [ "The `make_pipeline` function did not require us to give a name to each step.\n", "Indeed, it was automatically assigned based on the name of the classes\n", - "provided; a `StandardScaler` will be a step named `\"standardscaler\"` in the\n", - "resulting pipeline. We can check the name of each steps of our model:" + "provided; a `StandardScaler` step is named `\"standardscaler\"` in the resulting\n", + "pipeline. We can check the name of each steps of our model:" ] }, { @@ -421,7 +420,7 @@ "![pipeline fit diagram](../figures/api_diagram-pipeline.fit.svg)\n", "\n", "When calling `model.fit`, the method `fit_transform` from each underlying\n", - "transformer (here a single transformer) in the pipeline will be called to:\n", + "transformer (here a single transformer) in the pipeline is called to:\n", "\n", "- learn their internal model states\n", "- transform the training data. Finally, the preprocessed data are provided to\n", @@ -452,7 +451,7 @@ "called to preprocess the data. Note that there is no need to call the `fit`\n", "method for these transformers because we are using the internal model states\n", "computed when calling `model.fit`. The preprocessed data is then provided to\n", - "the predictor that will output the predicted target by calling its method\n", + "the predictor that outputs the predicted target by calling its method\n", "`predict`.\n", "\n", "As a shorthand, we can check the score of the full predictive pipeline calling\n", diff --git a/notebooks/02_numerical_pipeline_sol_00.ipynb b/notebooks/02_numerical_pipeline_sol_00.ipynb index ff144d5c0..e5be6f7e2 100644 --- a/notebooks/02_numerical_pipeline_sol_00.ipynb +++ b/notebooks/02_numerical_pipeline_sol_00.ipynb @@ -44,11 +44,12 @@ "number of neighbors we are going to use to make a prediction for a new data\n", "point.\n", "\n", - "What is the default value of the `n_neighbors` parameter? Hint: Look at the\n", - "documentation on the [scikit-learn\n", + "What is the default value of the `n_neighbors` parameter?\n", + "\n", + "**Hint**: Look at the documentation on the [scikit-learn\n", "website](https://scikit-learn.org/stable/modules/generated/sklearn.neighbors.KNeighborsClassifier.html)\n", "or directly access the description inside your notebook by running the\n", - "following cell. This will open a pager pointing to the documentation." + "following cell. This opens a pager pointing to the documentation." ] }, { diff --git a/notebooks/02_numerical_pipeline_sol_01.ipynb b/notebooks/02_numerical_pipeline_sol_01.ipynb index 2198c76b8..352cf234f 100644 --- a/notebooks/02_numerical_pipeline_sol_01.ipynb +++ b/notebooks/02_numerical_pipeline_sol_01.ipynb @@ -37,8 +37,8 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "We will first split our dataset to have the target separated from the data\n", - "used to train our predictive model." + "We first split our dataset to have the target separated from the data used to\n", + "train our predictive model." ] }, { @@ -96,8 +96,8 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Use a `DummyClassifier` such that the resulting classifier will always predict\n", - "the class `' >50K'`. What is the accuracy score on the test set? Repeat the\n", + "Use a `DummyClassifier` such that the resulting classifier always predict the\n", + "class `' >50K'`. What is the accuracy score on the test set? Repeat the\n", "experiment by always predicting the class `' <=50K'`.\n", "\n", "Hint: you can set the `strategy` parameter of the `DummyClassifier` to achieve\n", @@ -131,8 +131,8 @@ }, "source": [ "We clearly see that the score is below 0.5 which might be surprising at first.\n", - "We will now check the generalization performance of a model which always\n", - "predict the low revenue class, i.e. `\" <=50K\"`." + "We now check the generalization performance of a model which always predict\n", + "the low revenue class, i.e. `\" <=50K\"`." ] }, { @@ -175,7 +175,7 @@ }, "source": [ "Therefore, any predictive model giving results below this dummy classifier\n", - "will not be helpful." + "would not be helpful." ] }, { diff --git a/notebooks/03_categorical_pipeline.ipynb b/notebooks/03_categorical_pipeline.ipynb index 3972842a5..575268c9f 100644 --- a/notebooks/03_categorical_pipeline.ipynb +++ b/notebooks/03_categorical_pipeline.ipynb @@ -6,9 +6,9 @@ "source": [ "# Encoding of categorical variables\n", "\n", - "In this notebook, we will present typical ways of dealing with\n", - "**categorical variables** by encoding them, namely **ordinal encoding** and\n", - "**one-hot encoding**." + "In this notebook, we present some typical ways of dealing with **categorical\n", + "variables** by encoding them, namely **ordinal encoding** and **one-hot\n", + "encoding**." ] }, { @@ -94,9 +94,9 @@ "## Select features based on their data type\n", "\n", "In the previous notebook, we manually defined the numerical columns. We could\n", - "do a similar approach. Instead, we will use the scikit-learn helper function\n", - "`make_column_selector`, which allows us to select columns based on\n", - "their data type. We will illustrate how to use this helper." + "do a similar approach. Instead, we can use the scikit-learn helper function\n", + "`make_column_selector`, which allows us to select columns based on their data\n", + "type. We now illustrate how to use this helper." ] }, { @@ -159,9 +159,8 @@ "### Encoding ordinal categories\n", "\n", "The most intuitive strategy is to encode each category with a different\n", - "number. The `OrdinalEncoder` will transform the data in such manner.\n", - "We will start by encoding a single column to understand how the encoding\n", - "works." + "number. The `OrdinalEncoder` transforms the data in such manner. We start by\n", + "encoding a single column to understand how the encoding works." ] }, { @@ -258,13 +257,13 @@ "\n", "`OneHotEncoder` is an alternative encoder that prevents the downstream\n", "models to make a false assumption about the ordering of categories. For a\n", - "given feature, it will create as many new columns as there are possible\n", + "given feature, it creates as many new columns as there are possible\n", "categories. For a given sample, the value of the column corresponding to the\n", - "category will be set to `1` while all the columns of the other categories\n", - "will be set to `0`.\n", + "category is set to `1` while all the columns of the other categories\n", + "are set to `0`.\n", "\n", - "We will start by encoding a single feature (e.g. `\"education\"`) to illustrate\n", - "how the encoding works." + "We can encode a single feature (e.g. `\"education\"`) to illustrate how the\n", + "encoding works." ] }, { @@ -299,7 +298,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "We see that encoding a single feature will give a dataframe full of zeros\n", + "We see that encoding a single feature gives a dataframe full of zeros\n", "and ones. Each category (unique value) became a column; the encoding\n", "returned, for each sample, a 1 to specify which category it belongs to.\n", "\n", @@ -353,8 +352,8 @@ "source": [ "### Choosing an encoding strategy\n", "\n", - "Choosing an encoding strategy will depend on the underlying models and the\n", - "type of categories (i.e. ordinal vs. nominal)." + "Choosing an encoding strategy depends on the underlying models and the type of\n", + "categories (i.e. ordinal vs. nominal)." ] }, { @@ -373,12 +372,11 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "\n", - "Using an `OrdinalEncoder` will output ordinal categories. This means\n", + "Using an `OrdinalEncoder` outputs ordinal categories. This means\n", "that there is an order in the resulting categories (e.g. `0 < 1 < 2`). The\n", "impact of violating this ordering assumption is really dependent on the\n", - "downstream models. Linear models will be impacted by misordered categories\n", - "while tree-based models will not.\n", + "downstream models. Linear models would be impacted by misordered categories\n", + "while tree-based models would not.\n", "\n", "You can still use an `OrdinalEncoder` with linear models but you need to be\n", "sure that:\n", @@ -426,7 +424,7 @@ "We see that the `\"Holand-Netherlands\"` category is occurring rarely. This will\n", "be a problem during cross-validation: if the sample ends up in the test set\n", "during splitting then the classifier would not have seen the category during\n", - "training and will not be able to encode it.\n", + "training and would not be able to encode it.\n", "\n", "In scikit-learn, there are some possible solutions to bypass this issue:\n", "\n", @@ -455,8 +453,8 @@ "

    Tip

    \n", "

    Be aware the OrdinalEncoder exposes a parameter also named handle_unknown.\n", "It can be set to use_encoded_value. If that option is chosen, you can define\n", - "a fixed value to which all unknowns will be set to during transform. For\n", - "example, OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=42) will set all values encountered during transform to 42\n", + "a fixed value that is assigned to all unknown categories during transform.\n", + "For example, OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1) would set all values encountered during transform to -1\n", "which are not part of the data encountered during the fit call. You are\n", "going to use these parameters in the next exercise.

    \n", "
    " diff --git a/notebooks/03_categorical_pipeline_column_transformer.ipynb b/notebooks/03_categorical_pipeline_column_transformer.ipynb index aca827f4c..f9f3d5293 100644 --- a/notebooks/03_categorical_pipeline_column_transformer.ipynb +++ b/notebooks/03_categorical_pipeline_column_transformer.ipynb @@ -6,12 +6,12 @@ "source": [ "# Using numerical and categorical variables together\n", "\n", - "In the previous notebooks, we showed the required preprocessing to apply\n", - "when dealing with numerical and categorical variables. However, we decoupled\n", - "the process to treat each type individually. In this notebook, we will show\n", - "how to combine these preprocessing steps.\n", + "In the previous notebooks, we showed the required preprocessing to apply when\n", + "dealing with numerical and categorical variables. However, we decoupled the\n", + "process to treat each type individually. In this notebook, we show how to\n", + "combine these preprocessing steps.\n", "\n", - "We will first load the entire adult census dataset." + "We first load the entire adult census dataset." ] }, { @@ -38,10 +38,10 @@ "source": [ "## Selection based on data types\n", "\n", - "We will separate categorical and numerical variables using their data\n", - "types to identify them, as we saw previously that `object` corresponds\n", - "to categorical columns (strings). We make use of `make_column_selector`\n", - "helper to select the corresponding columns." + "We separate categorical and numerical variables using their data types to\n", + "identify them, as we saw previously that `object` corresponds to categorical\n", + "columns (strings). We make use of `make_column_selector` helper to select the\n", + "corresponding columns." ] }, { @@ -84,14 +84,14 @@ "In the previous sections, we saw that we need to treat data differently\n", "depending on their nature (i.e. numerical or categorical).\n", "\n", - "Scikit-learn provides a `ColumnTransformer` class which will send specific\n", + "Scikit-learn provides a `ColumnTransformer` class which sends specific\n", "columns to a specific transformer, making it easy to fit a single predictive\n", "model on a dataset that combines both kinds of variables together\n", "(heterogeneously typed tabular data).\n", "\n", "We first define the columns depending on their data type:\n", "\n", - "* **one-hot encoding** will be applied to categorical columns. Besides, we use\n", + "* **one-hot encoding** is applied to categorical columns. Besides, we use\n", " `handle_unknown=\"ignore\"` to solve the potential issues due to rare\n", " categories.\n", "* **numerical scaling** numerical features which will be standardized.\n", @@ -149,11 +149,11 @@ "A `ColumnTransformer` does the following:\n", "\n", "* It **splits the columns** of the original dataset based on the column names\n", - " or indices provided. We will obtain as many subsets as the number of\n", - " transformers passed into the `ColumnTransformer`.\n", + " or indices provided. We obtain as many subsets as the number of transformers\n", + " passed into the `ColumnTransformer`.\n", "* It **transforms each subsets**. A specific transformer is applied to each\n", - " subset: it will internally call `fit_transform` or `transform`. The output\n", - " of this step is a set of transformed datasets.\n", + " subset: it internally calls `fit_transform` or `transform`. The output of\n", + " this step is a set of transformed datasets.\n", "* It then **concatenates the transformed datasets** into a single dataset.\n", "\n", "The important thing is that `ColumnTransformer` is like any other scikit-learn\n", @@ -234,7 +234,7 @@ "source": [ "Then, we can send the raw dataset straight to the pipeline. Indeed, we do not\n", "need to make any manual preprocessing (calling the `transform` or\n", - "`fit_transform` methods) as it will be handled when calling the `predict`\n", + "`fit_transform` methods) as it is already handled when calling the `predict`\n", "method. As an example, we predict on the five first samples from the test set." ] }, @@ -337,10 +337,10 @@ "\n", "However, it is often useful to check whether more complex models such as an\n", "ensemble of decision trees can lead to higher predictive performance. In this\n", - "section we will use such a model called **gradient-boosting trees** and\n", - "evaluate its generalization performance. More precisely, the scikit-learn\n", - "model we will use is called `HistGradientBoostingClassifier`. Note that\n", - "boosting models will be covered in more detail in a future module.\n", + "section we use such a model called **gradient-boosting trees** and evaluate\n", + "its generalization performance. More precisely, the scikit-learn model we use\n", + "is called `HistGradientBoostingClassifier`. Note that boosting models will be\n", + "covered in more detail in a future module.\n", "\n", "For tree-based models, the handling of numerical and categorical variables is\n", "simpler than for linear models:\n", diff --git a/notebooks/03_categorical_pipeline_ex_01.ipynb b/notebooks/03_categorical_pipeline_ex_01.ipynb index 1f7ab830e..d77bbef38 100644 --- a/notebooks/03_categorical_pipeline_ex_01.ipynb +++ b/notebooks/03_categorical_pipeline_ex_01.ipynb @@ -47,9 +47,8 @@ "source": [ "In the previous notebook, we used `sklearn.compose.make_column_selector` to\n", "automatically select columns with a specific data type (also called `dtype`).\n", - "Here, we will use this selector to get only the columns containing strings\n", - "(column with `object` dtype) that correspond to categorical features in our\n", - "dataset." + "Here, we use this selector to get only the columns containing strings (column\n", + "with `object` dtype) that correspond to categorical features in our dataset." ] }, { @@ -102,11 +101,11 @@ "
    \n", "

    Note

    \n", "

    Be aware that if an error happened during the cross-validation,\n", - "cross_validate will raise a warning and return NaN (Not a Number) as scores.\n", + "cross_validate would raise a warning and return NaN (Not a Number) as scores.\n", "To make it raise a standard Python exception with a traceback, you can pass\n", "the error_score=\"raise\" argument in the call to cross_validate. An\n", - "exception will be raised instead of a warning at the first encountered problem\n", - "and cross_validate will stop right away instead of returning NaN values.\n", + "exception would be raised instead of a warning at the first encountered problem\n", + "and cross_validate would stop right away instead of returning NaN values.\n", "This is particularly handy when developing complex machine learning pipelines.

    \n", "
    " ] @@ -127,8 +126,8 @@ "metadata": {}, "source": [ "Now, we would like to compare the generalization performance of our previous\n", - "model with a new model where instead of using an `OrdinalEncoder`, we will use\n", - "a `OneHotEncoder`. Repeat the model evaluation using cross-validation. Compare\n", + "model with a new model where instead of using an `OrdinalEncoder`, we use a\n", + "`OneHotEncoder`. Repeat the model evaluation using cross-validation. Compare\n", "the score of both models and conclude on the impact of choosing a specific\n", "encoding strategy when using a linear model." ] diff --git a/notebooks/03_categorical_pipeline_sol_01.ipynb b/notebooks/03_categorical_pipeline_sol_01.ipynb index 206a36f4c..916e2be5f 100644 --- a/notebooks/03_categorical_pipeline_sol_01.ipynb +++ b/notebooks/03_categorical_pipeline_sol_01.ipynb @@ -47,9 +47,8 @@ "source": [ "In the previous notebook, we used `sklearn.compose.make_column_selector` to\n", "automatically select columns with a specific data type (also called `dtype`).\n", - "Here, we will use this selector to get only the columns containing strings\n", - "(column with `object` dtype) that correspond to categorical features in our\n", - "dataset." + "Here, we use this selector to get only the columns containing strings (column\n", + "with `object` dtype) that correspond to categorical features in our dataset." ] }, { @@ -106,11 +105,11 @@ "
    \n", "

    Note

    \n", "

    Be aware that if an error happened during the cross-validation,\n", - "cross_validate will raise a warning and return NaN (Not a Number) as scores.\n", + "cross_validate would raise a warning and return NaN (Not a Number) as scores.\n", "To make it raise a standard Python exception with a traceback, you can pass\n", "the error_score=\"raise\" argument in the call to cross_validate. An\n", - "exception will be raised instead of a warning at the first encountered problem\n", - "and cross_validate will stop right away instead of returning NaN values.\n", + "exception would be raised instead of a warning at the first encountered problem\n", + "and cross_validate would stop right away instead of returning NaN values.\n", "This is particularly handy when developing complex machine learning pipelines.

    \n", "
    " ] @@ -177,8 +176,8 @@ "metadata": {}, "source": [ "Now, we would like to compare the generalization performance of our previous\n", - "model with a new model where instead of using an `OrdinalEncoder`, we will use\n", - "a `OneHotEncoder`. Repeat the model evaluation using cross-validation. Compare\n", + "model with a new model where instead of using an `OrdinalEncoder`, we use a\n", + "`OneHotEncoder`. Repeat the model evaluation using cross-validation. Compare\n", "the score of both models and conclude on the impact of choosing a specific\n", "encoding strategy when using a linear model." ] @@ -216,7 +215,7 @@ "\n", "The important message here is: linear model and `OrdinalEncoder` are used\n", "together only for ordinal categorical features, i.e. features that have a\n", - "specific ordering. Otherwise, your model will perform poorly." + "specific ordering. Otherwise, your model would perform poorly." ] } ], diff --git a/notebooks/03_categorical_pipeline_sol_02.ipynb b/notebooks/03_categorical_pipeline_sol_02.ipynb index 725a86cdd..161d0cbdd 100644 --- a/notebooks/03_categorical_pipeline_sol_02.ipynb +++ b/notebooks/03_categorical_pipeline_sol_02.ipynb @@ -287,7 +287,7 @@ "\n", "\n", "
      \n", - "
    • OneHotEncoder: will always do something meaningful, but can be unnecessary\n", + "
    • OneHotEncoder: always does something meaningful, but can be unnecessary\n", "slow with trees.
    • \n", "
    • OrdinalEncoder: can be detrimental for linear models unless your category\n", "has a meaningful order and you make sure that OrdinalEncoder respects this\n", diff --git a/notebooks/03_categorical_pipeline_visualization.ipynb b/notebooks/03_categorical_pipeline_visualization.ipynb index dd16ea0b3..48110a944 100644 --- a/notebooks/03_categorical_pipeline_visualization.ipynb +++ b/notebooks/03_categorical_pipeline_visualization.ipynb @@ -29,8 +29,8 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "We need to define our data and target. In this case we will build a\n", - "classification model" + "We need to define our data and target. In this case we build a classification\n", + "model" ] }, { diff --git a/notebooks/parameter_tuning_ex_02.ipynb b/notebooks/parameter_tuning_ex_02.ipynb index 2aa096d5c..026e37fd8 100644 --- a/notebooks/parameter_tuning_ex_02.ipynb +++ b/notebooks/parameter_tuning_ex_02.ipynb @@ -76,10 +76,10 @@ "source": [ "Use the previously defined model (called `model`) and using two nested `for`\n", "loops, make a search of the best combinations of the `learning_rate` and\n", - "`max_leaf_nodes` parameters. In this regard, you will need to train and test\n", - "the model by setting the parameters. The evaluation of the model should be\n", - "performed using `cross_val_score` on the training set. We will use the\n", - "following parameters search:\n", + "`max_leaf_nodes` parameters. In this regard, you have to train and test the\n", + "model by setting the parameters. The evaluation of the model should be\n", + "performed using `cross_val_score` on the training set. Use the following\n", + "parameters search:\n", "- `learning_rate` for the values 0.01, 0.1, 1 and 10. This parameter controls\n", " the ability of a new tree to correct the error of the previous sequence of\n", " trees\n", diff --git a/notebooks/parameter_tuning_ex_03.ipynb b/notebooks/parameter_tuning_ex_03.ipynb index ee40ef916..e26aa4150 100644 --- a/notebooks/parameter_tuning_ex_03.ipynb +++ b/notebooks/parameter_tuning_ex_03.ipynb @@ -31,7 +31,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "In this exercise, we will progressively define the regression pipeline and\n", + "In this exercise, we progressively define the regression pipeline and\n", "later tune its hyperparameters.\n", "\n", "Start by defining a pipeline that:\n", diff --git a/notebooks/parameter_tuning_grid_search.ipynb b/notebooks/parameter_tuning_grid_search.ipynb index e0912cb54..cdf8117cc 100644 --- a/notebooks/parameter_tuning_grid_search.ipynb +++ b/notebooks/parameter_tuning_grid_search.ipynb @@ -7,7 +7,7 @@ "# Hyperparameter tuning by grid-search\n", "\n", "In the previous notebook, we saw that hyperparameters can affect the\n", - "generalization performance of a model. In this notebook, we will show how to\n", + "generalization performance of a model. In this notebook, we show how to\n", "optimize hyperparameters using a grid-search approach." ] }, @@ -91,8 +91,8 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "We will define a pipeline as seen in the first module. It will handle both\n", - "numerical and categorical features.\n", + "We define a pipeline as seen in the first module, to handle both numerical and\n", + "categorical features.\n", "\n", "The first step is to select all the categorical columns." ] @@ -113,7 +113,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Here we will use a tree-based model as a classifier (i.e.\n", + "Here we use a tree-based model as a classifier (i.e.\n", "`HistGradientBoostingClassifier`). That means:\n", "\n", "* Numerical variables don't need scaling;\n", @@ -201,8 +201,8 @@ "code.\n", "\n", "Let's see how to use the `GridSearchCV` estimator for doing such search. Since\n", - "the grid-search will be costly, we will only explore the combination\n", - "learning-rate and the maximum number of nodes." + "the grid-search is costly, we only explore the combination learning-rate and\n", + "the maximum number of nodes." ] }, { @@ -226,7 +226,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Finally, we will check the accuracy of our model using the test set." + "Finally, we check the accuracy of our model using the test set." ] }, { @@ -261,17 +261,17 @@ "metadata": {}, "source": [ "The `GridSearchCV` estimator takes a `param_grid` parameter which defines all\n", - "hyperparameters and their associated values. The grid-search will be in charge\n", + "hyperparameters and their associated values. The grid-search is in charge\n", "of creating all possible combinations and test them.\n", "\n", - "The number of combinations will be equal to the product of the number of\n", - "values to explore for each parameter (e.g. in our example 4 x 3 combinations).\n", - "Thus, adding new parameters with their associated values to be explored become\n", + "The number of combinations are equal to the product of the number of values to\n", + "explore for each parameter (e.g. in our example 4 x 3 combinations). Thus,\n", + "adding new parameters with their associated values to be explored become\n", "rapidly computationally expensive.\n", "\n", "Once the grid-search is fitted, it can be used as any other predictor by\n", - "calling `predict` and `predict_proba`. Internally, it will use the model with\n", - "the best parameters found during `fit`.\n", + "calling `predict` and `predict_proba`. Internally, it uses the model with the\n", + "best parameters found during `fit`.\n", "\n", "Get predictions for the 5 first samples using the estimator with the best\n", "parameters." @@ -312,8 +312,8 @@ "parameters \"by hand\" through a double for loop.\n", "\n", "In addition, we can inspect all results which are stored in the attribute\n", - "`cv_results_` of the grid-search. We will filter some specific columns from\n", - "these results." + "`cv_results_` of the grid-search. We filter some specific columns from these\n", + "results." ] }, { @@ -371,9 +371,9 @@ "With only 2 parameters, we might want to visualize the grid-search as a\n", "heatmap. We need to transform our `cv_results` into a dataframe where:\n", "\n", - "- the rows will correspond to the learning-rate values;\n", - "- the columns will correspond to the maximum number of leaf;\n", - "- the content of the dataframe will be the mean test scores." + "- the rows correspond to the learning-rate values;\n", + "- the columns correspond to the maximum number of leaf;\n", + "- the content of the dataframe is the mean test scores." ] }, { @@ -430,7 +430,7 @@ "\n", "The precise meaning of those two parameters will be explained later.\n", "\n", - "For now we will note that, in general, **there is no unique optimal parameter\n", + "For now we note that, in general, **there is no unique optimal parameter\n", "setting**: 4 models out of the 12 parameter configurations reach the maximal\n", "accuracy (up to small random fluctuations caused by the sampling of the\n", "training set)." diff --git a/notebooks/parameter_tuning_nested.ipynb b/notebooks/parameter_tuning_nested.ipynb index efc43173d..f632d16f4 100644 --- a/notebooks/parameter_tuning_nested.ipynb +++ b/notebooks/parameter_tuning_nested.ipynb @@ -10,12 +10,12 @@ "However, we did not present a proper framework to evaluate the tuned models.\n", "Instead, we focused on the mechanism used to find the best set of parameters.\n", "\n", - "In this notebook, we will reuse some knowledge presented in the module\n", - "\"Selecting the best model\" to show how to evaluate models where\n", - "hyperparameters need to be tuned.\n", + "In this notebook, we reuse some knowledge presented in the module \"Selecting\n", + "the best model\" to show how to evaluate models where hyperparameters need to\n", + "be tuned.\n", "\n", - "Thus, we will first load the dataset and create the predictive model that we\n", - "want to optimize and later on, evaluate.\n", + "Thus, we first load the dataset and create the predictive model that we want\n", + "to optimize and later on, evaluate.\n", "\n", "## Loading the dataset\n", "\n", @@ -155,7 +155,7 @@ "### With hyperparameter tuning\n", "\n", "As shown in the previous notebook, one can use a search strategy that uses\n", - "cross-validation to find the best set of parameters. Here, we will use a\n", + "cross-validation to find the best set of parameters. Here, we use a\n", "grid-search strategy and reproduce the steps done in the previous notebook.\n", "\n", "First, we have to embed our model into a grid-search and specify the\n", diff --git a/notebooks/parameter_tuning_parallel_plot.ipynb b/notebooks/parameter_tuning_parallel_plot.ipynb index 6b2cbe200..32f411b35 100644 --- a/notebooks/parameter_tuning_parallel_plot.ipynb +++ b/notebooks/parameter_tuning_parallel_plot.ipynb @@ -158,8 +158,8 @@ "spread the active ranges and improve the readability of the plot.

      \n", "
    \n", "\n", - "The parallel coordinates plot will display the values of the hyperparameters\n", - "on different columns while the performance metric is color coded. Thus, we are\n", + "The parallel coordinates plot displays the values of the hyperparameters on\n", + "different columns while the performance metric is color coded. Thus, we are\n", "able to quickly inspect if there is a range of hyperparameters which is\n", "working or not.\n", "\n", diff --git a/notebooks/parameter_tuning_sol_02.ipynb b/notebooks/parameter_tuning_sol_02.ipynb index 58ef6a501..4035e5717 100644 --- a/notebooks/parameter_tuning_sol_02.ipynb +++ b/notebooks/parameter_tuning_sol_02.ipynb @@ -76,10 +76,10 @@ "source": [ "Use the previously defined model (called `model`) and using two nested `for`\n", "loops, make a search of the best combinations of the `learning_rate` and\n", - "`max_leaf_nodes` parameters. In this regard, you will need to train and test\n", - "the model by setting the parameters. The evaluation of the model should be\n", - "performed using `cross_val_score` on the training set. We will use the\n", - "following parameters search:\n", + "`max_leaf_nodes` parameters. In this regard, you need to train and test the\n", + "model by setting the parameters. The evaluation of the model should be\n", + "performed using `cross_val_score` on the training set. Use the following\n", + "parameters search:\n", "- `learning_rate` for the values 0.01, 0.1, 1 and 10. This parameter controls\n", " the ability of a new tree to correct the error of the previous sequence of\n", " trees\n", diff --git a/notebooks/parameter_tuning_sol_03.ipynb b/notebooks/parameter_tuning_sol_03.ipynb index c7e032fce..c7eb4a778 100644 --- a/notebooks/parameter_tuning_sol_03.ipynb +++ b/notebooks/parameter_tuning_sol_03.ipynb @@ -31,8 +31,8 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "In this exercise, we will progressively define the regression pipeline and\n", - "later tune its hyperparameters.\n", + "In this exercise, we progressively define the regression pipeline and later\n", + "tune its hyperparameters.\n", "\n", "Start by defining a pipeline that:\n", "* uses a `StandardScaler` to normalize the numerical data;\n", @@ -158,8 +158,8 @@ ] }, "source": [ - "To simplify the axis of the plot, we will rename the column of the dataframe\n", - "and only select the mean test score and the value of the hyperparameters." + "To simplify the axis of the plot, we rename the column of the dataframe and\n", + "only select the mean test score and the value of the hyperparameters." ] }, { @@ -266,7 +266,7 @@ "vary between 0 and 10,000 (e.g. the variable `\"Population\"`) and B is a\n", "feature that varies between 1 and 10 (e.g. the variable `\"AveRooms\"`), then\n", "distances between samples (rows of the dataframe) are mostly impacted by\n", - "differences in values of the column A, while values of the column B will be\n", + "differences in values of the column A, while values of the column B are\n", "comparatively ignored. If one applies StandardScaler to such a database, both\n", "the values of A and B will be approximately between -3 and 3 and the neighbor\n", "structure will be impacted more or less equivalently by both variables.\n", diff --git a/notebooks/trees_dataset.ipynb b/notebooks/trees_dataset.ipynb index 7202c1073..c2509a248 100644 --- a/notebooks/trees_dataset.ipynb +++ b/notebooks/trees_dataset.ipynb @@ -13,7 +13,7 @@ "\n", "## Classification dataset\n", "\n", - "We will use this dataset in classification setting to predict the penguins'\n", + "We use this dataset in classification setting to predict the penguins'\n", "species from anatomical information.\n", "\n", "Each penguin is from one of the three following species: Adelie, Gentoo, and\n", @@ -24,15 +24,15 @@ "penguins](https://github.com/allisonhorst/palmerpenguins/raw/master/man/figures/lter_penguins.png)\n", "\n", "This problem is a classification problem since the target is categorical. We\n", - "will limit our input data to a subset of the original features to simplify our\n", - "explanations when presenting the decision tree algorithm. Indeed, we will use\n", + "limit our input data to a subset of the original features to simplify our\n", + "explanations when presenting the decision tree algorithm. Indeed, we use\n", "features based on penguins' culmen measurement. You can learn more about the\n", "penguins' culmen with the illustration below:\n", "\n", "![Image of\n", "culmen](https://github.com/allisonhorst/palmerpenguins/raw/master/man/figures/culmen_depth.png)\n", "\n", - "We will start by loading this subset of the dataset." + "We start by loading this subset of the dataset." ] }, { @@ -101,11 +101,11 @@ "\n", "In a regression setting, the target is a continuous variable instead of\n", "categories. Here, we use two features of the dataset to make such a problem:\n", - "the flipper length will be used as data and the body mass will be the target.\n", - "In short, we want to predict the body mass using the flipper length.\n", + "the flipper length is used as data and the body mass as the target. In short,\n", + "we want to predict the body mass using the flipper length.\n", "\n", - "We will load the dataset and visualize the relationship between the flipper\n", - "length and the body mass of penguins." + "We load the dataset and visualize the relationship between the flipper length\n", + "and the body mass of penguins." ] }, { diff --git a/notebooks/trees_ex_02.ipynb b/notebooks/trees_ex_02.ipynb index 3b1c0e141..0d35b25be 100644 --- a/notebooks/trees_ex_02.ipynb +++ b/notebooks/trees_ex_02.ipynb @@ -12,7 +12,7 @@ "By extrapolation, we refer to values predicted by a model outside of the range\n", "of feature values seen during the training.\n", "\n", - "We will first load the regression data." + "We first load the regression data." ] }, { @@ -98,10 +98,10 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Now, we will check the extrapolation capabilities of each model. Create a\n", - "dataset containing a broader range of values than your previous dataset, in\n", - "other words, add values below and above the minimum and the maximum of the\n", - "flipper length seen during training." + "Now, we check the extrapolation capabilities of each model. Create a dataset\n", + "containing a broader range of values than your previous dataset, in other\n", + "words, add values below and above the minimum and the maximum of the flipper\n", + "length seen during training." ] }, { diff --git a/notebooks/trees_hyperparameters.ipynb b/notebooks/trees_hyperparameters.ipynb index b9de0ac27..1e271f491 100644 --- a/notebooks/trees_hyperparameters.ipynb +++ b/notebooks/trees_hyperparameters.ipynb @@ -6,11 +6,11 @@ "source": [ "# Importance of decision tree hyperparameters on generalization\n", "\n", - "In this notebook, we will illustrate the importance of some key\n", - "hyperparameters on the decision tree; we will demonstrate their effects on the\n", - "classification and regression problems we saw previously.\n", + "In this notebook, we illustrate the importance of some key hyperparameters on\n", + "the decision tree; we demonstrate their effects on the classification and\n", + "regression problems we saw previously.\n", "\n", - "First, we will load the classification and regression datasets." + "First, we load the classification and regression datasets." ] }, { @@ -54,7 +54,7 @@ "source": [ "## Create helper functions\n", "\n", - "We will create some helper functions to plot the data samples as well as the\n", + "We create some helper functions to plot the data samples as well as the\n", "decision boundary for classification and the regression line for regression." ] }, @@ -207,10 +207,10 @@ "metadata": {}, "source": [ "For both classification and regression setting, we observe that increasing the\n", - "depth will make the tree model more expressive. However, a tree that is too\n", - "deep will overfit the training data, creating partitions which are only\n", - "correct for \"outliers\" (noisy samples). The `max_depth` is one of the\n", - "hyperparameters that one should optimize via cross-validation and grid-search." + "depth makes the tree model more expressive. However, a tree that is too deep\n", + "may overfit the training data, creating partitions which are only correct for\n", + "\"outliers\" (noisy samples). The `max_depth` is one of the hyperparameters that\n", + "one should optimize via cross-validation and grid-search." ] }, { @@ -266,15 +266,15 @@ "\n", "The `max_depth` hyperparameter controls the overall complexity of the tree.\n", "This parameter is adequate under the assumption that a tree is built\n", - "symmetrically. However, there is no guarantee that a tree will be symmetrical.\n", + "symmetrically. However, there is no reason why a tree should be symmetrical.\n", "Indeed, optimal generalization performance could be reached by growing some of\n", "the branches deeper than some others.\n", "\n", - "We will build a dataset where we will illustrate this asymmetry. We will\n", - "generate a dataset composed of 2 subsets: one subset where a clear separation\n", - "should be found by the tree and another subset where samples from both classes\n", - "will be mixed. It implies that a decision tree will need more splits to\n", - "classify properly samples from the second subset than from the first subset." + "We build a dataset where we illustrate this asymmetry. We generate a dataset\n", + "composed of 2 subsets: one subset where a clear separation should be found by\n", + "the tree and another subset where samples from both classes are mixed. It\n", + "implies that a decision tree needs more splits to classify properly samples\n", + "from the second subset than from the first subset." ] }, { @@ -288,11 +288,11 @@ "data_clf_columns = [\"Feature #0\", \"Feature #1\"]\n", "target_clf_column = \"Class\"\n", "\n", - "# Blobs that will be interlaced\n", + "# Blobs that are interlaced\n", "X_1, y_1 = make_blobs(\n", " n_samples=300, centers=[[0, 0], [-1, -1]], random_state=0\n", ")\n", - "# Blobs that will be easily separated\n", + "# Blobs that can be easily separated\n", "X_2, y_2 = make_blobs(n_samples=300, centers=[[3, 6], [7, 0]], random_state=0)\n", "\n", "X = np.concatenate([X_1, X_2], axis=0)\n", @@ -324,9 +324,8 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "We will first train a shallow decision tree with `max_depth=2`. We would\n", - "expect this depth to be enough to separate the blobs that are easy to\n", - "separate." + "We first train a shallow decision tree with `max_depth=2`. We would expect\n", + "this depth to be enough to separate the blobs that are easy to separate." ] }, { @@ -348,7 +347,7 @@ "metadata": {}, "source": [ "As expected, we see that the blue blob in the lower right and the red blob on\n", - "the top are easily separated. However, more splits will be required to better\n", + "the top are easily separated. However, more splits are required to better\n", "split the blob were both blue and red data points are mixed." ] }, @@ -369,7 +368,7 @@ "metadata": {}, "source": [ "We see that the right branch achieves perfect classification. Now, we increase\n", - "the depth to check how the tree will grow." + "the depth to check how the tree grows." ] }, { @@ -406,8 +405,8 @@ "beneficial that a branch continue growing.\n", "\n", "The hyperparameters `min_samples_leaf`, `min_samples_split`, `max_leaf_nodes`,\n", - "or `min_impurity_decrease` allows growing asymmetric trees and apply a\n", - "constraint at the leaves or nodes level. We will check the effect of\n", + "or `min_impurity_decrease` allow growing asymmetric trees and apply a\n", + "constraint at the leaves or nodes level. We check the effect of\n", "`min_samples_leaf`." ] }, @@ -442,7 +441,7 @@ "metadata": {}, "source": [ "This hyperparameter allows to have leaves with a minimum number of samples and\n", - "no further splits will be searched otherwise. Therefore, these hyperparameters\n", + "no further splits are searched otherwise. Therefore, these hyperparameters\n", "could be an alternative to fix the `max_depth` hyperparameter." ] } diff --git a/notebooks/trees_regression.ipynb b/notebooks/trees_regression.ipynb index 5e137e01e..217d2e165 100644 --- a/notebooks/trees_regression.ipynb +++ b/notebooks/trees_regression.ipynb @@ -44,9 +44,9 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "To illustrate how decision trees are predicting in a regression setting, we\n", - "will create a synthetic dataset containing all possible flipper length from\n", - "the minimum to the maximum of the original data." + "To illustrate how decision trees predict in a regression setting, we create a\n", + "synthetic dataset containing some of the possible flipper length values\n", + "between the minimum and the maximum of the original data." ] }, { @@ -76,9 +76,9 @@ "some intuitive understanding on the shape of the decision function of the\n", "learned decision trees.\n", "\n", - "However computing an evaluation metric on such a synthetic test set would be\n", + "However, computing an evaluation metric on such a synthetic test set would be\n", "meaningless since the synthetic dataset does not follow the same distribution\n", - "as the real world data on which the model will be deployed." + "as the real world data on which the model would be deployed." ] }, { @@ -100,7 +100,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "We will first illustrate the difference between a linear model and a decision\n", + "We first illustrate the difference between a linear model and a decision\n", "tree." ] }, @@ -172,9 +172,8 @@ "metadata": {}, "source": [ "Contrary to linear models, decision trees are non-parametric models: they do\n", - "not make assumptions about the way data is distributed. This will affect the\n", - "prediction scheme. Repeating the above experiment will highlight the\n", - "differences." + "not make assumptions about the way data is distributed. This affects the\n", + "prediction scheme. Repeating the above experiment highlights the differences." ] }, { @@ -272,8 +271,8 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Increasing the depth of the tree will increase the number of partition and\n", - "thus the number of constant values that the tree is capable of predicting.\n", + "Increasing the depth of the tree increases the number of partitions and thus\n", + "the number of constant values that the tree is capable of predicting.\n", "\n", "In this notebook, we highlighted the differences in behavior of a decision\n", "tree used in a classification problem in contrast to a regression problem." diff --git a/notebooks/trees_sol_02.ipynb b/notebooks/trees_sol_02.ipynb index cd7de2cff..64010ef3e 100644 --- a/notebooks/trees_sol_02.ipynb +++ b/notebooks/trees_sol_02.ipynb @@ -12,7 +12,7 @@ "By extrapolation, we refer to values predicted by a model outside of the range\n", "of feature values seen during the training.\n", "\n", - "We will first load the regression data." + "We first load the regression data." ] }, { @@ -153,10 +153,10 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Now, we will check the extrapolation capabilities of each model. Create a\n", - "dataset containing a broader range of values than your previous dataset, in\n", - "other words, add values below and above the minimum and the maximum of the\n", - "flipper length seen during training." + "Now, we check the extrapolation capabilities of each model. Create a dataset\n", + "containing a broader range of values than your previous dataset, in other\n", + "words, add values below and above the minimum and the maximum of the flipper\n", + "length seen during training." ] }, { @@ -226,9 +226,9 @@ ] }, "source": [ - "The linear model will extrapolate using the fitted model for flipper lengths <\n", - "175 mm and > 235 mm. In fact, we are using the model parametrization to make\n", - "this predictions.\n", + "The linear model extrapolates using the fitted model for flipper lengths < 175\n", + "mm and > 235 mm. In fact, we are using the model parametrization to make these\n", + "predictions.\n", "\n", "As mentioned, decision trees are non-parametric models and we observe that\n", "they cannot extrapolate. For flipper lengths below the minimum, the mass of\n", diff --git a/python_scripts/01_tabular_data_exploration.py b/python_scripts/01_tabular_data_exploration.py index 69427f8d0..4b07c4add 100644 --- a/python_scripts/01_tabular_data_exploration.py +++ b/python_scripts/01_tabular_data_exploration.py @@ -8,8 +8,8 @@ # %% [markdown] # # First look at our dataset # -# In this notebook, we will look at the necessary steps required before any -# machine learning takes place. It involves: +# In this notebook, we look at the necessary steps required before any machine +# learning takes place. It involves: # # * loading the data; # * looking at the variables in the dataset, in particular, differentiate @@ -21,14 +21,14 @@ # %% [markdown] # ## Loading the adult census dataset # -# We will use data from the 1994 US census that we downloaded from +# We use data from the 1994 US census that we downloaded from # [OpenML](http://openml.org/). # # You can look at the OpenML webpage to learn more about this dataset: # # -# The dataset is available as a CSV (Comma-Separated Values) file and we will -# use `pandas` to read it. +# The dataset is available as a CSV (Comma-Separated Values) file and we use +# `pandas` to read it. # # ```{note} # [Pandas](https://pandas.pydata.org/) is a Python library used for @@ -74,9 +74,9 @@ # The column named **class** is our target variable (i.e., the variable which we # want to predict). The two possible classes are `<=50K` (low-revenue) and # `>50K` (high-revenue). The resulting prediction problem is therefore a binary -# classification problem as `class` has only two possible values. We will use -# the left-over columns (any column other than `class`) as input variables for -# our model. +# classification problem as `class` has only two possible values. We use the +# left-over columns (any column other than `class`) as input variables for our +# model. # %% target_column = "class" @@ -90,7 +90,7 @@ # and may need special techniques when building a predictive model. # # For example in a medical setting, if we are trying to predict whether subjects -# will develop a rare disease, there will be a lot more healthy subjects than +# may develop a rare disease, there would be a lot more healthy subjects than # ill subjects in the dataset. # ``` @@ -247,8 +247,8 @@ # %% import seaborn as sns -# We will plot a subset of the data to keep the plot readable and make the -# plotting faster +# We plot a subset of the data to keep the plot readable and make the plotting +# faster n_samples_to_plot = 5000 columns = ["age", "education-num", "hours-per-week"] _ = sns.pairplot( @@ -320,12 +320,12 @@ # a mix of blue points and orange points. It seems complicated to choose which # class we should predict in this region. # -# It is interesting to note that some machine learning models will work -# similarly to what we did: they are known as decision tree models. The two -# thresholds that we chose (27 years and 40 hours) are somewhat arbitrary, i.e. -# we chose them by only looking at the pairplot. In contrast, a decision tree -# will choose the "best" splits based on data without human intervention or -# inspection. Decision trees will be covered more in detail in a future module. +# It is interesting to note that some machine learning models work similarly to +# what we did: they are known as decision tree models. The two thresholds that +# we chose (27 years and 40 hours) are somewhat arbitrary, i.e. we chose them by +# only looking at the pairplot. In contrast, a decision tree chooses the "best" +# splits based on data without human intervention or inspection. Decision trees +# will be covered more in detail in a future module. # # Note that machine learning is often used when creating rules by hand is not # straightforward. For example because we are in high dimension (many features diff --git a/python_scripts/01_tabular_data_exploration_ex_01.py b/python_scripts/01_tabular_data_exploration_ex_01.py index b09b00dc3..37548c006 100644 --- a/python_scripts/01_tabular_data_exploration_ex_01.py +++ b/python_scripts/01_tabular_data_exploration_ex_01.py @@ -5,7 +5,7 @@ # extension: .py # format_name: percent # format_version: '1.3' -# jupytext_version: 1.14.5 +# jupytext_version: 1.15.2 # kernelspec: # display_name: Python 3 # name: python3 @@ -59,5 +59,5 @@ # Write your code here. # %% [markdown] -# Looking at these distributions, how hard do you think it will be to classify +# Looking at these distributions, how hard do you think it would be to classify # the penguins only using `"culmen depth"` and `"culmen length"`? diff --git a/python_scripts/01_tabular_data_exploration_sol_01.py b/python_scripts/01_tabular_data_exploration_sol_01.py index 95d89b203..b3ef6a88d 100644 --- a/python_scripts/01_tabular_data_exploration_sol_01.py +++ b/python_scripts/01_tabular_data_exploration_sol_01.py @@ -78,7 +78,7 @@ pairplot_figure = seaborn.pairplot(penguins, hue="Species", height=4) # %% [markdown] -# Looking at these distributions, how hard do you think it will be to classify +# Looking at these distributions, how hard do you think it would be to classify # the penguins only using `"culmen depth"` and `"culmen length"`? # %% [markdown] tags=["solution"] diff --git a/python_scripts/02_numerical_pipeline_cross_validation.py b/python_scripts/02_numerical_pipeline_cross_validation.py index 0edbd1cf8..e93868352 100644 --- a/python_scripts/02_numerical_pipeline_cross_validation.py +++ b/python_scripts/02_numerical_pipeline_cross_validation.py @@ -8,9 +8,9 @@ # %% [markdown] # # Model evaluation using cross-validation # -# In this notebook, we will still use only numerical features. +# In this notebook, we still use numerical features only. # -# We will discuss the practical aspects of assessing the generalization +# Here we discuss the practical aspects of assessing the generalization # performance of our model via **cross-validation** instead of a single # train-test split. # @@ -24,8 +24,8 @@ adult_census = pd.read_csv("../datasets/adult-census.csv") # %% [markdown] -# We will now drop the target from the data we will use to train our -# predictive model. +# We now drop the target from the data we will use to train our predictive +# model. # %% target_name = "class" @@ -56,11 +56,11 @@ # ## The need for cross-validation # # In the previous notebook, we split the original data into a training set and a -# testing set. The score of a model will in general depend on the way we make -# such a split. One downside of doing a single split is that it does not give -# any information about this variability. Another downside, in a setting where -# the amount of data is small, is that the data available for training and -# testing will be even smaller after splitting. +# testing set. The score of a model in general depends on the way we make such a +# split. One downside of doing a single split is that it does not give any +# information about this variability. Another downside, in a setting where the +# amount of data is small, is that the data available for training and testing +# would be even smaller after splitting. # # Instead, we can use cross-validation. Cross-validation consists of repeating # the procedure such that the training and testing sets are different each time. @@ -69,8 +69,8 @@ # model's generalization performance. # # Note that there exists several cross-validation strategies, each of them -# defines how to repeat the `fit`/`score` procedure. In this section, we will -# use the K-fold strategy: the entire dataset is split into `K` partitions. The +# defines how to repeat the `fit`/`score` procedure. In this section, we use the +# K-fold strategy: the entire dataset is split into `K` partitions. The # `fit`/`score` procedure is repeated `K` times where at each iteration `K - 1` # partitions are used to fit the model and `1` partition is used to score. The # figure below illustrates this K-fold strategy. @@ -129,7 +129,7 @@ # [`sklearn.model_selection.cross_validate`](https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.cross_validate.html) # to collect additional information, such as the training scores of the models # obtained on each round or even return the models themselves instead of -# discarding them. These features will be covered in a future notebook. +# discarding them. These features will be covered in a future notebook. # # Let's extract the scores computed on the test fold of each cross-validation # round from the `cv_result` dictionary and compute the mean accuracy and the diff --git a/python_scripts/02_numerical_pipeline_ex_00.py b/python_scripts/02_numerical_pipeline_ex_00.py index 0436dfc50..f251ca7f9 100644 --- a/python_scripts/02_numerical_pipeline_ex_00.py +++ b/python_scripts/02_numerical_pipeline_ex_00.py @@ -5,7 +5,7 @@ # extension: .py # format_name: percent # format_version: '1.3' -# jupytext_version: 1.14.5 +# jupytext_version: 1.15.2 # kernelspec: # display_name: Python 3 # name: python3 @@ -38,11 +38,12 @@ # number of neighbors we are going to use to make a prediction for a new data # point. # -# What is the default value of the `n_neighbors` parameter? Hint: Look at the -# documentation on the [scikit-learn +# What is the default value of the `n_neighbors` parameter? +# +# **Hint**: Look at the documentation on the [scikit-learn # website](https://scikit-learn.org/stable/modules/generated/sklearn.neighbors.KNeighborsClassifier.html) # or directly access the description inside your notebook by running the -# following cell. This will open a pager pointing to the documentation. +# following cell. This opens a pager pointing to the documentation. # %% from sklearn.neighbors import KNeighborsClassifier diff --git a/python_scripts/02_numerical_pipeline_ex_01.py b/python_scripts/02_numerical_pipeline_ex_01.py index 7654753d4..2f9c5c240 100644 --- a/python_scripts/02_numerical_pipeline_ex_01.py +++ b/python_scripts/02_numerical_pipeline_ex_01.py @@ -5,7 +5,7 @@ # extension: .py # format_name: percent # format_version: '1.3' -# jupytext_version: 1.14.5 +# jupytext_version: 1.15.2 # kernelspec: # display_name: Python 3 # name: python3 @@ -35,8 +35,8 @@ adult_census = pd.read_csv("../datasets/adult-census.csv") # %% [markdown] -# We will first split our dataset to have the target separated from the data -# used to train our predictive model. +# We first split our dataset to have the target separated from the data used to +# train our predictive model. # %% target_name = "class" @@ -61,8 +61,8 @@ # Write your code here. # %% [markdown] -# Use a `DummyClassifier` such that the resulting classifier will always predict -# the class `' >50K'`. What is the accuracy score on the test set? Repeat the +# Use a `DummyClassifier` such that the resulting classifier always predict the +# class `' >50K'`. What is the accuracy score on the test set? Repeat the # experiment by always predicting the class `' <=50K'`. # # Hint: you can set the `strategy` parameter of the `DummyClassifier` to achieve diff --git a/python_scripts/02_numerical_pipeline_hands_on.py b/python_scripts/02_numerical_pipeline_hands_on.py index 83f4346ed..913b78105 100644 --- a/python_scripts/02_numerical_pipeline_hands_on.py +++ b/python_scripts/02_numerical_pipeline_hands_on.py @@ -21,8 +21,7 @@ # * using a scikit-learn helper to separate data into train-test sets; # * training and evaluating a more complex scikit-learn model. # -# We will start by loading the adult census dataset used during the data -# exploration. +# We start by loading the adult census dataset used during the data exploration. # # ## Loading the entire dataset # @@ -70,12 +69,12 @@ # numerical data usually requires very little work before getting started with # training. # -# The first task here will be to identify numerical data in our dataset. +# The first task here is to identify numerical data in our dataset. # # ```{caution} -# Numerical data are represented with numbers, but numbers are not always -# representing numerical data. Categories could already be encoded with -# numbers and you will need to identify these features. +# Numerical data are represented with numbers, but numbers do not always +# represent numerical data. Categories could already be encoded with +# numbers and you may need to identify these features. # ``` # # Thus, we can check the data type for each of the column in the dataset. @@ -123,7 +122,7 @@ # %% [markdown] # We can see the age varies between 17 and 90 years. # -# We could extend our analysis and we will find that `"capital-gain"`, +# We could extend our analysis and we would find that `"capital-gain"`, # `"capital-loss"`, and `"hours-per-week"` are also representing quantitative # data. # @@ -162,7 +161,7 @@ # %% [markdown] # When calling the function `train_test_split`, we specified that we would like # to have 25% of samples in the testing set while the remaining samples (75%) -# will be available in the training set. We can check quickly if we got what we +# are assigned to the training set. We can check quickly if we got what we # expected. # %% @@ -182,8 +181,8 @@ # %% [markdown] # In the previous notebook, we used a k-nearest neighbors model. While this # model is intuitive to understand, it is not widely used in practice. Now, we -# will use a more useful model, called a logistic regression, which belongs to -# the linear models family. +# use a more useful model, called a logistic regression, which belongs to the +# linear models family. # # ```{note} # In short, linear models find a set of weights to combine features linearly @@ -192,8 +191,8 @@ # * if `0.1 * age + 3.3 * hours-per-week - 15.1 > 0`, predict `high-income` # * otherwise predict `low-income` # -# Linear models, and in particular the logistic regression, will be covered in -# more details in the "Linear models" module later in this course. For now the +# Linear models, and in particular the logistic regression, will be covered +# more in detail in the "Linear models" module later in this course. For now the # focus is to use this logistic regression model in scikit-learn rather than # understand how it works in details. # ``` diff --git a/python_scripts/02_numerical_pipeline_scaling.py b/python_scripts/02_numerical_pipeline_scaling.py index 66370921d..4a0025f5d 100644 --- a/python_scripts/02_numerical_pipeline_scaling.py +++ b/python_scripts/02_numerical_pipeline_scaling.py @@ -8,9 +8,9 @@ # %% [markdown] # # Preprocessing for numerical features # -# In this notebook, we will still use only numerical features. +# In this notebook, we still use numerical features only. # -# We will introduce these new aspects: +# Here we introduce these new aspects: # # * an example of preprocessing, namely **scaling numerical variables**; # * using a scikit-learn **pipeline** to chain preprocessing and model training. @@ -25,8 +25,7 @@ adult_census = pd.read_csv("../datasets/adult-census.csv") # %% [markdown] -# We will now drop the target from the data we will use to train our predictive -# model. +# We now drop the target from the data we use to train our predictive model. # %% target_name = "class" @@ -67,7 +66,7 @@ # %% [markdown] # We see that the dataset's features span across different ranges. Some # algorithms make some assumptions regarding the feature distributions and -# usually normalizing features will be helpful to address these assumptions. +# normalizing features is usually helpful to address such assumptions. # # ```{tip} # Here are some reasons for scaling features: @@ -84,13 +83,13 @@ # Whether or not a machine learning model requires scaling the features depends # on the model family. Linear models such as logistic regression generally # benefit from scaling the features while other models such as decision trees do -# not need such preprocessing (but will not suffer from it). +# not need such preprocessing (but would not suffer from it). # # We show how to apply such normalization using a scikit-learn transformer # called `StandardScaler`. This transformer shifts and scales each feature # individually so that they all have a 0-mean and a unit standard deviation. # -# We will investigate different steps used in scikit-learn to achieve such a +# We now investigate different steps used in scikit-learn to achieve such a # transformation of the data. # # First, one needs to call the method `fit` in order to learn the scaling from @@ -115,10 +114,10 @@ # are the model states. # # ```{note} -# The fact that the model states of this scaler are arrays of means and -# standard deviations is specific to the `StandardScaler`. Other -# scikit-learn transformers will compute different statistics and store them -# as model states, in the same fashion. +# The fact that the model states of this scaler are arrays of means and standard +# deviations is specific to the `StandardScaler`. Other scikit-learn +# transformers may compute different statistics and store them as model states, +# in a similar fashion. # ``` # # We can inspect the computed means and standard deviations. @@ -225,7 +224,7 @@ # %% [markdown] # We can easily combine sequential operations with a scikit-learn `Pipeline`, # which chains together operations and is used as any other classifier or -# regressor. The helper function `make_pipeline` will create a `Pipeline`: it +# regressor. The helper function `make_pipeline` creates a `Pipeline`: it # takes as arguments the successive transformations to perform, followed by the # classifier or regressor model. @@ -240,8 +239,8 @@ # %% [markdown] # The `make_pipeline` function did not require us to give a name to each step. # Indeed, it was automatically assigned based on the name of the classes -# provided; a `StandardScaler` will be a step named `"standardscaler"` in the -# resulting pipeline. We can check the name of each steps of our model: +# provided; a `StandardScaler` step is named `"standardscaler"` in the resulting +# pipeline. We can check the name of each steps of our model: # %% model.named_steps @@ -263,7 +262,7 @@ # ![pipeline fit diagram](../figures/api_diagram-pipeline.fit.svg) # # When calling `model.fit`, the method `fit_transform` from each underlying -# transformer (here a single transformer) in the pipeline will be called to: +# transformer (here a single transformer) in the pipeline is called to: # # - learn their internal model states # - transform the training data. Finally, the preprocessed data are provided to @@ -284,7 +283,7 @@ # called to preprocess the data. Note that there is no need to call the `fit` # method for these transformers because we are using the internal model states # computed when calling `model.fit`. The preprocessed data is then provided to -# the predictor that will output the predicted target by calling its method +# the predictor that outputs the predicted target by calling its method # `predict`. # # As a shorthand, we can check the score of the full predictive pipeline calling diff --git a/python_scripts/02_numerical_pipeline_sol_00.py b/python_scripts/02_numerical_pipeline_sol_00.py index 7ac9a5496..a10f8555d 100644 --- a/python_scripts/02_numerical_pipeline_sol_00.py +++ b/python_scripts/02_numerical_pipeline_sol_00.py @@ -32,11 +32,12 @@ # number of neighbors we are going to use to make a prediction for a new data # point. # -# What is the default value of the `n_neighbors` parameter? Hint: Look at the -# documentation on the [scikit-learn +# What is the default value of the `n_neighbors` parameter? +# +# **Hint**: Look at the documentation on the [scikit-learn # website](https://scikit-learn.org/stable/modules/generated/sklearn.neighbors.KNeighborsClassifier.html) # or directly access the description inside your notebook by running the -# following cell. This will open a pager pointing to the documentation. +# following cell. This opens a pager pointing to the documentation. # %% from sklearn.neighbors import KNeighborsClassifier diff --git a/python_scripts/02_numerical_pipeline_sol_01.py b/python_scripts/02_numerical_pipeline_sol_01.py index 70a21c31d..3e77f6372 100644 --- a/python_scripts/02_numerical_pipeline_sol_01.py +++ b/python_scripts/02_numerical_pipeline_sol_01.py @@ -29,8 +29,8 @@ adult_census = pd.read_csv("../datasets/adult-census.csv") # %% [markdown] -# We will first split our dataset to have the target separated from the data -# used to train our predictive model. +# We first split our dataset to have the target separated from the data used to +# train our predictive model. # %% target_name = "class" @@ -58,8 +58,8 @@ ) # %% [markdown] -# Use a `DummyClassifier` such that the resulting classifier will always predict -# the class `' >50K'`. What is the accuracy score on the test set? Repeat the +# Use a `DummyClassifier` such that the resulting classifier always predict the +# class `' >50K'`. What is the accuracy score on the test set? Repeat the # experiment by always predicting the class `' <=50K'`. # # Hint: you can set the `strategy` parameter of the `DummyClassifier` to achieve @@ -79,8 +79,8 @@ # %% [markdown] tags=["solution"] # We clearly see that the score is below 0.5 which might be surprising at first. -# We will now check the generalization performance of a model which always -# predict the low revenue class, i.e. `" <=50K"`. +# We now check the generalization performance of a model which always predict +# the low revenue class, i.e. `" <=50K"`. # %% tags=["solution"] class_to_predict = " <=50K" @@ -97,7 +97,7 @@ # %% [markdown] tags=["solution"] # Therefore, any predictive model giving results below this dummy classifier -# will not be helpful. +# would not be helpful. # %% tags=["solution"] adult_census["class"].value_counts() diff --git a/python_scripts/03_categorical_pipeline.py b/python_scripts/03_categorical_pipeline.py index 5acdefc82..62cd9be98 100644 --- a/python_scripts/03_categorical_pipeline.py +++ b/python_scripts/03_categorical_pipeline.py @@ -8,9 +8,9 @@ # %% [markdown] # # Encoding of categorical variables # -# In this notebook, we will present typical ways of dealing with -# **categorical variables** by encoding them, namely **ordinal encoding** and -# **one-hot encoding**. +# In this notebook, we present some typical ways of dealing with **categorical +# variables** by encoding them, namely **ordinal encoding** and **one-hot +# encoding**. # %% [markdown] # Let's first load the entire adult dataset containing both numerical and @@ -62,9 +62,9 @@ # ## Select features based on their data type # # In the previous notebook, we manually defined the numerical columns. We could -# do a similar approach. Instead, we will use the scikit-learn helper function -# `make_column_selector`, which allows us to select columns based on -# their data type. We will illustrate how to use this helper. +# do a similar approach. Instead, we can use the scikit-learn helper function +# `make_column_selector`, which allows us to select columns based on their data +# type. We now illustrate how to use this helper. # %% from sklearn.compose import make_column_selector as selector @@ -97,9 +97,8 @@ # ### Encoding ordinal categories # # The most intuitive strategy is to encode each category with a different -# number. The `OrdinalEncoder` will transform the data in such manner. -# We will start by encoding a single column to understand how the encoding -# works. +# number. The `OrdinalEncoder` transforms the data in such manner. We start by +# encoding a single column to understand how the encoding works. # %% from sklearn.preprocessing import OrdinalEncoder @@ -160,13 +159,13 @@ # # `OneHotEncoder` is an alternative encoder that prevents the downstream # models to make a false assumption about the ordering of categories. For a -# given feature, it will create as many new columns as there are possible +# given feature, it creates as many new columns as there are possible # categories. For a given sample, the value of the column corresponding to the -# category will be set to `1` while all the columns of the other categories -# will be set to `0`. +# category is set to `1` while all the columns of the other categories +# are set to `0`. # -# We will start by encoding a single feature (e.g. `"education"`) to illustrate -# how the encoding works. +# We can encode a single feature (e.g. `"education"`) to illustrate how the +# encoding works. # %% from sklearn.preprocessing import OneHotEncoder @@ -187,7 +186,7 @@ # ``` # %% [markdown] -# We see that encoding a single feature will give a dataframe full of zeros +# We see that encoding a single feature gives a dataframe full of zeros # and ones. Each category (unique value) became a column; the encoding # returned, for each sample, a 1 to specify which category it belongs to. # @@ -215,8 +214,8 @@ # %% [markdown] # ### Choosing an encoding strategy # -# Choosing an encoding strategy will depend on the underlying models and the -# type of categories (i.e. ordinal vs. nominal). +# Choosing an encoding strategy depends on the underlying models and the type of +# categories (i.e. ordinal vs. nominal). # %% [markdown] # ```{note} @@ -226,12 +225,11 @@ # ``` # %% [markdown] -# -# Using an `OrdinalEncoder` will output ordinal categories. This means +# Using an `OrdinalEncoder` outputs ordinal categories. This means # that there is an order in the resulting categories (e.g. `0 < 1 < 2`). The # impact of violating this ordering assumption is really dependent on the -# downstream models. Linear models will be impacted by misordered categories -# while tree-based models will not. +# downstream models. Linear models would be impacted by misordered categories +# while tree-based models would not. # # You can still use an `OrdinalEncoder` with linear models but you need to be # sure that: @@ -265,7 +263,7 @@ # We see that the `"Holand-Netherlands"` category is occurring rarely. This will # be a problem during cross-validation: if the sample ends up in the test set # during splitting then the classifier would not have seen the category during -# training and will not be able to encode it. +# training and would not be able to encode it. # # In scikit-learn, there are some possible solutions to bypass this issue: # @@ -289,9 +287,9 @@ # ```{tip} # Be aware the `OrdinalEncoder` exposes a parameter also named `handle_unknown`. # It can be set to `use_encoded_value`. If that option is chosen, you can define -# a fixed value to which all unknowns will be set to during `transform`. For -# example, `OrdinalEncoder(handle_unknown='use_encoded_value', -# unknown_value=42)` will set all values encountered during `transform` to `42` +# a fixed value that is assigned to all unknown categories during `transform`. +# For example, `OrdinalEncoder(handle_unknown='use_encoded_value', +# unknown_value=-1)` would set all values encountered during `transform` to `-1` # which are not part of the data encountered during the `fit` call. You are # going to use these parameters in the next exercise. # ``` diff --git a/python_scripts/03_categorical_pipeline_column_transformer.py b/python_scripts/03_categorical_pipeline_column_transformer.py index 002889af3..fd429749e 100644 --- a/python_scripts/03_categorical_pipeline_column_transformer.py +++ b/python_scripts/03_categorical_pipeline_column_transformer.py @@ -8,12 +8,12 @@ # %% [markdown] # # Using numerical and categorical variables together # -# In the previous notebooks, we showed the required preprocessing to apply -# when dealing with numerical and categorical variables. However, we decoupled -# the process to treat each type individually. In this notebook, we will show -# how to combine these preprocessing steps. +# In the previous notebooks, we showed the required preprocessing to apply when +# dealing with numerical and categorical variables. However, we decoupled the +# process to treat each type individually. In this notebook, we show how to +# combine these preprocessing steps. # -# We will first load the entire adult census dataset. +# We first load the entire adult census dataset. # %% import pandas as pd @@ -30,10 +30,10 @@ # %% [markdown] # ## Selection based on data types # -# We will separate categorical and numerical variables using their data -# types to identify them, as we saw previously that `object` corresponds -# to categorical columns (strings). We make use of `make_column_selector` -# helper to select the corresponding columns. +# We separate categorical and numerical variables using their data types to +# identify them, as we saw previously that `object` corresponds to categorical +# columns (strings). We make use of `make_column_selector` helper to select the +# corresponding columns. # %% from sklearn.compose import make_column_selector as selector @@ -62,14 +62,14 @@ # In the previous sections, we saw that we need to treat data differently # depending on their nature (i.e. numerical or categorical). # -# Scikit-learn provides a `ColumnTransformer` class which will send specific +# Scikit-learn provides a `ColumnTransformer` class which sends specific # columns to a specific transformer, making it easy to fit a single predictive # model on a dataset that combines both kinds of variables together # (heterogeneously typed tabular data). # # We first define the columns depending on their data type: # -# * **one-hot encoding** will be applied to categorical columns. Besides, we use +# * **one-hot encoding** is applied to categorical columns. Besides, we use # `handle_unknown="ignore"` to solve the potential issues due to rare # categories. # * **numerical scaling** numerical features which will be standardized. @@ -107,11 +107,11 @@ # A `ColumnTransformer` does the following: # # * It **splits the columns** of the original dataset based on the column names -# or indices provided. We will obtain as many subsets as the number of -# transformers passed into the `ColumnTransformer`. +# or indices provided. We obtain as many subsets as the number of transformers +# passed into the `ColumnTransformer`. # * It **transforms each subsets**. A specific transformer is applied to each -# subset: it will internally call `fit_transform` or `transform`. The output -# of this step is a set of transformed datasets. +# subset: it internally calls `fit_transform` or `transform`. The output of +# this step is a set of transformed datasets. # * It then **concatenates the transformed datasets** into a single dataset. # The important thing is that `ColumnTransformer` is like any other scikit-learn @@ -161,7 +161,7 @@ # %% [markdown] # Then, we can send the raw dataset straight to the pipeline. Indeed, we do not # need to make any manual preprocessing (calling the `transform` or -# `fit_transform` methods) as it will be handled when calling the `predict` +# `fit_transform` methods) as it is already handled when calling the `predict` # method. As an example, we predict on the five first samples from the test set. # %% @@ -212,10 +212,10 @@ # # However, it is often useful to check whether more complex models such as an # ensemble of decision trees can lead to higher predictive performance. In this -# section we will use such a model called **gradient-boosting trees** and -# evaluate its generalization performance. More precisely, the scikit-learn -# model we will use is called `HistGradientBoostingClassifier`. Note that -# boosting models will be covered in more detail in a future module. +# section we use such a model called **gradient-boosting trees** and evaluate +# its generalization performance. More precisely, the scikit-learn model we use +# is called `HistGradientBoostingClassifier`. Note that boosting models will be +# covered in more detail in a future module. # # For tree-based models, the handling of numerical and categorical variables is # simpler than for linear models: diff --git a/python_scripts/03_categorical_pipeline_ex_01.py b/python_scripts/03_categorical_pipeline_ex_01.py index ae19eab2f..4f2054867 100644 --- a/python_scripts/03_categorical_pipeline_ex_01.py +++ b/python_scripts/03_categorical_pipeline_ex_01.py @@ -5,7 +5,7 @@ # extension: .py # format_name: percent # format_version: '1.3' -# jupytext_version: 1.14.5 +# jupytext_version: 1.15.2 # kernelspec: # display_name: Python 3 # name: python3 @@ -39,9 +39,8 @@ # %% [markdown] # In the previous notebook, we used `sklearn.compose.make_column_selector` to # automatically select columns with a specific data type (also called `dtype`). -# Here, we will use this selector to get only the columns containing strings -# (column with `object` dtype) that correspond to categorical features in our -# dataset. +# Here, we use this selector to get only the columns containing strings (column +# with `object` dtype) that correspond to categorical features in our dataset. # %% from sklearn.compose import make_column_selector as selector @@ -73,11 +72,11 @@ # # ```{note} # Be aware that if an error happened during the cross-validation, -# `cross_validate` will raise a warning and return NaN (Not a Number) as scores. +# `cross_validate` would raise a warning and return NaN (Not a Number) as scores. # To make it raise a standard Python exception with a traceback, you can pass # the `error_score="raise"` argument in the call to `cross_validate`. An -# exception will be raised instead of a warning at the first encountered problem -# and `cross_validate` will stop right away instead of returning NaN values. +# exception would be raised instead of a warning at the first encountered problem +# and `cross_validate` would stop right away instead of returning NaN values. # This is particularly handy when developing complex machine learning pipelines. # ``` @@ -88,8 +87,8 @@ # %% [markdown] # Now, we would like to compare the generalization performance of our previous -# model with a new model where instead of using an `OrdinalEncoder`, we will use -# a `OneHotEncoder`. Repeat the model evaluation using cross-validation. Compare +# model with a new model where instead of using an `OrdinalEncoder`, we use a +# `OneHotEncoder`. Repeat the model evaluation using cross-validation. Compare # the score of both models and conclude on the impact of choosing a specific # encoding strategy when using a linear model. diff --git a/python_scripts/03_categorical_pipeline_ex_02.py b/python_scripts/03_categorical_pipeline_ex_02.py index 7daacfbd4..979b8b0b9 100644 --- a/python_scripts/03_categorical_pipeline_ex_02.py +++ b/python_scripts/03_categorical_pipeline_ex_02.py @@ -5,7 +5,7 @@ # extension: .py # format_name: percent # format_version: '1.3' -# jupytext_version: 1.14.5 +# jupytext_version: 1.15.2 # kernelspec: # display_name: Python 3 # name: python3 diff --git a/python_scripts/03_categorical_pipeline_sol_01.py b/python_scripts/03_categorical_pipeline_sol_01.py index 0847e7e30..e7b30598c 100644 --- a/python_scripts/03_categorical_pipeline_sol_01.py +++ b/python_scripts/03_categorical_pipeline_sol_01.py @@ -33,9 +33,8 @@ # %% [markdown] # In the previous notebook, we used `sklearn.compose.make_column_selector` to # automatically select columns with a specific data type (also called `dtype`). -# Here, we will use this selector to get only the columns containing strings -# (column with `object` dtype) that correspond to categorical features in our -# dataset. +# Here, we use this selector to get only the columns containing strings (column +# with `object` dtype) that correspond to categorical features in our dataset. # %% from sklearn.compose import make_column_selector as selector @@ -71,11 +70,11 @@ # # ```{note} # Be aware that if an error happened during the cross-validation, -# `cross_validate` will raise a warning and return NaN (Not a Number) as scores. +# `cross_validate` would raise a warning and return NaN (Not a Number) as scores. # To make it raise a standard Python exception with a traceback, you can pass # the `error_score="raise"` argument in the call to `cross_validate`. An -# exception will be raised instead of a warning at the first encountered problem -# and `cross_validate` will stop right away instead of returning NaN values. +# exception would be raised instead of a warning at the first encountered problem +# and `cross_validate` would stop right away instead of returning NaN values. # This is particularly handy when developing complex machine learning pipelines. # ``` @@ -114,8 +113,8 @@ # %% [markdown] # Now, we would like to compare the generalization performance of our previous -# model with a new model where instead of using an `OrdinalEncoder`, we will use -# a `OneHotEncoder`. Repeat the model evaluation using cross-validation. Compare +# model with a new model where instead of using an `OrdinalEncoder`, we use a +# `OneHotEncoder`. Repeat the model evaluation using cross-validation. Compare # the score of both models and conclude on the impact of choosing a specific # encoding strategy when using a linear model. @@ -139,4 +138,4 @@ # # The important message here is: linear model and `OrdinalEncoder` are used # together only for ordinal categorical features, i.e. features that have a -# specific ordering. Otherwise, your model will perform poorly. +# specific ordering. Otherwise, your model would perform poorly. diff --git a/python_scripts/03_categorical_pipeline_sol_02.py b/python_scripts/03_categorical_pipeline_sol_02.py index f73671fe4..36e8ddcdc 100644 --- a/python_scripts/03_categorical_pipeline_sol_02.py +++ b/python_scripts/03_categorical_pipeline_sol_02.py @@ -199,7 +199,7 @@ # | Tree-based model | `OrdinalEncoder` | `OrdinalEncoder` | # | Linear model | `OrdinalEncoder` with caution | `OneHotEncoder` | # -# - `OneHotEncoder`: will always do something meaningful, but can be unnecessary +# - `OneHotEncoder`: always does something meaningful, but can be unnecessary # slow with trees. # - `OrdinalEncoder`: can be detrimental for linear models unless your category # has a meaningful order and you make sure that `OrdinalEncoder` respects this diff --git a/python_scripts/03_categorical_pipeline_visualization.py b/python_scripts/03_categorical_pipeline_visualization.py index 0b10a6f42..ad22e5ee3 100644 --- a/python_scripts/03_categorical_pipeline_visualization.py +++ b/python_scripts/03_categorical_pipeline_visualization.py @@ -19,8 +19,8 @@ # ## First we load the dataset # %% [markdown] -# We need to define our data and target. In this case we will build a -# classification model +# We need to define our data and target. In this case we build a classification +# model # %% import pandas as pd From 84877f23e2222bfb685927f19a9efd51880ae607 Mon Sep 17 00:00:00 2001 From: Arturo Amor <86408019+ArturoAmorQ@users.noreply.github.com> Date: Tue, 7 Nov 2023 18:05:10 +0100 Subject: [PATCH 088/108] MAINT Update wrap-up quiz M5 (#751) --- jupyter-book/trees/trees_wrap_up_quiz.md | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/jupyter-book/trees/trees_wrap_up_quiz.md b/jupyter-book/trees/trees_wrap_up_quiz.md index 3cdc6f4c4..71b4a6cf8 100644 --- a/jupyter-book/trees/trees_wrap_up_quiz.md +++ b/jupyter-book/trees/trees_wrap_up_quiz.md @@ -7,7 +7,10 @@ Open the dataset `ames_housing_no_missing.csv` with the following command: ```python import pandas as pd -ames_housing = pd.read_csv("../datasets/ames_housing_no_missing.csv") +ames_housing = pd.read_csv( + "../datasets/ames_housing_no_missing.csv", + na_filter=False, # required for pandas>2.0 +) target_name = "SalePrice" data = ames_housing.drop(columns=target_name) target = ames_housing[target_name] From 8e8ee506bf8742a30367e065f12444dfd5541c56 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Mon, 20 Nov 2023 10:10:28 +0100 Subject: [PATCH 089/108] FIX fix description to be aligned with figure legend (#753) --- python_scripts/parameter_tuning_nested.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/python_scripts/parameter_tuning_nested.py b/python_scripts/parameter_tuning_nested.py index e4ccc7102..4a20f9285 100644 --- a/python_scripts/parameter_tuning_nested.py +++ b/python_scripts/parameter_tuning_nested.py @@ -259,11 +259,11 @@ # This figure illustrates the nested cross-validation strategy using # `cv_inner = KFold(n_splits=4)` and `cv_outer = KFold(n_splits=5)`. # -# For each inner cross-validation split (indexed on the left-hand side), +# For each inner cross-validation split (indexed on the right-hand side), # the procedure trains a model on all the red samples and evaluate the quality # of the hyperparameters on the green samples. # -# For each outer cross-validation split (indexed on the right-hand side), +# For each outer cross-validation split (indexed on the left-hand side), # the best hyper-parameters are selected based on the validation scores # (computed on the greed samples) and a model is refitted on the concatenation # of the red and green samples for that outer CV iteration. From 7b5aa266489bd6eb0e00aafc315e46452e5f62df Mon Sep 17 00:00:00 2001 From: Arturo Amor <86408019+ArturoAmorQ@users.noreply.github.com> Date: Mon, 27 Nov 2023 10:49:02 +0100 Subject: [PATCH 090/108] Fix typo (#754) Co-authored-by: ArturoAmorQ --- python_scripts/metrics_sol_02.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/python_scripts/metrics_sol_02.py b/python_scripts/metrics_sol_02.py index bbd5118ee..f6695e74c 100644 --- a/python_scripts/metrics_sol_02.py +++ b/python_scripts/metrics_sol_02.py @@ -124,8 +124,8 @@ # %% [markdown] tags=["solution"] # Even if the score distributions overlap due to the presence of outliers in the -# dataset, it is true that the average MSE is lower when `loss="squared_error`, -# whereas the average MAE is lower when `loss="absolute_error` as expected. +# dataset, it is true that the average MSE is lower when `loss="squared_error"`, +# whereas the average MAE is lower when `loss="absolute_error"` as expected. # Indeed, the choice of a loss function is made depending on the evaluation # metric that we want to optimize for a given use case. # From 80ecf2eebe9dda66a08dee178a7d191689fa724d Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Fri, 1 Dec 2023 11:12:09 +0100 Subject: [PATCH 091/108] FIX typo in description (#756) --- python_scripts/parameter_tuning_nested.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python_scripts/parameter_tuning_nested.py b/python_scripts/parameter_tuning_nested.py index 4a20f9285..37fe73a9c 100644 --- a/python_scripts/parameter_tuning_nested.py +++ b/python_scripts/parameter_tuning_nested.py @@ -265,7 +265,7 @@ # # For each outer cross-validation split (indexed on the left-hand side), # the best hyper-parameters are selected based on the validation scores -# (computed on the greed samples) and a model is refitted on the concatenation +# (computed on the green samples) and a model is refitted on the concatenation # of the red and green samples for that outer CV iteration. # # The generalization performance of the 5 refitted models from the outer CV From fa8225c17b06624d3da18a76f35eeb545e56e23f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Est=C3=A8ve?= Date: Fri, 1 Dec 2023 16:02:00 +0100 Subject: [PATCH 092/108] MAINT Update full index (#757) --- Makefile | 2 +- build_tools/generate-index.py | 10 ++++++++-- full-index.ipynb | 24 +++++++----------------- 3 files changed, 16 insertions(+), 20 deletions(-) diff --git a/Makefile b/Makefile index 4356313e2..e3fc7ef67 100644 --- a/Makefile +++ b/Makefile @@ -35,7 +35,7 @@ quizzes: python build_tools/generate-quizzes.py $(GITLAB_REPO_JUPYTERBOOK_DIR) $(JUPYTER_BOOK_DIR) full-index: - python build_tools/generate-md-index.py + python build_tools/generate-index.py $(JUPYTER_BOOK_DIR): jupyter-book build $(JUPYTER_BOOK_DIR) diff --git a/build_tools/generate-index.py b/build_tools/generate-index.py index 3894b245b..c41dfda28 100644 --- a/build_tools/generate-index.py +++ b/build_tools/generate-index.py @@ -9,14 +9,20 @@ from sphinx_external_toc.parsing import parse_toc_yaml -from myst_parser.main import to_tokens +from markdown_it.renderer import RendererHTML + +from myst_parser.config.main import MdParserConfig +from myst_parser.parsers.mdit import create_md_parser + # This hard-code the git repo root directory relative to this script root_dir = Path(__file__).parents[1] def get_first_title_from_md_str(md_str): - tokens = to_tokens(md_str) + parser = create_md_parser(MdParserConfig(), RendererHTML) + tokens = parser.parse(md_str) + is_title_token = False for t in tokens: if is_title_token: diff --git a/full-index.ipynb b/full-index.ipynb index d5ab13817..86945c85d 100644 --- a/full-index.ipynb +++ b/full-index.ipynb @@ -111,24 +111,22 @@ "### Intuitions on linear models\n", "\n", "* [๐ŸŽฅ Intuitions on linear models](https://inria.github.io/scikit-learn-mooc/linear_models/linear_models_slides.html)\n", - "* [โœ… Quiz M4.01](https://inria.github.io/scikit-learn-mooc/linear_models/linear_models_quiz_m4_01.html)\n", - "\n", - "### Linear regression\n", - "\n", "* [Linear regression without scikit-learn](notebooks/linear_regression_without_sklearn.ipynb)\n", "* [๐Ÿ“ Exercise M4.01](notebooks/linear_models_ex_01.ipynb)\n", "* [๐Ÿ“ƒ Solution for Exercise M4.01](notebooks/linear_models_sol_01.ipynb)\n", "* [Linear regression using scikit-learn](notebooks/linear_regression_in_sklearn.ipynb)\n", - "* [โœ… Quiz M4.02](https://inria.github.io/scikit-learn-mooc/linear_models/linear_models_quiz_m4_02.html)\n", + "* [Linear models for classification](notebooks/logistic_regression.ipynb)\n", + "* [โœ… Quiz M4.01](https://inria.github.io/scikit-learn-mooc/linear_models/linear_models_quiz_m4_01.html)\n", "\n", - "### Modelling non-linear features-target relationships\n", + "### Non-linear feature engineering for linear models\n", "\n", + "* [Non-linear feature engineering for Linear Regression](notebooks/linear_regression_non_linear_link.ipynb)\n", "* [๐Ÿ“ Exercise M4.02](notebooks/linear_models_ex_02.ipynb)\n", "* [๐Ÿ“ƒ Solution for Exercise M4.02](notebooks/linear_models_sol_02.ipynb)\n", - "* [Linear regression for a non-linear features-target relationship](notebooks/linear_regression_non_linear_link.ipynb)\n", + "* [Non-linear feature engineering for Logistic Regression](notebooks/linear_models_feature_engineering_classification.ipynb)\n", "* [๐Ÿ“ Exercise M4.03](notebooks/linear_models_ex_03.ipynb)\n", "* [๐Ÿ“ƒ Solution for Exercise M4.03](notebooks/linear_models_sol_03.ipynb)\n", - "* [โœ… Quiz M4.03](https://inria.github.io/scikit-learn-mooc/linear_models/linear_models_quiz_m4_03.html)\n", + "* [โœ… Quiz M4.02](https://inria.github.io/scikit-learn-mooc/linear_models/linear_models_quiz_m4_02.html)\n", "\n", "### Regularization in linear model\n", "\n", @@ -136,15 +134,7 @@ "* [Regularization of linear regression model](notebooks/linear_models_regularization.ipynb)\n", "* [๐Ÿ“ Exercise M4.04](notebooks/linear_models_ex_04.ipynb)\n", "* [๐Ÿ“ƒ Solution for Exercise M4.04](notebooks/linear_models_sol_04.ipynb)\n", - "* [โœ… Quiz M4.04](https://inria.github.io/scikit-learn-mooc/linear_models/linear_models_quiz_m4_04.html)\n", - "\n", - "### Linear model for classification\n", - "\n", - "* [Linear model for classification](notebooks/logistic_regression.ipynb)\n", - "* [๐Ÿ“ Exercise M4.05](notebooks/linear_models_ex_05.ipynb)\n", - "* [๐Ÿ“ƒ Solution for Exercise M4.05](notebooks/linear_models_sol_05.ipynb)\n", - "* [Beyond linear separation in classification](notebooks/logistic_regression_non_linear.ipynb)\n", - "* [โœ… Quiz M4.05](https://inria.github.io/scikit-learn-mooc/linear_models/linear_models_quiz_m4_05.html)\n", + "* [โœ… Quiz M4.03](https://inria.github.io/scikit-learn-mooc/linear_models/linear_models_quiz_m4_03.html)\n", "\n", "[๐Ÿ Wrap-up quiz 4](https://inria.github.io/scikit-learn-mooc/linear_models/linear_models_wrap_up_quiz.html)\n", "\n", From cdc4f4133d84af15ed145d1d4ca42bddbc7b6e22 Mon Sep 17 00:00:00 2001 From: Olivier Grisel Date: Mon, 11 Dec 2023 18:47:03 +0100 Subject: [PATCH 093/108] SYNC all notebooks (#758) --- notebooks/metrics_sol_02.ipynb | 4 ++-- notebooks/parameter_tuning_nested.ipynb | 6 +++--- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/notebooks/metrics_sol_02.ipynb b/notebooks/metrics_sol_02.ipynb index 2efef3f68..63b3cb5ca 100644 --- a/notebooks/metrics_sol_02.ipynb +++ b/notebooks/metrics_sol_02.ipynb @@ -214,8 +214,8 @@ }, "source": [ "Even if the score distributions overlap due to the presence of outliers in the\n", - "dataset, it is true that the average MSE is lower when `loss=\"squared_error`,\n", - "whereas the average MAE is lower when `loss=\"absolute_error` as expected.\n", + "dataset, it is true that the average MSE is lower when `loss=\"squared_error\"`,\n", + "whereas the average MAE is lower when `loss=\"absolute_error\"` as expected.\n", "Indeed, the choice of a loss function is made depending on the evaluation\n", "metric that we want to optimize for a given use case.\n", "\n", diff --git a/notebooks/parameter_tuning_nested.ipynb b/notebooks/parameter_tuning_nested.ipynb index f632d16f4..6fe297cdb 100644 --- a/notebooks/parameter_tuning_nested.ipynb +++ b/notebooks/parameter_tuning_nested.ipynb @@ -354,12 +354,12 @@ "

    Note

    \n", "

    This figure illustrates the nested cross-validation strategy using\n", "cv_inner = KFold(n_splits=4) and cv_outer = KFold(n_splits=5).

    \n", - "

    For each inner cross-validation split (indexed on the left-hand side),\n", + "

    For each inner cross-validation split (indexed on the right-hand side),\n", "the procedure trains a model on all the red samples and evaluate the quality\n", "of the hyperparameters on the green samples.

    \n", - "

    For each outer cross-validation split (indexed on the right-hand side),\n", + "

    For each outer cross-validation split (indexed on the left-hand side),\n", "the best hyper-parameters are selected based on the validation scores\n", - "(computed on the greed samples) and a model is refitted on the concatenation\n", + "(computed on the green samples) and a model is refitted on the concatenation\n", "of the red and green samples for that outer CV iteration.

    \n", "

    The generalization performance of the 5 refitted models from the outer CV\n", "loop are then evaluated on the blue samples to get the final scores.

    \n", From 946b6a3ea33f4a380498d72f9c772c1341745750 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Fri, 19 Jan 2024 16:47:21 +0100 Subject: [PATCH 094/108] FIX fix some typos (#759) --- .../linear_models_feature_engineering_classification.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/python_scripts/linear_models_feature_engineering_classification.py b/python_scripts/linear_models_feature_engineering_classification.py index 12a2997da..153afea21 100644 --- a/python_scripts/linear_models_feature_engineering_classification.py +++ b/python_scripts/linear_models_feature_engineering_classification.py @@ -350,10 +350,10 @@ def plot_decision_boundary(model, title=None): # %% [markdown] # -# The polynomial kernel approach would be interesting in cases were the +# The polynomial kernel approach would be interesting in cases where the # original feature space is already of high dimension: in these cases, # **computing the complete polynomial expansion** with `PolynomialFeatures` -# could be **intractable**, while Nystrรถm method can control the output +# could be **intractable**, while the Nystrรถm method can control the output # dimensionality with the `n_components` parameter. # # Let's now explore the use of a radial basis function (RBF) kernel: From 8124c5bf26e4ff8ef067f3f8da603126c2d9232b Mon Sep 17 00:00:00 2001 From: Daniel Campos <12107659+daniel-m-campos@users.noreply.github.com> Date: Mon, 19 Feb 2024 07:13:01 -0600 Subject: [PATCH 095/108] Fix formatting (#762) --- notebooks/metrics_classification.ipynb | 14 +++++++------- python_scripts/metrics_classification.py | 12 ++++++------ 2 files changed, 13 insertions(+), 13 deletions(-) diff --git a/notebooks/metrics_classification.ipynb b/notebooks/metrics_classification.ipynb index 9191bae79..5ecbabc0f 100644 --- a/notebooks/metrics_classification.ipynb +++ b/notebooks/metrics_classification.ipynb @@ -311,13 +311,13 @@ "blood when the classifier predicted so or the fraction of people predicted to\n", "have given blood out of the total population that actually did so.\n", "\n", - "The former metric, known as the precision, is defined as TP / (TP + FP) and\n", + "The former metric, known as the precision, is defined as `TP / (TP + FP)` and\n", "represents how likely the person actually gave blood when the classifier\n", - "predicted that they did. The latter, known as the recall, defined as TP / (TP\n", - "+ FN) and assesses how well the classifier is able to correctly identify\n", - "people who did give blood. We could, similarly to accuracy, manually compute\n", - "these values, however scikit-learn provides functions to compute these\n", - "statistics." + "predicted that they did. The latter, known as the recall, defined as\n", + "`TP / (TP + FN)` and assesses how well the classifier is able to correctly\n", + "identify people who did give blood. We could, similarly to accuracy,\n", + "manually compute these values, however scikit-learn provides functions to\n", + "compute these statistics." ] }, { @@ -664,4 +664,4 @@ }, "nbformat": 4, "nbformat_minor": 5 -} \ No newline at end of file +} diff --git a/python_scripts/metrics_classification.py b/python_scripts/metrics_classification.py index a620c3612..e6304c851 100644 --- a/python_scripts/metrics_classification.py +++ b/python_scripts/metrics_classification.py @@ -188,13 +188,13 @@ # blood when the classifier predicted so or the fraction of people predicted to # have given blood out of the total population that actually did so. # -# The former metric, known as the precision, is defined as TP / (TP + FP) and +# The former metric, known as the precision, is defined as `TP / (TP + FP)` and # represents how likely the person actually gave blood when the classifier -# predicted that they did. The latter, known as the recall, defined as TP / (TP -# + FN) and assesses how well the classifier is able to correctly identify -# people who did give blood. We could, similarly to accuracy, manually compute -# these values, however scikit-learn provides functions to compute these -# statistics. +# predicted that they did. The latter, known as the recall, defined as +# `TP / (TP + FN)` and assesses how well the classifier is able to correctly +# identify people who did give blood. We could, similarly to accuracy, +# manually compute these values, however scikit-learn provides functions to +# compute these statistics. # %% from sklearn.metrics import precision_score, recall_score From e897c9e34601f619242d78afafea6c77ab821013 Mon Sep 17 00:00:00 2001 From: ACCakut <7684542+ACCakut@users.noreply.github.com> Date: Thu, 14 Mar 2024 11:23:20 +0100 Subject: [PATCH 096/108] Fix typo in cross_validation_stratification.py (#770) --- python_scripts/cross_validation_stratification.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python_scripts/cross_validation_stratification.py b/python_scripts/cross_validation_stratification.py index 4acad6a36..ad39bdb3f 100644 --- a/python_scripts/cross_validation_stratification.py +++ b/python_scripts/cross_validation_stratification.py @@ -54,7 +54,7 @@ # By defining three splits, we will use three samples for testing and six for # training each time. `KFold` does not shuffle by default. It means that it will # select the three first samples for the testing set at the first split, then -# the three next three samples for the second split, and the three next for the +# the next three samples for the second split, and the three next for the # last split. In the end, all samples have been used in testing at least once # among the different splits. # From 6205ca532daa21c251220015f78ddd05b0301f33 Mon Sep 17 00:00:00 2001 From: ACCakut <7684542+ACCakut@users.noreply.github.com> Date: Thu, 14 Mar 2024 11:25:54 +0100 Subject: [PATCH 097/108] =?UTF-8?q?Change=20c.c.=20to=20cm=C2=B3=20(#771)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- python_scripts/datasets_blood_transfusion.py | 2 +- python_scripts/metrics_classification.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/python_scripts/datasets_blood_transfusion.py b/python_scripts/datasets_blood_transfusion.py index cca0c3439..1042f16f1 100644 --- a/python_scripts/datasets_blood_transfusion.py +++ b/python_scripts/datasets_blood_transfusion.py @@ -46,7 +46,7 @@ # * `Recency`: the time in months since the last time a person intended to give # blood; # * `Frequency`: the number of time a person intended to give blood in the past; -# * `Monetary`: the amount of blood given in the past (in c.c.); +# * `Monetary`: the amount of blood given in the past (in cmยณ); # * `Time`: the time in months since the first time a person intended to give # blood. # diff --git a/python_scripts/metrics_classification.py b/python_scripts/metrics_classification.py index e6304c851..9af7d5cfd 100644 --- a/python_scripts/metrics_classification.py +++ b/python_scripts/metrics_classification.py @@ -78,7 +78,7 @@ # predictions a classifier can provide. # # For this reason, we will create a synthetic sample for a new potential donor: -# they donated blood twice in the past (1000 c.c. each time). The last time was +# they donated blood twice in the past (1000 cmยณ each time). The last time was # 6 months ago, and the first time goes back to 20 months ago. # %% From cffc4b70916936ed4df91fc658ef1293be2692ff Mon Sep 17 00:00:00 2001 From: Peter Steinbach Date: Fri, 15 Mar 2024 10:02:54 +0100 Subject: [PATCH 098/108] Update parameter_tuning_nested.py (#768) --- python_scripts/parameter_tuning_nested.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python_scripts/parameter_tuning_nested.py b/python_scripts/parameter_tuning_nested.py index 37fe73a9c..5c13cd28d 100644 --- a/python_scripts/parameter_tuning_nested.py +++ b/python_scripts/parameter_tuning_nested.py @@ -215,7 +215,7 @@ # ``` # # However, this evaluation only provides us a single point estimate of the -# generalization performance. As recall at the beginning of this notebook, it is +# generalization performance. As you recall from the beginning of this notebook, it is # beneficial to have a rough idea of the uncertainty of our estimated # generalization performance. Therefore, we should instead use an additional # cross-validation for this evaluation. From 4c23380f405ba6ce49e39f9b149a6c6237a3875c Mon Sep 17 00:00:00 2001 From: Peter Steinbach Date: Fri, 29 Mar 2024 17:52:32 +0100 Subject: [PATCH 099/108] Add copyright key to indicate correct year (#774) --- jupyter-book/_config.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/jupyter-book/_config.yml b/jupyter-book/_config.yml index 42816945e..e4e73c41d 100644 --- a/jupyter-book/_config.yml +++ b/jupyter-book/_config.yml @@ -3,6 +3,7 @@ title : Scikit-learn course author: scikit-learn developers logo: 'scikit-learn-logo.png' +copyright: "2022-2024" # Information about where the book exists on the web description: >- From 4ea9df962eac07d35b07beeaff8f7c6c7a59efb2 Mon Sep 17 00:00:00 2001 From: Till Korten Date: Fri, 26 Apr 2024 15:17:53 +0200 Subject: [PATCH 100/108] FIX fix several small typos (#767) --- jupyter-book/tuning/parameter_tuning_manual_quiz_m3_01.md | 2 +- notebooks/cross_validation_ex_01.ipynb | 4 ++-- python_scripts/cross_validation_learning_curve.py | 2 +- python_scripts/cross_validation_train_test.py | 2 +- 4 files changed, 5 insertions(+), 5 deletions(-) diff --git a/jupyter-book/tuning/parameter_tuning_manual_quiz_m3_01.md b/jupyter-book/tuning/parameter_tuning_manual_quiz_m3_01.md index eb1380853..4011cb852 100644 --- a/jupyter-book/tuning/parameter_tuning_manual_quiz_m3_01.md +++ b/jupyter-book/tuning/parameter_tuning_manual_quiz_m3_01.md @@ -1,7 +1,7 @@ # โœ… Quiz M3.01 ```{admonition} Question -Which parameters below are hyperparameters of `HistGradientBosstingClassifier`? +Which parameters below are hyperparameters of `HistGradientBoostingClassifier`? Remember we only consider hyperparameters to be those that potentially impact the result of the learning procedure and subsequent predictions. diff --git a/notebooks/cross_validation_ex_01.ipynb b/notebooks/cross_validation_ex_01.ipynb index 381202093..b55e6efff 100644 --- a/notebooks/cross_validation_ex_01.ipynb +++ b/notebooks/cross_validation_ex_01.ipynb @@ -52,7 +52,7 @@ "exercise.\n", "\n", "Also, this classifier can become more flexible/expressive by using a so-called\n", - "kernel that makes the model become non-linear. Again, no requirement regarding\n", + "kernel that makes the model become non-linear. Again, no understanding regarding\n", "the mathematics is required to accomplish this exercise.\n", "\n", "We will use an RBF kernel where a parameter `gamma` allows to tune the\n", @@ -160,4 +160,4 @@ }, "nbformat": 4, "nbformat_minor": 5 -} \ No newline at end of file +} diff --git a/python_scripts/cross_validation_learning_curve.py b/python_scripts/cross_validation_learning_curve.py index b2bb23335..a337a8ed9 100644 --- a/python_scripts/cross_validation_learning_curve.py +++ b/python_scripts/cross_validation_learning_curve.py @@ -13,7 +13,7 @@ # generalizing. Besides these aspects, it is also important to understand how # the different errors are influenced by the number of samples available. # -# In this notebook, we will show this aspect by looking a the variability of +# In this notebook, we will show this aspect by looking at the variability of # the different errors. # # Let's first load the data and create the same model as in the previous diff --git a/python_scripts/cross_validation_train_test.py b/python_scripts/cross_validation_train_test.py index f249a91fb..a11cbd587 100644 --- a/python_scripts/cross_validation_train_test.py +++ b/python_scripts/cross_validation_train_test.py @@ -34,7 +34,7 @@ # notebook. The target to be predicted is a continuous variable and not anymore # discrete. This task is called regression. # -# This, we will use a predictive model specific to regression and not to +# Thus, we will use a predictive model specific to regression and not to # classification. # %% From 913cc1c30e04db5ccf21ccc15f293d1bda42cd5d Mon Sep 17 00:00:00 2001 From: Arturo Amor <86408019+ArturoAmorQ@users.noreply.github.com> Date: Fri, 26 Apr 2024 15:47:27 +0200 Subject: [PATCH 101/108] FIX Several typos accross landing pages (#773) Co-authored-by: ArturoAmorQ --- README.md | 2 +- jupyter-book/index.md | 6 +++--- workflow-notes.md | 8 ++++---- 3 files changed, 8 insertions(+), 8 deletions(-) diff --git a/README.md b/README.md index 7e8318e17..fd3d7bab2 100644 --- a/README.md +++ b/README.md @@ -3,7 +3,7 @@ ๐Ÿ“ข ๐Ÿ“ข ๐Ÿ“ข A new session of the [Machine learning in Python with scikit-learn MOOC](https://www.fun-mooc.fr/en/courses/machine-learning-python-scikit-learn), is available starting on November 8th, 2023 and will remain open on self-paced -mode. Enroll for the full MOOC experience (quizz solutions, executable +mode. Enroll for the full MOOC experience (quiz solutions, executable notebooks, discussion forum, etc ...) ! The MOOC is free and hosted on the [FUN-MOOC](https://fun-mooc.fr/) platform diff --git a/jupyter-book/index.md b/jupyter-book/index.md index cab66acbc..aeb3cbea9 100644 --- a/jupyter-book/index.md +++ b/jupyter-book/index.md @@ -36,8 +36,8 @@ interpreting their predictions. "Machine learning in Python with scikit-learn MOOC" , - is available starting on October 18, 2022 and will last for 3 months. Enroll for - the full MOOC experience (quizz solutions, executable notebooks, discussion + is available starting on November 8th, 2023 and will remain open in self-paced mode. + Enroll for the full MOOC experience (quiz solutions, executable notebooks, discussion forum, etc ...) !
    The MOOC is free and the platform does not use the student data for any other purpose @@ -79,7 +79,7 @@ You can cite us through the project's Zenodo archive using the following DOI: [10.5281/zenodo.7220306](https://doi.org/10.5281/zenodo.7220306). The following repository includes the notebooks, exercises and solutions to the -exercises (but not the quizz solutions ;): +exercises (but not the quizzes' solutions ;): https://github.com/INRIA/scikit-learn-mooc/ diff --git a/workflow-notes.md b/workflow-notes.md index af57afbc9..e79247110 100644 --- a/workflow-notes.md +++ b/workflow-notes.md @@ -45,7 +45,7 @@ https://gitlab.inria.fr/learninglab/mooc-scikit-learn/mooc-scikit-learn-coordina The `jupyter-book` folder has the same structure as the `jupyter-book` folder on the github repo, but the gitlab one contains only quizzes `.md` files. If you work on quizzes, you need to to do it in the gitlab repo, the github repo -quizz files are generated from the gitlab repo (by stripping solution) with +quiz files are generated from the gitlab repo (by stripping solution) with `make exercises`. Useful: to get the `.py` code from a quiz `.md`, look at @@ -84,7 +84,7 @@ There are two ways this can be applied on the FUN side: duplication of logic between data-hide and sklearn_mooc.js (e.g. to remove navigation items from JupyterBook) but oh well ๐Ÿคทโ€โ™‚๏ธ ... the best we can do is to use the `remove-from-content-only` class in JupyterBook. - + ### Notebooks Note: FUN use notebooks so if you only update the `.py` files, FUN participants @@ -107,9 +107,9 @@ Note that some notebook changes need manual action in FUN: decide to use some FUN specific thing like using FUN hint when we put "Hint:" in the markdown. Not sure that is a great idea in itself but oh well ... -### Empty wrap-up quizz notebooks and sandbox notebook +### Empty wrap-up quiz notebooks and sandbox notebook -Wrap-up quizz notebooks or sandbox notebooks are created on the FUN side. In other words, we do not have +Wrap-up quiz notebooks or sandbox notebooks are created on the FUN side. In other words, we do not have an empty notebook for each wrap-up quiz in our github repo. ## Discourse forum From 09ad771674fc8c75033a2a86ea69d166f38443b1 Mon Sep 17 00:00:00 2001 From: Arturo Amor <86408019+ArturoAmorQ@users.noreply.github.com> Date: Fri, 26 Apr 2024 15:48:28 +0200 Subject: [PATCH 102/108] MAINT Remove head method (#766) Co-authored-by: ArturoAmorQ --- python_scripts/01_tabular_data_exploration.py | 9 +++++++++ python_scripts/02_numerical_pipeline_hands_on.py | 8 ++++---- python_scripts/02_numerical_pipeline_introduction.py | 4 ++-- python_scripts/03_categorical_pipeline.py | 4 ++-- .../03_categorical_pipeline_column_transformer.py | 2 +- python_scripts/cross_validation_train_test.py | 6 +++--- python_scripts/linear_models_ex_02.py | 2 +- python_scripts/linear_models_sol_02.py | 2 +- python_scripts/linear_regression_without_sklearn.py | 2 +- python_scripts/parameter_tuning_grid_search.py | 4 ++-- python_scripts/parameter_tuning_manual.py | 2 +- python_scripts/parameter_tuning_randomized_search.py | 2 +- python_scripts/trees_dataset.py | 2 +- 13 files changed, 29 insertions(+), 20 deletions(-) diff --git a/python_scripts/01_tabular_data_exploration.py b/python_scripts/01_tabular_data_exploration.py index 4b07c4add..3441f490f 100644 --- a/python_scripts/01_tabular_data_exploration.py +++ b/python_scripts/01_tabular_data_exploration.py @@ -70,6 +70,15 @@ # %% adult_census.head() +# %% [markdown] +# An alternative is to omit the `head` method. This would output the intial and +# final rows and columns, but everything in between is not shown by default. It +# also provides the dataframe's dimensions at the bottom in the format `n_rows` +# x `n_columns`. + +# %% +adult_census + # %% [markdown] # The column named **class** is our target variable (i.e., the variable which we # want to predict). The two possible classes are `<=50K` (low-revenue) and diff --git a/python_scripts/02_numerical_pipeline_hands_on.py b/python_scripts/02_numerical_pipeline_hands_on.py index 913b78105..ba4212017 100644 --- a/python_scripts/02_numerical_pipeline_hands_on.py +++ b/python_scripts/02_numerical_pipeline_hands_on.py @@ -34,7 +34,7 @@ adult_census = pd.read_csv("../datasets/adult-census.csv") # drop the duplicated column `"education-num"` as stated in the first notebook adult_census = adult_census.drop(columns="education-num") -adult_census.head() +adult_census # %% [markdown] # The next step separates the target from the data. We performed the same @@ -44,7 +44,7 @@ data, target = adult_census.drop(columns="class"), adult_census["class"] # %% -data.head() +data # %% target @@ -95,7 +95,7 @@ # the `object` data type. # %% -data.head() +data # %% [markdown] # We see that the `object` data type corresponds to columns containing strings. @@ -105,7 +105,7 @@ # %% numerical_columns = ["age", "capital-gain", "capital-loss", "hours-per-week"] -data[numerical_columns].head() +data[numerical_columns] # %% [markdown] # Now that we limited the dataset to numerical columns only, we can analyse diff --git a/python_scripts/02_numerical_pipeline_introduction.py b/python_scripts/02_numerical_pipeline_introduction.py index ca56a13fb..8a245611a 100644 --- a/python_scripts/02_numerical_pipeline_introduction.py +++ b/python_scripts/02_numerical_pipeline_introduction.py @@ -39,7 +39,7 @@ # Let's have a look at the first records of this dataframe: # %% -adult_census.head() +adult_census # %% [markdown] # We see that this CSV file contains all information: the target that we would @@ -56,7 +56,7 @@ # %% data = adult_census.drop(columns=[target_name]) -data.head() +data # %% [markdown] # We can now linger on the variables, also denominated features, that we later diff --git a/python_scripts/03_categorical_pipeline.py b/python_scripts/03_categorical_pipeline.py index 62cd9be98..64b516070 100644 --- a/python_scripts/03_categorical_pipeline.py +++ b/python_scripts/03_categorical_pipeline.py @@ -81,7 +81,7 @@ # %% data_categorical = data[categorical_columns] -data_categorical.head() +data_categorical # %% print(f"The dataset is composed of {data_categorical.shape[1]} features") @@ -194,7 +194,7 @@ # %% print(f"The dataset is composed of {data_categorical.shape[1]} features") -data_categorical.head() +data_categorical # %% data_encoded = encoder.fit_transform(data_categorical) diff --git a/python_scripts/03_categorical_pipeline_column_transformer.py b/python_scripts/03_categorical_pipeline_column_transformer.py index fd429749e..3ee06fad7 100644 --- a/python_scripts/03_categorical_pipeline_column_transformer.py +++ b/python_scripts/03_categorical_pipeline_column_transformer.py @@ -165,7 +165,7 @@ # method. As an example, we predict on the five first samples from the test set. # %% -data_test.head() +data_test # %% model.predict(data_test)[:5] diff --git a/python_scripts/cross_validation_train_test.py b/python_scripts/cross_validation_train_test.py index a11cbd587..f5bd73b01 100644 --- a/python_scripts/cross_validation_train_test.py +++ b/python_scripts/cross_validation_train_test.py @@ -41,7 +41,7 @@ print(housing.DESCR) # %% -data.head() +data # %% [markdown] # To simplify future visualization, let's transform the prices from the 100 @@ -49,7 +49,7 @@ # %% target *= 100 -target.head() +target # %% [markdown] # ```{note} @@ -218,7 +218,7 @@ import pandas as pd cv_results = pd.DataFrame(cv_results) -cv_results.head() +cv_results # %% [markdown] # ```{tip} diff --git a/python_scripts/linear_models_ex_02.py b/python_scripts/linear_models_ex_02.py index fdfdaf610..12a78bc5a 100644 --- a/python_scripts/linear_models_ex_02.py +++ b/python_scripts/linear_models_ex_02.py @@ -52,7 +52,7 @@ data = penguins_non_missing[columns] target = penguins_non_missing[target_name] -data.head() +data # %% [markdown] # Now it is your turn to train a linear regression model on this dataset. First, diff --git a/python_scripts/linear_models_sol_02.py b/python_scripts/linear_models_sol_02.py index 03aa72005..0b0717f10 100644 --- a/python_scripts/linear_models_sol_02.py +++ b/python_scripts/linear_models_sol_02.py @@ -46,7 +46,7 @@ data = penguins_non_missing[columns] target = penguins_non_missing[target_name] -data.head() +data # %% [markdown] # Now it is your turn to train a linear regression model on this dataset. First, diff --git a/python_scripts/linear_regression_without_sklearn.py b/python_scripts/linear_regression_without_sklearn.py index acc06a0ec..a83c0cf4b 100644 --- a/python_scripts/linear_regression_without_sklearn.py +++ b/python_scripts/linear_regression_without_sklearn.py @@ -22,7 +22,7 @@ import pandas as pd penguins = pd.read_csv("../datasets/penguins_regression.csv") -penguins.head() +penguins # %% [markdown] # We aim to solve the following problem: using the flipper length of a penguin, diff --git a/python_scripts/parameter_tuning_grid_search.py b/python_scripts/parameter_tuning_grid_search.py index 12bbffb57..39de4251d 100644 --- a/python_scripts/parameter_tuning_grid_search.py +++ b/python_scripts/parameter_tuning_grid_search.py @@ -36,7 +36,7 @@ # %% data = adult_census.drop(columns=[target_name, "education-num"]) -data.head() +data # %% [markdown] # Once the dataset is loaded, we split it into a training and testing sets. @@ -193,7 +193,7 @@ cv_results = pd.DataFrame(model_grid_search.cv_results_).sort_values( "mean_test_score", ascending=False ) -cv_results.head() +cv_results # %% [markdown] # Let us focus on the most interesting columns and shorten the parameter names diff --git a/python_scripts/parameter_tuning_manual.py b/python_scripts/parameter_tuning_manual.py index 15d047a80..d010a52dc 100644 --- a/python_scripts/parameter_tuning_manual.py +++ b/python_scripts/parameter_tuning_manual.py @@ -38,7 +38,7 @@ # Our data is only numerical. # %% -data.head() +data # %% [markdown] # Let's create a simple predictive model made of a scaler followed by a logistic diff --git a/python_scripts/parameter_tuning_randomized_search.py b/python_scripts/parameter_tuning_randomized_search.py index b146b832d..0bcd4761d 100644 --- a/python_scripts/parameter_tuning_randomized_search.py +++ b/python_scripts/parameter_tuning_randomized_search.py @@ -44,7 +44,7 @@ # %% data = adult_census.drop(columns=[target_name, "education-num"]) -data.head() +data # %% [markdown] # Once the dataset is loaded, we split it into a training and testing sets. diff --git a/python_scripts/trees_dataset.py b/python_scripts/trees_dataset.py index 888eee5a7..457e85c3c 100644 --- a/python_scripts/trees_dataset.py +++ b/python_scripts/trees_dataset.py @@ -48,7 +48,7 @@ # Let's check the dataset more into details. # %% -penguins.head() +penguins # %% [markdown] # Since that we have few samples, we can check a scatter plot to observe the From 12372259c6a189613cfbd646c60f9862ed7cb4a8 Mon Sep 17 00:00:00 2001 From: Arturo Amor <86408019+ArturoAmorQ@users.noreply.github.com> Date: Fri, 26 Apr 2024 15:49:47 +0200 Subject: [PATCH 103/108] MAINT Fix typos and wording across the mooc (#764) Co-authored-by: ArturoAmorQ --- python_scripts/02_numerical_pipeline_introduction.py | 2 +- python_scripts/03_categorical_pipeline.py | 9 +++++---- python_scripts/cross_validation_train_test.py | 2 +- python_scripts/ensemble_sol_02.py | 7 +++++++ python_scripts/linear_models_ex_04.py | 2 +- python_scripts/linear_models_sol_04.py | 2 +- python_scripts/metrics_regression.py | 5 +++-- 7 files changed, 19 insertions(+), 10 deletions(-) diff --git a/python_scripts/02_numerical_pipeline_introduction.py b/python_scripts/02_numerical_pipeline_introduction.py index 8a245611a..940065dc3 100644 --- a/python_scripts/02_numerical_pipeline_introduction.py +++ b/python_scripts/02_numerical_pipeline_introduction.py @@ -59,7 +59,7 @@ data # %% [markdown] -# We can now linger on the variables, also denominated features, that we later +# We can now focus on the variables, also denominated features, that we later # use to build our predictive model. In addition, we can also check how many # samples are available in our dataset. diff --git a/python_scripts/03_categorical_pipeline.py b/python_scripts/03_categorical_pipeline.py index 64b516070..844a072ca 100644 --- a/python_scripts/03_categorical_pipeline.py +++ b/python_scripts/03_categorical_pipeline.py @@ -253,7 +253,7 @@ # and check the generalization performance of this machine learning pipeline using # cross-validation. # -# Before we create the pipeline, we have to linger on the `native-country`. +# Before we create the pipeline, we have to focus on the `native-country`. # Let's recall some statistics regarding this column. # %% @@ -329,9 +329,10 @@ print(f"The accuracy is: {scores.mean():.3f} ยฑ {scores.std():.3f}") # %% [markdown] -# As you can see, this representation of the categorical variables is -# slightly more predictive of the revenue than the numerical variables -# that we used previously. +# As you can see, this representation of the categorical variables is slightly +# more predictive of the revenue than the numerical variables that we used +# previously. The reason being that we have more (predictive) categorical +# features than numerical ones. # %% [markdown] # diff --git a/python_scripts/cross_validation_train_test.py b/python_scripts/cross_validation_train_test.py index f5bd73b01..68c640da4 100644 --- a/python_scripts/cross_validation_train_test.py +++ b/python_scripts/cross_validation_train_test.py @@ -12,7 +12,7 @@ # of predictive models. While this section could be slightly redundant, we # intend to go into details into the cross-validation framework. # -# Before we dive in, let's linger on the reasons for always having training and +# Before we dive in, let's focus on the reasons for always having training and # testing sets. Let's first look at the limitation of using a dataset without # keeping any samples out. # diff --git a/python_scripts/ensemble_sol_02.py b/python_scripts/ensemble_sol_02.py index 232ec2c04..061be3e52 100644 --- a/python_scripts/ensemble_sol_02.py +++ b/python_scripts/ensemble_sol_02.py @@ -103,3 +103,10 @@ plt.plot(data_range[feature_name], forest_predictions, label="Random forest") _ = plt.legend(bbox_to_anchor=(1.05, 0.8), loc="upper left") + +# %% [markdown] tags=["solution"] +# The random forest reduces the overfitting of the individual trees but still +# overfits itself. In the section on "hyperparameter tuning with ensemble +# methods" we will see how to further mitigate this effect. Still, interested +# users may increase the number of estimators in the forest and try different +# values of, e.g., `min_samples_split`. diff --git a/python_scripts/linear_models_ex_04.py b/python_scripts/linear_models_ex_04.py index dd9ae6bb1..473013074 100644 --- a/python_scripts/linear_models_ex_04.py +++ b/python_scripts/linear_models_ex_04.py @@ -17,7 +17,7 @@ # In the previous Module we tuned the hyperparameter `C` of the logistic # regression without mentioning that it controls the regularization strength. # Later, on the slides on ๐ŸŽฅ **Intuitions on regularized linear models** we -# metioned that a small `C` provides a more regularized model, whereas a +# mentioned that a small `C` provides a more regularized model, whereas a # non-regularized model is obtained with an infinitely large value of `C`. # Indeed, `C` behaves as the inverse of the `alpha` coefficient in the `Ridge` # model. diff --git a/python_scripts/linear_models_sol_04.py b/python_scripts/linear_models_sol_04.py index 942aed56d..623afad8e 100644 --- a/python_scripts/linear_models_sol_04.py +++ b/python_scripts/linear_models_sol_04.py @@ -11,7 +11,7 @@ # In the previous Module we tuned the hyperparameter `C` of the logistic # regression without mentioning that it controls the regularization strength. # Later, on the slides on ๐ŸŽฅ **Intuitions on regularized linear models** we -# metioned that a small `C` provides a more regularized model, whereas a +# mentioned that a small `C` provides a more regularized model, whereas a # non-regularized model is obtained with an infinitely large value of `C`. # Indeed, `C` behaves as the inverse of the `alpha` coefficient in the `Ridge` # model. diff --git a/python_scripts/metrics_regression.py b/python_scripts/metrics_regression.py index 494447732..4c6d91db7 100644 --- a/python_scripts/metrics_regression.py +++ b/python_scripts/metrics_regression.py @@ -97,8 +97,9 @@ # %% [markdown] # The $R^2$ score represents the proportion of variance of the target that is # explained by the independent variables in the model. The best score possible -# is 1 but there is no lower bound. However, a model that predicts the expected -# value of the target would get a score of 0. +# is 1 but there is no lower bound. However, a model that predicts the [expected +# value](https://en.wikipedia.org/wiki/Expected_value) of the target would get a +# score of 0. # %% from sklearn.dummy import DummyRegressor From 9a9e74ff0c80c53ae174c67f45f79ac59182c04d Mon Sep 17 00:00:00 2001 From: Arturo Amor <86408019+ArturoAmorQ@users.noreply.github.com> Date: Fri, 17 May 2024 10:41:03 +0200 Subject: [PATCH 104/108] FIX Make cross-validation figures less inexact (#765) Co-authored-by: ArturoAmorQ --- .../cross_validation_train_test_diagram.png | Bin 24147 -> 60248 bytes figures/nested_cross_validation_diagram.png | Bin 34676 -> 114661 bytes figures/plot_parameter_tuning_cv.py | 162 ++++++++++++------ 3 files changed, 110 insertions(+), 52 deletions(-) diff --git a/figures/cross_validation_train_test_diagram.png b/figures/cross_validation_train_test_diagram.png index 1e485c737b21f4f02b264da127331c0aabd373a3..64bc3622cfb6aead818d203e9febbea278bba180 100644 GIT binary patch literal 60248 zcmeFZXFSkvA3ypPmC-OVDk36OL<`A^jFf~Zn+C~N_AFT;m83ydXc!HnfkaBfNVdw# z%*vkUebuk~e;%9%=gE0?yk5WiF2r|S*XQ$IpDX0hLDl7qYZxhtTCT39e3+u>vM7pX zlzs_5v!$|z75^vgqGI5pV}H`c)!gX>b->)k;f%e@8EXr^b0?gft?li$iAjlV-^^#_ z;^N>eBQ9?Hzi$w;cRD4`D=2UlKZLS7&HH2=(iWQ6P9%OmpVTv}0jmXBWx}F5s(P$y+*hf?rse9{*%EOJ~t>t&y~m zgoUZ!U1>V05eBKpo}G4^oy=|?*PT21(#^4ohmVC_y5OE~!NEH+f$y(wOtSuR&%wdL zFDi<~#l_`N&Y7b%_w|^3e0&TFyreBGEx+=dPEc1R?H~xa+1lf#%HA!t9PI4f9G7-)6>dxTE4^z z{X>VC55~(ar)Gazy0~?mV{G>+Ebgl6*{R0RodT%heJmLC2Hbmn7v|@J!@_8(nukZ( zmap0VZs9d^RHKByin4BKbH4LZ+s3CX3yg!Eoq?}BCyMLq4b1Q0YAtKhwDWU}akENc zSG%1%fBjmn$Xvs-FMF~&^=ouTl~rtP>WHIu*IddyAH}O;1gCY z&v)G1ZoLS<(8|U}Kum1KF8gmKcmd37*M?5}i=Ya42P z6C56{GBwhv7h(SC=8m?(^GjEXjaqI8aeUQ{`)%I0Of}jYTlG%TlwXS z&asNmpH~<^cM-Wr9}=v_u5(ZG^-K5P6m{&_u^T&W*X)a2w_HF#An+_J8(X%=*y+v3 zp3gyu)~QpZn4CCAE~Df38q|ev;Pm z?Io54`E&gSv!h?Mwr<;Y)zE~QuD?E;7W;~xiHR-O(7@pC{ur_P9NV>$l9E|iJeoJR ztEX_awY53?D59bKA|h5ykN-0C6W~w0;%9?88NH}1l2`S^^)00xdWAl7Sy&=QSM`;Z zbszUPonP!%0+{VFRDIX3cM-oe;gXX>kWOv`WeZFIEZ5pzpOvtDyA02HpWW{2-CF8(8Z>;a+V&^HLgJ;Jm%O>S{Crp# zljeZ~38|?;q3oNA>*_dnyAJ5eELQzy97=wkZR7U5=g*6ai}_U-;=j2MU>zDfKF+Xa zm)$ZZCZ_Jbz9gs4a(a4t1zla;jII$>qsjJ2`Ju5o(QF9{<&()VE1imYZC6Qj);v=;)ygBWX8xck{3J^;Bd6JFqypB##HX?b)9h zD0JYaS@8viJf}{L2$_+%$ce%4Wh>=p{qVDkYHD;_c@h#66Sw-zx{>>9Zsvae{JF4c zfox;0eWd)ttWHW5dquo_*`?LnezdmI?BBm1d*n?`%__YIM^v7i{6vb>ZJD!-*u_qL zs{8g?;8&%Y7RsTb-hK3FIi8K~gCong80M5bJo;qhmB9d>Zi(|YJkIFe&87uL54U-a z_jAvL^-14~dEvgteT0sz-c&ht%PBm&s4G_zo#Geo$(b#<*VQR@p)S+3ESN>0#Kurd zYvs9f=elbjt=(zcm^>t-QyVk-?2%F+W98Scf#KoHpP%g$a^_wf)zmW9@rqMWP_VTt z-uL3ITWj%>weM-J&dbX)YtD0QlaXD#o_>*#R@~-!-_N(Dt?&Tj8n;SFu&|lMmsj*1 zHW^TIL#>>gno6)J4aNT58_39BnPI&9OjFK*lzIVv{*v#vcG+hf=F!{ZJI}rC)aMJS zsT&_YdX#_{QxJjDyB4)0KgV(K+n(9+Cj0KjXPa){zHMe@MbE>-V{r6n!qF!uZ|w2R z(a_l?KlgEDddy~a?lAW6J65fGF=GZ!mvSjh2$X|4ffx?S2 z$hKIlsi|3F`{>xSfID}1{2n@c&Cj|=Hb6cD!TBFN=ER+p|+#TztK#s79LJgX_fwAFhcAaPK~++3GImK9bBArRKC} z32x4OtoN%1@00$n$9%OJ#+xSc>K+~yusVG@bY>APEmMt&$s@zugIl+6*Y$`izrN+c zsMo2;6KJH2A8<1ol~q;R9tZH=-WxG39-ADteEsIl-juGrnwpn1TeRXM*X}&SJUTY! zTysm#`?=;T_uoGm8XFrmW}IAHR%MtLW^5fmX`x4T7cJc~=sr2|-Ioc^E5CLN>TSVi z8zrTHNdp-f8M1EkYppBq`cn#*)iyX)oORrDO5WRuy&=~=zPjx0$xkS@@7<&`RFx?!PqE)kO|@{#kQ& zUFxD+w;rlQL3HzZ?m8&e{K8eMbxeppVCYxw_t2SBA8%+|mAyLtp7nK6kwVXo6DLm! zam%=?_%T_-IOi{A3N^9=x{W6hN>?L15*tJ)Y`N9j=!O+g9 zPoG}Ex=0)6>)@X`yVI)juF;33OP7XqZaVT%`1tYTt2DTklTD0h!_uxj_nDL75HkvX z`gHp;uC3I8TRQ`RvIbgS1hH>A!nboL$I5lGMSe824zrVvvCpdu`R37j2m+ykU8mn4 zoUW5hM^i1rJ2Yx40z`b}F{Tv0X2;zJ55jKT;5yUrcrn@9z+IL1^)?y3bkpwHp1XAA zy1M`#q>FobdspELH*Vc(M`d+VU5rY;1Z_{*%}uKDsWmMxFRy{2VLOnbL7~r{43k%( zsB%fJgWrOKgZ=XJ-6M9&c}-G^%s=}3F9DN~lAMK`zkZz~PTn^Ho7QY%pauA#v@&6@ zHcIK_f>* zl~p}CrRAs`7ZbxNbFQD>#>S?*w>Q*v^KPG6DZI}Zf!e;hOw0N#^LOvw`Q5uG)RYqG$5>N?u#C%M`Fp}wJeJWj*JY-%oIPC zYq#9l*;&?Wk{Q?X7F(Bvmp9Vnl}9TUN@Z>BXMK6q2yPbNK_KI(gS7f%XVuxmIJR7I z|JB=^zl}UXptTD@L6*3HPESR;{`)Sh!$vGWnnn!C^PHomxRDpluZt z6by}yUPwt11j2J{D`6nW3_W6Kc=&Y)+s2N9b%Gl=f^*#kX2d10zOVN{$mgQlNCyL; z!o|HoD_XJohQ`KJupXM5o3Fhc@BO;TWuSR=B(MBF6o9I9osHKv>8}_b9+niCX8b*u z#uvYmrorYZI(FKjgn(*G-LFOdv=TP8n~$G7X#rG9fK2%{k)$lI@hW|k%?lToY+T3U z%~EqiRv=O7;u&9_s~cxb>nZxq4>6o}-wW;}8F_iVoYeQFioZV4@63s*tJ;e<^~}Hr zx1x0V^yvsml-1&r5Vle$l-u{(LNlhX{F_xZ83OwAtE#J6b8MTgCuw?kJpb6870AR{ zZo5fHXkSk^2MQB_B?DgSBI@<~_v)?mR}$Cw?*PPzZEwkQwDfQM@Br145v+~H?;(qX z@BGxM2+7r}R|9mg9nUDkrr#8<@{Rssa`J`*6cvlu#Y`L>!oP>xRTHqtG!{JWaN@fI zUc7iweZm%)QYm^eW#2HypD3eLRs3>-|}LYYAn9S=si8k(&TdX z>?g-xTMV)knhRZe4yTm4_f)5F3&>tv$tza7fET^0_S46Yt3dD`J@q~@-rpcFzU}xs z=9CjpOb2ahAHL5nl!||nA6`V7hLMp`iDc5g@Nx-BJ^_KfT-#1w`Z$(gMR#4sG}38P zp>x+k$3j4UaI_Te6W?CC3v9RlX0*93Wq*v(%pyK)_k9Dwp{71lDx2He+mostg|Ec0 z7*KRrhxv80GZTY>C~qO1*bz~KHayp;-y#`Drn0~@+vb6F zL@o))W6y2L;aRmRIjb3$TUxfCKZ+ieDsatGmh_cu2Fj+FUkYVK@XBXoJ&N9>|8tsV zgShEAyXL(0*HPhYGCKF{*|V4mAbY0u&C*!!cpqznr&ih-zfOqDpY7ER?0me^TYtRu zFuU)%$SC`#?np$3wYFbkkzINjP6FvQ*MtyAQlE=QY z!$H*s9fDat3v;f~8IQ1@K4`uXR$F9)U1G8OY;Vnb$ME}YU%J|X+#fvk8W+(izo+Uw zJE@&^3^5_`DRiOi{ z;{z=Zs%`vl-MU2`7vy!m)thtrt7x{kySkL#fjcsKXH>Q08zTE=1`G8YO|)O~vUmJ8 zAGf|#9yc+SeYWpNY#oitW38?;K-39u6_k5Rv#%+q=t2mpOjKbCoI%eNnHRfr_%JhS z7MpLpl%qvP=ge5&Qa+p3YLHWr&8eB0@u@FFtgWnmkl&G4c>3%8G=pq@zX0CcprD}1 znbI&$rkW!rYN;1#6hMWlP*yDZ3{Qg-Qja+jKm9sa*TA{qdhG|Mo@?tVu3dJwi=Chf zaLIYO#Wls8nwuVL8zgu=Bg4+W@yU}Xd+`t12eg0!3cP113e=HH+9|pCF<_!(SL+w& zo8ILrYHDh6meIAoW4-Lqu||wc^?*9NQ7Zf%*7-yBDBt4x^W$pLOsGl-Cq{!M_cR@4 zPdzcs1g+7QdZfZ1^&^PaNBF5v*vwdWRg!9+$CxNpR8_?SM0D7owzzoFcdy^!lMDS_ zYH^^f!bVaid-v|W9XgAZuDoyG&mE&)*;!c%kYd;e>}wy!y0u>lc$xdsV@zYV1#Hf` zfz(suOQv6ME$bM6o-Eqao}Q54pKVzty7`o)<$2uM&&F?{P($B+7hX3s96jl&q_l`M z79i->2bl#*`A(fb1_v{T&i10~(=T7Xm;(3dl==DyXsXT9ZE~p9s9kqMW<@FZsz{4z z{h;dNIjh%>^}_6EiWQYqne69sQgEegjZj^1G{oX^W8Fz`Ox-{wuL`emOZ4&8?suUC+-Q zfq;_@8f(-vHax6|2IzprIty+jZ|jo)Z}C{0Mz0+fS zYide>51oJ2RaP=XUTixSH$p%k_VH`r0-(`A=#!#GAH;IIETD>MrymwfIl(wHm$o(D zG|)S|x3{;rqC(5d)ZoQA+lUvR_o=9aL_Ug*UH$yruVZNv%uyat5_L6>9y=!55oXnuN*go#lz2;PoK+gaeBUW zhg~yQ;r#R_@P(+GH$(S$juQq^U*NqPg78Rl94InYX`<6q|NHmARX>lJ?BCA>7F>L1 zx9iYA%Wgs|-oAY+IyQ!$_|kV_9d#)EaEcM9^``>L!RM9t3`eGEAxG1P3h6mryE;dc z3TvnVhae}A?-O!{E$>)CF_d?yMn2GBFCCAOp*9+}&+P@AILa@x{8}>bsi_>cKO}?9U zzMnUck(TZ}CMTX*wPwv4Xiyh(b9YgETed8B9cX?H%7x;xf=ECLwKnIlq>uAEH!|3w{DYu z2HoWypVEzAoCAxcIb&-}G#l;Fz#U z`=IM)ecCDiioiG@C&~>i#ucTphOgJ6(#;sb};23Df~7S4?uMp(iKkZm7*b!U6#VbL~HXG$J@7%f5qGfW@EyZ_vZf$?>J^?xE6PzLPS!pNe6CeCpvHxw}}BB z!lLkk!z$SC4#~%i_I_nrx`La!e3l#3`vNd>d5ri5Vzu*c%c+fanlB0q4h?jv zf0dqYoqeE3m=UP^ZM^S117F|XMvyo5dA8=mAjv@SXrCmRB(pFY43{O1QgDdjsa3%ntyqmK%KUb&}WVcxU(l_zIcMZ7AQ z*TLIT{-C|dsi~G=_fltjS7jLG2aI%n==%Nc39Ald_@v3w4W5hkNB5i;{3UH;WfcsC zA1zU(+eGd8&(!vDNvfren(^ zEGE%Qwz%uX#|Hamr+!8HlE?1~^7hweIdkfqQ z>?v>!IC|ZlP>6iA6v*PCT)P(jp7`LJ)Q&lZu;2o+hJZ#x14@KUIvWc91p z4u)LLpiMc5ruQapk0%JjQmVetS01(czJtec^LKJ>Kf&|!uUEW#2gQ4p-$U_L_ah1x zQ4lVM#mBX^wc9L{)XZmh)I;Nkr(@w+zK@>c`pdX4u!V*{XxhCOKt}?wFmHH#g2+Uu zXtaaav;MLKj$OPIdn=5SZit0ZkCt1WI3XxT?=$%A zmC>NC>-4B~a&j{D?zxSH35p5XLkhK~@y>>H^OM8v?x#(o?j5+b=-si6zaU!$R^O8( zI^@;MSfLeXuPKe+a5Ai)*TEw!gA!HJmFwbfrnlov{h^b`bxry{Y%?pOh9{P<-APeI zhk=K{JtMDI9rpyz9o*4DG zw)>WZ<-UXo^D}2I0a4JtlNo@bA+UZu4ayn~)!oyR02L?jn>b54FUa6E`v&=CKgR=Q zW;j0*kP5J&!lAESULY6rxNsG*98kDQS*~cn9uCBt4Drnd{&t8G%1-9pyjZbFyl^)0 z>6vjBnQ9OBDDML)P%8Ay^U*Y;2kcTp`@ej}Lp;)v#0P{>3F(A78X|MT^w^i`>MVy= zrQwAs>C~S;f3~Sfskl}g7MkH%o}zv)02Szg)T1*%W0HNht*osR0FD{^u0QQMGv0T$ zDp7@zZFc~l8UCQrB-(HJ;e(Bb9o*xR4ye;lrBc-5Km{E|la=}XGc)b|r{}MzZ(yCB z{aw+v0B!0LYUF*1#Afx(DQw4-Iu|1&ZsKt~eFV=!c5KT5D@p-c(~0r?{QQvjMG%Rh zbO>tMo=WX?mXMOltGud-=bao1F*Q_25JfixMC88F1dzt$z>C3>(jNBra>b{ywd@WKKMl#a`Y{;k8(4A6`E0m+L4cik_W2ICsa)cfFZkil z!QaCvNf~6B(LgGTPWp&4HS|^AH2vk<1CG{kaK{I~i~Cf{Id#w%O!sbb9Mv&j12R=# zfu|n`A*xeY;IHcvp_C3ys`%`sYpb`K`MH(701~qpZYz~8hskPDc~>z%Vu?;PbUIWo zih^M7mzkMaoo<2$wYf6jHMJFXtmB1`P!+S$CAm*jS66R!8#03+NBeF%;JSA?3-$4a z1T9pfBQh;cDAg7(GcqzHXW*lT{rF*)l6|u9mcIEzMGG@CKA|0PvhD$b`>zLwX#>le z4DhT>(P(te!K!s|bflq-@|_mT&kpU4C{tpl)4Z{j24zLJ@rXIEp^3>CyG=AdJ+f`eK_#=xu;GJou>`{S<_P8! zm$)R9xD}tnnI8U}&_J*zC}H_nh8Ls>C|0|7+BM&>xiW?_6g5TrK6OwWF$-Y*AZ31< z%V%9*06U4=Zi9IZ0QJ#v-4l6E*aYh*BKP9QD+87>%03^~>2;GY4+o$B=sA2>@%&c1 zW~mgLyNm&T=;uYBJ}rmt|87i{NDH@j^02b<|2A-*=BtVc3(I5EE_inOtH#UdVH7I~ zk5MZrDJcaV9X5p{O8hfz6#(JIxMSuuYeKOerFQR5a_;&}gl8CGQBhI;Kr_wH&oU7Y zSyOYFsD8#J2yAR1w{`T#gl*127#--a29Izei05bH|kE`AkAIi2+*=(4zj7XS#^jtg7a1jdQ z@~e>@h^rJyj%@eeCyictuq)Ii=B(Xir#toeOhfFA8$W<{mC<2g1irnr`uxKD3=wI; zrIcMh!#!I&ooKX&s+y5ZV0>q zde~avq$2V_czBHO7Sh)}&|isKu@`Gh&Sy3l!fpa?*y?-19{dwzKtjN4;Uq3nsBIL+ zb*(DAtq#M86(UjAFo*H}qr~S#M@6!s6!Mf~XF2=!Gxf~CV@L8XYLbO+`o;OnGaLQv?DbE;IAEfYPJ5$LBJ=o zQ>TImaD+|caPHhXH!Xt0y}Kn<#Yes&Y7KxRdLN<6>PBPIm&ye3*)3D+P@@Y1Da zv(tT<2G35jLjny)IVdVGXGHJ=UPKvspksGsf^24#j8!F{g!Vji5$b52;doDi@@ODS zaCaDu!faGm$NQ^o^Y9cOG{eG)X`VyeC;)%ZENt*yvW)9;(q~wEbl5G5{87RWW9C#% zzC-NLn0Z%`eLQCFMn_0Q1P3<{PxRY87A5qsa7F8+ocs3cL3aQ2MOxbV=OsAP!nLU2 zfkR*>Ub(4ihYnpPZz`p$9O|Q#$7q^HSq>PAt5e52Bh%k~HR%FSf`rd(6wi#t&gcAq(k@vyGG)6lb5JFJ&-E!s z1YHuat3~&0q@f3~CEF|zyYjuQxP0dR`*DL%#6@1MKwM=#v0raeBA2Gz2$J&9l_>7%kR6b5L(I2&F%23 zaR=R!C5h*LHE!Ly)z9CbMpW zf*(2z9MKBU+SYQDG3@+#S{RZS&>o8_Dpo>%QNk^iSdI=26-CO=2V?U=FUdGGJv&^+ z%d&1=1h3B|4YYTESes9uJu|np4TsX(hHq6+R=!Z2?m095>kTv%3G3<&+ihx>0U!oN zL_}~o>RqqAog|3UB-aev<0qHdt`tpLsieIx_@_65+ z(Y`vZlZKfl#0t7)({g3C>1bV^GXW7U#i%A-eHWLyU>p?&bfyu`a&-1}hrY4ibXjEi{UW+=}jYAAZ- zVX)+has}jm$OD8^4ssX-1E>wf5;DT&kDus5KfnoM=H|WxB9=sgFI83k2)+;j3Rr;n zdTr0uLzk_Vq!TwSAaS;>$V5E_>%apKNJv<}efxHjLIDqZYTLwtC=W4vpbj!RGA8jO zGzqD9J)DP&q+5~bOI=;ie-Wm;inJtdM`ShF=VGoK5&=)_zP$_~FY5E>&q&Ff$1ea+ z4fXS*h^2ON)XH3^Y$i&StQZpGL%ih1R`W&RX>{{%6sJ3{J}|9u1o3uM9P|MQWiOsLm?9~#?{EB)`| z|Np|D2l4;eBl&XJmHPWa;&&@D^U()9f5XQwx*ZsmHAK0^JorY@-xPLv5Q_RspSd+~ z@6Z;Lp1)boFY1C8NFs7*+Q{^LT@$AKBc7o^7y>;p>Eq(op<{jJSwRxo+9B*bJmD*O zz-}kc_5ebBTf7!-UI&Iz{-F+p=V`#xHU8+#2?J|{Q2|G_a-c={NK7Ty}j~1bK{K% zyF(X6ajR3KA67jK_Y!mB<>C2soR;Oi*}r`oie?wo{0q2i>7S%A_HS+T(vW7Ek{<}0qS(3wS_p4gX!EGcmcYeM!Zi|SzlpB(_M7ah2yo;PC zsfF3ue~1f72=U7MTx%*wRB1jR5P`_7lCH?1F2-ECf%pOp5OrZ zsmQ-a#l=McA0)u6Cj}F@9Vj*k6ls$zC-jd}5H0NjFL$gfL?IYR*jqzGqkuE15RPHN z{IpGdVZ9QG6wt z?!n1e2d&2sl^S5Y@`bBC08?-$w$>$-{o;=wmtox!Eb#}l+4khGU}Pj&IJkr2aq&Pe z<}mRXc|A|iDHee%RKcJyIChL&Qcg||*H$wMPqm}geQtWky@Pi`AM_ceG>4TfiWKD*+Ygv*qIIzErTNSA&AM>wit2% z(NDGp@e*4ELcz}mri38U(eWXpRPWZQ|hMq;&W& zEBG|>$t#hw`kCeou%DNo|0iGs*9JlO64-RP#ot}`w;x}(vySsx?D3P^V1%j66Iv7`cE-5kb$G|`Yc*2{; z#x+R4=6Ozt`Aqk0B*Or_a$bj~UgCk?#Y!3a{re5rG$N?y(P8;84**ORgax@`ovg4< z7hn#dV1xoe^Ts+Npb!DKJhx%~lMoHACMOlY-jP*sD3>ihseDC4eoNAQPYd0#dj_oM zIQAEJRH*20C_EeXB2H$8?((2PAkPQBeuE6BJ?ay!mtA_`S4=;)(P{Oi}RNvItBr|RL+ZIlAcYoagRmiJwUVqW$(eC_X=pO9=@ zfBf)6Elz|~PWeInNrW?m0$Y?>I3EfI9oD&6Q2}9o67?eaXdpZ_z`I`k~@s;Fh6}-bbfxu{t~w|>-_vY}3c-5T?{m+sEkV`lFzuZMn54`rdU(ASqLLhh_G$JX$|g$riT zABZZ~Y#F=DejO1;AQioS`*w*!t%nViv9}ns*^1y6rWsUIeoKPF!OP%$QEjnl?w=m1(M;mj$9im_+4DZB+M@m*o`?8&?$(J!NI{HI;M5sa#0^s^v&3a zce$Cza|FFKITq$zOzlPtbMyMrUsTN2vVGgINGnM)v&jW{*~ zWQY(z3qjtWLJn{9(Z_VSAD9ikaghQr8L@IXuj*6C-bR9+79p4q*DOZe3_%h`h?~$z zXpnSTFJwg(ef`RghJ=!JAg=xL!N6PFt*>Z~Fj8c!1#JDpnQamhuMyF4ewu`h0jHh= zJ*Et-#(~Hj;M}VD@RT)g?L44qLgNRLBBXi4Z?Iry1-kBAti)0bHh`v@A%xjopDnP0 zTk7tR0ctrBnPQ#WdwN2^g)H%M!KOrfW`|V~VgS1&Vg(F7WdF16zssR@hs|JpllUEx z#2|UU#>ElwaM9%VeD3X5mo-N;Jic9AwmR{tX*ut~^&9?lr$pAW@83;(=24!_z&g2P zT!gV!gK+i5U%s#*df1UIB} zprmq9PMZ5=;Ym_=!P_Jm)}*5pz4`V{PD_GPC=F$&p|FEK>%0OfHpd7QEQ)O}U%hJW z>0!{)(z=bY2?~1O{j7|a$`OzhtLO0AC_ntBmmY$?ehQFEO0=yT&~jJeU6T;+=Xm+d z*&+&VD_a2HU36v1_YeA^>M&ALh>hnR@DPIr7Y5<=XX8rt6%qfnXxw;dsY+|X(2xUC zxqPA;Q&yiZT%glUJEYht_W(&t*B3w==$=r6E&*P>Zf-W}+%dPVd4_3Ic5c%R);Uam3w3bk8fqR?~V;DGK+E!8mfp6>3u(2hu&XW-kbFz{2+ z@HzYf$`7*pCho5rjW5l?$!yDkJk<&=T#BUUp#-JHgpa}RCpk|eW8=<)>B#}@^#AaP zBSM)1uAWezNOw<56MlN8ffd@53LJsMIy#Um=@gk&-zBG{uxrFqchShlO}T6}n%L+R zPrh(bj)d}ByfZvJ{P!wa3cb+`H%4Si5)Ve@9sSkBNrJd#5j@VDZx(<8<6dcSo1~T? za_ra9aeSCt>y8Y&Uz_v8?z|@(T|1)R7c6{n7Kq;SeoVyL;!t#yc6Em?6kWVp@d;Ir ziTC(7*>PZ@VkR$xQEMTKtcE?i5oL`Qz*ZNTkcz(KsMAvaz@FS%>|w&I3FC!%1759l zvXGD7J?^Le6}39sX_w_HHo17|SSL(x zl8xLYQTUNkI1kE!M;@2DQ(gLh>#*XYOQRXS=)575`T@Y>25r$XNY4s1m+^=1m!=Nw z94&U#Q?gGG=m`i8riWzv18<#F-4e;_-_0;-^KI#tF69)y7=S^nt1tHK*r8{n^Rb~p zy6)WR+J_8O9)fcPmQ}U2`;b7|>e!|X_Pm;g1|wO>*>}7+a3%5|TAe-vYC};7baxJz zW|=8EUexJMcKx;6;fJ5pP2m?k8vi zHiOa*e2KJr7|;osnUMs~_SU4wk5wM|TW{MSZ%V`KT1O3`izPr{A!1K?^rj%>N%^;K zB{2;Iofpsm2$IC`AW1_J*yrrLmk2gU)g!nTED!`CQ1A4%ZQIzIa_v>&lz^OGLhVjM zDL@{35y_+=+=tspQVHX*xPy!=5%K_9_Mr&3!p$ZY1HP1%o&6f$8P+kWCxMlLHNRhW z7t)pIAr8WX3q&ulz%Ybp?lXuulhe~}$e38+b&zX@sZtDs2g?&q)@#6rVZ=$Z4`SmX z9z*0q-TS%>xN0SUZF4BxsQIsvS#tjQQ6zM)_}3dnqD$~86{3~-F8B~m0tgKHjmf2* zc-;#}#kZ>c<%!K`kLQ7t+29uu_zS~`7Fcg7ZoWciBr*9ughC9#Y(;HFGv(X786hq& zk~wfd6n^-#@BMG#m5}&hgIp*qln3!AR)*1wbIY+)2^-0=YYBzVQI4OPx&n1)B`_GY z&-ECG!xAGe3^N)zk!W_b9gx*{v=gFP&IR>oL*{2;iG-Xb6b5TD2?G#tOV~ePl!;A#Zj=psN$eZLRmPQrN(Lq5 zwqT7>Ma1NU;HJbMMMh&FVzHAixpW^w@6&nZ&q|~S7Irp8<_gF4vaZ2=AO zhoA1##h5UVq0^a(iHV@X!ouqLRnJ818&^{hZ?WtSS>Gy;_*z@bL9)%w-T)WW_W#NDIK8-Z!LG2*&L16K)-YYS7N#?!LYu!e>S#IvM4aNGo zzgrWsE!`0o6#I2l(dW+xTWt@5yDy`Fob9qp|IB`}3h10Ro+Jo4Ki!)ynJ z+JAlXYXsm%o*t5}it%e-9(H+jAONJB@q^lgppcO0=HKY{=J@xbV z$kqV`do*xJW0eZiPo173T*x$(CknE!s`+X_H%dW6BeYnOtX`;}txHt@Y%yi!(O9!Z z59q;6Xei(S_UlvdmndPLd235c+FiSLMNcnVxl;e;GFXa#KBk+u`s*QZ^^uyj7fq0@ z_O5+fwTRC3zZV`Hd|U!Q z3SRtOKxo*4ZD6%dA8zmQtE|J_gGl@Nx9 z{6`9Xyval*vfh@+$q{;sYQe(Fy6DuYQx7ph_(}WkMq^=#Ho`(Y4@OMfV#2UVu#HgP zrY6ggj`sw$0iJ*5;s5WVv_n!-1}Kd%EdE63@4YL||9AV{Pu{TnPc{2L;uv1b|FdfL z|NaZs>ANY>H8N|CrCA+c|KyWVox*JH$i^9`tKXlO8f&$9|ICdmsb8iPs3BT+mFrgL z0~5tw4%a9;Bg!3! zM}$K3Kjkv|d&Os{okT{UilC;gS#!HyN!c{#%FuKD`>JN%NH{~KRolN`=NL4Vj2r(w zeu!pT(a-6QI5|oFtcZ_9x+6FZ(E%A%*{;d_YxWmy!5UrG8E;wLnAx*~l z%C^n8vOga+4|EUI#_h1-z(mn`*%u0-8P%vMrGSTLwV_iGJ;<(8(L zrmKa@fUKE6aOeHqtNuKYMDaK@D(VMp3k;`OXLRPCX$U|RpUhx^y>A(HL;buKo7{Op4Dk>;=3+S)fCn_)xbkC;kyV?h>Umku^bAFbhK4FsOdC z7Q&nP`LBrTF+2no03@gipA;FfW%!3k zKAB<03yJEK>h_h;A>knrSoRu?5V7|(qy& zPbRCeoHtcU&y0FOiQ$uy;^L8U@e2%Gd?`G<_1CX(glB?Jpl(*CYVnf51#CXVg(fjO z-jTD(Ta(DPSFQx0b&#MoUK0Uu_!xq2DI3p85e1E;0pm`=0(R-Hq(R#IilH2uDp7WZbEFoT48o7{C|=19bt(v+^?)7-J*)a(3LSSFa#Aiou;@0$eIa zbH~(GOpJa6-d#Jq_$_X3Zo%F#ytRgghM0XJh+OV-MA)1FN!AG%i%fWSIQJljQUG>k z!9IU9G>HR9L?Agb0<0+ksw_sYBiBOF0mtQc=|({SXTn(rizr1XMF6WTP)9c6RRr5Y z>E`IZ?d>5)rn40BfRWHyg;qoc-)Q{E%pUsC5u|wofUq}ntzFB6vKJ5-7---MNl3c! z4kpDZ1#pR6H4R zOwzps3eMQt+L8e~1|)1bkeb9}M|MWn!-&yP-L2&+D()(6yN4y|SdtVUMokbDpyqSM zuJdDogOHc9nwmSfx~d{&(frZHF(o|dqugKJl*YVFl79E~uJo4K+ARYfz5Q$o1(wEt z6xYw`M)taAwJzQNw!Py2h;LI7(RCj0L!&bd%lj62ZI1rB*v&`d*hA&O1GD}8KQ99M zM)zSfybT#k^vH(VN?|=V!x1uK&xPRwG&~kOBHT^v90pt!(WZ%BjC?LWsCR3kzhdB0 z_t7!Ed$TAC#2&z#C9!pkU?qO^=}Ak`Dje|e^dzwt=rEFTu7#o-b%GHXz=ohoi6wv!QO~$!Yy&RSP?fQESHc(g1I@F{Yvzio2LA*e#rqk5ET?L=;MIX z7-DZAelY~*#vO?QzP2jUR33$}75N2{Wz4gO_)K01Oq(D9)rh;0FepDlnjvRmjDv^= zmC(Ag55mQO|3n8Oi=0#XX#J{tq)|^kXp1%j6O+!t<6=Z&;jn>fmBS}i@g3}LYXqOD zyIrM!zpyaVQ_$RzYv)*}b1I%1;jt+$JFZ-y=f&eA+G_cD(%z((J(+ybHGb+*nawMQ znc3T)EB=V~|0MiW8B0N&VT2$d5;kh&G$KOX*_01g_ngOrvVhJCUxP3A72;@SFouXg znS(~iz$VnOd#57vR+uEz(GQ=GM}8_Pp(E@L)unhayp+ZA90E!Zz1b);kWcc15E?`z z`y6Jc$l4{R)Zj3hTS?2op^0>ihtie<0mgFZ$7>?BB88Xfv}phHceAK}$FJy9*= z>-NZ~-U*0%LWTm7s%Tpn8yg#jg471ELI59H(s>LJhN*KvEd3ZS&x>m+A>#>^mCVV2-AO7CQ?f%M~cY)=n3P0AVMSJC7s6lT!YDjIS!Pyv zVuZvCd|Cj7UrbDlh~g1*u)4buae=ZCf{f(517>3bErvqwf@G_2Hh#Wc>&fNhq+~Km z4-4hPrOCth_`1$O(3Z~0&&yNzJ0^#vdI&oh6o(ktKptpSokCd{1&KsQ>#R3{utp3i zl7Pf!az0>Yj1(Qr97T{IXlKbJ$A?%B8M63$#K1i_F>%O+@7}5DW*8l{mQOhVBqUNk z@)x~pSB*?el))M$!ULIe!&F6qp^aM9!>6F8bB zJudP0OxF%G{4o~Bet0sl9FShQ>#R`o+TqGT1|va1TbJxRO~ z0241g$}shA*F(rKPPsAg9;Ek-l*R~#Sj{L4ew zr2%;nv?cS55Ifm(_ouF$o|;nJ-{)(&83&J%BZJms5fbMMzY+t*Ymu5&V5TGLDCN}L zDwdi2ei@Xhz!S+KceL2`FOgXB5o=9_THWIQ6Y_e9_2oCEyuSsfF}fYYQ#;{i24N1Q z4Ue@Rx?sV~z;4}46P`bV&$+LPrA~#m#fZ^dYw}Uq@BhGQxv)%X1CD5+p;&Q}0V-X- zJp4I}sg90^ooa{|k+Y=mURFR;mU3*PyQdW&0YC0F%uCQcy_=&m)&FyTRSx370|dCK z`Ix7np`pe3&PR7uOu0<|z>BA$5W1#?0jG`0K`x|a)vp(?`Fb3JC12l3)6kOA()TSU zMiM7CzG8??yy5q-wE0QQmeVjE%8+imeC-;6H5Skii1G#fv9-PZJm6#KiLZ{IbyLcB z+%1=mHqjvyO?>qE^T$nRTTq~wI5rm{T+M<^E77t*H;N%(NqCN*0Zqvk@|aQ zbaiV#t%D>Xn)|33q{bt5($)p*?c0@~~4fgSLY3f0C(UY3i^jNM|Q7Bp6bEkZrfF58pNIwM#UW!IU zb9_s4d|aH#-E;E#DPsQYoSZ@EhvcAydc=xxo=q@fEhO05`X-2o_!z2$Q7w4#(O(9a z_5*agv_u7Z6`D7tFz-={I0oehtj0ZqvqDO>jTU!;_5k2fMQ~E1EPcTk;hGBM+9_Jp z)YUD${A|f#6XLoceo0(iB!HdzeSz`aOyrGq9!g>I`!3Gyp!{%lKuA+qpW~-U5Jd81 zrE}!nh7@iyHbBERY8u9I8qc0RyJ0E3toqqglCXxAs$~7!S?4~h0V<{ySQtb>_$5S{ zM(WoL!YVmB>7Hh601}PZF~mT|vlajPd;j0NW4z^Z{|62VWX35gz(_!Tox&RrxXfWn zLozH%RYfHM;Yjy`v%>sQYUJ=NstB{ASx`0{$#GDX=<&m!-2zu9xg-oj^ym;V!DAR5 zXjw}U8PKTE$G908_aH13erVT#DJl?oe%GAyAwCV-iFEBnqDesrq$qSnwp`A-_M1dQ z$4L4HaQG5iQc*E9eK|HZ1n=_AhYy?T&f$;>5=KC3cn0wNj^%;OygaGdsjhe&J`#~< z3-m$1Y*`7$!;Y5=ZQN*yQz|G5FgmI82-be=B)+Wylxa7r2qcramH_#oj@gt*-ovHdAx(Hwg`MiofDQ&|O06FICc zsTt}b0d-hu2`C6;q7{+st`{~pJ#(SziXpa#jxLPH0cA=AAGvWnHZBU}RhPlH>95u60T zm}ZzOhGsPcl~558NC4;ZIHiT{`tY0N*aYWiQg`(_14F=xu%av_;3eC~Vu~ag87CFWtO(a}u#;;`fVQ zQg8TFO7`r@k6h$b7xP#xk=;HNvS*)vFaLydk0S5{yV4Ju%V!)0!q=GSIyb9OyT@H-?Y9h& zetneM$oBXfe+9naU`EsQW3&%|9w+e>C2(#f(UVD}0q1jpy5YNr+De%)Y8#gHgru!O z-0$zQqI~DZI00qT4>tse$!Pt(9o?J%*(5FIhs1EfOcc)33`f$PB&)FoR6(LpbcJ;8 zfx=IA#+Z_WV0Wm9^kLmYA=Pu``uA8)YxlAmP(Rw+Nyvxz%UBpHnE9ay?8I`|i^O{k zcf#hiYuBQ@r0edKm3>*{dhl;rgP&hiUE{=}{bW>EQ0%Cn7DzERq=j-;>q z7uziD!kEo#l+J8m3!Jf6-oGEGox*A%C(L3fm#krQ4@}1B=sZ;}703BnSW(^8Df>h6 z(v8$dN`;?Ed!Iv&6BY{iw-WxY=tUC5xIX>*}UB26L-h}UKnt8W{9rzR8 zq2Mb$<8B&B+-jQjr{|`*s2$V_Y<(-zyCO;9+t{7+l8w&;YrkPn6Lt@1{gcakF*&~( z2S~46DU=y393;Nee~0(93+0!Raw5XEv^ zIkT`o1KNFP@@ftA7(}Fqb#yNyGt=~T0!!7YwlL!o+{ypN*qea$w6=fWt7M9esLUiH zg^b&rsez28QPCunxwMUuA(Al^nIc6B5u$`dqV{HsOc_&#G7k~q{ha-3|DWeMp7%Xo z$8q2HzPE<;TkBfab)Mhpe6@}2O_H5?km*30piYR3OACbCWs|2a!b}IdWJn5L5=$Ubtrusl@C(dCqS}1 zt_~AY8?n)?1j&^?lY&GV!4Buan1q7^5WvA2#Ga3|S^ILQ8tUR`#xt8n)|j|Fz--7S zyri|th8}zwS3{4krqBXww`}mL_wQTM@DU=wkvs(XDhKw!=L4>zxx3C=cftSWgljmDw*~a40!%iQsR=&cltv7ahk;69I?1upVPoFMj zZ&p`0l3Qh9h>SXMQbRt!h{zRoL=o~2W6P*RGq zkEZow>cqdHmLc0bVL~){r8qq=)9iG?^-6lgnEkJL3DuO}WuNN8a+#c><11-}Ep*hV zQK*cZ*8jwu4*;3T(Snoa56VADH4rYFwT9@xv>+D7Z#8?Y2coajf4yk}&bEf$7qgh= z`Ld6-AdSNwjhsztlO#1$d5G2$8t{c->+!ReP*ai=vw}2TS^%UMR;SOL$sSn*9Mc5A z40($#J(4xD4Uq1qkS~&zBU-tjE^28K|HvC6l14z|u2=sN_MVzqKFfsji)9;^>_|37 zfn24;B7q_1%~uiyIgtiI7X!3idge`@GgOWQHG@r7C%twRS*$iTHqxwhpMKxA?Nx&? zP<%?f3;2(CUxLUxrifh1p(hvsc)M6E9UsvMN=5tUpVbsRKdY}-;pqt^+F=J(H%}o7 z-Q3+Xb4@_uB|dx+gc+&9<@)vLxLY)ta*e!*7eRPLs1a^|HFA#wO$ju^B?O53eM@m0 zi=fcFk!p9kvGpCe2weHbO_C5J{Wztpnj-RL*=E4tvE7_;oIyqc}XNPQrxG70&H z&!XHLIU*_J@@1)Zd*A>0e8Cg>mUNID4}L2W^rtin;<{k# z`S;XJSE=z>7kVIWV8(yQYs2e@RW@wLWD%%?L)L{Dlj@YH3VCyYMLygdJ)jlEm>8-) zBC9=CtqtO#}c%{W$p<(WG#Fk`smpa-BeK);VT!( z*uqpPd!&m&4=7~`s*Fou+S=%>qagf%1NnyR$8~2w8;EQ8{P_~pn3{rkNN4LowZ#3G z9wHGTz5s5oso=D;U7=Sb)doSQn&@!}LMbSrukZW!_8wv!&Ht0Mzlg3CVC;y&i8OL| zQ|EW%e8T+Jp_@T{5fpU#dSn3~Mw%r>M0&oS>Jo762IdGZv-b4J@89m_?l);ZZI(dZ2vj`i>?!GWzY_8wfmP+wwY;o- zF)=|Xy0b>S_}T3CZb0Em)#u_I>$ae)rEyEXy?2+$FqBpl!&i?zMtR!IZVsSY@^t#U z=GZezi(IYx3>?1 zf=!ILGqA^m2j|FdBK4x%AM19ye#PK^BgKEJawT-y=1r$==N$OofZZ-vHwP@(X-nOw zYcioW&GQMLTun4J0G-+g>T*FwuaEPR`o5bG@esL%G&YEK_s}(XKo>k-rB{R3V*7&y z$X_5^{yElvVbmN@{N5!ba#p!gdiq_tHaNI=>{T(DDYU?Ay1T zgXanVM}!jCDzYk=#31~*&R0l0= z^7MA#t7FP~8R}2(3l!*{mvi!l=Ep``Ny} zbX&Gem=*ciW#RydU=;R~ZMSQ2Z=Iyn%O}z4E-L$`1-MnOVdLwa&flr3NP}9uk1~IW zujOH#=$~a9N7;=hAO(2Nq0MwEE0?#)Wt#$SOmzb@U)^}L_w8=PcTUPXfQHfZ5^p5{ zB~92;jOPcx>Q$&;{BlmNpcLGk@I5D@3)*;lM~jpMs^rq5eTKstR{0If`Qg{kg|qk0 z_qthK%`kP13u=|qZ*5APhUBylXQ3I*`a6rG%Hi#lZcmnUnY_W6GEVtdl*s*rTO<(f%+)FJ-CRX6-s}gu9)D9RyteR{A zhmNktb^uOdP5Be_$!$QbK79sOwTu{9-?Q5X*S*W~t*3lc(T>6_PChaL?75tJzHV;+qHRF8=nkIyIzgs^6Yft1ORPO~U&lxK?l7`H%lsv2)$o z3n`dHO)jA!l*wVqo88Bcf#cAaM z4G0DxnO;PJjaQt;-bSvn4uT{K0rctlpZ1?F(NX|Xh1JKka@cC_-OZMLj3n8iS({cS zB{WX05uwg#Do_#)1BnDd2)TtB*R*sS=1K2l0quR1*JJj&1Oj#zeaUs0M_Q^tdI+aO zzEFotRg9fOSV9pM<43>r6YfILW!k>Z3_6IoWHlpm+s|I0`qrRpWHCFWHh9cs{tGiJ zB7Iu>@Wyr&W}PT>nKjdhgIxFB{YEWYN>a>D!c! z6<|{kH%5p6kNWTf)8C`E)vF{)%1PFNRrdObN+79 z>!Y`i7f5ssP0hhv{!wmTvt!7x#9{#eVNQp5=?!H5CGxQ()^FOw6uY;{UsJvIv)wB}AjO zoQo76&HNdj6j1{1QZd&CB!K6U#m07F0p2CWB9Ql$HVIanXj(it-vBXIVHXisz|>vH zJ0whjyVYNsAa?jJtJxrKHXHs3>CZFpc8MnEJD(I3tYE9X004%VGn4)DQfPmkcGlNp z(r&P^fQ!i>6);WVq}UL}smRW@wf$p0EWMXDtJIutk<1kAQTeb5f^jEJix?4QkN|Tn za_&PEEi}Fs+*xU5E>Zc*p`Vm=nXUBG!^MjaIUZ9-0GB~~bOTQvY6@z1HA+OuETHmI z6gb&U2k#(S>EL><8ISiOWhEA}x|FCayHSh&mM{xUm>%3XX+eSP<-8*Qj|L&6_`=AD zkMrBz81ThBp%P&;xY+aYqAE(h?fKNMJ$hu&tG>^hd44aT@w~EcUBIEAK>SGng2<#n zwlupH3>IFUXQ3#7jRu`AA09zDDklxEU%mkC>({T`ytx6@^=fd}kk;!G{btQdqkt59 z4liMAjGKa%^ZKyBf&hK;X|}44>uL#8KmgVyRP&;Ord1&sx;~KjAac9@jx{L{YX#Kk zLco)#o~Z?BQ_KNBW<5=>N>Wgt$Hx?>n>uWNA+$}c`}c2-6cEpP8vUgNqktgl#{MkB zt$UHJOlU1hQ395du7(;@LL2132jVd)D=V8=M7pmA1@kIEAnDWrB4r2R46B#TX^gf& z8{L=iRc;d}t^oFF)wOGV@yg%@0jHs@Xsi9zz2QNd9zpNNF4GeiFDvYNmj@sjG~v-#sdJtK#0V{61v+pl1Ul!NWn!>8xX*giyxm!F?6i5M&s;||V^Znju6c*f0s zud*}RukcAk*(v5i5|IY_`X=Ti(xgxsXJ|JrZ;@_nErx;#;`+rF%ZU@)cc;fmJ~;HU zShTH}cGSl8;BdE*me(}i#7zbEb{N)X(_q&DA*KVkInosKexSK` z5F>7%KIWSnwfPQVc%~lkW#fUj-wZ$gW6pNQ{d7dBvJ%UcAPE_|Emgf|KyShFYN+#G z3IK!VI^Omj?u~jHd(ApNsWY)-K7d`rS?x|5a=A2U7cjr@o-{ZGb&@sy%31VKvJOcj zTGP#}{POs(?`8+L51uk@+RoNaULGFH@C*rjE^`4+5w$rG2`jim$G4qpS$=Llaq%1d zeW+Ue^H=lgW3G%&xvcPT5=X8w4+-x~@y9X*c4c~s$o3cvIGBMuu= zs=jHEHo3vnLEj5bPDu19EN$OAAg-I3gpAG#BZ4uAbF(e2zWby})f7P@Kn_>%2JB-V zhQ~eno@aC;9LqKir>kz>uGw$hrh9Gh(H8vvUre?tFD)XzJ5_O1H!yA6`r{kpu;Own z?mxq&R9b5~;DF<&ymkp+(%)=;abQZ(!F}aMTQjp52kwOqO?%I{y43C0p6y~9>JUiXHSWe48Y+i!o$kXA;=_(;VtX*@7k z!8NC+b0UJRW^mF<@D+xpG~CE+;#C#AQ4_{LRh>!!S`>q85lkbhl|g}5vFY+H604Q5 zRL87MJjjG3Uh(h5iJku^q7;Go=)24oEa=02DAA(;z)v`cHntzGda350A`Fl8zI|o} zN5L*cLk&#qze;_z*|NyR-)pi(SRx@7r%9+)ZWBlJxa1BDT>pmZzyjmtP?6n~Ld`aROPoeH#V{r!UaBhos}KkA=W_-up+L?cW@-xX*(x)A zttHVn3!{HKw4svPpDIQsfx4eOQS_#EnFHts|9*jzhC?JBhO{v0;1sR7-hWfOD1p?D z;^P9V0duqD$^};oI@Z>}z?~Zfdn1@XBt#dVTZW+ehW+Xs^&NYV} zR4X#|%)>)}=dN9%jh0j=_Clnty|XeiGxxb)Tqz2C!cWv;y|-=OF5%!3(##tZR&s|d zAr1)i8c&d4l!ipS9Bv0b3j8ZN?WA8Cg9akFW2e%f8Ag~o5U@oOkwCDHdf#Ls(wJ!< zRMex+0d~`8oL|%l)f72#1ZGs;^v+egy=ql`a$39^3g9A%({N5KS6oRk!*(T?o@P4q~%#jy~jza{jlr=I{ z2N8>7hl)=Yuv#MJNLQxF3d!T?NqN8&wr6~|41Dz4feC345Y(mV;6@?`P$5-?1V$Xi z)T*Q*VXA_BXtpc+swgW_i51+R$#YQKWA^Y^XlxtVdL`_DTbps}@!~bS;5ze{EYp+0 z1m}PM>MQ3eqyUFoKIMrh6)EvVke3{>Q1wdKj2}2~;QHy*|HuM0Hio%n)gpPE$Y|rt zzo*74n=quFhA@(%E3=kv7X0Y%ldh=Ul{DJXkJ=_TwvHP9P)+*)ZGr#> z0`GIOiF6jtxiGMtW17WY*H6DNrxTg=_3e0W$cwu?s{#EO$RtA*VZPo|We*v$&$h5Z zg{UV8XcOqG@GaDKGl=L_sH6%Hjx?XzZg`veBFcq8B=~yQwprzluqz`c49h+2+>qME z`hEgWSs^d5_vjDx175LDX6NNOKfk6+3z`Z25q6u59_s4r4ol#ptiDowx8O1ckx(IX zp#W40<6Ek>-<(4zH8sButF8&Wr|Z2V$NbK^wQIF+XTRyIugtdH8WR&^HM@S8Zj8~= zs(RPg>~C18UrakwZ^_n4_f#LmDxU2!x!YJ6o`jWCK3;}jP`0f9Y*Tn`8AX7)!bi~= z#Cp!Emq6e{Th28(7Ze5dE z(^UIw(6Bzw08Gr^F6Ny_?bMv4R5RWwuM%@}^QTCm&0Ba5LdJ=BK!YPcgCk3VT_S$+ zHeIAL4oec{9k(!@N6keUXU$q|+INK8!9hTeA|ORiAQh=h9m*+>UI7;9Vy^PX*`5~X zCo4xfK6aJs#~x>9R%dBsGEel$^XJt_%Txw(KHzdF*5B25tfPEA9iclr#>?FJhO|~y zxNc=2mQmJ09WYVe7{h5pQqmTl;g=gsin>1R;7B*b^};29-k70mBT?M7!_RZD68z3g z-eqZX_w3o@QjBVebk&hDB^NJVB+Fc7CL7wE!uhBarH)6->u8|>SDUHRz9N@@#*mOS zAo6-GTGZiE@Cb>Jm+dGP{?}q{qor)X2Y+<-{gT;o$rGpzkv2f)j@f7N2iqyTV0GLK zB0D+;s4aSc!{_m6D`|9?A>F1$T_23%Vz>_@Uod72q6Ymp-od1HvtDB+Enz3eiN2bK zWA)rX!`cE@@|lHjk$IXltM=Y+;$dLfvSU9#%g?u&@hC8XpL7&s1z$q*ZOu6*Y7`ofsx!8h18pz;>eZ_k$j1u$VQd>4Nd151VM7v%($dnJ<)Kvf zJL^}*IbARV^U>frU*3cz_jMf3W)mqr_Z2 zH?CdOPODsP@n5McoUL1?I&sg@G(6+2Eq!Z(R05_;@_Es%$Z#`6{LjzyuPB?6) z%hf~eX~-HkZQ7b<6X*bizWulo$Xbrn!5odB9zLda%y{hS;jyLDXXyIkMPF40LbIX+ zZ~`JDC7%P}{pFgmaktarP%&x@N~1vY?RZ8Y$-pyb7M}a@sT@}_`YGbXGphX`|D=f% zznjM94|-hyAzX5r*;T7o_r$(WF-~PUxr>F$PtSfDDSaGEQz{EC$x>cN5-z%ge!;^} zpp-gX+7L9Of&+`w9^%K|@0MyKraN$!R~^8BM#NHNeR!`C^tn8MaqazKE{R&;9%aDD z|KbnoZK`@w#G)OzGL{P<)eka(rJ(?KLQ%C)ABMl9IB#IwKT_s_Xnmi8KR<^*c4@4BPS zdMTJ%JPJJzrp{Pg98uh4Phx0L)|bEi|M+N^pyyv}9~nls%Sx__`K6$= z{q_Y#FURgW#v5q;%0NqF(3rXPGR%qcY-joau1CBh(LI(8<67Pbe^nduMW)Lf(Az2fWiZEiH&m8Kr}diWw3 z$JmsDtuoD-XgOYB9lI%&H!^yeNuQ;K@7~o#d<6S<2;WW`(|={^v7;rQlp;oYS<1~d z?JJGf`Zfh?M!7+x%EK=^&omu&q3X7ebhQOnqjaM|!{&dS_aj|%% zOKLtjzB+6}%*Pa^-mqLwAxc571JG#wQ}gS zH(4ii{tk^4l~ast&D_$m%9ZVdzr9MbY4!hZui|@49gPxd$!~jc;N|xb-?pv2aw@_0 z$kz8=oXUEdh=Z?lxnX-|Wa{nV7H896ZVry^RfcANb#b`Y$EMHp&Id58Kd6AR!tBMBl4%t_hdo zo6QU1K^r0?Fd6>D0+1o97_J&pg$!n#tD@JoZ8bEdJ576$-Agg^q1}%uovDiWM~$jg zuYT5nYZ?0&KLVfNa`;d6MYtdZ4LTe zVCZz(9H|Z9LVdR|`l1n_Tvd(>j_PuIF+=BsCC1iw_05n4Z=?6U5UmF<4hlzkY2IR0uj4k`t%H_m^1wu-!s2YI~&Qnp!5_u z7Agx7hKosh&Z|uiL`;)LX3-eX9Z-QmsMB~#5=6s%v0xzJ0^>SHz5aVxm$voxYby>T z>S^&6^zgYA2qYnWEzL1aRO8-7pWi)hN!o}Iy`sc|1>y%kW&H^3NQ_`@y{XeX0z%M2 zs42*cs4VM#H~@)0KO`4fA>TT5#E2F2z=C~DzTSd~kxC~JPSD--y zDDXSxaPR?--Pi8`|4@Igo#IN*5;1_{Fe+#Kt+0N+&p^3F&kETe8~ayO7*TI?TX zVNp#4b0F7x^Z-;zqU5I{5tXCZ32AHWdRH?dc|x`8S+73I8P`cNL3;rmI9#}d{+7ES@1LP(B~Dco`$C7|e0##bK^D5T z?>M8LNIKIf)5XaG1pbuuliz#I`t{*WW#)^7gGkDyh>v)h?y-4y z6eC0qwW_p?9G0~aN6{3zb1NJ5nx!YS2dXm{<}q-Vp-`#9?q1mTRy0tG8t(!7iZ}=e z3lO$WeBylC>g*CWrKy4mP&rz#3(WplFib9mR0bBN5N?V>C~HR`noE6<2(-ki@CS5- zXr-tjMQn+HOF|K*JxXoN0`<|9>!}Ln zTb&mA%G4($EMfN^@3X8cQl)Z1$n1sEFz=x`V+}@(XpVD;q@9{Xcm}nY2^A-f{F?`e zq~Rab^`?f5{|hyyc&n5r%=K6byL&nTq~I>e+PqUAo2~l%Z3%2PZ|K$6!VO`3@x4`w z08U0-L`=swB}!GP$XIUfG&-6;ND!-dOF+24P6%U_Ql~LQ-L9L=z+IPN+Q? zR-#@ZYR(AA@nhVH1VW76Ka#M9GS`J)n>%=w{;pf*-?F5hMnvNH)7D*@JxRLtAf)i} zt!6PT{~3{ObK}J8c>DDa+Kw7=)y^lTn$h+u7lHz(==G{+zJJk8Htf9kd}o!^ z-5aXc{m|s=mvis@ygrts99c*oI)#IB^|P_L#NdP+rdVPcJdW7%1_`i$7+R_WbtX^_ zlCl_H=V0Fvs?y>lVKVi_+D303#XZx^tKGbhwjwrdr$o!`!O85p>C+e5#jxb&v3k~Q zIR#1quq=tPY;6+@m(bvvS)0Mw!qVGEyTYjBinDv(EBfns05NS{-S8Aw&M^VR!9K-K zSK;zR$)qS^GPpmN&>8bAUQ35*0bHTGo13Co4LBgq%fqK08QDCd8)xQkGqMX3-|Wfg z4Pw}}TQ^CGrAjlL)_T`)PLss0ua5U)?ie`g@Mf!QhdV!1vyB0`lOeH(yNuY`OG}t0 zIBYhhiG_0JF_3vSY6nWlYLw=}NO(r8PVXW=b}c2r{<`7tg5anc%9X4viO5B>EF+O* zxHJHcOnhUhtmM{QxOwyVmFwkZ+cKvUfkqzsfi239HS3X+NoXk-ahX|n_)z>M*IbfalYI?1 z-h`VF!#=0GuslKzVnv93b>vs;vje>1Hw5MwOxB+NV%OYy%t}XgAb1_G!*bNx9N$8^ zKqpX7UWmT*a*_xu$}}p7V-z;` zHVyJ;zZXCndS>Fx488J|5VtOR)g*S>GlTDVQ)fsp9F7T@ETIHaX0#eG!1cC8bK@!7 zyL+oYIsayQh_-`66M$}!JWC#~lv)xI$gGmry_JMIo2v)H7aFEs2ADF|9%7#Q%Ob5O z-INme;OW;a9v-z^R`WG>yf=>_L5wTZNHzc0=Vt2WR znqIqhmTw38zjDt{nAtMkM<)H~)sX&nn`1H{8`*oV-lX2X+KY5b(L1&7nxN3{i9ym2 z;E|0jc(XcACW@kmK0LG()MXuzEcDUbNAnL{Di-wsp)!=Ai(-DJn(P{GFnw5b+}M2$ z9BtU!FXHPJMWr0k6xS;ZL!g5$N)?D*3;jY9_V$B`g_}5&xF#SI5r+UPi;a)IBmd01 zbH+*USOdh{O^7q`w%=ac_JY4=^yJtQO|?&jVxqWdrzjh z`aJ&x>_G+-G7-;E7W6i=zJz^c7Il;U%qffBKT+e54kC8zN|sC`0W28zHd#ry=H<+5 zr<6_}7H7Aw?(8LFVU^^^s;2M}}q?Z8q zmB>f@A(=*J&NHh}pl+VzKn`7o>(yd4v8J$QxU)0TI8Tq*G*r%KIccdNyZhXd;geGP z5|gT?@Pa2j$j*?^GS|hWRq+5!ArcXfaOmdE%aPN=ce?W`eM7xq{Nwg;FVMJNi_3cP z1vdzf;$+w7REaW-Le4DN)dv$!;x(0^8Xl%3Cb~_Uw2~uau(r9tQQ($}-rNLgTM4Uy zZ>|OJAl@s`K30sys8MxLRiN^?Z*_jA(XMgF)Q4-)bCx~+{j()EZ#Dv98QwxoedV8j z%rOvB)zyUy0B?@9xx7%eO{6aDPSQ(Z%azJorjk-c=)+-qyHfO`bvk0}3d*lK=5udA z849z4!Qdi0J_CFQ!Gi!|NgO94H64-`mLK}bnDnG;|DGOKI^VZ}s=q_J$}%)(xxE#9FH9XePAbR?_+s$RB5xETvxj&cTRs|xivCoEhaaNJ~HbKR6e4=DHd!2=g*JZPYOLN*Td%e~(b3@Ji9 zIbu&ISh>o#nXlZ@{)wqkyvgw4cDh4m%;+SMVah;itoUioRKLLHd}?Yx(KE?-MQ|!= zLMa#kksB)iVu3R1Z?-$OPrQH4L%d@7Fe}ngv-(;MAIHc*H<}QNFPoUNy4`61De^No z_-???B&Sf|dO*R59up$ME1{}E`EQKceib1Kd)-Bt_NC&SLZ-&2H|MIlKy=CWNu6Ll zcbi$0gZ6EXQm+VmFQdJAyOm$g({pJUByA5RNZ7Ax=ZS%n4PdpX4LRg;Q*gz3MF7oM700@9cd%D^v9}^!4KoQpwa(*lZ=-7;(|=nZBxu zi@#gR-x*URb3)qxk44*7&F=h$QR`%)R$191yaug0c07FRXY?B+H*uZ&sP6jxLO30m zCUZOprJurA@*xm6(Keq+8FEmYvmQS?|Lt4YRg(ufl zEp4556c$4`++^k=YaiWD?g;Wvq?9hWqJ`$iXUx7+^;TlC1M3vs6iE#(MT^-p1b$+2 z$_Ai;y|~Bh8X1)T$lUHxhWg!3ElB9x^;Ry|753LzC{qvKNeSAfu^;b!C+2=@=nga< zBKwuO07usmSjdgy0NZ>2No-awya33Jn$j}hvlcCgbJ60QQ`I}MHmHv-5MmN%IJo>~ z#I5e?SIaH+Np#A^*o|vve!nm@;p5_oAS1@Kmme-vZ#nCWj$MQ_FhGSc6LACAJsM0{ z!+f?u0HdwR@7N`j)U5Za0>HYIymJ{eRuKsR?_v8#?n@mTs9&gY{SkjpJRkDcA6avfKbF|}5AfgMw9BJ#veTl+McT%ZqjT4KADJAy z(!VGyB{F%pd-8X?=$T$?`I3(Bw(Tt5ej}mlY)=aecGi z%Va00TD2;w*g$@;$mF5r8s0lOcK2*V0qc?Zd!XwP{6&fz1eQp3)x-!535K#@YUj)B_jo==<+{r=bH9wry*K9-!gDub6MDw_K#4m?oa)Y8FK7)BKM`lkKdvKVU+6 zhCDfgPae8;>nb@+Kz%r;_B{A9;qkQ;*;uI2OBPa4AdaHs4F17FIf__N^I=Qx7F4#@ z4+q)m0D^&W%h1unqM3-~*mEXMp6uW|(Ju4h!{%UaQ<$YmvA(}zepPq-`M=+~I64MX zPAP!^25mM)HFciwyuh#4)2ZQ|#)r`5ctV@rO`U%{{m+~gSNNsU0gGZ2T7L*S3c&^W zeXWg+Yhjg31N~G}(1TIoTcF24aaxl}pE5@C<(-7c*C{k0q)&>R1AYiuu%OQU=6}?X ze4Igp)PMnHMvznF$F)_;wx)e&qdSIhjQtA>v9918(wE(^Y`GVD4D2%##MRxy!=|9O z2oI$9fTdt7#;fF9&|5uW{khpp{RAfT8t{q!C95cg>KuKXh@q5(P5sdnNWG-U@aJ#& zYfLaKoa*{!VBA`b+jRGx36aKVbCK{+T6;??D@`b422?N8YHe}l`~3Oy#i{+_!w1H` zZt5F&WKKR+KYgdW*SgqQo zv~o>h@~bPUUv4`aBw^fX&+EJ#jjYJ*C}p-$b_6=SXxf~ZFi8fkRqG;}1E~vX!o{$` zZAk}eEYOvd?&chN;B7qs1i@T_P)j{QhYkwM3u=AWd+7GFPyIRZlIvIPz2B4u7Hi0!xyygbFwDBjxZ@4ijm;(AC>b8zGV&-c&^o0|`zAB@hkp>7gw#>QOTcd-}z8 zP)g!ke;$9L-=XZ|je>lRGzkd)K}Rh46o*`r>w~{^v`cEK2#AAGWKcZ~(1krA`&9P7 zxO<6-iAt)~s4dYnZkpLMADmWGB(*3tHB~!_b*!c&U;3AJA^x~9fuI@9c`O&H6DRuT*f~j`yJmQko2=q3m)rDz)Q8QB zpDeHOVbkM@iOZ1mr7xzfacDBtZsE^n@pqh@ovY0AKU6o=cHJm9X?+o?T&4q&;NfNr zeWls$kaV4YAsQ%YkNErWzTMZW)?^lz{RQ(r7RZd_;nRM6zkT7(u+0{Lx``cEXR)_s zojU~v24l;H?jceqaV$*B9Eny^T+WonN$K)=qA5XF$%hee-{2@HtH?u?jcZ4rTLelBpy(HJw^^4ZYM=0$s6_a1gi``qGcy4KVNhM_~HMiq4@O$}S!WzH&LWY8Y# zF?fVmNdN-cMv2dtVK+bzE0HLpeG8fRtYX1UiypR#JV#&GAt?qkS+MoYkKIk^*FMvvu3H*zVzM65+)yq6Zio5AS+*QE8&f9Rr*Fz{2c_Zq3 z)hW46WC)c)7tj{0YvzAdyD9z@|DBuQVqoi=rY*^<7IXew^7o=I+sH^Bzkhc0>Z5Fg zJarfM9SAbEyxMM)lRGx6z8i&A$2>-O(s|uzH?=g-S=kLaK2VdzIQa~!*O{SU zKyH8F&a$6-SJn6T7>p5AQ2?VleeDrlGm<_OPS4KDCFfH#Osh@IGeiA=>?armh1ipK z05#NQsA}!Yj1=G}wYZWrAbcgqIcTY(z_u9LL?T$`)lPQ#F#LJ~7`shq3UPr*7q=Ft zWQ3l(YQ128liW@x^_NyvaHprsAtzdVC}aV7BnuLs>r?3mdBpPO0?C-a8sw|GV%xs} zYyOmh93Y4Vg@s|ol2ssyIaI1Qphu7b=y*#|2?d3OxF;qeIuiSxd6Eg@=|(iz)fGY! z&#}(!`1B&Iqf3(+FNPgDT%tBQ{65u)@k~5=RnELv|L2^W?~Y zY6@9_L9f?#R0M=%Pu#eD`?w?DVodr0xW!Cs+D2iuyKy)t`kgV_{CPw%N93$E9vy8# z;T$fBS?>Jrb5z_5x;!;(*8cbNS{HtJ)_BxSuiBwijoy!q=;ZL|T-@6O*Se_LA{=qj zecbTLWG5Hb!0QK6GL7EUy0!kg@0(qp92d4bJ;L)>LTUVxP?OQ;fA@H2^Sn!T-st9@ zVn&HONA!+#Su)zB-Kd?0U)OJC!=N~5&YN${5NkC`U8=^`p!y<-W1~_O+LEv@m>}v% zJici1Ky=u$218Ve2YlHwb<}%V1_4=pM>&!HL};tLg93>u8F|%XBz75kQ0_Z?);5y- z=}E*8x?+U}^7<+`S?ztb-ml*YGjE4Qrv}(P9KFj^J@(IIACGr9XAr<+LU4M2Se@B#FX^I%JfaC@g>qt&bGz;tA~2i$Fi2D}hd{ z!-<-67^D0%3h0o-SQu4FX&qKfpWkQB%Rs=hd`=9J-4_&Pf3!G{_Ch{Oj1Spnb>`0f znz&-zk=csjgBgxH0pN#dXLS8SB9DKB!(vpH?f@f*RQ9KTPrR(hdpRB?sTyE@dz+rT zf+Mzd-a9n;_;J70&q_;63FjH@zH7%0=VQ)2Sq)G5Skb6^-xi)5om2bd^V$`G88!(t zIjsi_XvxbzcCMkB?LV`ss*p6TgA%zf*_6m`wY9ZrGKN2FrDfgYLdCdPYlCjLatB}T z`SVtDi=akD_9K10!F=s}$Lq)4s~lJyHxV1e&c8;j1_@Z#B0%I3+9f%HMyJciMxujo$9SZAYJOZI9Yc`Bhdb zkv1D!4Y}1^v;3)rW`WB_QG4AzHM3>n@#7L|OpT7($@<~m(Csvh7w~DJHgTA1XuznJ zOO!LxZ}KJ%d-(L$I4*oBM4@bl;v2@+R8&Vbm1C&auW!%l%E)v8CRx`K)l*qvi>?YF z8>x z5Ei>G=~k6vKca1%oSYPe-m!<}VB5u@K~)s!C8vKvZ;cw^W_L^o8EOxHd>Y_#bVWZK zYYk3F9KNI5*JOE7z}Yoq7f~0icD}{rugje`2UzrgtS{ zCp&Dp*X`VoPJnSWxc+;Nynp=}rIe-pbw}i8{haG>Mp8Em>58GdCLKMb$^`%{FoO&( zrd3Fk%MaQ|em0yVOy?}XoF}Bl7D}0~cno!j& zH=!p~E$v`#dU-D&b8ggx2`xmEO1Hg_#(TF}=u~CVIjeSp8{lH}2e` z-q2}vhy@Wshss1?90<`36~<>a=B+1tl1aznrSA$te=n^ty>8-nz@RNY59o0kQAa6? z`~cWKF5oiQzQ72tADwmf_C4C~)37*nXd?W_K+6b#<2j~n<1}2B+64*!2m2u^D1LY0 zxoZ&obkF1BOkU-mZ(Qef&&<(9#iH@+L(3>0_GF=WJx&a7)RF^D5?Glsgi#BcA zo$1exn%Iw9&h`*W(wrnski|slbmZTYti>DPUN!-uip?UoYjpfW0ju``x(7dA4@t4?B3Sufxq&6FO z?dll6dH2CKBW=%iLRb*~Ei`E}dXK?q00~-b?J$F>xb%oJbelfm(3XY9!$evm6AJsT zM}eTfW^Y$dEpzLVrc*4$EOa9sS|XlCOXe#PC@ABwXs>>?zN{Mi8@Z~tJRdXOipIh;00cH zA69PkbHHf#HH)+RMk*rTRY;jQ-Zf*SPNs&vPBIAx zoZ%Zbj1KMd>d*7YGpR2Xg^jw+k0Hd-$)8`X`}Zs!i5EZ#6a#?2)YxQBrEC3w(*-m<*5(vk#%8$;ygem4zJp85gJ zXS(S|2t3fkj4Obc`$HMy=xAu1{m(ykekW8-)%L3#sP4&g^6u}LnX-^gKwv=--9_;0 zqA;c74~>khYpv~NJdIUm)XW)xGSFebY9EVn6zSx>KPba zXQ2skPwy*|H&+YvgfTP6H+N zev7yf=p8~!!ps+pPB`B?)ONXXh3-DqB6P;#;+&W@>EGEj?C(3$lC3;)q!a6RH5b

    m5~uf8cL>_hCx{{J!YCe|+{VpshJWOeBasre-^@XyZ*=41(y#;*YvCS##jP{hTLyC#R%<+*b=roJXVk$CKPzHy%AOb+pDNs7%;3U}RiIMV+gaYnao}yZ_>{ z(?iGG+lQyyIq5W(kSirnT2q%h8)vkBz_KLRW5xO3znY4Sw};P`^bbnL*J=Y_?f4qf zx_^Tr7;(+wp40B`IhuOf5zrfd?UwcJ`wzETU@=FoMD|AI=%jy|1Wk{Q9Ktq~HRnx^ z$ZeH9(r7N8ZZzTWC^s&mkw`RPqK~l+d7M{0&DaGSkEGr|>b~5?rY(Lf4xFuhho0>! z{3c<@VmLPa@wywigW0*_s9`<9PDZ8wqOn*mRvfOJV?F?MEs4K-3VmGT5dj-IW#g#`sqHjg&FzW})SUN?aq6F3anhj`<$*x^BeB?HoX6)&->ws@K&{(m5pk z7uOtH`eZp&VPTh((?inla`sUBqIq0@CTObY_?daANrk~A8iPK6y_OQ3Mdo(*sJk$` z+9eIkbGy|P1b@u}^WQ8_i@%s1vpQmf>Gv8^#DlEK>^2d>L5D(^b{t!Vc!p@6#)=!V z2!Mff=b2}#f+NUuc>-fMr!VGmE>c~wYit1XzQ_sy8(T$VgH*o#R=0&W!H{TOM2|yR ziW;n{9QX*Ztk#MKc3W33On#x$gptAd|B2Uh?YC!B288yr(s{ zzW*1*qKqq&uV;@gSJC_VUF7p0BM{d&3s)zp9dIuMumoAcraQR39w+>p$tp3{XoUmSQtbqUdcq3>!e|l zQA5-rEBRqtzi4e4no@r{5SP4dPKOuPA~f2B)g!k@;1#yX87*JZ^#-A_kk7_uVS5u0nwJdX}c~VTV6WqDdeLp^4PZd+x6jAtMw; zDM#S|y$pn%{jcQbkR3Rq%;nR95>>+feGXWB`%31zHP3_hL@mP%<83c$7 z4SVG67}etr9(oZwb`-75ymx26QZ0A$Up01WZi+@YjRPETwbjwLoByut-o2O}6AW?~ z-*@e}gxg-TWmRxb2~s*(5sASaG792XeMbyN^RrYKN%WBaU6ht}O`nIl>2$6l(4esq zm$V2fsdV>HD`k}U&6#6sy&}x){}1E6VUj>m0cU6Whn!6H++N%`%6-=YuO>HLuGdqt z9wr>}3_lgw=d$hOL+lK;3u}5fJ-X>s1$rked4paJn#Y7?Q;Z^2*>O~*eZhZ6R$Qs~ zp^v{FCbKVMG*D#Z%_Q;XmD^`IGWB;qpAhgcMWt1^LeP*!Cbg|U~g>P;50{a%GYR= zN5r;mdcc^-x&lc}qG8MWl5VK4Khh48nY+)C08vy$aU=FHX`at?^*Mwyg(d9X|R>bOO?9k+;zlaitPSm6(qn4r;s`!9D8%`kt~KuE32gys?Q5;P-Ae!*ZqDbeJlQL z==T~P9kZ{Un08MDB$D^XLFG6tPc~$@K}n*LA(2rvj4fDDU=qrn+jpGH#U|bhxlgF8#$0k-#;x{@@M zmE%m*nV1gwQJXw4XB4@xnDdwnUy^8_se+q0l<tN2KiwS~HdWun`0(cwh znXGaeLr8%F+KSWIcI~*i!F?66wu>DL0#1UcthEczMH?Rne6TybsiL4TGwTt0dDqey zBbJ+t_Y!0jF$qgj6;Gd^ANBlY#daDzH12lWqZ3}_=ch;OR6f3yal8{MrYkr&_WPJ@ zO>u=99nQXT#V~1|UJb|7v)scjRc^9o+buiJY_(-*85oAfH=F<@~co zdUHWRfsBm|Yogj``3+&aFB=>+%Bwq|ERZLZ71kdH8FnY8$fSlkwTHw1yr|aRx4voU z>Mc1Eu2&<*baFOqr%)ZJC*q9$vy>YRf^GbVvJXw1^4=&km)Is;R{4l|t$tw4rO+hd z!`9r|e(APZ+Uuvab!anhN$m>@t6d8GQm@Klqnm|Uc@DWhZ}n~~%*%l6{2%IN9o7`r zdS7b$)iEYOZE<+Um#W_ub?I$P z>agpUewPo8I%f5woDV9Ua&24D+ArplTD)!N74H3a!-)CHn750)njcU7d8^ALPGr<( z`cXxPFSk4T2o@pwe|+t+wc8Zc39MX2vAs^tsCv7Yf>x~9BB5u3TZfWaUqMvkJ9m|} zEq^w}(Fg3@*-Vs}GI@z^(8*)rpn8>r1HSgn?66tauUnFrD9$t-SybJ;ri6R!VP=%{ z&m2=~B9n=ANZ7r-wQBMcLdl1>7F1(VotY(I->Cs}Ue;p+vD+)j!Rv8u@8!D@rb{SM z`5YU>=_E7^baT_I5K&6Q)uoHq!VHs&b*g38;$+*SfE$5SKwDFOauS9>bl6WBfh8M6 z8RlyrIJ0??LH(Ly4r2pN&(5xeHOh=go*?d=>QB_iu|J$^tm)h9Y}fneUs3ca3jXQj zkF4h8$5Xw%pVEF?0ty3da=HKQF=e-Qrx~pQK1E_+|FQ|_pJ2h1kB3HpV?lVC-)G8b zeHeRb28_G6t-S0o*OSBCev<#hNo&EuU_7Te_fVt}VGP-BO?AR(bKgSLm%vhN#PR=b zt32Rr7aIlClB4$+(uU+jAelV?PgXyIqhuf+q$KH8jHNg;xC=m)xO0&>$n~`E2Oh`p)enhYCVAgb!7K%5c48w7vV^%JVtceQ7Qp%;d)=L|n2gz5z0Zm5@j! zE@z0b8~i2pqMhMNDJy>ediPtr%}(l5VFT$9t8nbDL-v|=KP0LKIzoh5rw4ngRGw@@ zrSjuR-sOxLS+dYueRx>SD2j|$IQ8$}|NSt9Mq4^kLF?(vQG1a;NDGq=$Q?|H6W5_K zRz!c(fgFw7J18-YQlsJyfwOJiJT}yqdQTu(I6p!BsPWt;PK?gEyX79G5Rca;E{Vr# z*X)mqha2j1u*cFE!_ep%AQg0~Ol|T-#;^?F*?-lbipLnO>od~L`jsH~tj`#qri=-L zrZK8e(2z9ZZyTQGgQ{QI!-g+Casg_yLXkufpyG+m4unsBnzUv~WA@ zQbJEeW{HXPtvuZe*RGASbKg?*&ni*kAXJc4C(&KALmCBFV3)zc+M%&NdjKY%0D}5@ zaViL>>2Dj{J5ph7+cwBxx&$(1x5yUIzfkkL~dQHC~0uD%#f5h7#i&;KmhL1<*f z8(#iZ=*BEDj-uVS>oGA1d2rO8Jql7xYlzv3KPSVpuB9yyA&~mT+Ktq3Nf{GVi;5ZhG5AZ=0 z%rQgrIEC_oZ+oFPJlMcd40An=st&4GU|%`&L@Tr?*5Bk2-OC0J5Rqf^wKcH1RD%Kv zn*IYbtwM-pr!Qa3tq*N6@o)BL#Sl3&vxHdxs<#24YBFpQ;AZRH+o%_Dfit?M{g*G< zIpLK%Z}-jP8aN`TkwFruR$Ssr*nDXzJAV1HYv<0arZS9*e)=#fyFHP5;Zg+dXL9sd|i~swKGr?2gc{ z`%i%B>YDy8^W!*1!e#GMTk^qZdBtBB92I+FJVlbRe+4lJuF;$m-L$lz0eo#$%rZjD zG8A`g^gkdXU7$$>S^~8E_Hy3_Bd^zziwe#s`q4=tw{(X3L?yH^0#C)#J!&<{2wwDHwbmG+fP5>E#NTy!c zx`<7i)H$I^%yF>a4+ofM^(}?7_yXozbdB~-|A9GeMjxK+7O{PM6jw)j*Vud8aFq<1 zFk!cg6BiSTe2;1-D1eH^w%&hjQF|`ho}0CC_wM!7sA228ksR?4$#$;D^Q$O~&B;1R z9-=uA1RbhfWyrIZ)>WR`RGL@ucJeH2|D1%hNMyIezHoP=3_b7!%iMK{AKYT{sWq=e+BH-o*XgRU^ zQwNA^)2=?Jyy}mRL2#5|+|>7UKl)H{z|{?ec9E1To+~gpHdUMZGg0Xmw&;d<2w%Uv z!{WLxgb)`lm@}rOQv|no@%LBlSR&^iJUh6Z><(x$5*zgLnyBW0%D#=9yB0JY?uj5d zHlsY)!3{!(f)z}tJE-1@`acLQvUM6ENs;iE$XPHwellu$Ada$Dw|DgJ^6g~HO@yK| zW*kYmsK-y367Tn(wi?Vl!2;(uNj#zJ>TeO%MdMm6d59#J-`g3M`X>JSi9V z`dqH?bKq%zQ8!Hf)9^(dJB=50?N-eB1!80KaBn?8&sz(E&-7vG>XJhw1~IyAqN*zI zde|PYK`K>QJ|Hns=vA3#na*s=)liTAt1K#C4$WIB8uqlih?@RY*(}-VMNrdm+=;+d ztE!8ZA8G;SH+;hCDB)+sI>)*Y(H--?zWrIha#!sk z{=GZmXT!C45w1i`4x>#=HMn|NsPHfE;bcW+h!VwNZfxcCowSFmI(v<%Wr>;LnTg&> zf-}hJkth)E->v4V2n1=&NT;p(|1@{*aXsH}AOCbx!&GE5)=$2&RSs!WO@^qiC8ughsT?9i@k<(_ zgVRcFh{DK_94bpSvO+cb+LjF&!g7kFkQ^c=?&tM2+v9QHf83Azao>O39(&B@P@nJn z^L}6N>v~DcAVfwG5Q|M0XvTh!5aPJ6W}5SkF$cI?>kPsW+< z=yujTt_Yez5aAXf-3s__Fug6tZDkawq~k0#z~9)2AQ8?oNM6K>Ue@4bMWZNhyjG<^55S^fy zQ4$D`xU872kZ`Q3s;VeY^zk_n`w}`bUPEqX%myX}4Q@tWBsMMZXvC`v1X2i@lv-C* zv0oIgdK52{lBL~#`$ArK6sI!OlVC1@D24NJnARyzYF=!2+=8p)QVS^EPKeQ9DE(l>8Cm&>X>FpLm1p4?#?T)0wk zMdg?Wv=jtMEx3gfxi5WyMP&cz=liSDLQZd6a=W3S&n6}Zl?!uf+Vtt~)}46fl5UmA z4bwp@o+O%n-_FUYN`C^BRoZoi{-MDFoW#lF7J1$@^@_^@fS5H^BY)n#QXQICxuGZP~xYx%8!@27p;3~Q*ch`REZTT9YQ zPDi|4W0hD9VxS?4ygPjEl|%E#+`V)l+sJu8ofp^=H|zVy-X0~tsgof5UWX-`LYMUB ziMl6#`f2ybmQP}Vp#8xh?Pr{JW|e{#FFw3?q@V5xGj#=Vv!9)v=hy|o4ovtoje~yO ztcv9IrndBaN^AS9iFru6lpq^R-QC~kq*xom7 zu&e8)edo6N-0LmiDzN)b;58$UqOiM zu`|!JiK@p=7^$SRQVb+N=OF7YX!>Y5`CGQrr(UM;r;!d>eRrP;b;>oeQQ&}Y_VP&X zGDik6w-QIC=GVX0_(&fR99VB(!uOzIF)q~qq=e2XpIi^7tIb~h)dqOt;*CC>1z8TOsC^yl&hg0Z5&o7|agDQr&#BCD5Q zfOpIATFJ^%3a(eFz}=L%&6~RblT58i8=UbKjWmqU2>4>+0HLWcq5^o+=J3LEs02SZ{NAI%_0M+Td{q{ z{q)Rb&Lu)AD37a3hP9I_x5E)lpq{JIPqzg4pX{m|$3-X*g(9_qn`Mtzsi_h18R*Pp zdSXD)ytgRcg@q=3MFb=0QpL~5H+RyKfUbYPZh3YP1QB-1h!;@8xlIBlz-f`esq{F+ zZJyZ4@N3Pw*;gyhof~^;7yzQu@5%eNXGW&^d#8!)nhh^m0W=>ws>;HslU<|M;cO)n zAaJXKqTsAbzQ!vecG&)F&|<5~^^f9}N%)hK`Dn=N z0rp!wz~*EMt^AB2WI2?M_D6 zO7~?ENoFw7oDtqiIz~!<4SARM#%r9Ywf$zCR$z(BXkj7sP-LB`|0I^KW(csN%zq-e z@vV>()Q{v^BEf;T zdSw#3c5uBi@jQSopCP~my&UmK!5_8PnOpR%l{nItOs7w?j?*hmAkbqL<~zBC@*VF<8Vl3L$HCjn8jreL$(lrN2(=Y@xx(wq*ipQVsTyMSw8aD&I5lEXy&bN=^Q<7f}{`Hq% zhEEp7IpjZLqqx-!XdzvVDDlmSTdgU?o$Gr zw4$XtCEMH1TnAb>Yqq>ziSM=(%5v*i z-xN4OsecZvH6|MUcU~n~ja_|Tm&`wNVCaq<@Vo7>`x{kq{m;UnH??u{uqnDpZwjM+ojdrxsa!HD{%QU3 zW#4c%O-)1ky>a%XTk&7NMfzOSLU@Ll=ntuxLPBVcPzkY0%@jQ`tl-w&x^?UL<1*|c z{nBT+Lz<95cTk%WUFEV72Mk^Y`8|l-F`Nd7n}XC)x)^AuEWL*K>`mlBGrT#jSdT2F zN+~ZF*KSHonhxR{FEkGA$W3@qw1+(=<{n4@G+|HEcG&(GKg)Zm%s2L$NuepkFM_9R zjFE84(P}KmD)`Nl?Ucr{ApeZJ_`m z?G=DfJj@8^J&#<`3a*xh`lsU*0vIKh5?sVPSYt8S-5brr@(m3k!oO~Oo53yV!-964 z@ai|03mC*8j1bRR?rFhqo9}*qYR%fUUe!l>e(vF+z?qZq%B7F{k4&TyMLDVmq;hiu z5B}g%HF5ByfpjI2t}G|dqZEPMl*lQNDUDW7o)Z9sl}XCjj#Gl9?>exd_{+GPPr-~v zysOC|MO=(S4{x=(p0Y$xX!iY68z<7uWv!0gVf3%`ve|~+-1j^&KMn+Zz`&O@BeRmOwT^=`8 z{!6g9YS@(@Pz@&CKm2Hwk@MMe=Q=XrW@;FSp$5sQs?xY^+mr+S=iW@%gQhsi^Z7uf zf{^-^biYbeRNz6B)i)cilFu|e1^90dd$;^Wz@f-HALyFUj}6E;$sgCZ$nifo+MyVT zSpsyIag;3Vo!0xdq@-9<HK%k)e#3w_Zt0k(oij15i0m5Iq7AInK$Z3O5pdGW9fO(f7~ zJ?U@|-mRLVYj52WN#+lbugNOnLCkItd%>-SP@##7HLY-Q1)w)%+Z1^s2ow~rhHm0| zL*r3Y!wBspIVAl@cO-h}jH?(|h{hGctx!qHv8A)lx=UC9N014-?qbZ^hox^NGvzPj z>VWMq66rJg6X1kGgLPG_Cxhr%_3D+<6cHJz;qi17xf@{S_sCA7>W==B<#Az(k|B)SEbOxIHs;=g0Z!(2^gD((j$>;Cw=qg z%|_0T8yii)mQn|I(w@Y;ALP-d?jY_~-2FSUn$^@$@zt zLX;ZY+$M6jfUJ4cZIsENR*5p{8`AX7+))T8`qFk$N{Qiq88N(YsXJILbWZlc!|Ga-N+s#xjq!AcTCQ%sUo*-ndE za-Z^=hnQ*Fmq1fL_inCkxzEH@*QopOSFO%P8yXl`A0ACw!gK?xKcgWVWwsBJl*&6o zn=Ewv2pZ_cr=TVOusXQgAeZbO2@->KzA_c+h3np!`eOF94k z97vOSrQRd?{9$$E`a^(7k2=*A{Fr#QW$S0s^3Z*Aw` zus+)Ma%Uq2-K(NFIy$;pCcc^a_vf78?gH za!@7_XF>K1r5qJqi6XrjdrG`nARFZz1S)}ruM+ibrw6_PKFB3;!G^gaKnGIhiiWpenr~-qJ=-`ppHa>aM zs?o*{ky5VL#5rIF`3IF%T^!rtIFcHw?%Y`4_z)d5gEIBFRri(+$lNIUYZ{D79KloU zHsNdOG}>e~(G zAep7pYMVP}P5?gM;|oGpLO-C8aIb6K_jMrC=KCu|w*rYBs$41a>Bu4&bL(;>s!7ly zSbNwX7?)=2pQ5Q#Q_xy^{&05mmC&jmOljmUgIG=<(T)Wsiyb+ooh=i#KNJZvgez^! zmwDo18C9)m!$qL7uyB5ItN4se(uD)QifmFPmI)hd=M!Gz<<(X+8JiM*YShWkct{X+uGTJWY>B`kb%DbHm(a4RlG$5+%YQPs)@TM zCtz;8Cicn2>F|+S@kN#)(fDQH9Kwp2dWpR(Sqe(nfZ(TE`o*S|+HJgt%u7xe=%YB= zNTv>`%f{oCbdwP$Bs<2^_b2pwQmm;e1(b_<@jz=2%^xKZa^`YuBuG{R!OpFnp7}e< zn(znJY6+M`C%Vd#y1|qaz9w_JQ=VkV;829xs1#IkJaAJ0-M&YU?a{4JUt(oOrz78yy85PsmxBQ%6%5!mqS zUeF&M1$kK;zOnEyE50esxVj^&>k5Djf$GzpIpcqC`%>^DBB*F&Xv8S zI@31be_VPRhrgZx$zT8ac*p;B9jbi}tp|q0fBEr$Uh4jT{uWPODKn1g{JQNMv(FWN O3|EhK%^5mr>3;wR{1uo0 literal 24147 zcmb@u1yoew`!2eNlopXtN=GH622r}yK~NAB8IT@8K)Sm|1p!eIQ5aeUiJ`lh5hRo$ zl#mAL20^;+H~RaZbMIYeopaVbch*8#!;UZBc%J8d_YS|OsmefmmKK5_hTCdLZ3rSq zLlErT2`UJJexOeN0{_Xlt2}gn;AG|QW#(!L-8FM}ws&&3w=qBGY3b@_MQEJW^xYtcj)n9CgA$TBAjp_-8+l#FJ85~u z%UcJvN4>VH?yIIoMsxnd9XbKE@b_@r$M8EXtZc}p=d^TL5i6ZH3@L=^=x8rrHdoeB zW>$Gf$!u&?bdD|E>(Ynd*ROA!n?5{rSs^^aUW`FV+-}|8kl#>VQP>`KTa9-e^ZUk% zqGbO2@i8e|7lr=*dLQ)}dKpIgw5oGp{_|%Vas>SEN6ZYmg(Q8ZF3}VI9u-y_4CDBF zSaci$y72d?^;1;nzegcW8vg&h*o5IILmd$s%_WwTY~svV^-r1Q_bB^ZDfwXq2o=1tsNOibe&OxLtTeY%3C6g~D;+hbEh{-qyvM>@FG4IZqYNCiky-&Xs@q$ma4&Wc+Jd|_P^Mm9 zN7G-sR|T1UI0zh29BeHVl?wksNpOespc60z83FcoqD1$bt8j2ciy+(qc0jSgbNnrN z+8JdtcB~`sNKcSq_f5*~P^+9!;d>+qgKr*D*0MiHV6d6oi?DBa4<9b$w zxxlV}z0U{|g2_{#V<)&UheK=UF-kCbs*SomC5Yi$c2-E%LMMEk850z)MqO6ZCY(RxQC)0HsgIN7gs`?S;uM#A%N&S`~YJ*x_Vfq^$I%*{J)si<%Zh90sQ zYe(`bnMlaU_&j!WT;YK>Mv|4t5lp9|N1<4lG}-%5+=`3Z6&N8f)N*%JfYYbxC-f98 z%2W5t5qbq341KNMhsn=bfd>jukB597B;SmH`>^S5n^a>N1^-4n?nQ9aD@yhd&k5+o;w;2A+!UhQASor$(sF zYCGa@B0Iubd6>`8z1OjjM5bql)n0-Jy=c{EzI`hC9sHD2+zckn{$#W#d_6uaoqVU) z@sI&t+$a0IyH9*%@sX)sSnCyL8T)s;VPXD)@QT4tG^e9)!;2pvi&ZY(MFz4&v)_@9 zjD>%MUnvROTs3H6w>%uVft(3%U4_3fTed3+8(y98Q7_!`s%Tt4!R%v<)J9%xlXd#7 zGUXBzRx1YI&)tZ%s-t#vSM!1hub{IA`=xcKayRMniZmm4O$FgX$f@A4sjyZ_W*;M@ zKi3bD(seqPXg%Z<_T3RDbJbwehD$|2JXfp-YcxKh)g))sE4gWQK-dsmkz^=+QiV`?M=s)jQx3ds)3~zd-OD1yG4IgWvSrH zhJ9T5m+jz^>I%b(&%VbukZ1K*yP5mO%D0O|+K2kAOb7ekIdj3`OyRr3R)({a>f**+ z+}%_!TItN|$222ax8bfa>iopW-sLq3NGd`tYH|~f_*2b zqR*>T@mbAM$Yw_dCTD^yM3A>}`S(djv-UnZ){xgK0F1v@Uj%ei* zT4rXB7lHR0OS6a652@o!;c!)Zn41Lit`gJ@3O_)(<<_Q82scEIvE=|}?A3gjPp#<2R!EjO5f+zDXjBtSLrX{b526BW7X=lve%Mw*K z{h7mcdgq%`>*&J6c!^E=)Yx+YESPer4<`NVQA~|lhZme!8CMOhkgtV=x!>S0s1lBAkZGAbDDJS=iO;*1irK zgt4#5vw7Lglb8^Au__8*^v#Y_YnPK)lmQE)?BZ5_fbE<47`b@`8ygNmV&prGvVF&= zqXREvGIKXOApr;h5w0+w$-}ut$Z;@-KaUT$*A)NCLvfN}qv{U1Gc{%wIi-a%Hjn$+ zh}bM4&+vWQxqmSpT--I;g7h=rqr{OU* zCJm8_H|eXk5tylSux7^K$sRIFFp5Dl0d=}aD%?Vgr_ET%y!m#xhsIrWbE1qVi&s+x zVzyR-=r?DVL)2<`d8shKF-q26igAXko^iwHpwU1PQl^8{&Fb0hK2zM}ebI{8-&Us? zeZE_ZKOs@{RT{f}-;JGb$(wB^BQKy1vZ7DP5}k!<3PuP$2_Bp4rBn7unAu zf!Wo^@)m325*$Q==w@!@Bf(YJ8jMQab$U&gb+y=;TiANqpy2|XQ~&fYa<|WkHq+6M zL{jP8TGn{s#rIHN z11*^2DOoNa-9(y{R&Tv)K7k##dx1+y3~Q>MRG^!g_%?coOPId*Tn~1@IDU*i?8;eC z)g^GJ<06cO+w8H`b$z-O@bxR0ZL_C6l(@Z+MoFdYg+|6{6I!$Yu}Mdt5}HGAE1EK* zc`@G+sW1XXYcDiB)rI?`#l9ZIrc6P!o>#Zybu$xP8XVw19leG4{L>v3@WZi}OPobm zgFX7osizTbc|qH3F&QvS8|qs7;3^q*@0b&Fsjv71)M>bDl!wdAoi2d-$s`6EwGDY< zGn{wm!_{DzP?hu$T8i$bDxz*sGdipl%ConkaZ1y&Fgw(=d_Cl)X zv=NnCJueJ#8`*#+`>3bm;m*7-Ya7r9;!)Y@F`G}j4v!ho7+m0^LxoOXrQ%9!*OO zgE!>}_LBv&6sG01T&pKDCK?7tC(J}ITMc+M>GYQ9R=j3KwCus5p^@(<@O9pyrVN&O zDF0-%F0Q&aWU_Eu6cZ1vAb&winmGqKOW(Xj*O;S~v2tr<<8D|s0vjTa;?s9tE3O_q zS1g_kU2I_p)_zXaeA#Huc=sH zzs)>;c1=-97+%A*URI|>72dj-uW4^KA}~kZ&E3^^x79PZv#}?<&J2;K;Us7C;fQ1A z&{u@G|6bsJAHL}o%C}9?S-aPsdEn8sLpS+MfKA}2bA@gMKr{F0 zr9LNZbR+VH^$hdUsYR{E81HNh6?Tj66vx9@Fbv9qWQ8y67o%T}^B$?u39h=nZ33H5 z8=!YM$$OmTHBNYf*&g*P=z5h$#c&(2JqVFgu3!Zo1`yPx@IB4PbMKB;d9hla*w zsJzWW0;{m2B8Kdy!O2hoxxs(N?7qi18(AIh*Ti^~^Tsv{$U=7Zhl@+*c7`<5#YKk# zPA=kyCj26<`8?D*n?lUDgXc}I5f?-&843G1ThkdAgmDA_W&sEhn^b5gJ>BsFF5xmf zm28e5ZiY(`waWdn>3o7L7h`>jIMTJJSsWqYXsYs<{@lQC^V_J-hb`IyTFf?N-M-x{ zR=R~Z65p8}OE&ekNn%C4UM1%a>19DUevO-gw#G<((bk`?WXd;d@TSjc0%tzE_YR%t zu3I#&aZKyzc|Om$xM2O^60ym_<;fj)@kPvr$Ae+-wvUkmM@H2LyDsNV``6i+gRuE| zTUUDB<3*Q!wtD;~%*%r3%Z-hs3Lj&kTydh`sJ>L+av>cdca^e*Zct}`l`*Mc`$yod zWY4rRN32FjHoNt8V`)f_3}L%R+fWea9a*bD%@+T(EiBb*!(w!UZ9+ zwx7q=y@}|qkQ-eS%yV%*toZd*G#8g347YJU8c{Xr1KV_?PN|`xVg2Cn@Q^#sZ_wkX zvDaYYej4scjB}-)B+-~4hJK1@k!$?qQp6-BvO-4atyTA9j$VGGpB>q_Ap7HNhG9Dm z*81?^GCXbhM{aQZx|cMO$x90Bn97+;*Y>8C;oQlLchtIyWD<;o(Nhm6_0a|@zMo6g z`WMQCQ+cm{pDpRCDi-}yPvtB@Z2IPac=C`5J6Vt2R*Nqe!L==_#8L|QdVX=+#Vyca zR}ziU)p8*XJx!v_ReifNA?wbY0roj*iDn&TC2=Jyja&BYxslf+74)ovh1+~YL`@9? zlc9@pwMUF^kK}81QUq3p{T6P$C-mIB@6k^_`>cPWqhzugKiS~+^XvInd86vIMqI{g z+_oN+$6QuKJvipG^zmq^hl2ZNP3ybrJy=N8GBx{pnFF3fC3R8GAv`aCNg9~ zYe<&4DoZz0`$)3#kAeoPp{Hm0OHL`#a@reP)xrB&7kL`q261DJ31; zpu%zF%EgQ6#?H>pHP0D1%d3GKEs_(GWZY~=z`=K+_gbln?J5^dkP~`&2CTGL(XeM! z47QKj>^hQU%8(ZW+Y_#xzZuHt00f5beDd$B1J?CI0Nkl^4_yC3OaS+x{*r>bb)l{=1{ zMLh0$e`|B`rxPm&M}NI$vaH^y|3>>OFyyo7)TlfnZ_t6u90o>ywm0j3n#1GAkB2@# zqn09ebi_IgmG64Z|M+CHxzIb*)ZAPiI1_J8JMze=%oddu_65JUkaI1RiFXtz6(8^3 zyLYdK)2X&7d$K-BCh(xL6R0l_lq4m~%TiNK_xt<%`#@58rQkNB@uAc3c8i~&867q; zd%;FLM$?}jofH;^ixj%4yJc~Lt0lU>IZnDJalSG@J!$n6MU7>C;pUiwo;_}2fH(9) zW{a5z_W}hH8Y%DoGkS=GeTvxdF*3|h(-%xUPU2xJzG%`;pqXhHkiuVjNO%3cSb>VIG zly3oI8thdwm4#(RA&q0j&7#UWOi}99GG^r>3TQklRC|J8sxoDY+q)^ge_7B}FNjlgXDLFB z9)ozhEa4%pa4L&_eSN(P#0)FL3P%PW2`gfBB)6Ou*liK#=9gzYDxAi6N^QC`l7Yyy zbFk5wGv+lG@M^VkGIBD8UrTXmax%?tdxc@=cMI<`*m(Jnqq-767pa|!`}TG7-I=@# zT_0{rLZu*O^v4!Z^LW3%AH}2aV+ zx49UXfW=mBCpy*B`222Sw?MHf9sYUHy7sFvDvDcfYwpb@bz9&nWhMYJ#mlW!UT6>8 zza7KZYd2OOFq2bI?>pWaeWCtAre@v{SjbB7bHu~SI{k-A%h%VpjC7`&me#hm_TnElxJ$?SMbM1m}o1 zvwm9NBc?v4FDI~{_nJHyjOaERW$qwHe{94!_dGkpwfe#oEqZIE=xOQ=c0OHa{U1>s zS`Z_Q<2<}GXMx$yPzihc*6A7f>quKNwUX7#A8|JDV&0h>lJ}4uWG6g0E$IZvW)xkjvjfJH>4HX3?O~PYHvkP!p=?Pjl3&X z1~zw8P+A|ubZd)KW>58ts#Q-Oy!1R7jc*u!C-bgQ@Xb8vIfB#c4OCi zSI#)-R&@75K-0K}tN!{Sm+ah+JtGRkq5i159qiGjW>&eS$*J1OVnSg;?M^q7!v1)1 zP;UiquBr;u759m~`wqh1eYft6yB}8TwntOI4rh`p;{y+`Ec`4;mGj?O<7R!gtg`=h@Jy6uH)|Rz5*;#eQ9Q|U1}`& zs$JjGAN=v*DxBA{Cr7Vk=uo2mXm7F0vMpAnBU#StxRFUIi599D^|fmTNU>ooV0$DX682C-gE-KeqnEmW zWH(anp$T${0WnEN!glY^ye2Q1V!+;?!Ts%3rh}P8r*h+f&E8Dd1Fg@WKU1>sX=EKw zMk(=`di0`X^11%HK&Ui@Sy0JSM5cryYtf;mc#Mubr_^Q;4O$bvKHw zFrzZWlSyo)b>*UP2|zI;ue^!WY;}=*1Kk!tMWS|d|j zzq#*bub#E2Nv*W)T-OH%01V1Yfy;~M=;j%Ox=syPr{*1w`mL()DEjvw|C#Te ztu|TdFS^hr-U*!H^j_WLzFhr|?Xe(US%Aje+71s6x(dx2C4!E2XDRV+ibC{^N~_Iol3{qmCMJM;PHq_Y(}H}gJ#L0 zaWeL{wP#`O7$3GD%LteeEwQJmX+*HBbCQL5p303`o@ZvlLyjvWH*|ibhmr3@*b|y_ zTa7QPya(>YFpQ5OMmDM&Dbl_oM4or|HieqxENH>aYYGiAJ-hjas>o+*x1XE|D&n;A zT{kk49!gWsh0Q6Ttqd2L&?hyVf0Ac7o*npfmnAv{(f#_EVP>QH3?Ed<&Q66seqPmI z#gLJL`)s5m%|OuERArv0V0kQ0FEKQ#h^-++mR!}Ph(MD?)u72TA0XHjDzX=-n;f!m zekxPG*vUOkqOiRk`sn~2pUu5qL3r^QJ~duyH^<1&Mu~$FMgx` z1qBYPj}d0lN{aR;HF93I!8nK;d1nd!8B1-h2FT|R_jja-FEdPkHHc0`w!9Xbx&BA2 zq35Y$EfEWZhdABL8N<&ADT)`gggg5~Yb->k$r8j^TqZiuI>r5pnQHhq z1H6uBOBC<+U3K-1&FyV3S>wZS7Jh%&I0(WW8X6kIa$YOLgTSmGzln?E*E4ca*^=m# zciY(-a`LiVKb!gS<40Ob7Z?mi3zeS}xOT0V9RkKk=TWq^v2oMe+dKS86CeoMohb^2 z%gYsGk+p|gLut?)O3(P6jKg4Qqj8OAI~itw6^}ovS*_>1X4{iB6&xH)Ot@w@6MkZE zdv#2hpMUmg(TC{h=#XzqtE&ajNp5cAEO}6D^Iho4rmMc)m;GS&>BVqzxJsS>;+zO>Fus>G95-nBIom_@#76P{l=rC z2D3ljZ^ApiQ?@PjU6MXHJik<3z*>@e+j)IgyLW8-;QAMp08gc+-O!-8gap-FwKQ%U zmXUd!E;M88UX4Bqc^Eq58xgxa0Dr&kb;l8V8V?uG zvX+_h+t8sM!w&1hHU$wEi|VCANsrO@AJ&;zRph<*>rlA@Y}X2QeSA=tEPMJ zQfC$_N7p3Jpln?c;Ny{wK2=rKR65D$OuQ+Q0Kk+cC2{H3csjHKFkjHS zU&1{=r2c6SU^HGuMP<6csH%7dkWWowio*Ud!*g&W8IOgxs`;x?Tr#fLokHHeJ&gyb zY4&v4T~KH@0R;QAT|u2EVZe94<-}X}T1S~M+NYo{A3dvqvAKQ*h7P2$$$rBUU5?08 zGS*S&;#Ehof3WUM$y7Z2)1A|0}9SF6KXP~tXxm9F;vm%UO=v2oV4d^WHF!=d%Hu^JHE*|duo~*ZM zerW^fj_5HU9^G&VXlQYh|6X2l~lR++Gpx06QAjKnVm^8 z7f3L3XQ`yK!34m}o9&5GN{iqA1_@H9#{Ewdv$LVV-d0ntSl;=wKK)m3k?0#6!=K~h zo3RI`Adw*D6vzhUSIs2k`fSYfrXb92)RmTA9Xor?cB%p_k{ck9)SOozzov#rSC_N- zk*;Low_G-7=vehhB`{DCC@nF@b$;6&0`;ex46EJepNLyDPl@UmUtHh(X&U7S^7qz6 zQ^fY>>KI=C)!Az=7~aTwtt3@>|DG@)X)owg~L3)U0~SZ}vP8cdGq;YZ;K@%hBod>p|31|pGPFu;`9ebXBiRu7GWNP6-5<+!^!vL6QMo|hh`O}&k)FSGlpi=WR( zG$1BQIUV0*Qg~~y&Nl4?9P9%?|28RrU3=}U{j!-h=5ZRRa58NK(m{S$R8$aQ#7A$Q z4Z-~N*}SOsjJ`9ew?_Uw6^0BYvQc*SS<3q6XKLP_oumELXOokYCFJDf9)LbwC%KbN zf8nqAUyY%u!ZOs=I;eSwo^R((VId;&J4u0}y`Z4LJ&tX-lXvUY*8WlAY7<)0c?doR1_nNH&%iL|lGfIo(=#(mLNzb-rKv{c z@+kN&39Pyr<&M_*S!XTGtc$jM3ii?^4H@j$|MzyF?1fGkse!2aBDar=|Dgo6CuN+M z4B6Rho%tKv>_-CalJHZjcdEJNJP+8|*}cl`FOkc7E^$ypV?YMfbaFIgWjK2^>pO6! z8%(^42ZlazpK8Vbwj@E?GQUmZ+xravMd6k#|i{Z*hQ!bl9&WPci* z-dcw}tX=;mMBQd#h#UOZ0Chj)2T(GE_wkMYO-h!N1?nRZ9kX-x!=s`)1?|p=S%Kqm z{!nb7B=vgV7@bIBe>pW@*<$!@L&e6a)SqlM3f*j)iEpIovLme66U%4Tio$9)qVL49 zDTfG9cV7|d_7xMkx2w1vqc{})HFfo}>5M#Oo9sWNnR+es2&SHqusZnt zr(5giP@SJw7r-8oB#vowbg;`z$1eJExXez^SyxxL3=q>4ZfR%q!qnH8sDA3lks~Ka zBI{tLR_aUv{%Eg0HE>VRd!X2|Ofyw+*=_oro>d)b;vnd{HFFQKojyJOl|MB%28-qJ znD|12=ewJblycSP$1iqMfAMC;&X2+m3(uJY0PnH2I>!6(-aQP6^TR76HJSL`nZzJS zSzG(fD#!yCcz}qyNIv%Z_3MLJ5yO-ekRpEp9&J6eUi5Ca?5D}lHb73#b3xLx($v%} z1Aj_2D1H3>$-#lY4+v|EPW^@Fh0mRPj|RLZnMk2Ccytk8UWw< zY>#+d=yP*UzJMyq z?PS^buQl-UCsJMkBrvE2sQWrub`xL}USGD3pFSn~vos(|btau!^g72SuV0wWr zyXB9+!~L3Guu|Ix?eFf+1MeO325Z<`mQ8@Vzd0gaQ=Pam-Cm=v%#kD$3Rd{8scP8)%3>voGnfB+{r_7|FYDJx^h zlu0!ZgL3Jv?OR`; zpU>w4i)1qw4x(z7_6rcK-RSEfG{7S;z>Murv#@jzf|Ol{#|ItEeNPh;5)dE&NtfrL zc0Di-&5C0=pj2~002R#!6%BzAou677C~@6iuE-qB1XnTCwYRqqmyq=YQAOy&g~r)f zLr0qffa-!T@$;K@z1IdV4GjBMIY_RS03jIy1h+#V5Vok9cJk7m%(d|N<(dIap@dWk3INVJcGRn9>JSB-e|d=mtO6=a zrE6@=6n5_N<;$HeyI^s?xa`Y%EEwHeA&KjyrKRv^pC((P_A@?x+TGjVZyCY^y7ygF zNr{(VGil9Z(_1L<4)66-Bn;Mj(e=782B?KUNo>xr&Udo|sF9X~u>hx3-FSJXnDTLl zfvdK*_Q3+!Ds!QJU>e;d<`EkcbAgn`<7GW;iJ+#C*xb}~I0}-UL|dF#fd@&Hk#--t z31WZ~305r!>a(dva{YEQ@;uma8CzRhqkF262v%_#%tvUI6ylcc1Ki#JR5_^2nrw50 zZ0l45974k!Edzfe49`@V0&0Wd=rr&`_{VjAIBk+s6HFtn_RE(qt?NR)K2oFJ7O{9d zUevg{G~;kxtbXUCFBo;WG1oFBK+ma)0ay4mu&>{yrji1;`bl)h6F?O{%eZ`C05q{czJKPTkAA3&HOYetMiGdyXAH~jX%c`8tpVV= z*uv*tmHViR7cXAr1YzlWZf>qEAXrfhCr)%_rUuGZWjXxsQy5Z6cyN9+EdNxrA>>Yt zxQEhv*j7)`#X-P>CNol~#5I7(zYaoC`7EeJ6?h#VZkw{5I`w9(z!cweKkXJbr@Xgw z$+{>n1i{%4nTd=JTQzp5~^JcpY$1jLLK=D_yURZGB|X? z3@e>~9kq$!Yg<>|f)xG`lmk`-A7miKOzUe&VBP}M>FL1uJ3Lp0Wzqogur^j0{g$Ck ziH4{kHR{42<;o}&a+C^TiWJrBb(q$bC0Qjc{Qdn6bKa+mA}azdz^AulMTJ((nt(A8 zLBN}3Y?%hdwEdZh^|_H6uje?&=g&ZCp(L_R%xVa1@~^7d6ghJx<0DY*JJ?xKkmO>QuUyu0Pu0Or}O{hSA1AsAEJDn_@B#(mhH>h&ZSRp?-Qex2BPy6pV0F(OzDdbynm{|D4!o)Y5>1#rcV(V+RQ zl`oRM{`exd*yFkp!?-%+^g>MpOD#S|`CbAOIIaKh5rdC!bZ`*ScSbJ=>dw`ZCYElH zqc&q)h0l`a#(Y2gG76Xi&X<<(cv0o=pml6?%EkZlxWPN!Pj-JP#K$)yCBF~%47GS@ zL|0u)Uu4D9z26vD0^lF{Pe*`%RPXAGQGFVuwtFGHw(UL}Fb$`5P$cS+2Fi{Y0FT1X zu`{5+>QE0Vz~dKGBM(lUKAnDPS_bTe0VuILl!AZ8CMG62$dm!cH#Bu?zjlV#f9>;= zAC8MSQjHZTHD8=ahO!h_U{H3sl+^2>^xs(V-4xXK_t&4D1sb8?9}wH|@4WOy*=o-H zO%5SfK#qEwl$2x(Absw<{b9aw%>jOYrKSr+pikAO{8=plQT59J+JVv5o9fc2go3&) zgVM=$P-3+NT*Xn9r1Q&H1>wB^i^BZP`>MBY&5#uP9|L<}gg>=W@@oWgLH3f=#r&F; zKLEtJ%$TnWgBB7xMn=b;iaBaRs1SQI`Yr|9Jxr zM`rrursRDQw;YF0{BRpH@UgK0s>5lL2f8&hHb$qUq~zm%UHUWvnq&?D!i2rzlu~?* zubt#CnY`S$i78jdDP%Jj2TRE-oo)Neb*^7NG1D$3VL<1WTDnT`5Vm6v`WTg0xfy zeFdh+L=u>O_Uswm-(&+s&?h9sQZcp_&YzOSa`vo27Z_yH_&EA-cQ*BzRY%epHvk5b zK(>wAA}4^8$Vra(9JEoK@CH>@~nqsLC5~)e}HZ@rn`8p>R?i`Uqsk zBT^2g#DF|$U&5{463IOYq(^QJK=PD8KED6}xKEQAMt>A+N(48Sel28M|Rh2wQ zxF^@Rro0&}!h%NpmFXz!6rULmG+7}Vu;0kY$d0cDsp4@A zRIc&?0|D+9%=Gm1uf4?rr+VOg<3NYwn(r^N=mR@SW=+K2uKjNJ#aE7ShY4g`&Ytf;1>q*MpU zpeCeTU{ZI*Z*$?D&5)>=7*o)0o9U0Vw6q~07`)F|HXmb+;k)~SmDwI}Bt1Yucp!Qn zdFeWE={&A(MeuN%s{OH;fPgY6sYn2GT?g}V8~C24%A^8N8yMkiFreaI*y(V(g_LO^1QvOFPe)6tk`c9_*x(!p?whJrGz!fq5-GmCgQNGs3%z-rJ;Q z;D$(DIQIcC8U~V{lzzFre*f$9H-8w*m^f`bW=D$qhP5`r*I$NdID0Ehemx|m{QJ|_ zK+bv#q`5ty469y)#z8@%mqfNmDnThAC{yIhnwpXF8$^lkprx(`4gyg!h;ROYAPgoe z22>9M>_etrf+x{Z6R#1EOncDMytB7wPQbvjBW?ahb->wf0UOQ(5N!x7ehcVc+A9Ve z{Q&s&r$I3&*A%Zf1u)zdu&+1vLFKA@9-#WF`S{kj} ze`>J^&``qR7_fh71_(sayA`IRpgF4>1XXr-hGo)UMRtmROat_!xH05}7b&Ln0y8)J z_3M`pAkP||?d>7Uy#VMv03j9F;mxd^mPH31Is!8N3?>gcA|JPeVqhAs>m5R67)U87j8u3o4^~CCfrsd5Mbz)!&FlN59#;tj^icWVUs!Gfn(X-eNz(z@x$EA9w`rR2K33_K| zXV;Uenc85}BuP-9%6fB2A$=41 zFA(COtFSc+%qH~0p7s)H{ajB2BkclOK@2Htw7-4(_GSc|$cvXCba5ll1VO`ucS3r7 zT;SL6ff8%}fmpC$t=SJU?*t@sf`II`)Gw~oGX3@I*O1sele+3#cka9xBdJq^UO~gP zK5p^6{@iEI#8X40-CsHep)Ih3EWh}yqQCbbi8TXB;1M8V=RsfEt246hkGsKq$4D}# z)$aOqS$_TA0)42XAE?i58wZ#AJkLU}kqS7-YD$nLc1ZMax2UG$_LDH%m8rYkg zn{ykAb+28Kq5bJoOlNKl9qab8qc-_b3Uf=-pa1#fCD0wX{Ry!R!E8}ABk|!!!#=D zz~<6rv@@MJaZk@+qS|vgf%W9c_rTUPKLmjGBd_Z~`(msHT}mU0j)Dlpz5PG)G2&(o z!RbIm-31B!Ddgr4C|8Kpf?>VC&npltOil_^H3F#Dux!{2n9JmLx0X_f5vbwBt2lH6 z8uNf;NckZOXu&Cm0sU8iMW5b>ap1cFOC5DVbK=g)uGCFOr$9wrD< zEk1H&=I`{*o=s2!h~qg))_fB-N4mWBg9l=j<@E>aZCW75$WAn|o4#BE8i?n;d#_24 za+F$B11abLWVnm?wFbH(QtCO&0K)1f$l8CvFbqaW)Sf@pfBmsdR7?E-5xIX^vJ$`Y z9tTv!$C3aWr0MJH_hx9N@^FHEA&ohj=_?PYK!smYkL_6if@=vdDxPI9+VZgR@z2i! zNg@s?Qw@Iy^^`|duC=2xU!f%l=qJp({n`HC$DGxNO^^3}z6CtHhjd=X7({3?wp_kaFXsXzo&W6Bi}*~Xp!3O1@LS=Jer zgSa=O>Wyl4eW=p^plHKgH7SGMw)e*$0jBv)p6{!%flAL;lP;e%-bsLc&o!x~bIZra zhZ2;0LBB@|wQWe9a7#j|zX&MI;?G68P-{`4tzp5ZOB8$V+$1dm`{(L=Nv5OI=RUpx z8?p~0s4@H8L`4l2oQxiXcrf!H=fp{p%o~K(3aNw*?vuh#UKrG%@wUhqIZ9F3ZV*U` zVMd6c8|5p5NC^^XSWOl%EG|$4P7bT39~Tep4TXk3;f^Yi5y3)kq}jl`1)6e3%%b^ib^Il`pqV!mu^rtqy%b zau1J31>n(8v5v=G8CFpK1)nUWBS1ZmlrUtFh|V=V7g6 z&bQjXhlbTQCySt|VOg7GL?owy`+D(xl#5r&-}@p*$IBrN6w8bK#DAB?<;-NiR7rml zM!{O5sD?c2ZF~IhK-O2lX2GJ-3s?o13Nie%!FS_dq1^N$gWmBc;^zN3?j1il;{6Vk za|v^mY4zVaW;Hl30w10dg}eqtd<~E*UT;eIFuIU#rQ3g9JwB_tfc<^gOG-q?UI`4t z0u^(;5yWUfiIDs!T+4_qM*CEBX(9LkwSN6F8Mys-%Ky%Xr;XX*u3{DCnCu06s02J; zN(;Sw(0gfRnEKy1EvNe+VhYaJ*Z<7FFFrM#Bh_cX7OTtmkx`MHjBo+Ur#ktBj$9Gd z%adW_AqU94 zB=!}m_aSrv7EI3ZA5Z~~5J;gX$dY3VaB?n3Pl#j?Zci3-4YKlf{;&DJ;BUc%koDUg z9_Ma4dDLuQ^>4ZZFKGsMPmV_~6OgO#gA8tQ)@{(qVD+OydDH_=>PrR)~tAaUFjPwe9oYtxA;) zR4p(*JFdmJm2wf#Lk;z1|8IoyoiVU}U5K`uLHq;OoqZEw#T-u7MN6n*k74?2K+%~X)FhIzV0>PN zoxtoOECx*6ixu;Ng&XAW3I7{dru;5f;q8=fLmhO`9y!Mc+)!Pi(!U^ZW#bVYRyDk} z1mYn_=%UzV)zg6ZGZjr${`B9~V&*+p-i2vg-<^w;ZWYMJ&q8CcN@@(m!8q;-fkt=})B}aG|3vZ^%`q{~%BW+g zOABcWg*~Hb*g~xR+XY;!g$Nq%ni+PL3_>Y8dX zOO0+DRSF?fjv?uy2_93ODgPQgS%2K z3&_MvZfa>Dm)n!AR4w3Wax`Rs3rtVV8xX6sR>&HQtw< z{>}bdYj=$E$$#e7_Yy%KM1EbeCFU{1BKBpW02ls zO8v~j9M!ZNZMKZO5v_8(*Y8XSr;8>SFKuZUdi)^EIysFt=*&niotDrz@^L7#;V)l) za(mhLio=;7y1}ybJxc%MmYVRa1!qGj1LwuQ{U0ENYWc4^!vFlWMcFzqY?%A0=uYHR zA?al%F3kF3@Vaj`_WvyJ68Jj@<^&`MUbr)E1u;R=0vy9v0Z9xJMGfbz9<@|Qz z-+)dZ^l7sgK8N&|R6W*l!Jt^yks~^M%Fw+W#rqCEGx}H)oj7TV# z+L3ZE%5)-^F4Kh?np29D?=#eQeb@T^*7>dP`hC~>?muVt+LvcP&*yo+pXc*=SxgCf z=V6V{LOx>cu4b2>jz1COjI5m*wsPC&0XtZ|#Zm2XZ$B!j^TB)+2Cx1yv8aSfEuzbC zsFi0jsWqS7mpp79{xJ=CNz@3h+&&A-*G}V?i*(xldGW_z90CsVY*-snN{l1p65D*5 zOi;twQwg3e0dze=eNOgQJwlUSZ16AtgeexfV3yPqbzWoBf8e~_w4O`Ad_BUN`f#^W zFU|6w+G+~M^$9|Q5g8w&B4sEHT*=*ncPg)4gC5t0M@B}5hqr^wn3mh`+_|(3x-o1? zWo6?EhD0pUE>s1nW|xHW_`^rzui=*dOak6T1Iruy(nIM#8gtF|XfQMzm!NG|t~1a! zsuB1~H2(}qg->VVRSLf9_(fEj&s%&I;)0Gc0%-U}5XF8wF_8%5e4>Lc31Cu!2gn1X zv+x9-qXNDjtUw+CeGo-@BNSLkp8gD^uO!;^oIX9U4LKrTNU_($t$`F7eeBikjeRbU z?V0#gTA3R*5TW@Q!V~$_@dl;bUWu>**QIckD|RiWUCWDTc~wFWa8Au1v&1!^R9^bBStioqk-qApOrB{H3irgNi5Bjk_HuuGLZ`sKAC2ms zi)CpSp3m8`BIbn5HbY&m6*|5n{C_p`|Ax3bFA#GMMi~6f{@H|IDE&`z@&qIM_$t5= zX5hw955O0Aa|&);fVG&oIpVxC6H_Nb70-GX$p3M*(4jfMqT#?NkC-9}%~-TStOYm5 zX=08BbTBi?6o z+orzBXuMBKjsCV7uth*KRfCdgfYzh1C}{W8)SF_CnYAmUFR>1SLgu2GkUDmJEu%vZ zy;?75^hc#xy#28ygdEUF4TEeRMpTkxDR0z7#{b%C9KsIQ;{DG#_*8aa7v_sDsvl_? zeNt~N;y_%WlE+M|*plszjn)R`cif>KB9uW5u$kQJ0XhLp8`{uN;vR6t1X7dgZ>sU%rEib@ir5V!>|k{C(SJFldF(7L44qY~-{jI&Q<3 zy(*lqOY8>y7&oQl$R`ga9&Z7&>%B*NfoKtY=Ri)%m^z^jyvUP0+e@SR+0k12wOH{= zGC+cnDnfUr%uIF%J3(t&$Ed*QT zeNU#TjQYG6trz>|lVW&leR0BavmU-tHE9G(AcT3sWUrh+)o! zEYYnN$zr$K$m)@@oRI)dh5;o;J7`r^k)ZrRm{z}#w7xwa46ze-qv%<{pN9_Y<9a^h z{S2>xmdkD|0JHiBbYa)B!C>v~*_x(^NwSt_V?`>@2Y{D#=FRq@Q)qC!Fn7~P*T`eG z-4~HBCqsvlN$Pb1$MX7g54Go)HHc&RCVdy+7v%LJllCbQE*(_``*JU*0i!)!Pqur_ zaE7U@T?8}*-+5qvjg4zf_h}F%=(D(Tg4e*n#^eSKaK9m>gcnI(OC=78u~Gq(VMN-g zK$r_g_P*E!Ba+`y(={MPp;xZIU_}Sn{9xUOPbzqoXzX>dgl0?0C7)Boe+9SZC7K0d zAFO;yfBE*z^Sf*PPH{Vx2}G`#oyooDsWT}znn%JF`Q=%1J66-3vB#{C@@gA(F_~Yl zx5@AB6`6~y`2#k#X)yq8}< zRQF{@*{TL*l+Hpb-|h5I;(C6tf&d(Geuc6<<2&Lz310;SHAg;o07%9Pq|GZjm3C%xK52yubrBP zUj3EPt(|Hn=Ge-@dAQVcfTYoS(V0;|f{4wHAQ9|V)Q~%8h1-HfAdVd-4|8-#`eM!` zc(T++c0DIeM2@`qvcRqru7kD$Sg!U5Z3eb%LwBG|GY93j=xF^LyvhJ^9V zi!1e_5A%{7qbZq&sG{)OcN)5K<1$A|h!J|2VaR5*cm>8TX9y~;f0OqoLK_)|9*pji z^01)9x?r^JsgvF(hk#sO_Tf4c$}oeGHMRJ@1eM<4WwT$u_N2r-m<^YCjxRu!)EZ-x z9QW`H%Om2T%7{FZvqh#T%-m3^2?UO+u4`B4IF9*%To|IE-Oz+K4=(C?W026e4Ct%d zPX+@uk>Qm}1OWHR*GZQuUaW?+!_k?PquvM2qXj7dAFiT43*(pq2u?jUfzM&tp7>1mDE}3BfI89xGK^#k1bsW_aro(1WPH~w6y<`^ zP3d`6f*|c^nR$I_j}1SE=}MtV$8(Jvy-3i>o~eLQK!xpg{@3G#-c9>RldPQm2^Sh+9sqEdBJT<#LZz(1l#{<45`>vOl#NmBw^RG~@od&$3Hw4q_VO0*e>)?37T%BKPfgufV)O?@ zw1V$Snje=YpY0ehj16vd(JpE!t;|TD=xY^W^Yf3waQmgj?ayTI9+Yx(g0ZqsG?z*pSb_kk%6 z0TkxpM_gb{`g1Ni+1HEZa$C;t7eB9*sjv3h-EqnocF9P;*778Bphcmke35CTY4a^G z3RkI&53?jm&y(*L#W_rV!(|44g~>hj4xz>OpAD>{o5q6&Q!kkiL>XhdWfFf_{tY?|V0~iqxwXJr$ymasC3}X~FJNV!3WxxB7j%1@C5Wy=&KL3)S5u^WtZowV`G5ga=T!P7W@Xe$YzI z3NykO*sa=YNR4fPI?77v1zjBTj5s;{uES zY76C$@N7b~aW7Q55bpsH@{n#j)qV#^6ir(QSpGsELv%q+WLJe+(2TEDgYQ0L%J={y z0AYfqBzrRWoVz8;O^0n71K}5UzqEIJkLDk8wUno85M#HQ;R89Zg!M|v>;}-7iXNC1EzpLB(@H2{4`5{s&=}Wb_gPHrmG&Euvo=F$_lFX0B zQ-i;HwwEyis7=MKtEwM4D>q%Q2nMg;=mBSea;S3}oCI6i(TdMFALH;M{zsMW^cj4tP#}Jn{y0%GUs~|QW O04E0*M!^c-z5f8-FRl~- diff --git a/figures/nested_cross_validation_diagram.png b/figures/nested_cross_validation_diagram.png index 6ad68ec6a4b26a1e1b693e4959d892140fe4a547..559404ea5f835c57248eb114ee7e90652cbc84e5 100644 GIT binary patch literal 114661 zcmeFZi943<+ctWMGL%YDB9SH;qfkNvN~n;bB$AXdbD5JeMj~Sr87j9VMIuv#LPTUt zA<0acBWpihb)LtuPsi!6e&ir43l|GTQLKlQ6*Vbp zp%ebs#@5gwUHCE7jjfM7-MedWjF0aa5^Olz3F>n~814fzHC)Bb?t< zA77-vsd{3Is)^OU>zM*(Y{}!l=W2)Bj`qg(W!_mj+~|0xzry`;WQ}{=hgUsrH9bOp z2?Abz|9#cp@S)N3@&Elwpc#2Z{`Vi14~xH2Rs8#vbU*6LroX?)uX#*uWCh#buavEu zo&5j%yL$GOd4K;(QPDhr+2FrFG^^+5*jQLus`b=e zWh|&4+sJZETJUr}URUbMwo60$Df!QzOI;jlOR>qTkd?VOaD1_@lB3ABZOaP^3P#4q zO`iHHv+~F~sn76Kzj>qXxDzjubU#yzhr$xvs-vZKEqKpwPHyQ74Al4je&gZxd`^k8 zegPX2XZY|Mi&qI#>Un)Ph0gldSH8N+m|;@6iYiUDKdh#v#l0M_kh1B|s_#q5%ac`$ z+D3PIc9fASYHH%VJU71DWn$pOg~C1av(Jh%f`Wo9HXh*T*M63joUn&|9~#tgOtm>B;(*=T_I7pPldODDjn)TE=&|XWG+~3S6^&k%(TV=i|qsRqx)7 zemrTdO?&NKb$zwi6OTEUB|I`(xrXHLKNrZ|KN@%A<)vYron;>$9L+dW$}-mbfpOKU zRqtzSuY`oK4GavtPdvEHZM1atPLJ80KYsj3lE;%=&ajVGxR`RE8Qc0S+Q3=TIWZxD zJbiLuhi~t=a^uEQVPWC-+17@$J$vT&UA%ZPd1#-To9wyjSmuKV4{~yH7FAU(`IPHe z62KvJ^VY3nDdIGK0Rg5D_YW)hF6DXu!q(W-!Xi;4@xb@CobBWqJv}`o3|oVPg2IP_ zf`gY)eB$D4pD)kL>7P1P`@*(Mvqwec$dR(y>7S!z+}6sIm34J1?%usS)%zgsW_Y-^ zQI)pyV^x>2?`*4tk5iW}U5bj1-gojO_vX!;O+V#Ix{UXETNlo)x^m@;fS{n}jK{j& z7u9m5S8h4-NGmS!V6ck&=A(D@zqy&43rbnHFR`|^b{J@4!ymu@_|YdNWh>UX-{rY} zL#AcRbY(t9zI8R=ICS)Ad7ksnF9QQh6ciMqqN3uB%WoWiVYB|Otg~kCu2fg6r;{5` z%$=f4o1dCQe>OUL^k~kUN;<^SY_KJCB5+oZ%bJPV6hwHcCi?q?|!Gf zaySO-PHV9uRu#>jO7c~6aL6=0e|`(ka9#5CsN1*ox<|t}cz8rEet&pF7O!m1Q0K!oCj)|W6iiU+}>Rf z_wjXBc;5XV>%Kfjn`oNd{xnddQ=TP$wg<$eoo)|^pw2kJ%x{iUI{ zSDtkAH>0AMsPC=WVoP~s7!(vrWTNM56A#*4x^z%ea|Kn@(!wP#FK^oVT*S`KZu#=% zo~K@3y!YhEa$JX^j?OBn3vI`fb0j1r$PSWq8lkB7k4|iNo9ghjv9sHcUznYnvmN=e zFL0e4m#Cu}!BrMD$5d={~C@yW^YbaZquF)CFb?eCFq;Fy2 zp3bhW+6PB@^UN?&-7H^d2w-&>p<|j zpxN13mbl1@H*enTogDgvPC!S!Z)_}&5;uOI?|Ny+g*MI;Cr&UeUR;8j5fmeB9r1GX zZ)z$Yi1P82le_5&X2EjYL^+81-`;kqo{n>$+Da46;7L(F`qhh zD(U|HPSp0D?lTU_Izg3*DJlM=UFF850US=hXLl`KyEA@dd>MQC0eQEnZJwRs55%m0 zM@2^ZdU-9#G_74QH#2_q*|TSv=F|8iCKi_B_I%fnTeq$b7tXK49WQ$j=b@L}B(!1k z=0)_2mh43nqT(I-1A7^K0*D~km@MGKXdbOB=mUAABQ^w(8{pBK0 zx-&XDI?+=2DSJ?0VA4g#W$T2cS1AcOJy470&+4sB^hI?(asM4YrPATU?=%~=j|n-^ z`Qg@vxCT!SG#@h`Pm%Ta_pkeDVreOSSKeLEXm@jf>EXkN#bsslsw6AE&HShos|=d2 zzO(y`ii%2f*CyQ`ix?QfcKZkfbmlttCr!Bjnh4*0AOAvKO>Ve6G&E#Vy_%if58I~h zI2xhYGuq21Pt;>~*`G`KW$=k35BK3_Ss$t^CogaMlff*D#@9_vO`7E@kH=omr)!B= zHw<27T=p8fK{-{tG3})8{li8^_jbBYSejMcKO7TK6SqgK;eKw-_>b<&7v>5 z-IbB0v9YmstpavGza2Ka`+M$JWb4JQvLMY?JlTS(=`Ba^`t{VriCqeeisH<`6RjH# z&70)WcofZ%hX;ML1?yI2Xn4M6&%FLQZJMm3vsK0OLc8vOhxhKSU$ETK(Xq6|cUh#_ zt(X|08{E=IxXK$Ew1ou)1us4BE3gJv&=~xK_LUc^{C<%0RkzUH&F!_9mzQ~vZf#fh z##ZdUBzu&_GY1a%6gxk+Z064zM0Lo?c%wRUKPkyCH#fJ=am&Vy0+oGY=jQ=MMP>iK$ijj=hh9yy*xi?gw0d*H4viEzjqcBU z`gFyW&h!K!m7Q)BE`PJ{qch%cLc{T?{e1vps%m582@6e zsx$N0T{%}3@lP)=rC0pCgLT7o{@28yU8|P#D_7L~-n^_V$DHmhd3kw)Y9l&2hxKEg z4yu+F;)=byzJB`_VAYZt(lYbwSLAM8$IsGWR}U}R-jYNU=D6iPsHdyD+R$Tq6S`Hu z!*6Uji=a<~Gy&0O0SKQ-NWjM_mmX%2vlC>`Ioc&snK2-@p51fA)CUma~?Jr#8z%mn!mK zxkcrU#ryaxWA-*SHr-z7KBQ*~z8Y;lAE$l%cjDIrC&kJQen@V&|EA_$bBm8@3Y#O9 z%Wmt*>N?Bgo8F%I_;|tKV|ODzcd4}3M7obR=t@di zpVB{h$l2L>r`=a3s_6au<#CsP=M+Dek&%g#w1^n%i8OpBmRkL5Gm3VD{6|7)HO0@L zKi`(?xEQzSdt*B9kvREN_akT$f-Uc6yK^6HTDc6jS2#P}$7j)^C7hlp>8JYoySue} zo3JAI=h_M{&)c=?oj7sNs`dE`_g@#2%hS;*UBX-oPCPqTR|Bk(Tn#9+UQ&{SLV@=CnprMqw4(Izmw`vu%-xZE7yQ{aSt*wp4 zSnPzfw6O4_`dub>%R~L+`gUIaoj3LqC66X|H#OBF=g?4C82jtjuPK4Yjph~>Cx@KS z7JLc{nzl95r_Qz_-0`Y3deXYE=+OCV#z zF6QV@8kC+4_$H$3v>slR4G3dI^euRLPGG_*7`~;*Ci`gt~9exrMXOv ztoQ6}c=S~JYlD%M)u|z8lgnb6nVIit+Co^sjCk7;T)5HMvkdVwHRDBB=*4Y26)ivI za-^iBO!cLvk2L5S-c!21iU2ckO~RD!fqU8g(haK;o4#vgWORj&eo@iq&%D@%`;?WL zWE}gnX!({LFM-PeUJ`Be(I$mYJY`t8aA9|~*sIu|^OEP&RUVwY(Hbns-2FGONpDYp zjvJjoR$s>1&lNYr6RC>F8zUY_j0&X6nDEkKlBy~zg~ofl7q3pRXncJ9x#b#BgFL2q zr|$1p=oypJ!(YC9NnpnJk7*~fzCAuNAXYBox=!fl+6ARG#*V{xg!k!anE6-KH8zS( z<~=Q3y|?)@>j|&b4}aU|gIkj|07x1Z8fv#Qn39uR;(}G3I)&-JHtd=kZWVBWK zLbJL48a(=`U%%)Dj(m$y8(Fk)VF%hbHXgjQ_i%TXhNv&W*GvCl*n80)ZD+>J z$j!QU@1FAE!{MQMfWTQYmr8tCljGv{TsAP@6d}9m)}~FrbPl{_I1cXVJ%}GauAj0UpxW&PQW+vVk}%>g&hW z@8RR4u!fc^vHkx28^u8u&*pfyHANM*S{I6><+f*8td0;fTq=^*IWHCV$otVl*%Ct1 zZ94W-rRm{Vx~f?Z4Rux*c8L`ZYmKebHBcKLY*C?ASm=B_(mZKL_@CjFhA@#7jLgjg z$R-6gr$)ZMT{$+`x*B&VOC~)$JbdgYnx2rQ;nTy;F-dQaJw4I|xQl;BOF5=|ZJe>l zb+`t6i^lf^3IGA#85tRx?p?F9ZdI|np6PWpo*j7FGYX&!hGm;$6$cJG*zmGwzVkBR zYiDOCP0m0mY%X6wSXj})VTWhuYs0q^~(aEe_<}vkm~^)t%j!Y_L_%~LyibQxIoA_~t!HD;i@yW0pm_mAG_H?umW zd8ex-sp{7afq=T70!Q6nJ`W9Ze+MS6(wH#dx{LG>u|G0s!-gO($>Nk_58fxMuBJ+} zj{ce)4!(Kwre4g?AnEp-C{1?kY)WjG=jXE93c@uc*4Xvcr`WY_-n41GtgL>@sKrV2 zx#aXDvY=XQ(xeCu2WnBgwfQirr-|W_u!z}~%sjH8M-nA!9G!`FLx4iMZzy-2c6fGn z_SmyW--bXGlU=G%1|oIF3kLA7x|)fJiSFtRLJKuw!kqI_AG6wAawOfYDrT4JCV|V< zZN7oN^~i=}7z80XF5F#g3qOCtj5m>a6I?*182}~2LKUBGv<5~#y;phEeuaxaR1q5lR{mHqj8yFNcI{Eo>qU+=k zZo)$BjdCD-bGOHUogIVOpBJg;$qatF^f^4n^o85B1OWZ!x#{loE4n_Y2ALAugoG9k z7yNqM_6|ks=TRb@s016XE|~tt4KZc`C1u&B4`Y0#w=a@^0l@dT;e)(RJqL(`t3Q*7ZNC3QNskDN;65{e@vjqs|co?KfT-z0hJRNn(s)Y8>|DGeyF!JPNhTz9^ z!(hAt|JJR{TaLtf-nz9?e(DST&Rx6KZ`r~KfvG9S{y`=U>g-!h&EqHb%#P6C;N2r? z#oqAT>g4~TvfSX=?SuUp4;j+N)^^|N)2CSt%*G)k82zJ5Ga2MLUBQQ7+kJ7M2;xfZ zJq5-cmQBk+_R4`NZ3bH;W98ksnwy(}ZZuMC_wV2T1YMPro7?zH(UtG@DJvhPrQJ(T z_6AC1U|{(2CS-%A=23(F`MT)_&|mi%7_1}Rfx?@I=p5?r?^n$_ai#P2z4hzYH$5?; z*F1Ktw(#=hvA%}IR~NB*1qHEUJBCC=_>7F$=)bVprziQ_?(lKYV#}6HW-PP=cs-rR z`u@1Z#eS0C2;pXem#`bE< zyM>&G{C*s(QBmVHB-NxO)3-lZf9$YoVMo&-i|7h~FA-#KZ8_?5{rX}6z~x+A0niG+ zyuGyrD#(?%IK!aVC-0)*-NRbKYAFIZu?6X#nwpB&PStVx@s)XE_;WD&XQD}E#5O6Z zvhn`L{;w@67w$`g!zU#r71h?VwY0QgyPXsI>G3(o{=D1#+$0z>pqkDzX-Nl zw^yThBtM6+rPCz3aigLog!G(A)rt4@_3Y9R6KV5|yl(0#1^5J=ou$_~%0^MmwhmYr z8*j+2AtjNElT*vw1LDspI*48QG_>2-_*8-e{R0ESWu>M5Qc_agWr=}!Z(CsV@PIk8 zUH{W*h2$Q@#l zk%5QNT?Xd1>D$+@T2ps5YKN^&;u--JucK|Ubp1cY*sods2?7Q4zkjimJLedhe6IXG zH+=azK2vuY+yWs8!-8uQrP}{t<~9re7&zxmP5p@4`R3;OotK8zqKk;1E}~%L=nHnJ zuhe^P>Hq${`aoOGN+NJM<&*$CzKfCeefDfS zP{LKH;IHcHo@&iwRlX`OXQZQ}gC@Ozf5V2+sZnF7XGL$`FfCoWbYyJoHR>P}wxG8y zQBr|+oAR8uKQ=79QD0wgr%Fc^MH+fA1Ami!4%wp^K#L$is#G$5aB#2|04Q(zTZ|Lx zdve1yF>VqDuds^fF z;Ox=Sa^6IIVDPHR9C_#fqWUjj=urB(4$PCopAW)UQPj9b+^$TstKiwld0 z5EYw0q7W{QeEt9S`X7(<|KzNKh&JQiKQ}=!X(`q@v74{@JuHL6zHhk}&m!%WEAzC{ zu|SCfE=0m^WozM+_5Qhb?8rysM&wl9Cd!Z%_iYd05yq?ud(v6IWx$ zj^hp5cTg@HguM8a=O2D-Y}B>gL9Ta9=FQu;bVa3$wDiuMyC2pjDJiLhm%o>qTG`0V zmanO(p@`=9^yyR3V4uLiCEt5{?*S{(mIj3tG2rD~TwL%1SD<%2efCV7XDPXq{6$Na zEP>-&vcgD9#l=O2Dr#$63ll?!yk%B-Q4xjD@xHEZKXj+M4E{2*DE#vZ z$Qt~AMP~jyD6Kl$lb^>qF78W_|Iq3(d--$4WU5MB?%m4LdKT%r(6GLrwD#-0R&pcG zO{{*uKI-HKe;Hpx2{b&Q&F$13pRp0G))xJ>v}i6y#nD-NcwrAMI)@?MOl5pMiUDmH zZ=c|@Nkp1k39@@Rf8tqSTb2j!*^|^?Sn#tT;eyUf#^TSvtZ`k)^4IZ(n zF77*5_Xz6VIyWvOU!Ck>&#W@er>VxS^hN18t>oc4IeMnz)X9?_U0qj!c}}Ad0rRAU zh|9^z*`nbBS(1dufwH9A93T)-7K_Wu7J=Z?mij(C@?V}P>Cx)u<&04@nE-RP-$&@j zxV0|F3U2l{K6BVM0-d)FZCwXuoG zLRkDc_Bz_oydhXK(bHe$;=bkg@c#ci2>M0KZczMK9PenCOrd`xvLpTTffg7_1Mo{#B; z$D9oH1s8VwUep!+qG*KOL`(doO+^QGXm9o|Nfoi zVIU}s{Py8p%apxarckbR@0?3GsYuZ5LxUUvXT)!N9hDD3^FK0;c z`2iJ?xDK3LT*cV7zy~5(wasuK-lrSxL6Cq65RA~g{R#@*H$+}G;$c8Fyc!%F>~NX1 z_D!l00&YW<8v^(tqWcqN6)6xH0Z*Pk_sPzd#-sc?@wTdppnL=;0^z2i7%uzx@gp9D zXL`CAJn66-S074YA+aoEWJHTzLwN$JZx9t-3Q`2Z?-LWlU0hN!T6>T;MM{}0j)X?V zGiUg7FAfIafv(@Zn>#^3XNsge9GeuoX8=sZ?Y=5Q{d!ehy%f=vkm|YLzlNivthg2| zSil#+AA9FcEwTm-jEr5-L0%JUKQdA`*%?;ZHb_!Q4<3|&8Q?w>um`<@W3y!Ct+=>g zK!z@W=mSPZ>({Pbdov{D3LNl~PoH?8T$s$X<-4xFBjZ>MC4tzpVuW;sBket*nBOh}gTF0fM>FE3bvgp-qak*9>A?@gDrx z#zw{0n4=_20wAN|lVw;mikh05f-KX&aQ=BeV8frZqs#d8%nbkf^>kQm$Fs~i*2y|~ zLaD@GhyUWE)qn~4LcQt4enc!qFf=suJ!m~uDzxh;J{~v;zl;nQapFMJPzA)#yk>$Q zxfD9LmP<91sE(!#V;Hx>XU?1g`2X_Nt9@ZUZ{;a(G`t)H zf4}O^?$snrfEH*24>Bn^xdSYDYU~4~Uu)#Mq^w%5ef@e?)@6(dYVHfi{#9ToBwEu6 zNKUw|K!9D3D!4U#ZJU86am9lNA(3;xaGeyjw6bEu+kuW5PYg7V{QRkajeu(y8R<}X zk(09?sucot>P{k+5h4mPQdVHE^pL;5;OP^sTJ7**Hcn1IxFgy#1!^%;tOtYFd1HHF za}z26+Jwd5^>VmC0jadIVBX;JU|v=C-@iW3W7jR-}0_ry*lQYjywF(%<@mePESOQ*|(fwcCv83l;Cy{ zF36BH8Q}!;>FJAC#irRfi)WOkF^^QgBbF1k2&(P+&t}c@o$g<<292^`*-foHe zl0Vt5fXDvWAYbrO0iHEc3^;d~-gKw}6G1y$M6H!|S`Gudgi9mw*l@|v%&F&h20?b}NbPb6uZ-dF@L ziJ^w7umOo2xWF$Rj$9Pma`^T&{5Xv31Ij)Wk^}fNq1d>Gq9~$v-abD3($cG8Yc`me{&WA6@lX{|nvj1`cN9>REGsX6(DqZzAg_o-6;Wl0Q}@03*%lDN zZ6Mazt;jAfKs{7>XNJ(orO)o!Rkpgiy5IZyuoe`0=gz^7A3;VhY!@ocB}(Af#J=b8 zeQDY4g5gDB&fus<#>R0ePP<@CfaDs_7`KW%h|X zvd(tV#;ACLlX+O36yJso^jK;kyN4jBsctGTLE?#mUwGEUqyy4Nw;`t|^j!sZB|_sO zA|mWs^L|$ErUIm+BSSt_;$1cf38fZ0`)M6gZJ<-1+n9RwLxR$EiMWUgbagrkAEIAa zlr$E8#}x=}aR}*rL8ilLVqgtw@HUq*Q*yDaTszdwg^?E})mu+*)_QHeOl)i{``yLM zmX(7Lv+?kp8sY@-Fhy`8xhFF_JJMBuIYU%T3<+|Q7sKuolDm%$(bk0o2Bm&3Ib~qJ z4cwol+`-ls1!$Qd4?R_NTXHc3Fmd_0v13Gb=G<0>dR7CsLS9`gMi4j!CswN?Mu-?dO)Vd}yP#VZ(;xGn6MRp)tRw{RO zwLWK`T`^_y^tH2z&c}+-Y}EM|=!|XUkY6YY>`ZfykdP2IQwOO7zyYLpBal}NhYs)y z5ja6ddpQ-OzbGlY~cI%;~hJJ>li7dRZ=T5&}q8h zE*2p-W`nM$c1MPT8i7-$?W~q?8z|>(!$XY?l)&Uljpkh^PAc6$awh<&O-1hOT zt@?!?`B0!gV$G(%bw3YDi&)IULK}PW^mbH*ykC7Sw{;$! zm6a8?M7Z9j=$M%6)&jh~7~BdD`&cC9V^J)W-ls~=?d^gvUF;Hhhy0{F{r@LUIIIaDrs zD)$=tCSV4fyKqB+V7LUZp>87Fk4g!R^NCsv)cUHpJ-i4bhTU*S8-l~45G}!hdwTrY zIabfQ$x!9+DO=IE4S==)yL@Py0FfcTQknq7=qM5@>mMN`(yvdXdGi6p@71H?Lk0zT z*Q`0wYfHai0gZQU$n7Ujo)qaCx_e)q|Mi)GJizv4-h@1&Yo&$czQn?QFv>)6P6$VUK zVgG)w%>9k!)Ms-D?!6O)%#pA z-3t^m9<_onakT1?J9k*2*t-3m?jdv+k6(-DOjhQFPh78Fy^7j%Ssp=Zax+o2j36pR zNn5W&>YR=Oa}VcAhhmK?z5B==8E>!yJ}{1>aq>#gpb64ZyS;-MrtgImc{F9R776bF zMfgLf@BxuO8ijS-iEu7$wEwa2?%lgVB8}VgoOe3*vBMiBe2sbK=F`CDb!EfhVC{q% zbU=G9#!J}(_FVJvDTWCE{>v$4$$21<^9tFe?&OVNOcQB-**e+1rltaL)N0XAmT*fg zF?1VM^4=um-^a2tF9~q@CyCsX>>&1YU3KB1Lx(tbUlfLS0c)1&#fujYWwZ#ENibij z#0Cl-ej)wgL??-e#kZvgSFTt|kes4&k^y@-Q$|)+n#1%>G|-AUzJ;E}Tv_1vpcO*) zZ4fq)XYa0lV5;V9*w09^!&5Ue5xdVhH=2X;&HtV(w5##<^`#dmFboG)|C4@(2N`ZI z_5l!GlRG^;>HCbVoLpu8;|kgGaA(%0mXK0%Fms5g;Sk_QJUdZe;$&lFPRwgoQD7fL>aUqZZV2*V`$ajxVHMaU!- zk4q0xg-UvSK^jF1KXu&gGzGo!LWveOs}2 zNHBN`tUCE{9(o-K^#}6oUWGRk$&Qd_3t95t1-$uxFD6dT&e|?*2f{I`q zXQ;(2+cUcyMrjcu`-Gy;V{B;(buQIWPEl(&*2Fak@y4T_rL?8V1{#_7c-Xeyzt>0o zDffN`k^op&`IvySfi#==!jCDds1)NdA-H@Ews28tX>y%0R0@durxBX4yY*56HKQ(; zap~HJL&;G3HRQ_wox#*#?XOE-OcXz|ArY$6KHwXaY$mIYDt%aZbBqQUM^k=uLc-XJto2QKa zzz#kYd~7)b>1uoS6bjxWwicHW>pwH133_!{N=O2xA1HhJ{5dhVQ7NyY0EM?spgSS* zv~^%`(BkX`>@-^8i+{&Qm03x6FhQUbHY4tx82fYA(awd0r!aYW;u$v@T_cyu@>fz$ zxk84XKtzcNN+{yz`QNJnsWGB*(8EI>V$2Aheo1NR!quYso=El(pxO6?80rxjiu7cY z;i8FZiJZC@>gpHJOQIyTT}%+zk5R9a&n?L`MBh#oa-A0X|IX1SZ3eo&dinP>BRUX+ zdTf=*^l}C|(5o`U^Tjbb#Bq=e+&W$n`gfeRUKy*DC@p8to#WrQk<6|9Nvb*Y)icA) z7rb$kKmZgdMXmIxKCYt2M#jm%D0P~l_Dub_e$cp+n99V;K(2k3I@9UVEC+(9y(#kA zgXKB2Mqhi|$98L^7xme?#j48nxM(9Nl8BB&+#)|eEw$xJ8d{Fa6(t}_{BLZPDQk6? z6TkrIdXP14+<5=~w4(yQ%lK&iQTNekQU?F|(mW6 z^uOhI9?ew9F_3JNm`CIZ{qyxdv&r>ENr!HYl&&_^$|Zq<(GFkUk7vlMYwUW4qm(nzk^8r35}k)v)oz=3iKfG%4%wlw0=Ps=Z6Z6Sa;Ui=F|tE3Q#O5 z3QmNnsf`soTmH;c`-6fhwSTT4-|IyLVx`~5;ZyzjNv)6o(%`tW|J-cz=!m~V*a^2u z2}H)62b2y-N$1bZs#fLSh5m4z9-=9LV9AUmzZfaAd6+~o0XLC`Ld8j$67(|%ZClQ8 zugVtI9+p4ErR1{3OO{-Nsy{aND4G&FRO_fdXJlcqc;E8n+}x>s>V9AU&S5{iyFy9B zg~?03ucliLjb7ZWZX+ZH-G<1$>2xsO@p>TOS~Pd4!w{rk8W2sZvNk0Gf;n$&(OH2&A1F&P z90brk-KK+hKtNJt!V=R3{Q{aUpD<;78VwJFP&*Ju!ywl)yVl{Bti|B)1R6tiu#mkA zxe9Tc41@Z3l0ozAbUh2z!^QB z`nq*DP?5JM%mI5(8kW+krZ&01*59t#^m>7m! z)6vnc)o__YowFIcV{Bqg)j`X_(>EP*0B3SqSvQ;I0aq^6Q6Ba!{v`op&%R5w4%1aFfu4Uee~#;-j+y8Ui>Ya1{k*#^S%Nk=igD@W;l)ImG(x+s zbC-hddD1%H8&_IsWr#Kl+CVfYWPZIMgOb@}NTO;Oa>GD@cED8w!@or@$tU#It&p2H zJ(0+P23ps z?f3-;m)}$HwV4_*dUWdLsyMeFhq8>>mDq4qnuo6f@DptmkMNdypll43!BYCP_5Zs(F*r|T} zxE~ED~8mt=ZPbfa+wcMT``&2k{XU zaBU=430?mcf(9~T+cA-WXG%}O8)Aii1NmWtgv1JfvAt;5PzAMmIsN_?Tjd~Bj79jU z2Y}v4u*u2AB>^qcju)4r?f7DhD9tszTRPM3p4) z1*#MY{(}yi)+I5YtBShfyNp+7cLB)j8QcY+^%6`t_(PB+X?Vh#cG{fIVKy)^j@@Cl zLR5e1^EU#9qH3AGKGYmC>v9V~>? zMbe5$<&yLyv`ox5vBLjdyUUIN*n&2e{|mcsgd7ooDF+A;WNsDO8F5{bk~Til&shP_ zv?2GrSd65_s#RhJfkfnRhjJ0v)~1iZ{z^DNSK+5(k=P=GNpMuqECo-BhTiT8n9XL2rLLPzlHh#oz6d! z>U*#xfN~kJx5?xL@p$0omqJ}19qT*mb|^g5=28WTYi>UN=a4t#pK8a%BihNPSv=M;{#O<7j0 zI!5!kei=#N-_%X1gj?m!&1;J09}k)!?P(WW**7omI`OD(l!#Vj5TAe$R7)e+vt)oD zA_>RGfV3gGV`J^)6a;9qh%Ou%a)z9N%`A{t0A)Pv(2~kbtJYN*JF1%#t3g$xD7a0C zQQT}jpu;N%?;al6707bLEkc-+EJH{pioig%h#S9n`4W~-qE_0ugo;jSwUtR=d=dRbe3{t!Hc?@4_oV|O6b>VwYyVI94_nb(`QW#=mUn#4y{>(49VD-14h0yhPrSYG=CxGN|jE2MmYA_ zxp!XJH%L)jh02?FdH(m4%uEtb3DetjnBi}pNvjJb@I9jV#QH-CDk+?w8#WJ)4hf;d z(GQw0>yf)rP3BVRYtP>e@gl56(f{FQ;C2LO7EBpxN+bB)Vl&YFW64v^rDTK{sQuqj z0(F)BAM0tyabKyuDk=fRQg7eBwF_?44%{SZ+4Q6>%Cpm&CCu_#!Bob0N%9^WbJ^P!g%1AW1<|7fIXb z8ZAtxs1*r`9f}(4ZxV$kQ#SIM{hz1BWNFjtly@#@$V;rtecY+523Oe=JHdPGt zwQd#`RvQ}44anFC`bHJOVBvTka_82mlc#DM8;J>w8)W-y!fMGnSxykXrWdw}^#?d6 zA3X*4<#=omk-zBdWTEWSA8Weh4u%tCf=y3O7a?M@2(vYCW^oJ)!TH3QubM_wr?;R$|HBJW zGgH%eba2WO&Ar#@lw7(t!KaWQB6fMq|8`HQzh?O?>jx>xIMfD;ZxQ4Zi;YqzL_VY4 zBQVnNCTh&(zaGo#Fx<|wxA#1kWHWAR8r{6Y4VcB{5jO81_xFJ=QDd_H#>+|3nkFfK+-QB_s6=@Fzxa z&PTUoB8r4i1@AvT&SxY47-_A`odZgwFjs}9e!DC3%s0R*(`wp z$w{Isn-rLYAYhMXX7o06Q(Xp$Sv5AUh8iZ;AO(39+S4Km>7xA|=RE&CuL@-{X;Hh! zFDsn9=I*_klAr#%a)-soCFpNX@bA#Vec`T<91eo0#hABGE%xhml!CXv!}R@Lm9MNA zy*hX9oJq^!Xo-c8T1dtXseVu~%nHa9y?RB5k5vi)aNJnj9etTo&SeMTHEvTjnU>8f zU~gZ6!3j=~ar)JQoH^RcMBx-UtCk&^7L7|mKA{N+Kie8DiOv|Bwx{?(dQMBQY@OEmL1_ z9mO&h9Y1vVa4AN5;bTz1R>Zp_13c&{7{6Ku=z|KiuSNLALy5opY{5up>0%_0v}ZD4 zyU7IAeE6^u<2Jq{88Nqv#A05BdMAL%eXwwFz|<1lMFbHca4kT7gp3FW5jxUc5w7_Qv1%h67=ZN9axivcSz*ZQ zxSrk_JR%Z5g$I88L*(J~o&ROcwRrs!j?-rMDXDynd9)Z*&aO~u0G1Q>13z#R8T-h{ z5XTZY`5=SsZC6jv7I*hTStKA|0oHeWn4ZDlr3QQV#4!iiZQIm4^pUD24@%WtT{Bud zl`9bd7$*Z+xXq0>vKVXDuNF9R0Bv$5=%rTEHx*G@R`#5GLRS%{rJOFFBW2 zfpa4wTniHV83QaB#7^b1gSnBKfKW`z;cpcm5bz?vsXpL*Lf>BrjHxvB*I)mJ zwkmO}5QIMJ^o0vci6q&R23G>J{}03=9o3 z1PX5XyCgN~vF?DW&)Jbf7F9a_j2EIlm=ikuH(@V;A7};t?eQsJ zY$-gjyKu8pg693|S^uh{=BIHxfV<1L#JPDPo$&Xak6-=gILC)RAm7WegzDn{7*Qa+ zo|qhy<;F{hb`VAZ?2qgbC$cd<5NPx=P46X``uE*M?FSJh8B8)n+R*V{ z$n!r|^G69??k5c2s$;0t&sh1Y<9nZfc*exWS$%#gEQF&Ub2Sd5>lD=NOaCm;C+Anf^XLMU@KC zH&;6*thMoXvaXkzxlH>%HaUSlPl4P(WktCa%RY?gBcXwXW#2aOyL>W3*qq3K5CV@N za!c%Zk|tWbT2u?FG3AMGpcOD;o)aG7;S~FSPSR^pwnBkHXZGR?Ykn^beoJ`Q`?QnN zC?h0Cj`%NDgcRs;cl99f^uFauE%$NtbTH(Ijkad{IRSSO&>5WX#DLQz;?vWYp{`>b zEJf|5Si#f+@EX(!mU2M>0ng;*O^6lX#E`FLYtdP-I7t40B)&h~C*emgfC}#}d6-3H zE*8LY0Zw#7Dkl-S1QL3LP5N~r9z9$|O$w~uaDi2K&Rf}|ujt;tiG_4x!VS zdtMMfRYLO1e}-3q>oK)$bo#U>n%oyy!0MccUGqZd1U>{_7(rS)9%l(|+qP{lp^%P! z^_nq;L+A^51z!lC0INijL0(oU@D#fe83BYdi7mJW@lB#dL0dN~%~d;nnh*C23yY5W zb0Apko1y7SqF|vnlmL*C_E8`4>FSYq!Uk4 z``?YKxf1)69K=(LdPhVNa_$SlAL_Kjn2AC5kE8(r2RW2f(i349Psc0K>_ADT#(UVL~KY9;VXwri=~Ws_$G!YRALaw1^i?2pAz`N;+3%%89ZfP;|cOB$ApgYrU+y8b?>B$T?Kk7|fTUtFG- zC&Dh``h@eshH*mLn!}(A31nZAGlNe{#sOfrh(l}Cv;y|?CGu{L-9mH#D6!$8wf}0M z(YCl8W2A5Rk^0AftZTyOCoUElzQG;wwiSV@jfW9AR5*5y6 z;DhLf6$*#*^`A4|s>n&OSj{%}_6op^2!gWV?;u{dO@6urE$bCJBWc}~XKJbtlAIK_ zI#mR>f5Y55@|-`r{S4YMf#}sAFC7CfK;L9~1dr|J-Mcp=Ek06z$PiHI+x&JAxRT=H z1+}%cWbA-UKVyPsYu=v!rHs)d7WN7j2*4PzugEF27>NpRy$Bf>V&2jf4IIq@8qf#? z$AkU-Uttqa6zK16ggZChc?Ky3COxU)kixv*YGM4GoEZs(TqkbQKR1B@0XpV2Ff0d4 zpduV*N)I9l_M=XYU`o$x2qQ29w+lH)GXI?Oli7>Zsjyt@b?eqmK@g>*$T<_(xk>im z?wG7vN{)KP$9RPUN@W@$pAn`&Tqa}`Msbhd+_6X1!Xje>DNZ&rqK~lXVx+sx}Ipy7UVqkIlDmiXO| zFHKWx?3IGj|3>mvP1T&6o9n)T6Ezn?^b&k0w&A4s%v0%hC;Gj%&euY5NGB_R1X+dT zFdWAVfILWY7Qw|!-Zm`WCoNqivD@w!Ud{Aei)vGuRwsN zyS{U~a}N&{5XYzl4o@P6%a$zzcYmy64Ap@bx zAUFN^&H{{+YBf?9S-Ev{gHs;U{m=NQ)_Ku_^LaD&qxm=O|1Z+s1T5$LZ~MP682d1G z*^88YWG%A0vZsYMYnHN>q>`F33}wBvSd$bjDk7wkWkR8dB(kQ`qEOlC|9V&4&+mDj z`~N)0@pK&banEn2uIsyeKJVo`&-eLBziLrn;M07Uum9S;4AD~W1pVD^x+NRE#=3w8&^|7nut8HjYe1Dl^JJn6+cb`d*7GOo&k(54O*I%ep)ey4MsKf93bNgEs!jW z{4Cyb4b9#dm9iPUT`Z_kpb5oImE_%TvZ_{84LYMgPoF8YA=KO>5j=U*_ak z>%5KCFW8{bRvNm^oA2qfj(6_e+avs;)>LR$a<19(zcvLQO;WAT3BTQQDr9X9{ke^d z{m&g;mUO0kBS{0OXQ!8!x_7Ug=`RlsUKNnlsHrv7Uvt0Rk3Cfx+S(%}bQzvz&w$@2#Rn)l0scD4Ok%rrs>sZgZ&>6y=cL@uhOfCe-P>(|s< zJKs&F8kujG+%OTC)A2`lI88;O~9>M!5A}_KT^NQyu@0wjbXK z0pT#)`a_+Mv+H?2j5yhzDe^b_6;XR8Ud1W9k>XY6jPHY@s6LHyWmWd&aXqFt`^ zghgW%EIK(GBYVHPW5fQkoww9k?+4BhSO=APgXzutt#}e=mr(o#j`SIJ9qyfe6*jf~ zxCxQ51eaP0ngfdsfKJ z)!hD{0mJMPHYNOrK0oa(^iwSbBSbTn$80DtUNNHAoq5xlm%w*v12lD$DrcRU{>D(B}I zOPu^UdHA$-D`*gYYu4;9Vq80ces!#D7u(r5b3AtN57R1XW3V^8PQz-v8|oLBgZ}i!;y7=gjfX zYSsW{hqTRj7m)q<4$wYq@JI}CY4*o4wBEB< zFR?E+d`)Y!opS#LMX_+1)$kM7h*nCNL&W_dcDjP13jYu3@(jEL5KUausdmNgNiGRTsovzcaJpz83X+BgzON@bm&fnJ2}j!U=XKJh(1RSgAj0*xdwgw zGQiVRey3;|I{>H2j0Z+?TLeI-E)LLSzIfzk{byoxch~ml=Pf_wp8{~B-PtVDYv2pW z`tFSz20|m`zsMk%N(dcl;oL}oe?=#6!)ZSF+DIn~zR`vr{3=Yc;J5S=FJz*eH}Slv z$YOWrj9};u{Iu&S#E8eo%qeU$rbB?sqC%zL>I_m%3XwAenA7gzdN4tqMp<|r_TENN z52oXPQ&ZbYaU+Ws>aI32oIc^r!|Maa%U}%s(e+!m#?GAC8jz?jxtVESP_+w2E|`Qd zuH6uzrjlO(FGu7p;_!;ErA?mT45xLiW^>;S(%Oo55xW~f+YyQip#dp^f^KCMsQmmo zNK?iAWs70Dk55ZRg@wJKLd85doAUZ!8kC9Ug}i#OF=io~(1uG=m3eDWv%rBMRg~J| z)FeGCOymo)uYi3ZfASnq6O1;?pBy=8M?4crp^4=1hbL(=++u-i?Ti9mzQ^WWvp}L0hjmRn+>=f@yYvy@FqO) zrg-V1UP}M?v3wR`L}nZr!M4Dx2<*u8!F^~8p$sUYm`|GFIMU=IgcGCG?r3f>qBm;a zIQ?*P+=uDk{#^v~$)M}ak9tpr$bp19+Y(Ihfyk%v|N(Jx@ZG0?6*X4afU?+t;B~{qK)?J*# zf0_{K{MNeluM;QUFgV+!e*IC9;l0L<3EXqMWl7;2NUs)I>a2EGC7o?Al9NY2xy%^hCSz z&(Swxj0X8W)0rGHJhTj@0qBg^P!sfQM5R%8hPI#H9Ifo1?8mOmc7d1$Z?W=EY{>Y%b)h{*RQ-e2NXRQPe|MBR9{j_2X zO@I4kqqgI0a{M1MOtoj=WLJ}(fD+TwPwc5vxk#q#4VZtmAuiLM2oN;uG&162u|3H} z!XS1NYjH`w@a?_1@$8zrk`dtIzQ9>GAw!!Bfe7Y2Ywy%CMwUMW@kUJ$Qe6QV`D`C) z6+dU;!a9t$w-`4z@Bt9)Uli&eKYlbi?{&R!0r-!!Me!*VLEpJFE;NegOez>LekXqId`&kNXB*}LHifz67sgJv_vX9o`* z>ht-;7-Jw<0dmAf4b>p_NO!dVt|kfs+OEbu8O2IAb=O7u___1u)dUbkyD35ifr0uh zeb5*BEp(_+fawz?9z3@{9iKRjPd}_s2$@5ynGa5fd4-%vROT^fOC5{#|9f(I9 z2)j)4Z(y)!^>VhUOyaIAdo^7q25Dq>54#GWt`O@YcvX9p^eDYdy+1dde(NrJE$SGV z0-eimiuf(4U?+0yi>&g>ADv+0w@A`}%O+|#)|i6@#z#CzYA8T}aOK#Q)ep^&jM@=I zeTN*qd1S@Y3(<~;$5fg^ywN3Wk{KU*Rq$N$c-rfUSk!3Ox!mhEtiiiODT{9KnxfV;GoF~y3AD&GA@_V8w;PI~8)cs;Fjq3f$buJ^ z7{KK+#f$tKxR-3oT5ZP^DNaZtgnV(U_S(L8^DFD0!kojDa%CysBn^W&lEL2(un{BO#y=um!K2 zF(z|uj`xzzT|#^QK&_@(JVmY(nD%351z15)|hqRc;nuKus)3wynGg1+@uT%TN=EaPF9e~CPeR$C*>DBXd+^CX_kL!3>w}+qO+?q3InyEWBt>)Z{yuo(G?p zZEX;Vb_aM1Yf$4>KbD|^ZN}QP$j(Lt*pL!+6gaO4OF#-wyzd=J~mS@QVm_8l~6JVrD&+D2WRxBdLOE4)XMVak;Qu8B$n;?uZ& z%jx03!L<~Ww%7GWfaNJJvQIn-I}@&P9+_0pT9zYBTi|@u7krWKkrdJjo@sYV;=li1 z#DZY?r!A@l?M|S!2sW9o5s*>$$7|3DW=xr|CC3rbysRR@!1)-a&uyB19=5EOLVS5? zghUxi{7gIgm+1EBpI|@a-wz%(>?`tdEK$iw;vgU!U!LG~Zer3@pIouoHGeNj`v4iFDbap`57ja{%n9c_{vgdk4J zHm5%M)8MdH`chPgD2kd$VmPJ?Vrjs@fx5J?(qrUZoI+y4(ZDzeyxLxPSA6 zHvd(Ehz8MPUQh?#l6Qv;sAlL*s*s5IXIp~JI&h#9z?*n)5f1t|^d=}X!l@6f@GK`M zDB#yEuZ*#6s4TN~wdG}v95=2GrE&lRRaUm_cp(Z<&l(kc1nj7+GVrd4`(PpWarjPR zme!vR#?<$Dvpcr`C2bHoTxLsfQ3-9l@l~`$W9dXW7)f-;lB2)}Sw-UIBpOz+`0oEr zkU`WS{QC^o+hfWMDi4s5{-TkbU?m-&5UOGUPD@KM_9~iWETgN4d25kK`7oJJ1|1Ni zAlaQ^B6|4!N31Zc1>#;?fyzyRu3gSan7d*{6L4Fw&*aw}YJq^<(sypN_;+G)B-8Db zHB@S2#wdO=+YNYaLEkMXt08s9j2ov_)b-aZmpIrJA+?=px&A|1HElQRUrVu3t(>Z~qycKbjpc+1 zmzR!ZMWyG==6nAFIrT$_hc2G!9|bU7&_h%fA|2!lW#UA17sh`ZaJddrVTPLia%9|Z z`=brdH>vC`rdp=f&c3Rzw|xiGn)?bEE1TrdXimn7lZN(t3)(8&4H>BW$3Fcglb)gP z)wI8SP`Xw;eqo_A%=f%A0a{Ln1bv3{mO&=z4rraHOUUn#=jvc_u#{Z0#O%_?h;nSd32$4^KOv|LL_?&|*Qqu@ej{Sn+GTa?R z0w9$S%D1si!y=LC^G0ea$m@p>UwUo3`BnNgX}Es0sg1<&1iDA)&y5xNYgVuR8?g=$ z&+PGcc#gY5f5ha#Iy`Fz(l)^8nS{n8rx0*CcBK4L@_#|2BrE@*AmfEIbjB_NvI5!M zFm$twgBor5`O?K|h5MpnGI}tO$tRfv#Ax8#MSg8U(9dNh_ChrkfQLa@YY^dxQv*KX zG{gm62C#e#JO6ye7bzEOC?@*!#IW5zYal{5vCo%N&OkXt6YkKuGuIhX^GC3^k)s*K z+)m`@GI9e9u<7)chhT4;P&i1J4f9r8(G$HIx&=e)>%0J?y48o4@=?Yq2wG$=Ola71 zoWHet9n*F6VG4O^lswOw4Kg*}BY*bK&pYZWv)NEmC$K0JT8x+~cm?_#+I#WInK^T2 z@X$%zTxi!HRbI$mX;pw=3@v5TIJ=&#Qx%=KivQ*8UKxu{*M4z9=eG~x#-X^BW>$O% zUzZSC7E%~9=V^^|ZZkr9M;63Cvncq()OE)W`;ey=#;$!V9^bDWcDc9F`P8yA9e&JD zh90zae&lCR9~UFRdUv=*ici4v&|W6 zY=QbwCOW|h`*8{u_@AauF5>$)j2km+Ahxy0PbBc3KK=3OP{Urc-c6o`7+BU}<{X2; zYu|LjZM!jfiE*?04|OGyxz!VgUSi7$F@B)PC#h^g)qCLJ!Cip*s59jNB04zHHHp=w zk@HPPM4D32@?ByU3GQL@w*F?V)W?r4i{GMispRsS%%7j#Kd-#3%tj}IP%w|4QQ_bB zl5MrC>RT3WgC9%X<^j{wGA&7Wtq;{dv_p#)8%3YbR}1%qvhEIB&R|;`HMNE#e|wgf zckN~GbnRBXuSYWCm>CNf?i~Bqn*6jNV7w4UR=b3@Z1lscxGZ!pE)!@$O|`I0q6zp& z^`TIZSU~42g0A&MO9qjRxuwkj6WM_)ek{`T&ahXN2VX>;(oT509~2W7Y&y%?{BYCs z5I}aZZlIE*`P)cIE{>^U$iDmw@d?pz4aj?u9mBw*q4<*!GmjO|*{XFF6fj~$s(XbE z*Xh^4$~0s4J;24;I;nw;A-eeE)aJPgB^;i<~IpTmby z(QKg2L@gE>8EN>TP`m<|>!a1L2UZHzERBj3%|Og&`@f!w0}Ms(6^0k&bXzpNGUW%J z(RsiCqhhrvm+t1SuD}5-kIRT0IU-g57R5x5=meD6A~hG$lfVN&pXfrnIxfwOf6w6o zj3Xiw>;3qUz`>~O<*Y96^WFP1ly-$$j#`<%lwlpinA3zp_5a0{K5?^gDxJuP1 z;x7?i&?ty%K`03P4MYzFsHu5Zj$hb=zn)C2GKnIyi@*hAC>+IBm&SzS&0ZH47dIAy zS08;k%7)&0Yy@wQbjGCU2gf36Bj?DOl@z^~a6v^8X8rJ3V4y!kRDzuU*0Ey~>ShF3 za~KkpiD$G7-WO&=$%}dl!NXP{Sf+yIU`yd00G*|n^v(YRN7Ca~@oj1M@k$&8IP;eA zH<5J+n%lVJaEsytLT(75z|NCsj(W(Q!d`se`8Ur2P-I3^lxIL)_;5dG%U%)#yl-np zaphEF9wND_vdzWisK)xw`Q54VYB56cv(8nXA_CtNwz}b%@}FiZ(Re(Q7exramUUP|A^1Ecik6-@!pXr#lFm4bz-rk> z1EWG26r-DxPlg~F&C(KG57hwaTx8ComxB%q8ZM`dv8wJurT1W`%i@AqjoI4nD_D#%q`IPL}v|z z^}0s-H)wB*@ODkngFlHUJz6}aT$AS_UZK(*qAL|Zi=FQCXayV5;MQoP{RZ7C#6X!L z3h$vK7)wEMHiz9Wr(M56tyoh;$O;9u{8%7dgYr97Gov;dPvFLmn?;A-J3xpMy%BUj zFy&$_gPb>eg{UY)=0VvzEh{);Vpgu>6ZE0rfj#f$kqmm< zPa>+o9E)63l*3G*%|U-^m&c5>VaSIaQmdd$630L}30?RkrX6;raFWe@G=t(VC}V_- z$^HKO?*7@i=&VS0K?PgRG6*B!CkdM=>*aF39wMlmC%aMC$rKV(FJ#bZAz!q9tvUQl z(H}y4ie|df*Z>J#UFJXYIog*O?Avw*^d$veFGs6@&p&&O=8;j2>eUNb{e}Hi|B+ZH z5R|AZ8R-`tqi7Dx2Ft&95X9W~RDm&CvIdNlsGU%IUgv8o53g02F5Az9tg zT0}uI6#_;*9(Dtnhv;-e+)xPe8wKiCn2^SODUP+?9((@$D9BNkEO}^mfz6>GKv%{5 zl9X)Q;dd)`B#g-XeOL3Q%|`yxBKq%XH+DP>xYzvf9z!Do?Z;v5qwfVrhflG+>fg@n zwAr{l`N2k$driM_TQ$>Wi_S?;jr(K%KJc{Tg&|$-nhlwilJvZMo$k9m=SF;Rn4K}? z)&Vu&FIk6@B95u0g6Inmkd>QKlYbvCt~Acv$(b3h7W^~ zLRc?cXmH+j>)-0kzUE~s@y&W+v1u#x6&%qQ+|^5=I;Y$r^pPbf28(D{|3j(8TS313e zd)FhEIAK(wIRCg|bnc`qP7HBL$-ft}8>!ub@1=yT@?#^uDY0gWmw5s@_#5B{`ky${ zc;WIlHMiRkh6}U^Uto`wfn)O4WjsG5fnl#LO)?2FVvId01H`cFL?+!84I2`bVS&1Eb@An`b z4yTj9o|*Z{FXqp*O?CRn<9hXe+g`XllYk+_|I_q`0LDvrm-~j;A0=_j@PELfZx-I@ z#72rvUg?o=@W3bb>)*FI36Cw7=sIx2u6Q7tom#k!u-7&JeeXxO78C0Wn>_LsRt!N^a27%e+*O>Q}U<+S!%XS{@L6= zzSrqHp1K1Fg-*`S1a{#z>teSZ4m`*=?4PQ!xM8iFqaWYiKIPn;Qe*?#xXpl1NIk>1^L$v zZPn@zB8KzR&JOh8>DMj8qO(V4bMU>=nWK8G`|mCO#;S;WJN;619J{P5SA%SL(={ z8oynwU+O^s5A}Sa=6$C3jm|E&^zJd^YS!tSEAo~rZw&8nSw&c#keo2muh*BD^*P?A z^~+AJK8Am(E9U|a#lw_rhbDdi`)edg`abTaA>4e+=S^##8NZ>G+|$yYE*V(|hgG=O z9r=E8IMB$4F5xryqOs%0+d}l>QuL?#5?;i4oYbAR>#dnH{Og-5s_%~L+%-N~YHyCR zzEQBHyWed{v|tz_ZDw#$lJpzVB<*$f9QFK@*E*a!ca=iM!)Ww$+MVvHw!{bTK(#@~ zZcOA&khbbudLo0zEG-b#=V;_h_c?T`lkziZxqf`NIZXzvt-)np+(f-|81>ggmGPqQ zH9u@%S%lf;BpXbbavuQFbJ=7i$GG37;JBF$Xs5DUubU9)P@FYu(_AKS!dtf6fetf+ zzQnWFAr3bcYwjL-+d9U_#i=WiKHO|UVbP&<*Dvo+IF&B+y!ISf$v8A>K66_SN$H(B zks>SS`SZiTK!rl8_&khq^Qmw5`(*zChpU23qI^qRyz6n5-C6jMS=)3FA?V-vl~|j?cWauGg1tbLe8Oe>k4`!syh($$I7Y zX7t}X$(cgWKCUr}-!V`V0}nm6-6p5LlP>>wI-utxx`(n7w+eIB&23isZo?cU>QI>P zxrP8If&whFeB3Do}==393^)f zOdU&QRw(P02Y7S7SO9|A-cw8UT>Zu3)sEqxCm5#WHCy>h)`CW|_&n}FcuLrin<2It zrH2~l6cmI~{O8m2r~)yv(Z-O&IMnE*&1>x07Bi5P%ui{ZagHy#jnv$RPCAh0rx5AL z9pI8kSwno`E(#G)$8bWAQiZeoitR%jiw__a_UA)_4TB6qsb`pdad`7=B#8TDI5lcW zlkJB-0o~-Mx_v(d)E-Dm2nAfTb8s+56Dh+b&0N+Iq(UUO@j$icTJky@NIJcGO$H({pC=1_QdaD)f{3^XKMW=EBh~c5zmP_5)N+d>38I!Z9Me> zjoks}PeP9QIU5kZW}r^zcUHgnx86Z8HkF+&=d151;BCilYSJ-+Zn}1zI$BV&xCu_< zpmsppWAv)6zGT-iXx7L|UXCW?By;x-cRdd|g-*FQD{mU$%{wOhSnK8g{ytR`3jzf4 zCgHRhPi-KeRe?+$$=VhWe%13xhn2eAq!IHAA~p!Oj5T2q?ZWOm)F>o?y9lF?^Vqs| zKx#>Pm@kcQ2+P41_n3!RJIVi)7eGDw`R<|bfwynpR+I7WXjCxqcyB+%JDCP=>2CW- zB;?U-+sPa|iICKI-n;<%JRM3s+U}&d4Vl-j9lL0YH*E&n;Se(--Ee<5lp80&r5~N# zt{g9HP?MCmX48TUTHyIJ3m$O$dh8Uz=cjTn&fw@em`{BJ%}D%yu0iO!Pwzg{N9v#v z`}XEa69X)*S-R=4?c0Dh;n8bb-RRqpw=>%9Yj($SPIa?IHVcF03n_2RmxvS^j-&wp z{2{sGWVf+?i_;V{@{l<>4U>%Bq~UvL;u>`vo>kzsu2#`D$C~n5rfrpXm0{&FIWqpe z*~y46oEb1dg`_fq{Kd&e;WIPpdOV6>vv<(a2h)gZy>FYL5KgB*(*eN_vk@Coxsbea z6p~BuH;TCm?FFsiUGgw9c2Tb$rV$6ezdxIhzryW{d4z4|pk@skH%@WqiKXxz zZ`LUZ0Gl{-VDVcss~vp?5B}&^@wnJQFCkCEoZBelAs67^7@wE**UNQ9C$GJWJDl|x zQI|X|9?u#5(i-D0wuzIQd8QCBzRjk?47zyb_I~D1gL#ygH;dtO8&KQ$8#PuQ@~;Z9 zjW70Xhz>=wZ{OQbf)v-K2>>2ZFZ~pmDIY?IKbdLRX`H1jAb*I;>$xjrzYk&OtT8dWh zy?aC8;9Y!cb`sJ0K3%6M8Nnj>QQK>3XpE&Ax(~2+nuWNgq^71xb%;~D*q~3GKcrSX z=BS}2;rlqzW`2oEF%!P`DCX3QX?@=J4l+tA+_XmFvAwDm8|)6ws6{TND>0=jXKiWg ztaxHUfp}|BF_#`aeIq%s!;cFDwu@_cWvBYyN!^w)K2H-Aw@6KPaq)`k^jU*te>Lk_ zZH40cxT?gjrq5Z^@e>vGU$Pn`M8(C&N8{Q(WZ1Arv4M51_^&b~KuP$dRip;2W(IBN z05-w4jw7Dvu20aL;_qMe{R;ParC_23An2d*u~<@k*#-kE4@AL4&3i}2H*xW;HTajd zl`mW|GigEp9Tp$L(Y;ltlQ*}!xos8I4cQoHF(HiHToXltC zrKK|{wh_KBdi28G(4*=hDNGDvS516Ua{Pp2o3VD~-+F6hTg5H=Q%3TMYpHr4)M6c{ z&$DtZH;(^Y5ca-zElr)O>(%ce2M#L@dHtl;c(wWW`!*a66Kj3)ew~*Z+S(JP-JILH z;Auhv>(9OJrrq(C4Ta(*Lv6aZ4viqyWhB^bL-SSQH`_6#)>3OWHvbQm%7>hC^A*6QC-Jo;x~%kJk=JOr>JDKT*@S8|ur zRkW}4Eki1t_hp?P^;-XiU9VN7o|4Fhlk5yWj&dU>FYnlhug{xEP<&_6hx^8gZVMdH z%l!NvZfT)~gP=+7=ZE0`O5dB~-KA8*wc=a{=U1I6 zPsd4*MDMcGR_{vR0bml!GzivPwyifE23jwT<0fwO-KsWdW~~kjik{V;7TS7v&VtZp z^BlIShrOu(`}{(Scj2A9UwpnkHsXax=T0ephT2I_u1+Kj$aMx+qn~{Wfmv z>if-p$X}2xCY$DJcIwnAqxCei#Kc6Uiafq1HHK+M&nIpbuB_%TDjBUAN4I`5b9O!t zT62WWs66W!l@Vq-l_YQ*em$I>-3t*v9vl6xG21s5x4qlwa$dfC8E6)5lZ1GTT0ISH zDSB1*xbjzbM&pN}W#~6dqg%HEbn4QW0&^Xs8cy!p5VNmo$ko#LU#s!sCT6|#cpPQz zNonX$$(m2VzPV}{s0>3T2h>COJ{FB1+(QH<&|=8lwruB5&I4H{)R9c@j#UNn={Hh2 zbM^bjBchy>jQ`>53PVG&;cL`>sFBP?(1fn0k(ReiYWaL^#P`GKyY}Rru}L?ev`eQK zB{D{`-ST0F>s&>Jh9EAGvP5!XaAO25x-q+h-!Y_WoP*6{FpqyyTmKIb@Mv)=%HjDY^OSH zstv8H$M)?$P%vS4}f!Vz_p zZ3nOP4}8?_V3o5nMC^D=8#Tj%^B-T$$kmpRFHRm4+0Y zNO@|JhLQ!Gke7v1eR5{zkc^M?bd;}h3sgJiTVH3MI;Oq^j6e5eQSNKmSS%fv+&)YF zhDn&B$7&f_{`!$A)i#0hAbZI#`yqen0M^QpZ(&qGxo?Trg(%5GXPr7KC{p+mVj(E0 zPvE2RW2Hq>l_hIOzVzrptblNzw@S~Pqv29?a#;spk`q<9^eiY;xqoTAsIsh3xObU2 zb*k^`SSQCm6d%!pFF?sE)7Yu$2&O#kqgME=S(Lq+<+X39LnsZ=5z2ZoI=2B?q6#>| zhvp4UIDf*a{=3$Jk78XwlaKQl3VGCb;ht}1L^+ltsv9xhGE>JA&Odcu)YQyLQ>SKt zm*-I8Jc%zVDLIs_xn-E%yX#$@?wh3*>yH00#aqi*rEVGTobHpO_@GgpcKxqw83l`0 zvxkmOxu1QB?!MdHg^IxGUpM(Q4{Iw>J&-VIqp=P$ni#OKA*CL`E8;qfXZSW@5}6qxp(*2 z=?7FQ(ICz`AizS{{hL1VUj|0jc}qZV5WReT@$f0NF2?`-+WWwW$~4;X`tjq|j9!hb zWGMLj$$Q?eey>IFIADhcGhklot|$eDu6pseplfKAfg@7q9Ry%9v9pW%w06s1tgh`g z_kuesKa>0IEvHVLcvk?p;LoR0Q`yq?5w=(Z+=IK5YoHU+_*OD$k>W#jMaWhZe4hO( z3xPM;|AlO^zq-&qMb$G>BdnA`G zx&0_zFwvi=;+N(P&o)&t1b3XkP(V!B7@C0WS+%^Q^)A<&f2&JtOUbzPQ`WOe*h87d zJXUhk-}Gzze6%fp`^{hA3F;=nC>zA5@b->Mmk)s_VU|r9NMhI4)wO7$Wi$=Qj_mZW z+O?*@RpSxHOWcxHOt5LTnS^ev1_Er-Lvt2;v>do2nnDk3S3`+;fU0N5)Sb%+efzMG zXO_AQ>!UF8yA4jE)FnsxqYx?p35?=lrnU;3l>jM8fFc>o4Fyh-W!{2o1}2z}x?j`i zn^3MW379s?@m(RvZGe5#OL&%_ZWU$A>b|Tc+Y!n;nK~b7Hkn+ z*ezVcalxO?nJ|DDQmK;2LZ~MSvSLmRSqMxa)eB1l!=2;!D8!sU9|}%Ua6f#%Q1q-UApRdsqL97^aqk@A49X;AIbPa#=jb|){lxRQl>2f8%<3Trom)J$;U8&)jrL!oE z3?#0>7|fP)Xe8)!(N;w}OONFQ}1tz0pKcn6AklFXLVSkJ(8YH9i|DT`hx?mW+-~X^)yTw>4c@tTdmrzWW?GN=tsO-2){6 z4N84ty$7IRf&fGUw}A$mr=+C3?_KX=#o_`_pN) zjNXCP28I!DU3yes#?{qBjEAmz5NdJsM#$NOlU*YE_B)dj!;QF$Q>8iw?o)({8 zMDMgnJaFTB-g#B;h~TEHnjid zv$f+0E8hKW2B0MBek%ewq(OG&0c5&_R2QQja=%p zJgTLd+VMO;my~%~j}tU2*QsVb`Z$TbfZmY;q#+-W_>2lCnA~W_WQtsmdeWmu=6cR$ zJJluTJ4M7$wky-PGrO%Lq^asRf0Oj1)P$1r z`uX}B&i;zpxs{+~Bx1Z;4f%sl`*ac>sE2^}6O_COUA>e@!js{GeAYdZE0nd|C&fxRxyYDH`Po1+L$nqdavxIj0 zIpywE%aC{=AUh90%2BvC4ETo7VqRzEfG?Yv?Q=LZs^?b9>&oW_?zQ@J*%IJ?sbB@b zmxFGEj-cck(%~s;9f8f2hV(kobdGAXNv0`-76I3z!3(024MrC=bWPD7wwXY8lG_-J zyT_9)3m}uNS%R73OLS*FyAVZCKQ4Wa1$ihrw@-c(62`v z`vn@vh^ppRpUs$)8m?Y_TD{Qiq(gq%SXidp!%E`1Ds8gPt=it|<5xxqU$$|KoLRa2 zJZ!Ln-Z#uKg4HYAhBo;6x?9{nc!(hrM&t;cMJZ}ZL$bJ0h@l4C%9cyYQI|P*{C!Qq~)U-MKg>&+g#!*X$8tU7!il`NeXi zJ_`SMDXys!*5&6cJ0P_ZU9>!r~^$ z^{KS(1W!LAjV~nAUR#KV*&gveMN?}vq3@wPOm&Fo3WX~|Q+pbc$5Vw= zwrIY1N{5=CjbJ7`3`kEXPk8CRzje2mwt0Ex{#)4LdeUAZ;QAkcRW zWk=VtUC?vOjf)7y0~lQIHS@&9tY_c2h?^P319`;Cv-`s+nCv17$7c50@9k~9-0n#} z>8xemqKy9h_ixXexM|)ud#8H*W;mi{KYLivgY%cKN?t!HbJ5$HCq}f5)9}|!cu>SS z2UlOLg>24EJrv2HYA|uYc+ki z@$s3{siabLDk!>Df!8{?+;4 zg<-5+M2CJQ_B7ASm4+N=aJY1)9Z1bVJr@&*ADJ1vA6=t)2hOUgk(u|bd2TE7xh-;h zD>s{BtoRiw)3763jdnI{9@x%N$l6zgG>XVBGdRZ@H(K{ZvYVT>&@M3BK6 zvAr?qkTMpu&LRVe=YhOi$?w9f;T;}Q@g<%s>Z*noDLf(?5>d7M>EFG-ocnv2P=0LI zLl|Rlq)hj)jMP;6baf?(H22mrc3GVj^)!?FoVl!(+^6zYDm<9P{Qqg1hi60=9chiJ zq3!q8Awk(~0xMqoP=lSOPoCRBN>8f&g>M(W`jCcsvNDY+V7sjSxo8K3%!fM|80#Y> zH;U@qw`dhZISUzz;6&?I&6;&lLPYi}InygxSbltU?!l*)0|jVnXe6%dHGH@g{VaFZ zdidbM4??3$h%M`PsSAplc)quL0CB&j$7j#*zG!WFGd3_HxV(F(Cr?+q3+y+Ac zI*X*X+*?ZgHH9I*10#_hWj%57q2Z4U@;Z+okP}QLt;<1q$8BqAwq?@2Pnl zof**G9JEd-Ljf;oj%B66QkLQIsm<$xdgig*9^GUKHLoN*qYLYth@)D1dX_5^zMWy+ zt25i+xeBKKA%6B;m=?Iu>&LNzgrceO;y1iX`qe=?JB=hxp?O@oA3zR)Nf}RApX!WT z-)T+{5rGn*_<7kXfYOEdSfxMx{(caw{lL_H4Q%-vDK);o`Det+Qx5)Km>z4We`)my zs>h!}K^w|#DyN;yS1<;65Ah2$$(0v>zTG&t^(Hhbho!yi-Fph{EmER|Cy!SOK=enF z0DFx;b;h3qX)K<_P%qpDp6X?V?-oiJqlmX~Mtdh6)y+6=7~~;5f@g+EwRmObarc>> z5%%5W8a{aWs6SFq`4O>l%vT`x5=L?I!J#^-LmY3c(Cbm*ESw$Wj4&s)Q|ewENWK|G zH2yQs?2Biw$k}5ud7H-w4YT5;s+OmFF{Zv&!hy=y`|LZ-;6V8@N$C>N;WVH193+3R@0y9>+_|F0%CZ3b^FijGDd&O1DR`oVd)-v|rsmUg=o& z&xo&=o>(ic*J@9t{dc{+?MUc5fGTR@_=!Dso;x=f&YfqKN%PLn%C(#3?ny`hW@xWn zL-8o4w7fVaW2D#m0c$75fGENES}|;S%8@e*NyGZkr3hwJfmD!>VmNzdTT#X^M@PrD zagDD^gYAa5*uK7V|Dx}RplqFWAp3vl%XxRU+s^`lK#xjAC?wF1r{^6el48-7 z?-hWN3S5(Nln_vW=Y^opAc(m$!^qRJch!4fC7V%k2^uMYJwn48LJ2I~l%TZfUOd-? zuQ`1LsV9_)sEWK|Lschi6lzSr9i?zKlFy;_MRUmKPUaq+Uq@#QU|ON(h!ygrpDjjL zz@Db{Qh)s2Wsa3wt6vo37qqU^=CAOf_Y6{IMttK*mGX{hZ zc6PyKQ%KXNt{)D{nQ$WCX_;YBDJ|rRhFm?g_C-D7%S6hj`oI1vbT`)=z4G}eDXjS% zjzDoq8a_t^lhQ?SNjM0~@?z;|;e<(Jc3+fKf(43DbjVTIbmQsXd1%HmoPvyulbCIN~J(r z2w>4I#y@MU%RG5X7f0UywT8lL+?4kdkQzj4K@^f$1Tw{QDFjdyw6Z`{Wz-kx32tsu z=%iKbJ0lMy(XH2HrScARr?Ab7{G7U7i{{z{AsWhwJ}CC(1BlvFcn|<<>EPDhK0YQd zcEqr0 z_*wD{cnaH>K6wUw8%c4aO(ES2^E!C){^*RDH^%q$B_y*9aXvgo7mz zu!v=cZMPL@T)_7EobsfkQWOR3YD|Op)VPlwUs>XXTApQ}8p848L#{r~$|@7=k<#xW z7a;Rj^KQg&WLo>dEA`2%REE<@3O6$ZNavGBsRmFi(|wv-AT=MwUc0$!>lH}1dw8pH zqJ>b6i6)i0Xl|_VLENQS2r=X}U>>sk~cRalSrX5`4ul>}_EZ=XYj&b&RK z?)K0oOL$o=GoT{R41NL$p-jU$*CWKN@{ubP_oF_wiOU?MZ;sg@-?nU33NkPaC8IEd zb~I{T>0&;E7MQc@IuoLmX;dH_;bj6f&Hk~9xlc2AS|q*enp__7pDmp9L%U1jUL3FN z;^E@wbosHmM%9O-Hs?Rfn{i-^-vB#TLU6B6V%(zSYAID8LFp%c?0#swWv;Uh; zZNc(A%+%+epZ?(gy-gD)O9>GYA;v1lqeZ}>vQxXY@@$diy-fQO69TJrAYWY5&F-3LULwnNJ zaQmdQe;Iicc5M~op}ooCWpb;?jW4zSwBUX}wAj5(${h71SeuTs+J8Y$v{!vE?%qo} zG^kgvpRzxzRX5{ncfYll`5lBGGc8K`xP9_es-z?wa&?E=ka?;FqD6>kt?Z^e3P>}{ z!k)mwKeM%RS~qFbNJrTpWXf!-3s^v}tCi)LS+U<;3KF$!wpa3pxBDk`o59G=B-JD= zs~2v6TXfBdVOvi=VB4vgE2dLa>{s85d!(m>B^TYPPh4H-&h0mI7rj~7vDFEroniCq zHf-GqP!i}A%xhWJx#2jz%v8a#1*UJ zPDHofGCDY*MR#FvlHtr|^a%iSL) zHE{3VF;n-yd3XNXoi|u4#B9f z=NM5hcL+znvupL2ZmS6<*B!pWQ2-!Rig?gk&59pheKB96oHh*lKlNzL-cC>v$if&d zRGaU3<4Lb!P_hRHuez+uXRe`aoeaQZlJWFMmj?Z{v;%%3JN=mR(@hP$6_K-T(35FU z;!f*`CQ>)jBJ{kv@$anRrafuM;vSvx=XaNg_K>ZZ4%QhiBP`JbVbu2%Rh-TeOll9i zU#IHA=eP}A@x~zS|AxODL%!&3x>U4~9=P9%R{0a z$EXX1=KX?)6;LN3po|ht!A-+OjK(TOy>UkR6S9uk*8KyPsN&+{1cVaKk!Fx#lj&ek z&EYSxP)(f2@$iQ}ecZyf3?>5$bwtEFhCn*cJnWx;{_*K;sl{bL&m|wAAv!NKsld)O z*fy}DCL{0;uKX!V?cK545*l)rGn%)9z7bCdf@-L#i}m$y_Ro6BAFTcYbSF+ z)8-vJMly>FmwpRkjC!obMVD#(zw1R&)%qEFiPcpiPyX}8kw1uUg`)1*pTD8je9)}X zA;2$*_=73jRd#u^w|p$*0F#vidcVi9sKE-%q~*u`>z!v~Lvr%-my}36f1{W%b?RA1 zp^G;n5(x5`+nT9@|CEV|BMw!dT2of3TC?g&8UEluzP?IP&c9tb?SFr*IrB~19*HGU5u=IpY_>ahJ<@T~)*t=(!=9=}Naa7|_^R*Z5eDSXE)LuJj zZuWKK)2;1xY0r*sYf#tGtc6{|@K!Ii{4BqvpAc zEPHeHPW+#5W2~PH8z3-yC0m}j1Vo{6rSCl;XBXe>%OcrBh%+47xeHGc6Mzo8BW@DKNa#u!;!pZaOj+*dTU1A}QfA56@5t6M9!jqchL z4CgNWUWy4wk*V~eZ`_BW+<>f$AmAo1=-)!~sDCLdA^Szc$}xbI3eo-o@4S5dI-HIT z=v=fRGKKL0S!~wR2i5ZIIiw3DsR(IT5>aj9|jhfpzLed~!A&D;`0`7f6n z)AC9w#(1*U?JShezAJUO9{1O{u9_frNGA-)?3{6=m8I+FGJ(a7$mC)7h2f_+YyV>5 zN>==uwTs;Uv%9`{lxpX&fMh^91P4)}2&w_8V~cBnjC)aI&3;x6o)HWgPzXpavb1_8 z^rmX{{>N5)Ia8gEpLq-JSD@+lLlJb#f8YI~)!OH^X`p#vp@afX!`d(ZhGymTMcWId zDZ?bmNyp}fs8GWQglOdefeH{6;u>w~AH&cY5vI>`54m3z(nxiY$H4ZdgY(_Wrsp$* z4*?%5r)ijhF3bpWvBBwuL`n>$`U?^#id&f-1xveGaHInd;@ie5U3Bv!bg2cjE~-E@ zeoLv+Oo+*t?`lCsH9vQ$5Z{#V>wzqo)4QcNzu}eJKHtmN8LadR6}?0 z;x}EULI%&ElDXxU!u6=X#>$m8G50OB)?zx6fsv9VG^g;YftG6fxq-8IX2Zezs$;vD z(*}ou-pZ^wtrd#qZ5mm<%%>U`sv_bo%|>o(6jGn9eK75yt&rul&q;0%Mf9Aj$n@Sbi0kw76%|B8X zk0~DVGDaFRU=5|g713_S0basH6-dLZXAY4TF#(iE(%jmuT6Jf}RJ|YyCkytCDv+IT z3p_4uyma+kIoy*atXglsy4mRg&xg{fXA6mCmv+o!nt~33e@fQQ^8I67%#i3<>$Xg z=8HcU&aTi>0)&{blZ2}VDHS!3uxbla{g$Qg1LyW)>HCR*Fy(bxkS8xzl$@i|<wj*>|T4;fn$R6TJ7)~j% zfMHgQFc6h^qz_MwQYe0oPS%Kn7`AhC%piPmPVQ0oBSRp%S6Qhw;LZ3P^nO<9w^A!6I0v@;7}U#_#1rfnuX{scE@`vqL?7=0jAF6DA3*y zHP)LBfTM%^r&NnWDQOa$R-)F^lK-ZNUmw8O_h**a?)f*{vhvuEiDvDHMV3FtOm;IO zp@f{duCuVwmA)P!U8vQ}stPoB!-Z(J;bJDdF7C&2AGT_@j+a)}GaicK%+;zA?_5%q zPWHb4y*ClMVuDalhu3|$=9Ex(}m;C>Zc@WFT5gXse zFp`37{Rm^HkTMsg5_NA@tr|5rEaDfY8HQXHiyCw+H51!yIt*gxJ7x(V)4X~@Ht62L zyjN{EO@C(Z09rF(eVH8SnFZ!&OeNuvfY--O0aC&gUyhPv0v^a^!cQ#aL2b^EsKenl(OH+wE$-4(XD-`J$`hkorpBC@C-o!A$iEw}p8~0Ow&OT(|9}u0BD9cx>nVuf>+WPQ3n!s zJq}QZcr9m8Np?crX@^JECZOlG7OugIq>dMY+14S9{5R5KA-^X^$?vJ@L+g0~uL4Z( zyT-oLR-$rUwmpUtg#feL*VlKlt;DqIA$&D`x>}9L%bzTCsd5w;z^JCEd5F>X*tKlB zPDFO%O=cLl)10o=xkCkmC!kmJwcm+8Qd|DjBIh3R^;-vq%#el@J(h;DPkfmQyQm08 zaf1=YU+aLy}!;4%d2te(kkbBZ?^Nd+q0ca29BO*%)- znU7L9(kYH&?=y+}9u?vK`@iBNu%oK_oIsq1(&+u>E5;kmj&~abAITaYn6VZRM5d?O zcBWyJd-#6{kL>X?|butyt@2&PtR~?KpRZC@lY79X&xtjycifTf&iytFoDRvF=$%7 zD-?7lhZ&sI!~ufwQWoyt)#dalmRl00*4zz#xpO z<+b^M?SYvoL3yR-kBWuz@&;{L;XYJhJX{rdADqr5HM?!YD>52;(e>xlDA%&Lodl1} zs)+7h3mexc>EHdEzvyNwOOv1f(EYxs>S4)TxR{ka% z9m=R&#(NQ4BBN-c81K}i zptv|f;Q#L4kpCT6IjWfKbwLgsh*$+j=!PpfdKW>EnLG5z!i0x_tAq-vKxV z>B^INTF0ATaQX4jI1_M^3g7edE-Bw$c=l0O67~Ds*~c>eRt*T%y^SoHk_~uo;M4i7 zh~&o!j?>WCqs$+8{5Rfot!nV5?i&_i0epG}Ecj6hwgMKoA*G`F66rLMHcBW*sKK)A zLGeoK9BrS@RR>lUG287nfD0(HMyTcMEjBRlAXSBmY^-lXPkYKOK^l|BW3nj zRPv%D|EGiV^(PuUop#yP2*xxr0=$jU=8Tm|K;AQe>sruFy%{;SWg zvYh2wnA^I)b3zwLflvR22G#$4QCMrcobCOb6Ksq-=nkGVaiLGaAxpa!!zmsX>NY!? z?=oGb~S5ZQRn?^zF;~+8zDdx0*7mMJf20R69YR7x`MezmYScL`SA%$VR!X zW4Qu)fNm_sepeUt&u8*Q6q&%bLTeKLG_zf zUD5N=e_!{^x3dZPo~dE_e7=XOaMZYcjV*_)Fpi+oQ7tgGm^<&fv2M&?aqAiwcZ_WF zGFdxi*Av^XuUEL;r8KeMDr6;k>e?QI8jhx2==o|~#l_dVC|kKbI^xlXpg%y+)u=ULBM_qy+UNt^$WK2J-dq4F}n zlAqsy^r2H=V+bK9vpt_qA?eXwq{+p5gIm5Q*r=G{lx>1*%N=8}j3Ex)M7b%2_?2D} zCSMOQG>n?vkYlN-Zylln$|6}u*|1H&{1OW6J80R%U&S9n8qNaM2$D^E&BThETh%}l zxm&Y-^Fk+$cQQ;W3vOLx53uUa zngq5pdJ+PCe99|C8x057KsSp!YS8fO;oOSN6-jbcukm-?98oY6(M;1ma z?gy|fa`1gjR7oqO#0f57>W$nQ-7W-c@pKY55z)KfNJR4FFmDVAjYXU47NvwNA(o5S z3zo`FuoH_zKQ&nuKr&M``45vEb}&=a@ze6XO^+gBaQQX5`I2PaNThS^r#+;m$OVvd zK~^LFuBM*)dz)=x$6vkMO-F|rwtCkSD6RFMCnA>Ejzn9A1Nps_FSSuE#td#PLLJhK zM?x^EX9vx3BI^V^uSddtzSjlKqiQNAjfe=zYo6M&+I2_Or5bdou@v3OT-ya|gjciS z+YjfyzrHzr$z!;Hx%AkD6~8KgBi!_h#U}(e(`qWN_R7N?jPp?hyuW?-^AWyI8VLYB zww3}>aw`{2{hlhOcU`_FG?$O>I6Fmy1xm0XO?p zT@rGU#*!2*AZDD33b%>cuF(sN?41%%0>jv?%xb_dS;WT*8ytlwqerq$^R zn&gUdyX_@cRHO_G&EbK$84=*97RbOGGY^){=YrTPoDA zT@~P0YtO!-E;Yi2hA;`OYr3D*9>1!C`~WYEAZ(SdY2oV`8eY2}M2HocqsS9si&X=) z-To`AMjRU>2){sd;@F01M6bI)KJ~Y@n*Y96dr~zRE z(P}$Jx(%C<7~R0@(3u8%NtZu=EFV@Ln%ArGoZ7r=--X_wbyFXvm zvMYJp{>6|-?8RzvVB(+|L@$RlFodNmqIVJWi~8Z2{Gsm+`NLn;e^}t&jNRM5jYiAP zr~g?3m=2FmtL2^esED(!E{#eI>Gn{Cc4B9;|d-H{Xn+%T@e=b9nW@!2Afa$5*4cxZO zv}`$e3<%4gKik3ohdj3~UB_>V3hL>6b;Fk1DK zr}Q_@yfV9wXLQabuAPtPH2D1Ki2Mm}3uP;~3^9V!Jd zi{pKzySunHBD(SOY2+FCl~&1!OHAS5r8`$zSOj<>nUUxVnORCKaIrXO@8X*wj8u^9 zqhCh93pn?26jixcr-%?2KC6F8SKejo&wC=|?S}VmN9Rt?5_D0zbl`-cCCDwFd*S3f zBG5*z<4TSFWpo1NHVYgsYgQ5aX>MI1DXwEomZPV$YU>M*nguFW5&-REVIVyxF)#L` zMNXUEHX@EVby6oz*DyS}F!p;hnlj6n$L0t=AZ|;0J_e39w z&Do_QsFgkk4x|eL%%)dA^X|uYdEujWFMeRjRFsBEAh6qOUYGnxAybteQVxKMcij{IgZs6odVKnk^r6vJsoU2 zaU;L~lPJPQ z)d^wxU#&E3@A%XI|2c$7{`CLmb!Z^TkgFW^DlV`GxzDmKQ!n7m3Ulp0S|fdN(jrEo z*51`Fxp3C^J09b)zsPx%Sv5-bTr(cy=L6ehLjXM?pghvNVH+p1$K zfAhDDn?L21|MM@;scSy9**NpXD&8viaM3&K?>>8w8MpWK!E@~8?|U?uU1v{@!v*AO zx03iN7fR~z)Rv_;{D3MOAd4WEgXrf(ywJc+USxy1Lx=8bLJ&yl0yM)P%IohaIDe)K z{oTd?uT}n}U&C0dUPTOsk6Zi6LyB9BkEIJq6%e%KDoJrQ?M(xeNJWjgLe-Ud9zsHo z4SG&d(H$tJLjQf>flV7lL01%M?H}NNE;e*xyR6=Ge6_EB?%LLLH~cPzQu6Zf1Or8< z)z8;S)=d?8E8@tLSzRa=7VZ?ph*!#)nDwXq6j=9&LDQ{-Z$5;ot%>wI6h=zEW7GE! zl*YJ`2Bq0O>2b5bv!8#pOtABe*ho5Ky{q-~@`jCGXKmdP5b2#@zaKy5@B{hRK28bq zT#;t^_HXD&uafOz97cuTB;C_(A`%qvXyLkpvr_<0ka9tGyAZ@Be*gv1R*)D7Zve)Q zu));$pFP=mTd2F7c$rSu_Vk~K-(AqV>l$(RFc_g&oaHdd*Pip;xKrwO+ifB7CbK%M z+rR(r)B4YR+eu;-wYx5}s?+@4-*6c)DNNve$blmu4ie(< zD!!dmsM-n|C*}~SM8Cl4e}xiCC*!z)jK{5$RS@QRYyG zdkiGA1X>9+a+Ibvl%b!Lhha~MH7INyD{>^EPmqs<@riO!uI#o1a!I4rWs8__cHqAH z&9w&fTYSRmA99Vj+bw%i#AKCR@&{>hk()VkT$YtZX-oSXZ77y07SKFtkj~nau1fx> z@teuz)v0e+LpuV1l3#>7(&G|rq{&KpE4RLh1lW*Q$|s+$ZM}m@-CJJ9FusdInrMIJ zbNBr1R>KF^_QXxawE|TvSDk}|WNfIu`W>`Zif-GjVG}c_O-{PS2(|-`+ zi@1r)B0`-gq8&Oz^yx$!0vC9@HtBZKl$XRu11x*_mC2N^)fj#5a;u)UpqJ)CoL6%p zzX~sv#v+5YMzD2+gyy$>duO_y==hG&kHEiX6Iy+n?(cvlSV$t%<$4it+mS$b-x*hT zX%mdx8+FmsJC@=CB71}<+roVBZ>C)UTQjA{Qg=gg(*pMiJTx z^nz!8G2pn6R~44pf8wGcRGE2N#lip^9R4@R$X`ydI5D`xb&=3r9MXIYE|)0FXcpqF zs{RD6cBG>rHmpI>)Ct5&WvXUX$S;_Fz2f=KWg3wfQ0LC__Owg#j!hm-zc+UfYNs)aN=V!!OCjzZno$)NA`|KhC4m7VcU5S>#fOYmFD3!)yi3s7)o!wfQV5#^7c5y*_xygIjPT zTv6o8=_;g3^jRX7!u(SrIrJ{dXoDA*ZQ#YpFago83$K?Clo~^-cQKBH0*9fG{^bXI za7htejY7;_nOU&?fjH8?E4#`-Xdq9KWr^{=@E74UNj^fED1ts=paMqt4$gPS?ZnvO z;y>j|IlJW(5#x+hWMgQ9cW9}D`wGqDZ@w|JU9!~IbUVel6#{HHr(2m5oqSF^CC|$r zBN`69F-rErVN5ho2u=m6z8+Ubw^1mEc!by?x=LI6R2-+dGcH?b92~=m4DI4Bl7(pA zwN>p`XMAYR3mYDdfA7ojpmcT~k999AzT2+tGp~W>G&tU* z)%0#fF|#&lI}%6aV^RmQ(fqU_Fcb~$HzP$0++{r9UCeTd?%dH)2toiJEs!4cz0zq; z%*c5UBeF-wr`w;Rzu2&5K)n?_h9-3T?z;Tqle7~oRncev11AMiQSy>q4=$~6a%g^-CQ4$Q%Rjs@;KN2I9mY4SiEnf2ZT6(&^>E@L#E<3_~s-GF-8r5)45k^v{uFZMtXOK56>7oAW#tB#ad){qmt|z?v z-SkRI&N`fkA(rjZ-YpIO@L6{mhOk3h6d;+iU?rlHV$GHZ)?!C5zw_?lw?37hN_%>u zy^_T7@>?955JnLuuea!$M)M!$hmRBP4@KH(xLL=ye}L$fccCTivlM3JiPU%)(`8dL z?zjl2c2+W*?A_3{G*dZD0u;)}H1MwW-uG#MNVj_P21{F3_6Q^La#0M_i;3c*wH|J$ z2_~oet?*;x;wKNjB+*mbXn+@?l$Cu-*ss7hL(B!m_{Zrpb7^!GGMy#08iZ-KYm-8g zX*9xSy?77RzsJ^Ar&?a<711Lr0+?U<(ld%Z+e3jBV2|Zczg4d{GhfwT0@+qa0aBC& zq!Q^JN&O^7qe5%J?qjt}9MYw9I2exK+S#2R_T}pT@7@?ygp`D++W$1iP}xP0z}TZy z+PGnZ7@yHnBjn?SZ4)(+%qxe;_L{=4JW;lgBAakVZ)?Y_CyNV{<^Z?@78Z`}W)A-h|X$Zns*R z;_+31;VzGi)HfLBsJ=R-%0BnGx+lXDhFM2&K!9z}{Tv%Iuuz6Mkj#c7zJ(folQkyP zH)_DC(A|K>PbR*R9FSF*>vDa_QJLw1FA$m`4-#JnA$$*~Z^{;?Kjr`$=b@(qbf3>k z7O^uGoD6JOxoTDUZkO{v*AvnbOK!x9Eb=2ZUyfNf41m6&m zQt!s1TKOvx*-rMze(%+GP!{gF0f%xn{{2Tw{TEzo&Mp`vu5-rtF?Hb}>WMq4&(Bp1}zGSqAY$&f0fk^o7X0tcpejwa1$ zW^}D&!zibl5tB&BGCi9jRfK>nv^61dyk|s5o#6o97O6Y)k@*3br!vi!$7yD0xb;>3 zA;5DHlv7g1mwzZHP8Ue^J)_yyS552plaVYm?@4jSiP#W_f_JxMvKJ`%2z(vHqZCJ5 z^MxxNMKzOR*Z#oyRftu@TSBmX50Cdl${|kEsk)_Va`p_V?a=tiK^({EaQlUaJ4HE> za3KP_EG4HjkmOVk3L5ZKG8SDKV`PCpzbirg%T588Vki=eY9FPa|4tlY5CVxfR$E*( zkYdGILRSMibNcx9g_tBl?-bIo@ShMIX`^%j+5J+@lMRVv183F%ajPIi05;nDJ6cDorzX=0}9)RDF?sj3d zFbHYM~8oH9tz-KK3zZ{b|i`6MH?fS;4xQ29P_rBaW9m4H-~W zOD|c%CBFK{hdVKwZmcM1g_6PHUCG=e;TA;o7k%ZXxG1Vd~s zY-;4F2+1b8R&%B*%(s>;J_X4#9O{PfWT0R)EGltjLYn6}n8FZV%7l_33dDL5@~ytZ zsm3e&DH%ODST3oS_?E~BJigi~H(8IvkBR9b^4tV0J6QKE%Ll(Hn58_-;nK?8c1M~; zC^&_)=d1=jUy`>{UCm5s$_}*G9<3o z$Bv7Me|Cdl06*89U#-UcEsV-d=GLK1nyG%5Z(KgTca^+1l zUtS+Zft~t-+H9;6O!MUyS2wo+?Zv&2V|s6?-?KZyyI!9A{rniXcX)8QUE=bz`Y;o< zv~)ls!uuIQdYIC_=K1sO+Rz=kaxpcxIWMd%cHQPkTz>C&ktXtWO5?vqjpE)*zA4mU z+_qJ^4lS=t@>J<7jk5@ZEO*80lTKL$k4~u#e;a|7im(gE zytK5`@>kZ1*C9>cG*zXR&=wsw;j-nDlPkbN+PgH;=ZL2`kyasu^^Q4HQ+Vb{y4AN~IkDA1F_@OtB!K$eb&| z2{4<|7PpIh@*^>=m-x)J6F-uGL2L&%Y8)4wzDp6(aV0oLkZcVnGWOiA(KCZ zh%99k@0hVV=yxKROnXDzDNGXawR;{YzIud%R`#oBG7Yjv_wHANm^|@u-lR?qiQ=W2 z26^vQI7(Dw8D^=(Y&MbzAXd?0kHwM`r()v&?J(nJ9R~|?HRRpH9W_C>8SN`ej|SX^ zFuAo`S8tmyw(P{P5Fj4(#(nQ@AVw5A^x&?5pB2(y$R+Q@D9chjzG3T1q~%sf?P|5( zchBzKdh|cP%rtI15+_+9D90#o5^22CI`pK^`8bOO&B)A~P-vJE{@c z&(}rJ4*|fDz|5nN5etxh4?vF@qZHQx0$b&wZ9}tx2_C}IrXahWz#(Rqz0BC1uPPID zXjVwLd!sT?*K!Vk)g|MAC2aXd~F?l1);f6P>U?CD+_Z~-o^&o)koD|nNC{IMi4~0 zX2h+p;Xn|6vanhyqZxN*!|s_(4f4uXoV<0cv}Gmm>U`S9#C`AHy^;18doAa5axiGv z%y7j-ev)$2^uekdlzo>c=5oNvP+xWD8Qxy#6esedkg-u6lHICTH9Fi&cm!Hn8G_6z zSWXF-B4)P!skbHAyJAG#>qnQ(#<@WA+OVVY*ZN~AQ>ik)yU@#mRv#7KnZuQ;-zb_1#OQ%==r#K~Zn#Rp zhb?#|OqcHW(mA+>l&<{o0x{z?RjW~KjsZ1%Z{jvYDTm53xkk4-)aMmgX0J9PA6++kn z-sc#&>sVARyDT$j&pSvFI_?(KnygocBz>Fq#6hj znnfu7zz0Xk-A-9Qmw|*bu+>wM4Pl1$@Lo7fIQ@3PnE?+`lVN;UcK6c~@hZX~j-?a6 zs{U!FCDE{)1Zupw`KAU1zLn_})B9iEp7o&xu03hpta|)WE^Qh8yV08K^~0B)`8>u+ zaYA3^j^4s*=x`ImpcDFKYu6Y`k)`sjaCknWSC8E*?a#JURyO^gTqX=3M8Y&849^R>-Q{*JYuu>H3q*jr5Zul(WC(N&* zFJJb^t1j%?%DI0|Y3_%Yc(uEQP6@HdA01VDy?F0s%crc}d=revYJn38*Gp0=d=Woz8 z=A0IirM2U4@+Qt_#?ngQNV)$6da0LB%q`W5UBi&8TOhzhp%8}Bn&Cn*iNk}x6`3}C zt;Yaa`D|EbZ7kvp&1(~<0gqE$GsDhL$JlHQ0DoJ85VCJ!kXFKPJk`)>{ABP5@#$#Z zwr$0Bp~7ov1<1C-b?K&zaa?U^N~^PjNEn=r!s|zCi+!HZ636yDa~MpC-e$|Bj`&nU z2Oo{u+erZYN85i;y;1h`C1*k?yJV1Neh_olkTSLx1*r4RZ`vg*Wy`tUU1KeNSt9Bu z-x3er(8clh>Tha+l8<}3P^{j@jL~(uT;;!TFXw$xK;Fokrpqog=`XMsMz4J{#Y##m zej_YdvEIz*XNo+b)b165O@__1;L(a9HxEvE7$YIwo;@c5dPvYGH9x{3 zO7qB+g3P~KDR*DrRV{W6UM)4|5J~|5#JO}($eLu9qSzXyMv>^~a@I*VK#~}20D#q} zU?+$F-_($s+zA_>@oLei)~QBh1n>f*;B}eP=hA<1KJU#fCbDTelIEmdiY^Xm2gO;i zBXay3K;zTyB@;s8RzeU7gWoBOKszlbC#P!EfOs-1Ev?T^L>FjfKV;=|`nIrpMQcf{ z3jyGK=GPAo(9$~hlj6q_7MG~3eYP%$m#R^QuTkq*4Uj5kQO6^WElu3dJ$G6)Ox{q- z@+eUor+^;)tXNG`ySLnnhV|ku?&Ls$14_n;FA;jko^8jB8@JqHp!|)O(uiqklKrx_ zEu!s)D1?=c$CQT2Y-%Or!zS?>gh?4??pE zJAUxK41klHNiDH*()t=svs~JUt|cj7-0`T49OW`TM|h^3J7ve;^CeuS7>hSpB!HrR zmoDdz)D!CzmL;nqEZax5qek~0KbdFS@A@!IxLR1vlF8cdI7F*qD-=q?FYY8>f>ePnxQ|uG?q!YAkt7**hRw%PGFb zJ$pufvn|)DI5|P4kkfA^g%|^KLPAb^k@q|AYuG6Zi%RU$^bavxg@0*FGKAl8Vc{IV zFy~q0*-Q`-Bcn$D_@nN3-~FB?g>$!8p<|rau00EH7Gij2dYB3+H~Sg0a+xfL6Hs9Y zZ`JC)kdEca7tG+d2{YO`Mq?Wjz~)^G$^t9*-?8Hm6~LfiWR`5dwMZ_d1&7DSHBrFE z|0^2EX)+@5?)}_fK2*>Px^>bE6+qv)<5~#Yof4xtG6i%sy2FOqQ#M``ZDRaCn>~KH zwdnU|xD>3hy*b+5JzjdV%IYspP(s;}Z)6@N=ok5p{>`Ji zkWD+Tem;V$USE16o z!Tp5;&1A`I#4@!i{y|d+(Uzp6b2=9Gg7yEwoA0_fwX$LztD&2pDq7}^rssI9S)^e! z>!;1eE{|W@+anD77U5pTp;LObN$%!tB_?bBGDq0F{jL7 zckP#%=Nigt9iFEcKi+i(aA-R0TX=(^pWn{H4$z8+PnTzh zX#S)nB$I9dD1&J(emYlMQ|lKHaFkSS60at!dc{M8_~Tq;8^`YsXJQO-p?H#&}&RR`vzu}v=u1*D=4p`e@Cgn zOSwGX*3FSnj!I3*t?ALTXXWKE*z%*X&P0b3Ahmus3+{3Iw6OE5@C8Nd&Ea!y{#7Vr zMDgPdh5W!7-2+f0YC*VAYrHCpgk3MDvMdU@XKbY-B_$=_loz|^+bRze>Ia_qk%;jR ztJ2wg-Z4-v2(R`@)yyb$Xl;HW^!~_ouG)$+%Ln;3VeOZ0 z#Zk<4-0^Ma0&kvpcF*_x(16+z>HFeeUK}vU+Sb0^oXO(Z;^2+ zbX!hhw#xRHz$rm*a%(sO94)P=k;G|Bg5@NanrYMyVe#MbB^pMoc>WpgG`8iNi@jL2s0nV@mC9skRB@9?L&<>0<#c}!xyBz>S_skM21=8X zK{F}mcTQiN#Wxa>AAP+6;f6LxiyUiY`-L@dJKNw45&6V_hfVC9r=6TX!i7j&>1VmV25pEiMP>B2j7qwu!D9RYttz9E zs!pu@OzV|N?ECd2_%88ZhIef%XBn-VGdOCRr<#}4xi&3 zprUK5mcoK1s_I3C8t@TJ72i3sQFT*OjN$PZbnQBl+qYbf)L`$a#_l_m z${@4)&i-Qy^;>6!6>RVQai&6<@MuxePW_2Nha8xS+{f6-;OU*Y)?-}P^xSf$i~V!zCBgz+r7pJBn|6&Fmy=7uYy`)#^x?x$8Wv4MH3eBK=l?pQ z*P+;c^`oN48T)OF*Jg&Ke_b95*ExJ#b=$<{YqnlXy}L75d-uSWPfa}2t6D2oq-w6> z_hF$n)w|0l{Q1-WNL5y7QkCXQT5sg^o?K6`3u}#j)FwCY`cHa`%B%9kVV$!gI?yO> zjY=;5QZM#=5{R-`N6BakS|A*X4-PtkBT9GxAe=H~ihnB#eKyAq$S~M@SY?;~v;-oi zbX6Eq6_?u@5+A1rP3|irEIFs<8N{52f;4sfu3jVP%>B0f-Q3>+-s>-6f`ahD@aHnn z*H70S>JD7fgvXD7PLQ!&pARa7R{SK62Dtq!;P_4;=vWfF)BxiQ-j4QOD@!HPF{)3GO4CmPM^>z+C6++-Bui9 z-L^eG=-@q!*HC_XFiwnay4_r7A>xw7sLa$G%J$cC%jPsA;*BK=Tjled4}qmRatsSe zN~&XKgDtr03v_Ql7B&H+m6(EEXA+?|MbYFd=@`*A8U8=Qn~K|~pBb+|bLI@+`Y=hL zrDCb-x)(!P%1ACg=jkcbEZPyW2G9g7F+R#Ugm;SQ>j*yHzJ;h#Xk`?(;La5J5LHOb zK!0{h`zPm1!izG%dh_)K@_@B$bWBW=9=Qw~cFw$f_mN{i|NLcm?^OjRl!5V`!mll4 z{D{OetQW$>ov!{f#)iTZvjA5MS3?w`6PT6TS^VY2fnE$=!CJ7jIDQW0YBzlhG$}?2K8l;uM{=h5LpvGMnVK(7!9JK9LD%%N+5O22>!y z+SE5>uQJL_hJ=U#5VcP5#T0?o;WX7Zk)rbiqhMi-X7<2a9V`XnZQl!_92 zY48Xeah3S19Cyeo>411xX7UNykV6f@!Sbv&lf-8p8+`J|lQO_38z9|X-$35_^t{#^ z@djV=>f`a_z4nxo7Fa=5wI(6uJ0M=L-`bFk9pV3W`ANVQb1^F=mlI!|7QU@MH*&c* z;uA|EkA?iitB>S?!3&s_B%MhAvsLsnmtXGSHlOyTu<`IqpHw08eXTbybsdj$@86Tn zs&ueeYYRzdT#lb(pfm%dy&_#!o)J>J=tEAbLg@62m?=gxQnbYF8ne(tLb9O*9hB6a zOu*MKe*NhiUQA))Hx%Y}cKe8sr4JrF*seOAQ})i8H_r(;SJc5GaiLvhzJDpf!_00@ zKX;$;uV2F3$;^@OP?G3RZiB2*GBjbovV)m$ckjswPCnz=u0M_>7%e_iZMCOW{L@pn zF^$d96sJX&)PPlIYV72o6UMQWx$PFmpD(1O^jeU^kFK2h+-b8j+fX z87|FVChcUGHaJb~ewI9jlbbEs!IJVlbff<2vLEX&>(sM`k4GvA4H7ojKmj8j`%Si2g(M3%hOHC$ZvZ{sdty$2ZS*2==$b^N zg?2_IgfF0sN8}R|x(lsO%UK4vKVIVbVUN(W2p}>^LIx1UN>w^xZuRy!+LMyZP0*zp zXEg+pmW)-(upsfHmF+E%7ugiZ($3bi7>yG-t@H|O-D^FEKkI9+nQCzccZU+5diL7& z>$IMdP7)E!u|WvEFR*4TzbOk#E2=#GP&NO*;Y#^h(g#;*?~pJ%CbVp8vSQ(!o2sjb z|5Ef^X5YxzoN$o1QL1o}RJ-MJuf}yUDy@==iRpOS^VX;2I|wB(E^J}Fuk|v^2~mk>OF!H# z_-DlAMV{d8??2V<;FNIWRF&okp~#3?RAQ0FfU2JTuJ>wuhv+Lq+7W(8(m~1Jt5I!X zE9Wne0gKBILelOnePT1X>nW}&wpAncL$pv8$Z`B50v<{T6XXvLulV3J$NOOqM z?b`K0pi6(?FX|AzK7CG9G-C5k(8&IZh66b3D1_1CVmD&MI*n`+(#?Q_Mm)wJ>)veK zzQI$KGHioH=h(%8LdO&yG3GR~?QyYcDrQmx)?2Q3Q$wUbgEE(F@sNe1rNgAly-dq& zBNjoZsgsC%HxF(K6|)u5+sb$j4x6z`yOpIa0LBK2X+nvXy6UyFv*U{j&t?3kny>uD z9^}6lw6}*mLMkL)KFz~1P-yXaK{;%B&yNo}c|81WbdcqD?BdSkazI=JPXFDzA4dc_ zcW^naO5Nj!xg|Fk0y#t2Kky1G+4||%uUjK0$t8IBw(0v5|LV44GwWak2e>%ciY8oq zF&7rxnCg0OQ0dnG`Fv65B}LL5berp1{~6G@zCxnP1s~P6<;MkJO1EK{(&-$_Eze;A zMnagMm#bbOMDY65Ee-dd@W3>eWNu92>M@?gHkznYUw*go0L#L1>9P2Z%|ERSI(ned zh3U_5usO{dC5TX(OX++T6R;7Ib)hszd6rS-ov*SKo&Xrmx#tsJ>BZ$d~n z@Kwyq+ipAn_o8539^Yp)b!=y2hQ^O-W-AtC)kY_GnT53VVnSlE8YZkKm<`TKUgy!2bfyM&^t zApbIO3+CE^`A@=zm?#Zvx6TdxUP4;bS7bhv=!4V^+t7ZJO^FFHM%A%MWZ}Xo#M%|J z9$9Q1S3T)Xx9DP&u{8t9aJZXz#4$ZT-&!zm)UOK zo;r@Q0v88y&~bfUGD2(~2RWB`iLk%wPEv+L%p)vF^5_CBvU6c=khaocFv1ITxW=o{ zEDek&oSlg}@@RoOiKZ|Y71^|aMMt_P|7u3YWWW;)9qf*YkwcBug7y7L5IjHJzAH>g zK;tQUD`obq%t7!d^|zJ^qjUqmsx{ozjJ6f9*I4u2&0^1rrb94ihU$fh*F*dPdPrQo z1vlaKx!Mosgm{qZ5aZFuiFU4lazkES?JG>Gk>0Ac>=uBM69p!qa44&`-(}%DQl7p4 z{NY)+$`fKS8%d%Fba+>)t|rVmrAK4V+nF}r2jECCLZ>V&ae~}F=jkM`gjuvv{+!L) z3JKes9SiMd(Kk*k-8r2I)Xb4Y!O~KnU+-&Y-RvWF&)KY!u3fv5K^_8(vd|PPx_`>V&y)p`76@QXK$rOw5KvT3 zArMTcIIVyD@t4eZG*IWX{%KOATnUPjj4izztNa!6f~a9Q{7F({-=#%>n;?c-7}83W zOKgR$X*=&ee0*D%2{Z-sPA|E8Fq+9N(V(Hv#H(JiPcc>pvhP;7*adI5T%Uj8 z!iBP5t5Q{q#{ojXOqA#^d3Kcd}Q`5 zvXxaQJg!UwwDn^Zk=)z%9RN1Trj$ONe{)p^mnY-M@xkh;5KMMQI8mia0_4$&ofO&V zi0$Xj5OU+{mkJUKldAYb3&huvV_KK&QOVjZd1%W2Oae?FO1F?hB3lL@!}alR4bC`B z#ZX9e4Tqs1KChN556G7=>mZyQLfa1Qq-!5pth$@?+9bo2qepdG#&ZMZJwjBI&CmlK z#aHzk-=Lm%efxao?jw3|fq?TvSvyU8LMJ4D&Z3M9ba^9lqLfCl&h5V)<6W{}nIekVw5cz9X zIo&Cb^1W^{nJp|@m4p>tB*%Y^*<*!>xd3|M#&(6@RtW?k9<$o)VHvgwTzk^#DSRk9 zaoQtEoE=zZ4AFQ}l$=vVqcX{? zCfyeHF$t=afsEfZ0O$uV6lJ1Y(b7$H0h@pGjYEE-tWv256f&k$JQh24H0&8|ET`v*CX$hX1Su1u0dR?BO;OotU&qRbulPpl$S>Y<6Js!VU=mOC>-IHtBT= zuWrsI@q%j_rpmxXjMk!4r=B$Ru4QMYr;D@t_(lj@CW745HCoW10^2AJmn2R-xAqJ@ zPv^`5QWID#vEv4dTF1LSJkV}vYwu8o{?Xk7JBr1n^9YPule%UkekEr@rB`NlMQY== zgWa98QpC0Zm_|8gYoLeAcKpff%*(i@g0@M(X5IFo*P1_zM3QV@z z^Cd3)j)9~0OZ?E%AFW~)$Z_M{HhIIW4d*ubhd6hX%pz=9c>QPeTQeFDKvgQ145A37 z!+!&~6k(73oO<^M&7u8sKPk;OWsp)k_Q0B?tA8Y3BU~dtn}mAXR6VzAe(4MAzF$_x zr`8xIw6BfrRdCsUyi;D>!0!vnUw(=h(!12RYrrd)g!L4TP?}hyQ+;{LHfg;Uy(_=_ z4K%Dr6LAX?c_(i2J$r`R^`6n+)6)}9DW=2|Tl%CHmhx3f2Bi$q<&MrPU3+C3E8#U+ zMI~5WHp#GljJAARUS2)Y@09o-X10}>SiXM}dtE=gqxwe(tMzkBsxkbA?nR)po~qiA z^23D|2kJi~HF2fOMu4%l5`~NE)y_ZZ{3;?M@KU zBL8Dju*r)TM`=6KHPYZ){9dpHj|Q)AdapS~(QVp8#E>EDbVM!9RiRaWXF0#4x{I1-=#Szf^Z0f1fcYZRwFn!_k@0m}}Sue*DU0Kdi3C+_N zCyat6C_VxmEW1U?IFf53;31~*q4NUxFSEru9kZNY+#faMdCFJsAq@9O^XN9`-u`Rb z5W$K&38&v?2Eg*JfH}qZ z2W3ouE``wqy#%5Tu2%#1sdHU-F`=dgNF+= zC^@l!Csee%Nh>RlAFR8X%FB6Q2^*Y9iuc?R4y38ZZaU(o*=)$`?RK7@TtP#5``j+6 zb6#3XFK4-RE??P>)7mJMgD-?B%Ax8 z@-3wh(D!&WjV`i`QyTW^@kCk|u2T-=`>+`l0-fwISh^MSZ#^1lgN~1^*L7M8iq&(g zoBt|pt()Ed=eCy9giphq-Xwn2@A9~_PwQ*y(y%GxMlBn%?4wCQZLyAZ>arKH+im>! z(g8b`TM5TJ)=46hwb_F8)=C&S(g>lNxG||ZegixkQFF41WXOxi*@ZdLZ9eWeG4ul*`>cW1Pwoasq}WEciGry{<6VG|T*_8d!grFm>V9 zw381_m9}ygSf@JZD>DYW-6}ojmS!1a^Rc_O^_=oP>G2&Ur4ke+D)?&sp6REk^rU)| zVQlOpAr&2T0uD)}33?I;P7!eZ8tjxf8GI!%fRtofvU`jw%NCNPBn`CrpVF;>KD&T9 z660NqmSyw^n{;U79aTGpk1A@{q`2WO*hnxikAm&BGIndUZOSg}2Bcq1yIw#*!KSoP z30Q*`=h6iVUG?mYwRk{CPD(EZ!RWd9-Ifq77Y%*YuS=H$Hh;YR^v%ttu3sem*k(iD z1{YkjCCiYq8juShIP1im3~yGX#%gYaOl}|-uciY$@W3)IPhNRnaEYy{#bip;B^oc? zbBqLKR5d^f;2o)?KjFdyQ@!$9jW)6v&K)ytPqKDbN5OxZLy|WQ0wHgsF!%8A@XxjC zXgxF{QkyTJzjQTPS|ZC&Eo2y*yn>nKWl)lSs7;hbK{92I0Oq@$rgVQZWKOdF9d02Q z|M9kWVMKFeET<=9Kx47O15CBUB+X@H^6@=K~Tj08Y9tGd?vI#^ut%AnAe# zF+V()sBu}H)5N!w7`Oij2!?$m#w|v8u3NMe{@I=n=eDuXrVkg>vB^Kn7|+s%7!^_$Q}flTvhmLZntxnbi*eUKIDwW82~;i{vM02=bJ66z4crH-+Q!k`%l zbewaJtESm?mxd{D`JI+;27c zFQ-aAg{2>=D7L4_kkY9xRWF-wn-nh_Gk$#B$}05Svg`MhU%qsSDa@yGE5T008REq> z#_fXYEpaiN?(eT+rE$W%(s&*6Ht~A+%BRa@>h~q21Q9~h zH!GY|_peRmU+HwwIX{0S0X!6}IPAW+`o~@s>+1ImqfHSc=3{~FzWw&XVte(EQhw54 z;{4B|=KuCuo23StUJ2(A=TZ!G*_;jZx6d<$nNds2U@xH)?Rdi<6eVJUOq})qn|G9dn`n&Ex#QTc%*?2F)?=JZQyptFHiCvRMXu zZ8c9ws{tZA^~G{$jvBj<&&d_KKO8=NyeMgCn}J6V6|5Kbub>VUvs)@Co{o9`9tK5m zrOTHuk9h&zRXiVM;u4utH}So}G*>8G*mOKmo9s-gsBY7DF03<*X$}>*)BTZF`W?^O zPZE{jh|7qD&`@3R@FIxBP79}*D|L_dV-Sx~35|xCPoAt}`=N^yO(}tcSbBN{5?cB) zeJZf;ysapKEXpd!!SNP%5SWZ~;Dzi&`H&AMc-hnbs`d^^pZ_vbRewt)J&I1)(4Dk;MO zqVpX(fxqNmO=1u&f;y`u$uBm(0nQeQPg?(?UXx<-Jn=U}$SAYAL?69~fz@PHH${ZPnSO`(E{*lWwfz>I zdwajp?jw$Ib~qN$Kh>iTB2%3T;)MQ7-%oxIwBrbk_ec17!WG+2BKrNfU7dYW%}b*b zxMGwQxtBuVQqjlf6BAO0f(e+x>j`QvdTX&W zqV^O93le5=A{EjUa6^=U_?#C#Tzx@DE2Ue` z2Qa}J)Ka)#EOwF0a-4Zb3AloF{|#21q?K~^DG#$F!hnG!0#a_ziz_3C5XL3q)MGRY zQ=>jHme!IL@9hf&Dl_8)`^@nc!X1Eu<>y9cOqu$2XX_C%x6~xBN%< zwTX0YI>Pqm{G!Z8_m|?B%~3rD+=~ zU6x19@LXaXS-Hez-0^d#X_8JZ9yi-m9VN@~IEO(6^z+6k z%EWNo?FoXD*`|HK=cgI9Y|&zlX`g@|)8`u3ZZ)aurL!B4g%z7CKcQbW(%DUdW6*Cc zB>&0ELhMMwTPu!e_k}fc(U>;q0@FT(5W9uRtokF`E(92zOHzyKTtca^b6Q583x&_5 zg~?bp%{jX^$0(D%vs-!fa>rE>1@8AM2Xm-zUJ7j0Gp5*E0;;86ksw3VF!J(pAxfCwkf;#& z8{fjY)(eGx3in26^6b-5&8*w?cKC&^5);j9&V8J>Z}isoF^O+rw}{{Jo1ON{zkkXR z8X3ie))Pd|}#6ZfC7i68%!NDC#rQjG4TSR>5h#+6@H ztvSR81YBsBoRyU&(+N@?CSkM5Nj%$l9&t_jjdqWZ>2TQEryhE%eEGZs-8loHqqLjVssYr*B#bl$UepNDH$U3;W4!G?S{bQ;g#R0 z^?LL+)USA%;K1oSDrwOoSHM-7_AOq}kdZo)M>e+|J7IzqnUJ>3EJ6K$(rJlIPmu!<6tvp%$Tef-=caEnSsZfC&wT<@t?Ei z+^ir~0TVWWT1u0wd;0A#O9Q6vN**AbZEEtTYwObS0sAMnP39bj-TIqkJVL3a#Vxud zEEq(lMGx{Zjf|TlB2F#Z0iwdgXt~BVJLm@jUM@>V$XYb>q+AD>&H|W`Kd1e zkQ$S1%BkU^KO>Qpv97S6>|={*1c;r5s;$V5|HSbsjjJA~iw+94kYYRJCd>zUrNp&4 z;uv_%@Jrq1-+p_Dixg)(tWH*Fi0IBTji6}vV$H?9=gAs0p65UG%ZTI&teIiYi zbO!v`$WZg~lL7I7_f zW2e`z-@7A=m4^#GBThR-szdb6-rGLDNV+;|$!-6P)}N27`75K#&0|kx7cW||!6F}3 zK-G?ntm-xwx)u62$ZsZ{`%h=a8kU#dFXy|3E}sAVcb(mR;+}mx<}eG=t>Q%6b35za z8`L5D-lr9%eXj34`*CxJrKOvfE;r0`dX!!Ln~#lui^;#)Y_y%~JS3*3cB0Lvb3K+> z+8M50%-yTdjD+F-NE!0SH)HszZjP`W#2S@HXXW9M$d=T@yA#1q1%Pw@tZhS%;5t1Z zX`I~WL~}E-^!nNiu^e zKPt$8B{{4GM1Iq+QtrL4d5dWXd@^0mx{B)mWK2so9%(oIY%SP*^2 zzX}xR*K3A=mlh14(am9_Loe?HxxIP2CAUr zg22cHaO{f`qKxw+wAkju{?L2bhzG?l&EG z28wJU)siEds8H?TX(nXYSeZ*vGcSM}Wb@A^b_JJ51>zpS4=1@Cbb-#|Xyc0eTjX;ho$NkclYk zL?O_yY130H&(P?YMiVDnv`A!tVGRwq%T=(*7G|GbV$V@|`cb9KE?@vyl9>CVkhL^f z@r1`=wmC*1bux4n#{!3JvXasYOI-^ zG-oG4P-{Sus9Zt)X(d~sTkhVL<369VLd;U73njjU{GU@!AUXS}{TK{P1K~eAcgxJy zbTngYWyhXAKk}y8Vt!shZlTS2IDUc*gV1trA_`Y*>N+b@1Cut$Fg_JS5i;*WyL_X} zWsp3*DttR6hi)$_vQe1B1lwUCS%%F8szG}AU8&*udN99E-&wIoIJ2+@ZzV5=RX4{U z!A02D?2*%SG97VnP#%_*n$VWNPceRi5r|UUaM}JCX7WoUSD7hexHB$N8=#E*00+~eAprd<=LU{UG{$2khK6Ti9J@CratA5h=4rn{w z{V5{>S6D$}G;5DZas^e32R2jHG|cg{_0aSK#0?g60Y&2KS%Fv&(a;=}FpYN(c0h`U zEH?8M*Ea3GJH!xe;o*f`djIzA81oICdIixRCnqDXr_^Y^>R*J1_@QEc2Eb+v_d-WU z#}Twb=FCAI#0w=ia(e$eOD}iu_VJl4jM1_3nDnpqYMivSBi?rgON+>AvP-*Wnj+cl zst_i}q2Dw3Ti8HD1UcaN<&@s1Ee4+CdctDGhESKIDk+~wTg41jQ$hTtbFSjLsqC1| zNc~kJfdznMn1L~5Wt1RfR&--WA6#*u7v1Y zQvubT`>Ua;;BQ*=v&_lPMy)md{`#BzsrACB9^xF!!(kVV6z+xn=a;Q$19+MP5oya< zZDBhRXyfKx??^p9kp$g(`0(L6$@*;ZFpc4jj_g66$^lAxk&#y1iVM#RA{SODQQSGa zN|fFSaIO^f)DXug6s%mTlAaCyM7(U%GN+G9^h?S*92Uy8dL4^v5le1wyY}5Rp+u+~AU6GWk}sSt;pzc z`jLI@`v7gGO-&FhS8A-~=#1u*jY_#JKEO#guV!Xi5{nlGqT`pl!=_AflHjIch+3>8 zMv3yn0wXI|FeXXDB*m35#F45%HoGFS6~U8!n-g@k{2-MClZ?^8LyJWlde;4l>`yI= z-@p$-nb+!n>LwvUh%%0_mn6YoD#WOc9*?@P+970|F=rONR(`5_qO$W>q?EN17DWBR zEqIU0L@Z9ZR1tLo$%>i05QKOH;cOzo=wdTivX)ykqc-cq;46L=%4>qS)72zMps-Ox$wFMK3h!;qRIpE6ZlM;>oAl zZ>?*WkaO)}RR$00Jub&+ukAm~q{{ej{8e=x+(vC&kdpf2DBI$_MG;f`qQv=RAJcnP zYc=yZc+fW^=aG@O4wKA}K`%aLjfHcidSA6U4Fi zK8oGTfVMTkyvyE|1Csb(s@L#$nz!uVYv%9&_{2cZ*p;6JNg^IZH#8vTMtOB@TVnq9 zm9$?61SS=&_khK0pG&Cn2|^{)dVNqdK`eJBzhRW>e3m;6!5&(iLMwH;ef0FNCetmEWllNGRrG&GJI3|xH~2eooX4vDb{M{uAt;*t;}ElW7foxtu~(4 zU6Cnycbd)XQ?^b!yIq^q`{9YLyIm%qL`+v4WwT>R#rG-&+oi08gX=H6UR5?J$sw$A zRc|tRkB8gt+P+^EU{kv7*ciiEZ?%(Nmb{;7=#ddZ->x*-M(0zJZeoLK5!=Q%Wri(> z>4_}WG_mN>Q~LJ+V}I{_@~w&cy})sk2PL0-dqdMX3>@po2}w=Jx$Y~&0V#1u*+T)J!dA5A zXbVy~j2PE(yPH>)v2Eg&n-qfo&QTZjzsxduvjRjKW(^n;BNNKVf>`W%9qSIh1)_1na?vs>a zO-83jju3!+lFLkZWd}h6js*v&q@il_3e$`z(=UV_@){3M0edYu*r4dZdlIwEWzlU^ z9H`e6_Kz6b?E{q+EcXxlkjWhLoTZ1)uW#T)nD&1`1I4S6W+-v!YywIM9EliHTNhGe z)2EPfs7jDELnqiHSZwsk_QtE~qVTu?;yjeY~2p~fAf z?&&jUdQ^64*`fpRgRU5j7Yh&>Ge_YfCcPSQH|hH|eSpFNb=&PL$nl7X-($Vub4jpk znDAzJ4cveVs!xv(#llo_ZE*n;$B&Cr40P4>-NvL}yJiKdn)!-r42_SE*N+H|2TT$3 zKnn6vv$#vs7=%I-Ky-_(d}ZkZI-Wvd`9$p` zY7}6J05F`d6r|f1CqDjulMI~?B4Lt6E)(mi5KgaLi&hkGksk0AlpslR@!^$fJ3Ht3 z;|Z1T)VYe6Hb2q$eMkco!L`iagF&vM@EHDFcmK&c$aQm&Mr)Fqncd&@ofTU4V@l^BvB@pC%7i*B!)^7l7rie~oe;xcIy!D?bVQlpDBdteguwG!-Cr3^_K{}9y`zNo_PJVcq% z!q>jAWXRtbPH4P?oxgRv;~F4Jb^hQOIVEzL;#f(CNv%v#drG~r*Gz}QL%Z&RWdgku zV|y^vP-VrajHR#`P=m%?sSU_@^*0^P!*0Q9Z;-_^;4H9^qR+euL;-1ERz!=3LB)&i zp1m_dWD@i4HMWcY{=)XQKeg!8FrurXPjuUJW#*;7s`H3C4bw7Jieak`>WOEQTUlSi zEW$=}&dfIq*8X_4v-Q-aRg%Ss`Ok5uYt_uVW?hRXMIO39t!{n)ZwUsD51}ypM3Hv& z>eYiWw6B2*WHl~W3uNfZ0t{HoVh_Hr89`q(oaV5Kg9Ya2vY5J^M)6UWP6G?sL`)Kw zO%%TlFi;^rtWg7NRDCH?Dwzr!@S0{EVo?fw<~e~!#DKETHrpu#dr{H-pWgmEuIe+} z;>R7mCKEN9BKBwsMg_&*d(XkxkYWJ~R>XpWD7IiSYQ&louz~l_Fj9fwaF>rZMD#7)vPi_TCsRQ zmO=_26K9Yc^o*XojmX|m0KOIGsIW}ar6x=-u$b$#m@LLb)%Sg{;+9>xXg+?D2; zlPGwN`VNkbL3ZR;0xV%73T$_0XQlYm86@6S)kgE$*EKY%cxZfzgE@sg<+OFw_$`Mr z@=Aqqu+U^`k#B!iSf!*1NnW&c!oz&jypr zV3PHVw7r1SiI!6!dj0>^9bl7r$J=QNX#Y)7I9ExuO7H$J$<+GK;MqZ+9FRG)VDZ)P zA>nilMJl6v?@WRO2cw08&%9xo3*uqYJN0Y8)nVu@H@MjP&&0?CYMpT@1wBIS&3RRQ z=&MP-|BCW1a_Dcx>kU4ri8;M?K~{Gx8aS`FL>}BxY^vvoQ_v zmSZKL)J{Lp-N0d$#-{Dt!V%JU+}Y}kg#xRcS)U9hx@2x^;1~8`3{;Hesn_a>4Q6l{ zHEOG3)q^(4O#noPHXU{00_lja)ZV)mNIKO2$dCH}(fZVj?FszaS_MM~S}Gjn)ff97rm z)ZUCX?_p2+?*0M#fx0h%)M7ni5jkw)0$$p==)5PBq>mJGcmCYDX>DH6#SyBeCJns| z44o%C0(=dnm!NO^9fjBx81uHXu0vs2(SBJ={mkJtl^XOa=HGE$F zHXfEl{V5N{Mbr1cDq}Kh4?N?6BKjtj=2KoVNsFoQx{!NxUAGO7&!$ZyFULU9iS)ZJ z)d?8nbY|1oIKX||KFV^a*z{#c-}lM;?{DIX&BCja*279+t+}}U32!OZ{1Z?E!pJ=k zd5gbU(fLd|<4Ah}RrRs|mMmch`0T)uw>_1WEB%2^lVl3V72`W!)i;d)_}RnjgN))V z9fGqf%L-<6G=|QWn4BDOywb4-`|&*QzMREd%@S6laSYq+J_IQ#8^?`inoB6&=8lt` zc(pXkDyh1XR%=8L-oVZ>Qat)^`n2n<|4qts9LXObcM`t(Dih)Cd63O@O@R=wlU=Lv zW7S$FEePT)?Ov2_xI;)nU$F;~BS)(B=?073y{R_}(dBx8?O^upz<<{C1!D91L@{^X zS4|h$v@0)?0073Hd|Un`Rj4#)O8LCo;aejgq*wbq?{xaqQGTE8v3OIVt8}R#u!~Ufon{e*s zRA3|a_BUm*NH{T0k`m#;mKB2?I0NB`lu~s@O9+?-^=xk@vnDtGP26rG!yojfGoa-0U*g zPp0qZ>(UQv;Xf>R>;1g1{X6}Sa`x*FJB}tb=2f@$-;I6OW0%%&P((?&h=T3bE{%>a zaY-r9{|_Q*8kAe2Z<}VHT&rrvn=Iui`{XZu-Gi&it$4)yied`&cRwk&W;o(MemRCk zMLpv?bP7+n)B;Ry%e^;^{}qPwwacpD$e`yFB5XIu9}U& z2;Iqf3A*$A{mHcIqQkUoo^mH-LPV9R+ZxM+p^*KkjzQPqHQ(CfX@Zjoy_E|WrjFoP zi2sfJEM-J6&DDhY=jpAU{-^a@#h66L@C`juy4y{5a)zs3*>~h`h1M2jbJHQ_Tn4#M z%cK9NMrvwgj)DTT4@|zmktHieOSfW0#UnVJdbPdX%XGz`o&~`uLS&W-(rp{*%zNLkY=;sW zOG;X99Ffxby1W2r22>AbUj+!E7<7reni&Bo3y>}*S^U}Q@*ZWo=0VCb!Y*dtwL zk&0YI8>||&4ow5h_*~eQz*D;$-9}2|>B5PZHQ4gi>>lmw*{fHS-#bEPc*|N4L_mL> zXs=18PN!Hl`9Sn8z-qKQhj|}1*-`d&%%y)#L-tK0>^~IzJ2WktE+12Q?1fd)-0qas z$^WLyxvb|)cxbMdvpBJ9cQL07))r?Sr{Hg}gOcJJHV9eV22nTZ7e>@+#z}7PDU@!( z{9U>@{w3sxiw~r6Gw5tf_4TqITE5>;VEy(+qUUN?&P^|rs``ugIZ$t9t4Jj=`W?$G z@E>NtZ~bR#POt#4XsIBea8D6F*1fu47X@j+wd&Kd{uY;#$rxx!KdJdJNsSoO+{-{C z7DiCsd0zh-ylmaH?OoK>R9jd^mtuKA-H~iSlx-87gH~LYe;>dwWT1se_^I#a&&)Vh}8CZ~*vf|4cC3Mn2YH7-8vXiruM&U}fgDJ^dM%0Q!9}qv`hz)= zPLtD;%z2qLt3^00qO@`tM0E`2^bu=8q;da+k1bok-!p_fk3ME3M#5ao9$Kk?tZZrC zKXfADU2Q@0qvx3E+qk0#5BzT$Qp+m%7~4CGGPEc)k_D?tD(Cu1xdB{!)8o(VnLqR5 zEl8K0EZvQpfh^>`LfJyoM}BYiyJ@Wz;jUTk4D^CP0j;X<<*a(u8pGgtGS@U(zRz0= zouy)9n#%wWywc8we05unH_t|8LWem%*&s1z!Hy@Mc1}xLbWd2e&~;>~9iD}mMJ}E2 zaVi)7t>#D1pSUCsoRS}0|8!W9Q$eev^x1(-)Ss!AR0 zKL|D!&(J?e6SFSwZa?ynOK8G%4O=$=kVLSFA(1emf8{f91n(&Ck z2HE=rKHvlT2x^Y}aRX95s&de76g$AUIb`DDv>i;C4?p-cKpr*ouQJelC+&5h^}bZY zKHn#mXzkcwqto86a9lB3dtb`%t>?7XvXaJLbxw~v7q;xdtHI08%$nRX_HfI>_q)<^ z%ccf-J{{j2tOMllE0^p$0cFcwe$@SZnpNYMQB|Lvdh*Bw0iHiIvdp>>F1p$~H~p6D z4x!K5K%Lwx?08E$mfO(%0N>qz{LVdvMo|49CSUnIGSJl3A|rax)aU*y>?0EcLca?; zJ1{1FYNESK*!Mlc{%M=^^Ou>EuZ&*XIzK5h?#|@FcAFOcNmWPjl=u~BH#Y%T>IfFj z4-dbdAYLGus`Mq7DCjl*bvs5VLPWT>780XX{^fddf6E&-781AYU2Dv>#}xu)3(WT+ z?$?6$KK#>`dPY zR6=4F$O23BaEJz1OgIDVAUS@4ZjPanLMSX|$+#W2F(}n%%ll)JJlThHJS`PrGW4(F zw#N*bcKqmO!>`iO!QNDH=70jg+CVTB5mnPLlmm%Wsp}3$SM!I_8Kp3hU6fPUA8C%= zaes-qWnQbrqMI)nv)vz_ZSGIy+SQKoPGLjA@RwsRPt)Y(jZBoMZ29jJgI?RUad~u@ z(&48A#mPgy!Dwmc69`b)Xh|8&SyH>9#x5B*A3uKFrbyA+p%BKNmCnk~P9mn6YDnTo zSSSZRyGude%^0|L?ckb`LHJ*97SZ=XA+(sF7LWjZB*Y02Pfw3NOe3@M%Dz;_#b(@% zt>*gvY}2|Gden4rnk@%Icw4(p0E%ovPI!H0!8vk(eX%#-`E(D2sGvudO_OOmVSM;o z-^bZlmEjuK#raTRSr50MM`1ZjGGy{(sN(6eEa5&wSfp#_O#_q)va+BDlKt3{bZO8oag(3BbbFI zSp&|Vp@FN(V}h)eB$F!!?fcg91}P^hA9T6{OmYR29vZdoIt7t~LS1$QXi-)S_s5Vl zzsBtTbzX|548${*05KLmAPuIi8{H1oY`W6&U80%z=yV)*k0@%&`ypXJ5POb4(uvN% zdzs!~Ce;@@gH6qK!z#%ghk2e&SVx-TOuihn4>LW}Bc;nv2ddFe8NaqQlx+!Ib6%on zhWlNwa5_g2$6t?w70~(c*)vw3aUNMTRz9l@BkZzo;3qQc@oyhCJJ-`-2)#Ad{G$F3 z@R8YZjpQ_F_3AuL!O+8hEa>PCS7vMS9fu}gqo4qtaGKQB{Ob;dpoLC$@K(E`7%wzwM$pe2%Y0`5RbTOPp!`Ysi>)4f1);QxN_-+~;?bVTDi;@t= zbe$=OxXxfS#+_`Zr%#?B##yX+)QG4O9>(H6F|5*-!RweqNE!=C zG+;3&ZZ7Nm5>1nEir-5;gzd`bs6j@{6a>lIE zq|<~7S`YnDP6Ea6LMwHTe%8pt=Mn!&yQPH%;|$bAD4xaurOb}7Tn)!M+nP*^*tbu5 zONBJy%czw;CiNSrkpncWcK}LAe!0S0Y}QlDzA|pU(C+oF%S~~;^)@7dpe|gT39(H= zJBk@A{KG>g#2OG8Nl)#Ana%^gUubYezaY|ajafHIzHS}=NRwM2h^7GE7c!A;2Sy05 zP{k}vpv%)u1|!xw&B;FNJ-?ZoJL8<}UtKu7ML!(_-$|GdW3US0$V+7?u{Kn+A!FYF zhpeaY&_M-aDhp5o?-a`D`@DR3YKSGYt?#rtDQnF?)y%qBXf#Z^H~9wOWQC$LNjgAy znBTpcnEYHWw;sXHxjI#-k!7@vG(W&?2452q$X1@G(Rw2Bs{*Iy5E4b&3YXaFT`*nv zEz$vLWCz@Y{Ugs*phM^e9pBTO#0hrDfhVFvi9aR$}iR!iLn_`yFt_;DBJEf z)rEY)=aZ)WH1)}?1StM|WVrq+I4Tmk@Lkb;|6Y)Y!jkFg)95WIqk!@JYKnwE+(5d^}#Fj zGoi?^x8tyZilp|W#Sj}lJ2?@LErZRIyrGt>zT{^1y?-c;vepnwyzmDt@TQCQjR6To z{VG-4PKO&GyoCiMrv_R*#?9hNb0^vA8hNwps$lhJ;2X=Q?eDySZg$X%?9{qANq_$B z_q7YE-AoJ{32th1^jR{|Al*3K-JPjF<%gksDfsMU>8T}#(1HEYY5B%(ABW!jIAI=K zHf(H$1-kVJD#{sud|+?qS|jF^(}0Fnr>*iH?cO@&ZGIv@)ceEOd*_66RkC+n`1`Kg zrlfHKFp&S)1fMCx(!L z_cK<;8DTbJ%p7QoJ<*l=yhZJUDhaa7<$+;qlZiU+w>jvW)?CF2Y}52Pv8GwgRqz3S zSP)(SKhgKl`Ts>!)}?b%=JKmY@W|QNpjKE|ug#A)6Ngth42s~M5+>@R!DKncQr`{$ zv|~U79<|FH_g>!zYEW~@5cn6zpBx7}Jp@z}b)m+!EASfzo7XU3>Jg3+S*#}-=4<`~_SVB;_&+UAI%}F6!kvrH;I(RU$e6vkwHjykD6K97{%}f2tK+kSuOo3%BdX?q9m_21@ zkQj-^R$c(E$a{q-1JTq$FgB*vBev+B?2)rp?>X{5yC8i@wU+Kv^2Nqy;$nWiBlo^f z2bv$~CdO-MIa}ME%ccD$H?MiZ>fuw|r;4;}2XD7z+RcMkur#;P5KY8S29vbN8GsAxv&Pt}JNqkd&z%k9<2qwOjtzTiW?ALI|>K)p>dHJhP1@W@myWU2oB;hMG=^l#~ z2E@L}`gzf!MH{b2MaAA-I|Dc7ewvxg-^tD)AWS2YNqCH!!3>ipJ+de*yX!p-+mRcy zUbu00zNahf1To(cI)DgJ@K5^?E!pU{@|`8#?+-YHQ^y_jVru>o9%UZ`Qnc-}f;LZ% ze=gdI+J?M6iN#J%cxmYIXnsbkB;DFEJFPd?4 zZbcQvR<_O@QQcI!%Vd|ycQQg_57;%dv$o!R@Am~5d59K7`RYiewDA0edW*5iM&7Us zng>%i_tl;VsHtWr^N<7_L!*s)U39~;X7=)U{0fpY6ANDU^vu4)0>6<{h=rxG$+K+Jc3^uj+VybOKFfa*JMVQqiWz@>89l(KWn$)ew2ZCgruini1*jcdMFJ zXS>{$-QWH0mb39de&R?u&sxaOp#uhaJZcu#36^X}yBDXk9~Ay_=I!L57o#^coN1ip zr|Te)&+c?!C}yu#RnLgw@dr$?Z%M-oYnGRNbij6n-@|9wz4LxHF{(occU}>;)b-Ae zU6nLxBKysNF{x^i;|6M>jr}j*k7kkYON0VuBX{m^U~urj(A*iU6FN%{DFSK7-OSEz zccePw1Oyc%o;`ao=dHI0u8vl`$ph?vDxvVv(5yylr``I9?f)HF_$A;dbJx?$ttEtMW$Q41;97>bo5$=` z9R2v{w{l=472k&1LD{mo!ij!}bha}w+L_a6)P=RX329gGY)TKn22dSp84F0o;t=7sUEz64a zAw}0lCgry@-?2#fUa*?{z~Hyt-!Q-WWRrl?e~m~Bb9atbFPM3)f){hYpZm3Hm{(9Y^5HIY27*r$bXXKY)w zioKRwjY-wG7XSw{STSf!J9X6Wu!b4l^y3xO&!mbPa}ZR_`wBeX0Iajex^BP z3<%t@G0`k)Tlkq|Zn!93)Wi(5<}PD)|6vJ(p9Q{TR*8ZnT4em6WnKS5Qf8p&+_hr` z`N6efNSf}2|L}Gv&8k8cwD2TbVRIks()Lbd&^*- zd}&Z)?b-WxneV;FEaqMOZB_DcK4HzUiKT&g1CEu+(z9cL9+Hw>|2hbkL3d=U#{*EZ zEraVrpo*OhVA(xLdpUYP`E3uQ{1PTkeDvAfCMF{y7)T*0J-`2c*R)BrQqob=eI^ph zWlt<`sD+{B_C#GTC|0KgY2K$2wGnrah z9dG#V1|xz_>18;zJ@_IT`q;hD1ldC(j$TJ+aMjk+FQwY(@X>^WPr4Z#n6Lp&yT|GX zuvd5Q@tm3OQ*Nx8@8p#KL$&wgMt#!p(+^j7z{lbfuQwh!K3{}c1rIE{DtODtpPWZs zAGCGVlo0#}PwzUO^z5#e?fiKmC4z&kZuy*R`DxRj)m=xW9%( zHf=5p%P(}xjazEldZyd1L%s37NaCE`4YX*(Xb3#Dk)MtnS*KmQ!OF`3bjFvP!P#V0 z)hs1gUj{*sRBYy4?yRmie=NSEyPM2FO37-X}%@*v1P#JQ4ntF}f77VA)% zs&4T++Vb{B8GDm+$#H;sKOK)hEpQN>z%geD_=)uQ`DBMiWdhUU{@q>1FHY}R8m%m^ z)p8|{*^2j!1na({{*bH0Kk>ZZLT7dX97K51qsd_0lv!c{S5g|3VY6GkTGkw#canva zANVVs{+%SSBO5uBv&vLQ&1u46{e1{%Uk{NsLyePPV+et{?q=3rf6WhoiUd=28;&iV zUNh!FPSSjRG`(I!=kXycn?Iy~$u{^4JbnKBDo>WKR|gYWIC-7S{Ur+~5Cui_mHYqjarggJl8BBt)2!c?NU~JDa{HA8X3zLZjI5pLOU4xwQNT5M% z@7*cGH~*$DoR~%ES_objab~!dlQ{K4!-O&dFKFJ*;l35MCObKS|kG@=rb0U?P zDg*-D=xs{wsuo+PvFSnoq|o#)Z@D51KPq=*auWk*`!mMfoDAD-4ZuhSk=TTLn>EmK z}>g^sjqDHD_0f76r7u%!A| z{%5Op;l_9fmBorx@WMtfmr+D@ec#JE3YU@Wm7lT?OOl$cI2wMEeG`9a*VD?E&z7+UI6Q-lKdoG%M!mE)(Hqj*{q*V;#K|jZ z_fxC2oZN-YU<@liUBe02um6%9z z9a+e0`KglEkl0U(Rs!Bz%gMFm0f0kVry{qaIe)Dkzq+E8&veR3Z1^jA$xS-xC-$#W$V|Z$>ry>XrM{Zf894@e6O>pL!zscg3Jfg z^>2Q0ehP;LF*6N!&w7xk`yBAGG|zr1HzXGyml6@gxh37^n5e3C?~o>5 z0v?hluveLFw{iXUKEh+WHoIMSb~Gx_AhD*+khSa~GIik1E)>tgsoQmHI> zxt-e%41O4|1FFi&uaiMKG4?=t_#R|S_`|21hF#$Q)bi$usbI|}e*u4|o zQ}k#%iZ(NQMiaIn<@W%DkMf)_VS^Fb&Of|)GJiK?E=CzQ zbI_(0_WbU3)x1p0jvt-eVJ)q8?lBs+1J{>_AHr)#F3;Sjtzd07%r900qO&(2 z<9uiH%C)A3I(6!l`-^fPeN=}(?!k;t<%*p_XJ3Y;>>lL8b(=D#7vK7&~Mn%xu>{! zG4wgXhx4IsoSC1Nmgbuu8XB5V-5XyhmVl8dU*uF%&fTs8b&b=aL&@&ukC}+jZY2Xm zlN(a%PPEm|zO%t{oGe?p@?IWijeqHX@=1pFHGgrwY5YbDgS>7I;GJeB14OCGQqdtF-vYdV?g!}CaN{0yd&3UlIfYBVRUz16>%DAHJe>#?vBqXyJd z;OeSUs$Tu&i#*5O1D%LfP6!tczj|O>YXt97ZGrg==^y|8-J|!^LWdp1cc3pR3=3sQ z{9)-LtwGO{eyPJ3tfec!qe27bYO;{vsC1?`)%7{?VmDz2H@a$pUvQHvpwJccX#nF>G#^;u;7;c=eO_0?0ETjuIfXjZT+?>O)vFbt>C1fe&Ye5CL$=sk!Oo3+)>cX2 z$xk_-bus{cU%?cH4LkeXVeAHyp(&`J_~8h#ql+$D2RTD{aK5DEwZ<#MAa+Hf;#tSl>;4q<4$dNUIGV;!_h-qQ}` zyK0RAdwWn)6MU#JQ=s2HyaS!FgWql0M>+e&D=^rUwVm|=1kI}=Rh!)C`wp#vR1Bs# zyUZR-fU@Uh5>O-+>xuxPGkg;H>-%8kfPE=Gn6D-B(((bA=!fhx>jR8?qRZRnujMT| zBn-{IupYoxUO{Xd=?}QH32Ak>*hNyYJ@WIQdv+?#gFmTt>t`+-+sz+8L=Omi2@l(2 z^rJZo)f%&?!u-v$n4y280WX`hy{{+I!$ePH$-#=UDh*1RH>`y@iz>>k@EMU4m(vc% z(~2VglTUp5u~*Mk_Y=Rkb~fyErQd%>82@U*f_XMQ89cjkVNAf~dll|yo&RjE!&|$~&To`bc+M&PNa@9MmM$Mx zJ-(z02guzAJkNkpLp4AwI;SCU07sK}FtSVLQnvWAAGi1W@@5hIyO3GQ3EnP)--Nym z?i}ZiISP>b9Zv z)c3mpGQ`1uc5hSOs1J*+-?+)VhwY@tHkoDG&b1da<+uiWi) zuZ2tS&Rq8ow%nG!mYBXDBXCUMk3F9S1r=y!Jv`)Gxg`SxhO-sb=%Lo~b>CX7*!r(6 zQD`KXqo`Z73NK=3svVjUNmu`S`EV1ox2R`}J!(C#MRYSv@P-nUxG19}6QQ?QEltfQ zs(&LC$%D{O_a;So8HQz*D`Q2k$iPxWgdmr}zu*;fcsBNT5WmEaf#*9u#rZRMWXx6d zY0h=1`_=ne#8kFOC@F6F{@2cturT^ZZV(*j<d#1z5XoLWRvH1}7lsy9Y z=r4;B489k$^-|w5xgg;B3q%B>CCGUfKqoad)nNqG*=04%XVfyGWxyuOUq%xbG}r-k z_g1=uxg4~&^W3O;Mp<~lfxmLd!Qgk1M{dvN*Ef@l?t~L}z8Bxw@k?{qTwiVTwr#fG ze%q~TvvEv5UWsaT6VGYcx*o6eThR?$9u%|%iT@aqNR3HV;sk%PmFY=;D_T|>*Cb0T zxD~92NUbs!One%rhJUF5GatM|Wm(E&sP*;z#Gf)C#XF73B!ARw!Tz^aVzhfMG^hDgGF^-f2oNzgB6}~i`GJ3 zIu-p%nMw^hsGYCF1rHtrGzVW_dtS8UWAnH+Q{*)mZJc&&=C3v0O7OY(Kq@OWz>{ly z`&1LI%1`D?SFAMtwh^z}(Z_z(#_X~EV+QUW{^P;7)}dV*Zdp7$EO%kd+8Ce@ zyCc4Xm0qL>9bpiJvQxUoLn0p0(AB(%?ah(X z!pf4V`lF~fi^WD9!8OSYE^Y^bTk;)gRf|uHo<=NmH9q4q(SZ(&Z%3R(ZvJ*ZEPWQE zz_zkym7|m>dOjQXKKN=1IoK6&Z^&dfo_`W4jRi@nGs8!~V{8mb?cY5qr@>c^N3sGm zYjbPFuf?AJ{_>?Jc91dJ_v|^vw-xI!`fvU78{gw*7Gn01kv$g6#o7A{kh`iOf>rO? z0m^hgmD|pC_6Kd*kW9cfsa!dfTo5X#VyDy^3+5uOL_|Lh%d!MqyM%xm zVO+nV<7a+A#Bhd($*bt`ml{AgP@nZ@eX`3=7?m=j%*-wfzMOTSOXr|(E&0J%pM^eN zqZf3tc`)NQ*Q=|}TzYvfwq>V@?_>9N@oqcYyOzPGFKEcho#S!>{$A?Rl89RN`FT`$b+|ygV^HZwr zC@)+{xp!eJyEQeu8@dcgaJ^8JqkML5O0(OEFN13?ukaVdF-m4o5bYOWv%k8Z)#lSy@?=4^j~iwX#TB6H`T7B*IgT_fX?Zld`^O zH?u!5hQv_v0n;f0M})c%n1#9!u4;Ky(EcBrmucB0AcS7T!Y`&`{HaC-%KV-!$?J;kD$2teCpUrqx(kpyGZz2(ng*~9M2=X$Wm1&+T0zu?$$BoQx z)t!7(5FGSn^#{-^EkPR;j^3UVxC6nK)2zj^xVSjL_HPVOs&N@>+1hZO_m-5)l{#1+jbT|4;jL-VW#8imKl;7OMZMCGf@`*&9W1 zk@o|3%_Cd22r5Zav>i16Ri zr4rahkl!*8=J;R#Bui9Hb@_D?vERIOEW~%6)}l2rk)I+Vw)LcM^9pv61}2MD#5e3m zC|zjcBUKCh3W(!_qMS#hMTQT`Qiv`wiopB9lN9r|rxty9pNH4d{<8k+eo2Qrtq$*2 z@wi#euQ%Rkem+0?ceSKp{Jap74gV?T)lj4e>pBc`0I1yOHpsv=kvr1&g7?DaTG|0C zO_IwE_@t$lI_NiktH6r{=Qh}X%9v$(IbWA_ zKJy~8qu<$)b6@-zv#arnO$DW&KJRxd@rr#Dmt_amSqIe)YtrznQ!{U;f;w;4H5@up z5f9iI?d@SUga6LTqb9w3S<&kgh&cFS(`pW|&^M=mx8Ho)zB+ez=MyjA;vI(l z?qvdc7^+DWc3n;wcdEK4+0T%>>;rfYmRL(8>&x1YV^_krus4cX86ItGc>nv!_Rp67@aEWt zme@KtK(9!q`npUqJbBGd*fRjc#37DnTOKVRb2dps9sbll`mXD& z`YX@0@{XB4`|QX=G4`*g-rN=Z!>HhfM|F$AjmCPnb{!ju;qyTft1y!dQAj%Vd?Z*>?0@I6kgYuuIGUe{>9ph>CLS zP^tvs+wg;?KqUZs*bdP$A#oF73?o~cNt_j?hv%mF8DEG;`sp{_?tu_SSxqfi(tjG9 zV`b0~G;<{pgCl8@APHm=Rh7g?MjqG=$Ewpz%f2=uM#@L*A~r)t3z9ym z{CF1_lNDNaV=P+b(N5Ee>YCVC5LpBFb%S_?q zadwtphRrEHv1KLfjjcU!abr_WKDC-!#%LOjW}fNGC8tlb{3j9^?L@M9BkG$SbLBh+xO~a_iIB-5Q%PfKC50W+S!LK8g~tW6iMLd(W8MtEHL}- zk$V_7i@tGsCT<);zck{49QOf4*{J*n^|r@T2Q zieh4%{9@{MhW`+HTvnaMmO@h_H1AiI(w#c}D3A6L;9RCh*n;8{>o8llLfj(_qtevu z)VDwe(mYg{-j;<(9$s=VEhaIK0@}JvvJwDqdfeR*zi@JMfcaTWyG(cxNAf)Rya|X# zLENJDrSa(-*SDX8gCqSC4j&?0 z`Bpc$xainGA@kB9Bkfk=MB7I(0dr$=&bBh1k|24~4eaI`Eb0{A8{bWE92jK2`QRDi z{L2d=k^%IzAtWgy@kR!!Y3kWkcT%c~JYuaK8WmG;81;*37#dqGxW%N61S1()Dz+h{ z(jJ1Kw(#Huwkcn;Cq0;c1Q*Ef*r$a8h22S?Siy8D=m%s67M}>h^f3B%S#jDimOMn9 zHjOBQ3=4B66h6$ebImgWh#4&xK7`3dOf+)7x02##g|+l#zWE{>s!5WU+K}~ePm`BW z?nZju>;xV$nErkJbs=wlrO>KPpjtQc=>mj2<=`Ofpww!GucJt*uyHiO)VCp22uIU~ zhu^KpeSO8!r5Bpsy$*SGLQHlAgsYupS*nA8P$h@xtFkYqv8)1h?uj4pV-*peO<&|$XKBd2P8U?_NJ%c(Y*>^gR8KKXKAQOWpPNDlkyrD#CN8wm*W0YXm~B4tdb zH%W3KJQ_)*)FBcQ%pFs$uMVcq%cmn{1NHD7ht?KQR)dN5fOsH%7QOzRr&=~{)JPm> zSu+tVl0nrVc&_L2OKYk$TsgMpmtJfrU6|lb)`&#Qs1BZ`dxjf~3VA}gev7m0>-9~l z4bHlb6iX8tnHozw(}6>LpIgDRXP3p4KsunbOH>@|P~wEp9Cc(l!oJE}DTfQJ!{s$f z4VCIPgJud0thRB@fb1mGQ2&8!U_Y+6`;J^{{}H`OxR*?`pAzTGToLgEoOUB{Qh4X5^8*|VjVMNz6FwI~sZ|;y)Nys1~LOD5kh9L0M;=q1cinWo; z>sxNJaniIoQ^^GYILHL&KJ1`5Ox9@u^B+EZSf$a3pEtj6A8>Q*+%GOH8!cdjv271y zbM0jK%Qbe8KOV|pgF~lIM+>hze)NWsIpPn=4~qjKb#mICVVzgqHBLR701N<55%@^s z+`=d?)QyAu>^5ikBA|l!=VXpO9OjJv&~nke%9QKgJ6aQ`3=?ez;g7UDtl& zdNn4Gj*W5L-QO!nLQToP?mE)2s~EQy&nWOMv2F8a*U`((*QuEOnVThV9K1E&-F+j$ zWntco!nA0>4)r>9W7s11Ws3UjeYwF4F@#_p=ijQx7*Y@;Cyw0d*+DLvA17}g%nCq& zw9C6Kt-c#rZC~7Poem=Yk^`7XJNg@1MOvxtjVX-me&EVE97|y~Q$w;#!Km0Ig z5RtdeY1f#^BM-l0V<92~hL|wt_90+0G-)~k@}cPS@8EGS-L08mwtDsI;wM)Vi0hnQ z%?fgx0rK6`ki(8GwFu>l>zqk|DYi{kw0{wkAC-}pJ}7dm)19{=TRqBFY7iedq8(Zw z-+aFpejugAZ?iugh8|9_wXs1?*R5;Up!+7D(_z1t5iWN2=IF;cUsT*f8uHmm%u^3> zsmV!`kekakm8mQfb?#6I1cko`@%h#eOG~tLwTaXqHP3%Fv3o8{t<=H8-qajx0gY?IYQcIMl#frMYG$+<5g5+j|1bV&kwW79UF_3OVxiII#JxkoD`z(#qkd>Ldb9xqF+J{1o;IgB)`zlE z1$L2k4c+C;SxUIn&BU|v?9_ZNOOW+BEb*b2?e%;eaq#*+jjs}xCg1W5BwM^j*DAPJO;S4H%b*t7vQtSmSQ^oC1EybGbqInx$5D4EawVL3Hu2> zVR8@4Wh-&tgX_#A-wQ?(*@Hg&2J{+Z6_1q;2QBu3^P(xV<{JA%*ObT0~}Y(W4p?C7bty* zm00u7W6(++M6XX1KESmPPt85EhV)t2 zhCwtEYd3+A2isfALf?`gCfTqQEiGOUCM16bIt^q9%YO9q6o>bv=Ag}L^~ooFhPpyV zL3qCz=*|tcxrE2ek}f15jb&qqeD8A}CY>kZOm4$+F-kzsHGBxA?vYW4#Z#tFu$HlD zq%98^S|xP$^46YJ53#+tIBpwS7De{)MF@)-3h78uAdV4)mHOt!T z?Y{+h5;|Yaa^S0hl(m#a9M{@{9Ql)h)~1ESUb{jo-9$O-u%HjDM zMqSv*$8=WfW(x-iicAd#rcd!8&nN3^JfxI^uyXg>4V@o3=jfhheLlHbved5!ngNI0 z%yZ;#?H8Gw$^7^i?Cvk%j)^(SGyYl>n7}>$bx-a81T}gJkQIML3AB<<+0KN<&)s9! zr_$9AN*zn@(6N8T+GXT=)_;}BHqn|lFgD`{7-jK+7nJ*5I%yCIF6(UQ<4Uup$a5Ei?!S1*pd&-XvucP;a1 zSg~4`ns*2`9Ly?pB2xmv@24I>-|iAM%d|0_{tOPS_hms&J(Jm1q6qgt<6R+I$$q)C7WPJ{y) zF4({r1vp3+g};3HJ9$m|o)GbgUewn^W)Ku`At}AM2h#vG4G1xDUsjWps&+#T@1k%q z7}xscR6c}?D0_MH#1!Q@bSN`kU}3OhVB`)FLkaix##hhtC4L2)%dlG>+4qUmr;r9$ zfPz?jIo7nBH4wrRO%Wv3>nyFSWV{0QW||DJr`qRU8)-BopF4LB-QTM@>}!jb zEa8Bcqx`QCUHpDWSaz;@+vzLiiNPB>6hL`F;7)?n^@L0bwpvFJtffsKpFd<)VbJ~j z{PFgUkY)_Naeve5nXuX1RrSG$h0G>Ice!d1Dq}PmWpByA9`@R!%L4Flo9^ z9p1ID>KPi|;&HKwdQxu${&U`AaJBkmeQ?kzb4NMvs$Be}chU{>tY ztKWkgmdZaC9GYsHH%MUa0|kD!Zay26LQa8VH&pX}M#*I~1N7CNW)sQ?%a%JR<)k{O zDR~!SqxX)Z=%qG;(v>TZE7>7YSzp$e2^=@$4(}o_8QK*tNE7uA$PZ$ z@pIRzo;fF!$};noHRu?dv~Qg!|L8gV+_h-A{?$`tlWepp1a33VY<_5t4%2T1L6)CuWF zemEjd0&lewZe`TrU5f)Afn+#>qfw!4{`u#h-4pik@zA38IUL1GctEGyg`N704C@{e zn{Cx+|2~Z1X>eDh^h5@=`UDx75r)TrG3#hV8=j9If9pg-a7;rP2w?py6o3=as%q7c z@N%BFPbATPRb*s?JM$~)m~{U%@?AYxZcys0eMEF-(F+Zd6^sgiycSnWYV@UghLEnM zZ(|vdl<)evc>UE|p;j2*y?41;$~d)Nc>M|EcCD(~o5_)yHNYwlMS}PWC_eA}jdRD+ zqxZ5-H045(c}b1-VN-ATo%CHC*>|sIGi#PBM8f(^b89-@%p)qC%$HBD>w zTdi4n{TLZHyqRjQ`0Hw|d`^Y&u70^$T>ZJvhn;h*CSVoz591X#>dEpZ=cbp+JCpAJ zr@rt3Hmyw*WH?YhjX&op`(Epqw7w6eAu0;}Sr2!&tbl{VD*<$lAOFuaN)iQH??4}O zHn>dj*MJ7)jGN&_wP@5ddb^5iV>6Q5tU}Fvrxy2}HLNH4XHB?glL-eB>`ilGE_bYh zF~3G>Z*OnjU~vGvLrQ&{OfhI`x(z7jH{S1v?}b=@#(p1pA7mZp9XG(VENeeL>LCP- zD%aq$cKX}V>Nb#n)--VMe8x=h(uRKuWHP73htPFmXQr^P>KPz+!X#T}WX31gWza;iCKQGWH`$Efdo}1iT1I$C6f9#3>g=vP_pg|Z=Tko~| zQv5IMK9>7ryFP03$JKEp;lMdW_4PA)T}U$xNf_H|UMQ4G&5}=;ZNr7vrX{B{4LI_o zYZ2us5SoMD&QNpu#j>twWS4NYwkmJXmgU=BVYaB}iM`Ei4s%wDw8XbQHFZ2*B=Si0 zfj&i2o=eMkzZ;3;m%A1QOaw=fz5tuqVJU0bjpC=8jO(Du?V#%tm23Xs^H#a^EwOq8 z*q>Y+;In{vZv%hQWI#=~K*qiwGz&wB)~$O6${i&=f8QB?!Aft_rVSdsF`nzdeb~w) zUbiC$79hmQAci61r8-fSMHM{OE;tjW+k0AVOmLfq=IpXCIJ_@WNpzHl2~hDF%fM8Unwcvs_7 zW;auDVQYSmU7>jy*7vUtns^0Tj1xTixc;W?R7G@>?^ibHs920Q7=TiHxxaxWXinq1 z=Le3!aL7!nv#}ZseDSberbk3XeqGbQ_W*HYh z{(qC}tBCMlNPmQGL8CIqZhB88e>8*ZW!DWd(y0NQ$7yCwqxO8JV*TgR7v`L4(o9c1!!ftmFC2#iI>4QS%&!3NYN}&-f zfyF&O3AgqwoT3_>e!j=^4;*lZk%*M*g($uFI6(z#BR}Rh}I{nIa`Ke9@wxs zgST^4Vai4j#Q9oU=#4(@-ahL$dYk^dSZx!vVp~hEEPwGF4VwPWyqy7YxzbT*FCQd= z{TR+$oLRiP1)KgRx~?cBUcp+bgHIkm?iz6ta0^A<;2BnsLMhSFe$phfC+(DqPw;ok^kVs^~g{j z&@@!xDV{EaB$Q%t&By<8dmEi#ny<0!a`42RzyK!O1UeG^@~YitBo<3mz$5^K970L{ z3JEc}Y)s!-0P2{>!W|I99iEa^iyXRb-yCb;80R~lvLiM%Mizm-Po z_`$~-oD~Tse(#0VEr;_b>4#2&sqRexV4e}x62w2Gemh&+Gc&X1e3`eV;`P@vx^Oag|1@rkhGZB5NfIOgUeq z4qqyR1obeu{2CoDyDEGKXjcgYl)LPRwOsLwCchwU98MgyQr0ybjmh7%9D!|`L=Q|dOK{;`a2JpWEF`k zB5cp19?!sfcFciX)^$>~<`Y)G#OpDH&Uqu!op8@L9KpfHvnvKquag+@-=UxA&7idWZqwA7J`K4k=i0T5%qMzjsF(*{k*H#gASzTv&-5>-*2RMrExFdZndT=|`w z7-UU?zV~7~0KXItwt`4v`cArj#?2ZbPOgxcBA=6A`8L*?yE6Cu6Czl}Y4f}YfzR4A z8P{Y{Yc;9X^LDT9ePc)hn{+D%oVF7J@Cn*xLRnDbg^(ohdjt=i$ zSU;+sOXfpXk-c!r)Tt>H;-%LV(9&G)5v-hAo{B@);j*%mM9wH#fL&%GSl><{2eB3< zTxNrgrTE!_kC#fv-n}n$x@)4GxZJ)e{dV(ajB?_QduoSrYKR-0OUv*QXS?X?oC5+f zl|gDQj;yjWE6FR7J*k$WTJE6#na2_+aGl4LC54ar!s^Lrl4?!yyTbl29!{c6H}9@ zkoLWAw9RRIeD{~Eh2`7~qCkP$8QaT0VAcRS>%5_W*41}5zaK79y!d{4%%(}~!@R`O z0KhK&1*bcM!NjQd@9}rKjcTURDRGe-au`t116p)fHFE>sfsYY~kkT$=TwcA060K7= zjvu`n9ymu#Mp!y_+log(Id(n|kNU>jyOg?^kpF8?kkgoTTzuDpfvEg8%nI{;ddHCQ z_Fatsz?E3AxEIFnr5fZdKKsWqd~h}Lp~U?L9g*VN7?yYR(Dakp+_EN2?j^IQTI?uul_0(s z45It29CejVHX{y)zc}RVTX-zhJn=sKWz#7S9Pie2^*`$SXmF1khmHWtCB#0WeNz0S zh(K^)-Cwuk)b3w$xITkdQ&IEmCgMmHl|n>{I5ZgWo-eCB0i^LeI6uEI@6#<0wfz^% zNB5<)oPExd*jf8(f0o7mLpQjYg*9`A>MHy~PFfk!&i8Xto-P>~edG0P+=+w8nz)AB zsWn=W%*fI-w~FcJt)2soF+Q8pB4hB~FAjZaKn4+H4VU`Vx!qasw=g*^cIL0PTQ^pp zynO0zqGm#som^ri05ya1ej69|<)i9Shu;f(z#&n7+Pg4ou~dyL?Hvw}qSruwht!O- zR&x#c)-3Cq266Nda`kBXt-gJW(po*q)yypSBcqN`Ql|~>-nDDmU`7hr$LSnqxl z^?1K~;rCjn%lj5Xl$rVT`DjFhSi80SFe>_~Z&N&qbAr3xxV1Jp_wRl$Rt5%+cyTuv zEsF~z|j^rL|3&rN*KA!n30T6KNsl0{l>(v_I>-Cuju{SJf9}dK8lB0rkUy* ziIK5n)cwR`2^3JsCznbEFmhV0lxr-PyWOB@szsW z(ZsMaX)WVDz|4gLsfr~n=mVakF(KT0u|EM4aM2F(*;4EkKK7|!BwS?Pjb7jQ44Q`2 zcEYALW(#yG6jJA6J8gI%f^9=?74~d2sd$FLSP!m{1A~`!l_2#n7PdE0@+u#6H|?JL7G_Do;=Ne8 z3apc9B3Vul>Oo;YdInc7IP?hnr++W#^Xi1Iw4rOSE@7R9{`mg@vCW8; literal 34676 zcmd432Ut^Uw=Eo0tQ*m-hzbfyQxH%?QIKMVfJpCMBuYnmhm8dlK@Gi$N|X*_=m8=q zA`(DDF98MVAxI4n0(UO<{?1qKDgQbD{h$B(JPJZq*2??NIp!E+&b1zDT~p!MesDVq zh2p@dUeZCKHd-Nnw{AwEP-?gxc=&SGgeL(Xm_38%ht`^ z&czumAtiD81ph4$4_9|NNlB-Fdx3nJDc`d!dcXzD&H*`!rK^N5T65bPkR^&)3V?sK= zKG{8$X@-707%-wZqBu<709S*;TSjE;K)wkX4@DuL6?NoD2pfD+R^4ia{CC&y8{i62 zsJnOn@B1|_QJWhsE#xOTMZP2*AAr#iYKti4eIeR&%-#6Kyhn~9 z>7vo-X1ngfG5Q;gq#)C@D;z=Rw=A@bw`CLJBrSs$no{IRPo>0jSZ+J_Yt_ehbzm2; zIHzDctTVQ<#YB(G3R`KDUn`lGH>CW#zXsnUQ-kAuX)8%-iA%J_m6i9{QzX924{hpn_l(zk(9i}YWmSaq~k6(UCN0U?7^kWh@qUZ2lqrR)c$H5@QY+&dVn3I!82hj7aajqol4;lD79L2~~<`M&eVBv~57W4-yDr z8#+lK2q~PfD+v8nl=B#yU&;*oVQ~Lgym6p+|9;Oy+HQ>?Fxv1HT zjQL=^F>;r6V)5RBtad7QT~WT!m>On8RqkpvH+3z_BaPTB_N_@U;+e{#S<5l^<66@Uq~ZvZi~01H2PTr$1}Yb-;2z|$EuB- zFlfRsE-yw(OGgYdAryFG%*#9V5F;@~aBQJ7&<3;5G&mYRjSCVU*7m3-kBz{#4eF8@q)qbqOMar7)$q3Ian6B+t zyewlq0#8rfMl($)9rQkVXJ4HuZL~w?_sTXE9aHu+BTLc6mxE0M9Zlrzdmjg3C_F+t zEB!jD!3X5?X&jglBJ-lqvXUxg9@iAzYC>V)EMh$F5GBRG(ZeRWR+3h)QGOJ33?B9- zRaSIt$oNgdXCO^x*3j{BfZB3Xl%NW{%HF$Q@8cwJbFyfr@x$Gu2_dTB*kw$xj_c^q z5!`0=tk>Uvz1>}~+@V_j`jSGn4U>m|%$WVw{w=LQXRK& z9=FP$nH~CF6rFuwB0mP7x&4X`kD8p?`Obhd#i=(e>MDKD7<@R6vv_1Gt5fYT=R><4 z8_H;E!99;FG^TvRNkq3ckp0S;k(-#Jy6pXFDi}HQYSPm0m;~IidbtMWCGJXXw}L?C zz0frS=a|Bi#Q~U5IeWE4p)nEMbDdv7_L5^AxaEYdC%vrx+ezlHzE`@E@E0&6YRh*L zSUpreZms%8%Yp*>mi~S1;pLHm)>wroJ4V;d)IeNZoZ*k=H@M-(BtzyimBaOirR`f2 z7pqK)?AwM)2P1nH=UOy_E0vfT5|L+zsVg%jW9(mrxtOEI6=RZc2@kI`{Bm zo<)y3U+`(dId;oSYr*v^?PLV*U= zBQXMFkvI}9smq={OBwyKGE{uhS^=?>Qed&R~q zxqJ@2QL{g=wsJkNR`XixC5kvbD6n@1vpw|N6DrHHz$smIIU7%Cq_KGio9l?Mk6bm^ zZF-CWdxkLqqvC zsmU4Xh)4rc9lCF*D4XjLHVXBW|E?{GGt?;A4gC7iY^53m)3Aky-r@{4jr=lru)%Bk8 z1ZtMmN9PQD2Umq6&f`^rg5J@swLK#p->4l4UE#RRNycdd{71%qW1mV1Q!n(ynA$W8 zT&%w3sKFWubj=an|6Z?}ajX8n#XdE=?KU%@i>!!~ z`bf6_$dXqw8jy}6x|||-b3Lxvpqi!_xa=sgRBHX<{$*qMGu_ZKDWbGg-#%l?mRT(| zr`+YN`|IA^b~I7!;ldP=>2SMb>iWdahtyz`PhPV&j4?ujh{)x_BrnrBniG*Ayd$BLWMW*Q`2x{958 znqXZN+}z!t96amxv${WcEiiMcfb-lh3W&OFPCEb^~&0FO}MA98n^r@@G&|C95ILKSQ@2N+InnD0;2fXlm*eTY za>1)BCPK{Pc}w@WEi#lmM!$M-9F!V0AO*3UZ>D$}^oW}lcZRa>cz5mG^v4gJa-%Qk z@DrNb2M$>&u`(9&S^CL-t7DqMUk;zT{;?Xlc$4&NDRhx!*P0*GOwk6QX=sVk@1Zit zakzwpt~A?*So2Sh545;37baf^8H-OX8EknR!e-m^>U;7lZmzp;57HIA_?X@O;l+Dh zEZYnFxhCbmwJz>sFhp>9s$YYO1k0F8WZFsgD7obQ#1!$d<<478fS&Q=8pjftX(@Pmg%m|@Rk!Lf=`Iv|2#?^X6S zVF4ixG~td=3jdgKqql9a?#Y^jfWU{i6UI6dA2^kA#4tCLZ+}|W z&o*E&V@;jy6s&9RxU9 zd;c2!mKK-lGw_hdd$QX$HE_W=7q*P`Yp|!JKPV|7Daq8MLA+!|URJi`*3!&SLdzSC zhRL?J6CL@@`{li8GuKk&octM7Z_dOsx34$6R*!4ZR77Wb({>*-*|))qc>CZAdcoR?VA%z z#}rGdPm?;H+2%}S@GD0z%X%^-xyaiWl?rD2u$c00GF?>h#HoiJ(_Wa@O)qs%E`{}0 zekLD|llny788f$o-TLQ`(glINoAUOkAC|a?{ODjLJjn(zlZhmIp8Gb zS6Lv0yY^6iFTqHJop3s@%SH?pLkVP1<5&<^?ZHqZtIka15Zcu#Pmf%bDYk3Djy%{c zu+x90UW8{SpJD(@eyTzj@XC+)Ml*F)<6&cP=?&F}Sn=?gnVDjBosp)L5+VNwgY_b* z6{~-2IlO3%t(zjPDGm-Nxj0vPj8Vt{arBj6A2=OIa}@P1WOw5Y@?zV+QDE~I(V8k5ve{xL!@Gk9$^X_+y#_TiG%<;$0smzsmCs*DL( zb3UK%f6#cGORe?Im{U__BhRM<#|m4n^(-#*xO4==)w{!PKF<%DP3q5y!s}`(cn$)@ zwU0^$FR6P4pd7xvs*OxQ-(3B*_7))7aRTC{jyP2%?_G0uo7k7E+wet$v^<{O9n9?M z-^c4uD=>Ziy+6~LT26GSnEoWg7qs|ehL1I>(u4D@YGVuI->~gtlm3C6?-l8%u=|2n z^-f!8{SunV>ZFC@1knyJt!hGEs|B3tbz~2FWNwdDx{7wh=5jrLivQrj-ts!`>;^2h zQElZ=@R>M47TPP-{Cd}$(z;sg@%?N|fwYyJ#lW_Q8p1~~?m{e6%90Sn0n4$Me8VuW zQ7rLJUtlf zw!GseGfyc4Z>4q4e>VO}pQ@cNQ?F)Dhc&MmQKzFG>s`9spGMp(t7BW;I&B+qnmc&x z*hHmn?8x48B|1@Y@y5aIFPZml1nHr)&Q?G%;^2|C*Yj8&OV95wb5DQda`FCeT~D7q zdnwx=K&TQt1ZkTj>)acE%IbZ{MLt-``welDn)Uo2?9x*VjAm|^I8%$&C;(2+gfRJ) z3nqaWxshadwL=R5t4kxnsn4Ei(>Yb^4*S#ITn^vYc9uyaa0UTP9n3+>lQsd2fG!T% z-mf`<%mtNpsDA!fEVg+w$3c(&a!-edW<_R>R{-<;oz1zDOQrCd9zna#x7JE0n#p9n zo?@qxHD>=>pUuxPpzV^LKfho^QVPoe#N_*YM^cujox&TRD-f;gA`g|!jefIn=q0uI zjVa~po^CFYv)c6AF%?Ha$G4%!iNQSbyth5ri!EYOo>0Za)4rV$z?hUc zn`>a~yWZJm8Xp8ts0~Px8>dSZKv2hIXF=L0D45Ca5Jbaj>q(sCTshx9bxm#z--z@0 zcA2|qsljp;=Rfv3PGX8mWW1r#Xh+d45U=e)5HkM1z4`g`XLQzCF|h`rOhBX8nhppb{=6^S<>(8d4qNBH0spM<9-ujxb9lMQXP(88w z$AHAxcY1$(7yEXL4kgj8k2%E_r_yueyi8oD$0meQUh}`_R`b)TzW1g~^@P?vDLV>2 z0}~~DxfwQp#5UKZJ*SJ)+UDbf-S-5pJU-T2?39EgY+s{nfA0W|(5!e)QgW&Vn5-~5 zZCbHS1Ov6J!Lj8{BWDqN>u>XTNEX%vHasLVnBA?=is5tC7-KZ5S`ss*!j{T{iC?V11fHe z*^FCFzC`r$HxA0j`!^WPTj7{fm2pFs*^ftfiHAFGOrCWL3?TOGz58WmfuvUbxuLJU z8MN`2A$s$%t|-n>#u%*?gnmoA*Q0oKyZ&y=B#Azdw0 z2$^eWZtnjD`sQXwhW5ouzj;vy9lSzoo_Ws}j)R2-S0WB7x_*CO@kD90^jb%rc}Yi> zzC+1ga#noS^77mmK^q77UGS-PL=i)|!>QQm1Fzxg)&yGS&55uKp#BaPUR}16`y?%r zW0I1HZRF^L1mp3qWA#xYBed?A=Q%TEbaS*^zGGAJ*)1RtPB{Gh{4ASM+_+9Xo&>Mh z%z^@=p98KDRgUfHd+s9o{GzQ2Y9qMgucho;Gm7Bae4?uFY$@Rb$L34g6Ggx91iLZH z>jc8hW@n$Jp}2fnPs!9UYH`w#1gfTQWcEVvTpFJf&l%V6<(aza#jf7KFPiUbDlXxhS~GQ% z+OiE@3?mLcOgG)H7~sFzClFIv7lfl6R`7F-!tc>McbnCs zsg&a0q9N-v(Oopr4g{OnC-CGP&J-@HQ1C66n8A(be&d;8&%OJ69bzRdpRW^PdoaVB z#PUk7=d;nuC-O{}YIaCUDgL;-N6x~nUP$bT+;~QAPl-!Oe${do1Tf(sBG8uKTnyVO zzfNq}egk7YMhA<1GfYmHUDA-Hw$d`wpMR+WyiC&YG zaLKAtpYQMg;MrsNtn={!bX67YU4xh6>f+#8L<%H%(mHQ9z{V{PAy8ZeHGSJf9U!MN z3d^Hu8Wui3?$KcVu~N>xrK5F9tJ9xMT`K2^u)L*$EMShS!#H(`>4$qniaV@zySghlxp%=RO1X2QFTY0!sKtnPJihbf~n18lg{NGWaPX?6BYH zOZCAH^IsoToQCr|OPlwM;nhZl=S$9=tl$|jR!dayUpNmbF>waoF+T_dSe&;4S?$N` zIEyT$TVG%71d?ib`ufKQH5n7z8saR2x)zvIq~PM4&Cj{RiZs`IAXz6bXcj&Pb+s92 zco6lug1>X7IkC7oNjAUfXML1up;M0+6BvG#Bx#GDY@1w2!hsLLDAbD!UlNlyw;3XP zh2ZD6)YZB4;Urm?yl0crD2 z-qLT>kt^mZp311GkG`((#^lb z&x!0Pq(8%K>#=>ZKP~y0O%ZnaBfeYhmF6;Db2&#ZaQHEA(pwG`ZO4`O$O-uc|9)a< zrs-LW(JQhV7xK9~BFZC8CgmLMR*CYZ$E37&A}us!u42|it!Ihka;ZhDCv69MBH=0# zs3a|_qN3suvpm%%@~p5Xt}l3vb=lh4^bpkk94hDs(>3^qDL8H-Dz@H_boj8a^yDkUA~ z?8e}Jm4o#AxM1T`8-@5}9XsD16N5iNz7;@pO~Bnb*TtHnb-DLcwU;XOY_QFI6yXGvp)RLS*??(I>_~<&Zo1lrI!I+FY-DbK6vJ+3ap$Oyjqyo*hL`@pdh5 zq>)7H8Tt9sBUd;58UrZO%4aYg%4>mCi&s4vxVTLSvc)3H1*K@^%64X2I63iz)9u_sjQrJH9DUBejqVjBdL9H zWwzNRDkX(l1gT7A&QExEeu)u7Lm4n||Mk5(ObT+w)Y+M?opvR{zS4apc=I7{ag%6U ztU_G`PlqFT3gSsJPJVj1CPjX*i0G-wl70!s$_K>A8j7>*dx!KG3Gd$d{e<&NyZ!up z2VrzN#`ua6{}&lL0#*f2J>TJ5sI0Eecd~+qM8_mBXn_-)uwiy@-G;u9`&Wi#C-X`N z9u4{W`u6b1IF_p8V!^I)cpt(x0kmQ!24Ha>;4Gtw>fKMhK0Q!dlnPuo|I36o!atrs z{~$v5%~fL)Bcph2Ev>KipPwF?BrOhvFG)KHuO=pwi{s3_jI$AXyaX#o@EiU5@}7E} zMB_m;_G%BvBV$0-Vv+H+HbYB=QA5&l+sl#X{`B$b!MfnMXU~49$DYWX{}INwy35BP z-y-E1Xf`krgMiUCdhOe{4|_FLQ2`=P@z5H$>XHxmD#eZ+yG12>PjNtf>NiB}sX!=5 zMUWSSo?MGcx!IplntiU-?A#|36#TvJK<$%pp_b1_t_$`0$tfU1*f3tV&&2@364XLF z-A_&>)b{JJ9=^!$LZP0$Bai6*H*7K0@ukDZvh+|F87;rQ2&O5lWwJzDF7P@S2bm6bti#=8ay`5QxDV)~_^ z(QEiB2$H3A2%Ilg`QmM;fJlIbRV@uCZAl53so%zV=Y&g7jF=&I1!{}4jL)y{p0Jmr zvbB@tUX$hXuoQG73zYvziZ`$uLzj-Zu@=(t0w0@t8Q2-Bgl>OE(ZJoXl)$8?cy)g|>pm<6mmxwJ z+-MYWP^=JCGf)!O{&9!t7&C3HNoEPE)1KK@Y zFc27)F_Sn64xX_K=xHyIUwL9(c@SSorp1oN^v98aAuGd31^;dT$?so5^5^`R8ERnc zKZCp{9VkaKH1PSf5yv?pAwj{RBadnW1g=PNZDm&BIao!k9qNko&VoS$%)xU$u9+il zi$B6#%;C*5KO7G)k=$un;3dEA79yx?aq~w2IGqP&fEqaWF80>F_aqTe$0p<_ql}-|BE|qw211xI|?0sZe#$?eIRVco=)tBG*ZjTlK zrSQaf7hsyw#g|KM!7A3Io{Aa)2O-j6Y>_<<5W% za0>@7_$0-X-_q)affB4WKyaK0n8+>JH#{`tBtCH#kxi`+tPBid)peYmowZkHCVNWa z)Zs|tb&$a?Tv(6 zpOWx5KfvJ0)@|upG!cTryAW(sa+ehhHGrqqWY$UwUel;;0CUjvc+&yVowsZwegd}? zCIG)3iVTg|f9jPhqS)XQ6S@dDJBW}|rpbW|U11|bLxC`8(J`s1Fs~P7^F+bRrq|kE z^u&prt{3O;+zqp&ha*JylcXu)7=Y@S+VWF~0*oPBp*`;(wf%>}Cn@1378|>)$ZXZslr}1JvmLJ9!k0Yr9(=NV zscxW4q;SrQD+A$@r`G^B+L?%>>E+V%b7aH?(I5ne){K`3oyK65{Qavk{SIqq+q2Ph zF(1e|S9-&Y;yPI?TkWBsx2gK3pNgK|=(-0J-f+ug`1oGcrgiTsh_y_VsGOq|BNu!c zJqwk6D9^McPbYxn1Y$WaT%!Ee5K!@!72p9>Pq6K{!2y+;0BCV}2Fi;7tGaE=1_`K7 zv4~hchx7mtW4JC++QI%>@|gl(M7GV-ss2%d^OHT^thJTqwL&O!VkQgl!@&yQ+3~B- z&y`&Ux1TXl)8;=&ABUpB{FbJY-&^799hh~_2{}AsUn6Nvr>IMsdjk3uJt$$rSRbBvIJpVm5%H0 z4w9IQuTFqW)8iaU8PK}FJ>`P7c2T#d>iF9{5B1B3z*6nTE!qMqtiGx&Y6ck}otnC4 zyfTAsKIiWIO7Sn!E_w~RDv^srOo9_slesXZ73<6OV>3!n+H-)byjy_&!0)TXb+!KM z$A=CPd${;RPc)=0?aP4ati>Tss0|&qlakP(646L3bfn-h4GRXk z#y#zjIiSkgxk~!{CLW!Zc^2J^ph_lcpS<WFw0?Z@QG58E)t58W~Fc2LEI$qd3-%jyz=C5rSSOjt_V@C z_k#6QS=EDwhyYc3Udb1#FlY&G8NypHzt5n4EeC}aMJGxs(qV-RH#A~m>o zJF3?Hk~tg&r{n&c+*nu8s0-KWZH$2l>@ZF*#S+TtQ(knGU>WgVB${O5pCjCEg$TJUSo23rsbu%4-T6i z9`$T5;p!TOBCbet31*t!0Tn_Y1ly8oTeLL&B)Hcanu?x=IY!gBflz-@aI~BGS+9Y# zR_Wl8F$EOkU zKQ?Z0&QVQm`}WrQLw6;RjCm6LqxmXN$b5YsQNMF#%CMDX)mSqBD$m9;8H$&K1tk3y zPK7zG1ujFg0^uOmKy5eyG5U*jmI5Wg?258r9=t1NQD~X%PT)cl)C(W zK2ve-PgPvd_wrhxBSQ>*JzM#*`2N&s&tSd~CK_@<^7fqvnn<#bF)Q(BI5)<(?_+ zqu!Bo(i7$#+p_ezkDuA}4aNY#%$`?Y@Mlhzj1;@{uYLx+IE*yaW zlcPEcZ0fG%lUA3{EBMWg8X+xrkX&>(7?_diFLE#}%b(iFdG2c|2nUr`z#4z>GKeLh zeP*H2-}%Oe1!uQ^?%s5E3m+I5D(%4P{Sj5A#O!MBCe1do3^&|pBoz$?7;xs0s@@z`rTFE=8|`R#lmn0 zX?1=a0;;47qQLFV)r1-I=@~-{-`9QMnVC+IRwqbRF?bgeO9l;%gHh;}FL?te`UX@J z4YD$}2i8CUVX#Xe3`NzvQbDPop9HhX^A@n>h2y$!0`%eIQ33y)mO+BbM!FDh)ZxS5l3(z*t&mg&@fciSEl@M4rE&$#qeunVO&AQD%y;#bW@1n35!v)aFk zP>ts&tuJIF4k$0|a(CtA;QtZLH`+xQDaHlv+q<`r4vmoekls_-_jYi%S3#{ZfrfwA zB%&OBL5H}&&}8u<87UDl~P`Zg5KqWveW zl#$FYv2qcJhVCxXv}Ym{D3xIQ8SsK%k;Hw%NQ+tAs6S8>IwogF&{by z#vNM@`|!)JHUu4oIRf#4kYn?pAxj*&HbH+$|b5MVYtH~1$@p)lfqlygg~sQ7#J8d1IOfh2e2Iow3e$oU$8L&o!+-F z*)s_w{&-P7LKz%jmZ+p^9k%ox5qEsxmp<)*KxF3(2;>+!vr~Q3b$ldW5LE{tqtDD0 z)E#yx4Op5{*lghi0e}n%q`QwaskG)>l1!0SGg1eP%029Io0ZlAo{Y4E_^H7?LA(Qf z95C97s-H99r3Fj?j4qs!Dg>E&ul9#nyYtN_hZSU4gM4e96N0DnD`wO_K+WSqKGTFZ zy*G292bsu#`S(#!_BpAv+1wNBMfE`|bV5maU>Fb)JQl{y0dtV+>3!s;P7 zM_w$ z{4<_MJ;;@IK?lTK9)zWBZ38S97TmjxUks$-w-2cb=pZou8`Q;<&bnjoAPrvVO1W#- zf8YbnOKcwD8v6NjFNT8bc;{M{e)j2x?Pg5-|GG)(96X+xo`_7F@C@$k(Y3a=ZiGRG zXr%@f_FSWa>rX~1{N~3KA3k>npnH@kWv4R-#3#U_eHl7bthY|S&9c(X*Vj&M0NNb# znKf=mG9d&qy@*h3Op&`T`VwwFFMG2Xa1wJGZ3`+jvGY36z_az_&HJa;`z6Q(4hP`h z!bKl3QBev|!zA-}DG6nBsm~4{KjY{E_~U;y$uiqOToWru$46c3HiPejX$5s^lM`Uz zWzRxvb}T^)DRE{t*1jWe68wX1A0UqsJw*<2fh*H>p7`%e$hpg_xR2%3kcDqHgC% zv_y*3YFgK}o`{KdLw4${{z(@D9Vel0H@7o+S@of!(-zQB$RYpLP0-1P>`m$}b|MA> zGj0^L1L%A$9Dio|JS}HssU9uG2sQ@-Et^o7`7BU24{7GQfd?jF-XUDhdaAEd z-psR2zX2JMq%wNlHCq-x46n}~z)Sl*A>)cGVaMVU5;T)!#Jz!y`3Hp2!9Z^Zb43P0 zF8B4ZKA?=#(1yT136})Zo`zJU*nXhz;z+NoL>`&yV!>=nQw5~2DP#`40tl(0!Q4{x z5HhD)=GgU)9Cc03ZXL_Po_RC;@!_7Nj(p2tL%2p+j1J^k$6E8+ie@-85MS9MKWzG2 z9|<%NRC#Hx_T;+_xU21z2ln z?x~-I6aoqqlUE3-mxrw(tx>0fI*Lop!}p0ldJe{^WM0+scssYb$G0O;TzjN%e17VQ zykQnpx$MAt#r(42M14dFj7Rb*Rm~%_k0!;AWrIkA2cN?NayHrE?(Xh=ElDP~3mFbD zM@XR(YfQ7FABt9gO!EF4!MxtRx09eSch7y|ad9*cP_>2^$xvZ%#5hZCe2I;dZ#) z>rF?ysJOV2w496kv&F8wM}G15rr^4oZzSgwC1{)`c#|EY63PMIq`BNVjQeAqA8^a$xTBL8re+HWD1l12$Sf{r<_A&S) zoMntrOoZ`HT|tFX&YoroxP5H}HpgUFO$cj*4pK+I32f~S_2vGPjK&iW-~hW}I+1ac z3F2QO9?YUeq?OwXb;}g79U-Z#GqX4|oF6@U1SATu$ZOIs3wpq7zq$9XI}xHU;D1k= zQ{)+G&}m4ifT6bqoc(3V!mZo>@-u3VAXj?iBmsRr`0yd3QZ?C4(8-Ho zcvN(U^5=D;^26se{q}iUP$u5%rNIXN^~P_=u2OmKYn$JnwSt3DyY~GDw=0sVX?aIZ z|ASdVu-eHX+=6G-4gPumP^b$Rkj(+385jNfQqxYZR7cjVs@ zv|Gsz-{UNsXSs#z=SZei`zw66!HA+5#cS?Mf(hd@{_f;EmB-pmp9@UTf!C30Arfei zt(fjHhDUnUoz7+@RidNM^Fw_JxWCHGsOCPshYTH0I{N0%&bv-1E6bBek^Z0Ez^R;= zV9ba1|4iPO(6tZEEJ;wX!J5Cy<4`VLU-vQZhxj`}8quwm6t03gzWA=&+WR*1n@Oih z!1-PrEN!J4#+X@Q^VOEsDEhejmvy*8zOGY-r`YKqV#PUHlo6asn8mJ;&+Dsr)CyaM zlT%OR^Atzbd|2OgxKHxl!*-&7n&9$DOv~Pt{XsX?wr|uDT7P|wcNo4!FlCCqLm-b` z`N8@naXlq(H9U&-3df5HMcihD>1gi>$-3kI9xmv@b*wC=+Yi$2WcurMEMAm1!VXp$Sgd;%@SAyzyFw!^T|_>thnj(l97W ziNRfa=23v3#9S|Hgxwqb0qjEfE%gn%gs-e0`XYIE-b^t|hZ0v1 zCizKtptO#>I!-6*YV(DBi9kRtWV9mR>d|s<#tyb!e-3|aB9jH@WA%>NIpJ$jTRGNG zfQN>ouw5p;4A-t|e`M?&Mg+6$yuzx;)x|B6_{bnT8^2hi@AtfqXhtyQnNc1hGz#Rqg?WCd=0 zfyMYge6u$m+sdxDdExt%8h~GJK7gMJ0Rg{%7`Vx83N*O2mfgLMizO2s=h>HhMdOQ( zWftQ`(f0x)Ezasr98QRSYrgQ{)RzNi4IJKRUd_rCL+2Veee6c7;or7kMLX4Bp~3IQv_Psu{v z%~Laf&_(rDF^96DeEH*r_IlhkZ^liVhQ6mB8F->#ZemyM{OM zRr*!#pH_NVLr`2IZLIwF2{yewTRc!=gB#!OMbAD}gBP`#zMA3<;A6{r_74ZeTIYT_ zJoF`!nlEN^C}iW_C|9@?d`CJi@+4t#%G~@qWAZQHm6oJ5rnz z9PJ73NvkE`4q^`OEeXKmFVmYg@hVH}aB-dw_{Z7|RG=yW2OpV~ApLAE20tq}kCPJo zQiHmi4?{`bB^eHIO4V2*F(E~Gti3b$S*YEJrZd;2r$80-g~2 zoG&SVf~`K+e-Pt;wMHJ6`L{}6`b0LvWz8aIz}f6G+ZLuifa>J?|0L4?_E3b(1Dg*fm9%71+h{sPxV>e2zw$&oy@SkHoCQycDGN_c8 zQfmJ1m-^<-5cywv|GSaD4hE`dw1X2iicC9%?zw?HnIJ81hw|cj9Ub#7#apU&3y#e{ zaA4bYBoA5Vr$H`6e_hOJLW>|rWHB#F+ZxF-{>I)w?^bD42JiZT@MMT}Yc2zY7Q0&5 zN9DPvyDcQ^!QVKYfE6r1*ZZy9A*^ZyiT~jV5(QR@k#}I}=lE zcMLoKjdi-|dhP(Dl;4Y_r$f{dW7~CLy)bO;rM}8zQ=U7s<^NI2{|yD>i7Mt7Z;;7C znQ|l~Y0qM-Pt05u;j{|=f5Dgj;nV!f<1&zEQfM7GLGQ_H9i;8H5yT)c=1PQbqtR|T z?N?Ld^?Js8Z(i=cW+$VJ*b8s}X+-f)3t=<{o&AxSC&xJZT3sY!?Uq-K*I@a){&_|K z7da_L&lEG4P@||4{O*NXMO>D9D#Vzu<3WRlPNqDjAPhd1<5E{YCrSbO1|uz_g~Z_V^YXt4{DHLJi5N^?1Jc;mJ=pxWT5Enh zQ<^wK7q)HEZ>ZZ**Jo{ z?SH#Q{{O`P_A#6c$8ST`#Nj=kTvaRmrvmEaNhch2N|)k!fegojsvzwY{oJfuopbr$ zMqs}r=nn*{q|O;9Fq0{-qdo;o$y*a|QbA!t{LPl4y3 z@Sj@`!@F>Q`LRW|icGvVPEK{%9DkW@>w3v}v4Na(@|&NxOSIMeZe3@A6nMaY@w(&DNPjXd zpI#-;HP4pOyH4dFG%>fo$!Lx!--Y_Yk3`F*_vEWOE|`0`_A4g8g?#<<1h5P?xLCDB zF}n!-K3r3j2M*u==?#917RS2Ep-l1NnzT>O2V7+DSX3<|x|M${}#R+~+V0NLa1q z#1;ySA4Snf9*0AHlJVR^9Igyb%DX6*UclB@hfCkJ&On5#;gDeevG9*b`zKUKuHN{g zUE%a`>wkjrKT-I&ZkuaJIPh-2=|^75Ws-Ue?{W3=aMx%GqsowYXLBF+S`9 zC&p5(o2%m!l~p6v4ol+7@1sh2h+U`%eb`h*dETG>liq*Vj5}nz+D#PG_8}?wcVewO zJ6Z^oiq({&?yt&vL5{0mBO92g7_;sT@x=7mmD8{@<3EMLoKxnCYg#W3_z)uZCG(9T zt*XppG3;fIrv#+U{1J&Ya=0p5L+&(pX;W2v*U%au4cd+c` zA=ps-r4u(&PK0l^UWw_ z(bu!QZ+V1rpL*mFB_+#h$n>Xk7CDW_xkhJoye+vpI#5w)Ep@>3pL(R@x6xNC1rv+~Fl(CCj29Dv)ThfcY zQsRmM7#+v1RG~wL&vwGIHXMy#;+utv`btg%EKsrtcy>Sz=OvO_j%2ej4qM7>tNo$A zy=qGtQyPa=UB0ZA#s1V_>^CdO1Ysl6k#;NzYG^2D-qKJJNRv)W@YhBSJ_&0G=8D-Y z0p)LKF`*=FYbOZ>J>Hyk7vdXk5qCus>!{&sc_#^p+k@(jOO@mwG`=~basFuq;qm-E zW($%ij|9u_8=mf9h(@<6Rxf|BaYE_2jq$QE(?UP&vXmw0HqxGUI=#Sm*i5{kk0{cg zdL-i2djm-|ym0L4-b(FPrU#O8^t;l`x(b-@uG9@Q#?B?Z!#(DW8$}IQ&chGxOg^gdQ~eqNYL&=PU_(TeI6m!Wl9l! zRYNU%^V5BH99Fo!J50RsSxRW0bgM^gJiQ8VS3ZPW{Z)*MYI#37@mOgo@bVW(vHz?G?QLo1Q z7D4JkqBO2hx%ys$WpCW7;N)$)!zlfWUqGG0Q*`#FhEoF#X}5iQsggPGo&Rn>CYD7 zsaS&!s`tmkBJLU1SLDJsyj|1T_rACC%BwoKGSzO4?HdfWdu>qqW-)RIxqcz%$_Tym zgiX)fj_37hf@GtkYv}yOMp}c%Bp(X(UD+Bd(3oQ$IzLHvyzxMYty5*}+|{=!c)xA! ze1e%y2N`g`g6j((Ey^_ffX89u3tafjxm+x+_)=*wAW~M%7vt39cHaB%p*oe0D{LW? zgM>4^A6`yR>iS_T)fg*fMp4P^g&2L$mP5vzA?G~9>k5f;0x@HL)bXA603XlJjP_D& zI$zpSRj!Asbk&%R>jwC2X39WPqw7vAnhes1BCfo)4;~bx8vUU`cVlF?=yL+*d7>OL zzK_#|$J8-#>VArO3F_TH=@UA|gMsiQCGY|s@rQ6c^GFPx(iae7rG`!!3YFnCaD6dz z+#~87T4S%JLSn`{X`3)RdILR8yq<1ANbA#_w;PRmtNNdM5QwD13sr1*gYFGR`pSR4 z;pq^zGp_jg3uLT;ZuQrt0^C7|W@BzDRr2ahkoKr8zoWC$zehh&z!JO|E*B)XD&ND; z?)vBm@iw0Gy>kc`<+gWilK%Wz>;~L>L`JC^vNQ@tjaztM|; zkHo?AMNYE+A!b<}_Q_H(kUtai=T<161RnPgcs9h00iW+#$k(Goz?kT@TN7+sKG-4~`exDZPojP_VB`qEAhch_7 zQ2B){rw-5NdPYC2#FPqA6NakZ$o1F_qoYl$@_IPmj5(P6i3N+wdn^(+_m=1qRVZJ4 z;v%j$unKR7TeZAY#Fi1_vH7hhZs#LbrFQi%pQPU>yhRsh{?!Avm@%(Ve!ki1nNvNJ$B6?=iw2Dys z2&S9&yg5qrSW3%RlIWN>R;HU;qF!!F*|Yu1Co`Qx81D4jYsd~fBB{$6Qz8oLBvlXE zU`_X~ynyxsw?GyL(x&)*XH+!T4$InxYjGZVcyruyHwNjA?FdHn`CXfD^-k*I&3{+> z$hZHh_Wz?+uqe~4?HZ!~O3nX$3QqV#ag1=9k$`q{%(UIDS#!G(p1`%*KrKlGMb)*v zi6aURcSCHm?gtLUaxuzg$^s zz3Z({yYJ^&^vN?3v*57WdF1r&!bO!3-?ZNhpG#=sl%Ccq9=EhcgwO4VMb_pkf}}%2 z+$cK_4Js4WK>TG_8Tz8HWnml3Bit#(CjCQbZSAjz&nE>|@|C@2tf$?8L0z?nc`m26 zvcb|8-`?rY&_V^nv-8G5CRm0U?j`CP7P-vBp!5nT`xOkb%qW9&C+j{5t|e5>vxMx{ z-E}V!NX2aoeeS*vq3M9^iwS1)0rDJ+ef%257V(Sa z7)lOQ*x(R}%phEXUQOyP-;akM$zF@$?H6!jeQ{NEb+vI*NX$-s4~`HpU*(Wg zbA~qLY-z9?UVAfSE*LJ%+^U$u0kgG3F~?;)w6vL}{a}CXD|Q9yMOuXC=S*Z-88|s< z@O3H&1qF5+Jnn3WE%PWlRP3;r#L0A_pUc<6&Tk$fA8WA53T@z3}JCWK-V{J_933 zJqz}NChnW<-5B!%6S9+T5TY&X@t##fX%RPw55&_R(Ph+fGDcFqzbpjb|ES7CB2O#r zIp*$sI!eP5x`JS+0t;OVnnHJ_ZPl)?tRSYqKqd)86pe2~w=>lA-#f7HtR}>TXaLeF zVFsKBW2`eWu+YlfdlTlZefqs-^- zj3CINL`~fEHshpN&OqFCVx7Zu;sL|83u7>@iIWwf=uQ|&@@aG-(7qEo1;WXdwc8*7%nH!iBsYH2>KndsqY{wxS8KpW+C zev`0=$DTE)-6|^~E>tF8?*#@J>!S8JUv&cADAEF%+q(|RZkLx=_pc&#g?70Ei20or z96y!z$6D{MXUa02eIGvT91-9LTsOkED}?rFLLJY46}L5r$$3I zJI)>p7M7M+eB$;>d;)cJ$)|aSGu(T1b7DY^2!`jyFM}1tY5>*d$$ydpnJ*fH15zr^p6rW&X#pgcHF^hwu1j*NO z+B`?NvmIc`_K8FhAV&Wf4<-dk-6scy4n72ye-tN!<1NCn{Y+ zN4W@ay(|EOMLx_pY6l+Rae`DiLGyK7zcUmNe6UZAyaZ=(=od~Bao4M#{8FS|Cb87) zJZ2njp1QwB2=U3x)tpA3RK!q!p?`Sx3pkD-E5OWuxi`DTg;59hkxCM{9$y=MoY6~c3b?$5{Plb7XSROgT4gkzL`PR$Xp}^#Tz-tu`zC=?IH0AnS$gFqqLB~oB0cCL zC|OA_)Tz;gqTZh&tzoz*5ki1AOih7bLashIaqD?^M%HI@wy{wP7u$3c%C-#Hi4&5me-Re&>qKe6?K5k&1RpyR_2NDhaV%N4$E-BRxQ(g&GV~>ahm0k z5+1Q7!)F75?aFm}TQhI8fK?+KaF4zV!>coJg5CTK0MB-(!#3hD6=)DR2*3}me#ljk zjJw|_*fZXtT?Xd6(HqivhLxi#E-F0<5ecH7H)z+px8$h27lACw?$4>>hm=7qh2Q?5 zDq$;_aZWkOr2WRA=gUf?pjlLWtO3!N=d zCfZSI{^d!8Oi%)|7sx_Q|BuF4Aw@@)p}_l4Na;1sAcA$)E72xx){mbd9V|HoU-4`>jH6`leS8`tcs?rsSGJxVA2MPLwCWm#H8tMI#IveeGB8Pb92Aj zo5v2UYI|!Ly@jh(>YTNEdw0RdE?(b?01=1P)jfjQ9IN>PQWabejTaB!|D^)GGGdO& z>I0{Vme|HL$%vo;Q(8V*qs4)tu-3iLuZ$WVAZ!PWL4keWCH*&R2-8(rkfhLfvGnSa zJ3}2mB;gy3fU~)(Lfn$l%^d1~G=D_WePJOf;suz%5XI2BDG{7~tP1y3=7ITrZ;ft| ze*h%E^~fSu(iGc+wNiVs3yw19=?2w%^fP(AL-e3b;*{!$lVoHHqBEPpjhT37wxpU_ zfI_?L_t&5oTp7~e+@-{y={M}PAH8yg-aTX^9e0%*3s1&EBa-{{-@!L7GG7x(%v+zp zi!3MYp?A>u2-^+7n6wrQ-qynp=YfuNSXC{5dFy_L$?IIM+hYu@ONaWIIB(!umB#{a zg_c|VUOZ$V%dwS;dAm_%N-FAZH#*9=#6Q>N@-XKb;cA>?)h z?7iiHAJSwU5L7w@x{t8a;JRWTg0i2hwsF<0<9b}wWl{~gF8ldj-+aG!Z0R$e>j)y} z$5-8#f;}%yEuDKR`q_2^k5|HTm)b3(U3N|P$tu4>kO>hsdrbVaDsH1AGcwSuJKsEM zL|6l9ALM71JHnpWbVHbCY&EuZ)=TYJ=a-(5NLb1`_cc4g_R-_?``=u@`%2z!Xy!P3 zY3(;R(;WEl9+Y{Tx(9LnnbG-xii+}QDy2&t`yDG4RJ}-!UYaHL8UJ~#(y`#rqyEc# zg`mI^O+MPbc8^-1r#5HJ{@&q)3igeOWe`O98 zSyJvXy`ZZXcMFa&6iRv`9WOqC)b28kI;~eAEq5FE?Yd(M3VttYvcz~X#0&sTP0%OR z@Cn);Jl!Obg4Td%i%EdFByFARk82H?+uYe3Px+KT0OFz##(laok(iOd!_3Mx0hHNDqp49d{tYAwyG7|!hnZNk{ zq-4mPFmu5V?&B5(7lWLw=I_%NkKv?qU79T~3fLV|S?L{X2i{J~qH2ZCO`z&B0c$QD zrda1H`!^Adym2Nex6Kh9){jtf-|rI!en^O*bXdzi#wMsWWA-{x85lCIB2egB|6jipgZK0#;Ie#ie#G7-F8l5YH^^;Kn~AhM(-bq0 zKb(d3FC)#MKc;0|89sHGLR~VR5}a(-%qD4ja^0mmA`v9CYFu5IbDZ`Vmj<+sJ~V4B zG-n+RC@f|NqzMHy?r85MDF zEbA2SD-!V9C}05jp$b%092hvr7NQI>hn9n20Ym8;vXUGJa^!isV3{`+-CR2S^Oem5 zaSrP^f|otZ#k{fOKN{JEm>6Y?w)FrMf@DCzmWzvU5n)Rw0i^mKh6dacHD`i8Iu{59 zqddl9%aN+hj<$PRf2JS9)L2Vw)2;E!>#cJ73W4oLI5B2Lv0@0nsg3bsY!gv7;s^dr zw7{gHV0y6J2s?aD?L_(Ty~C%eCWW{8RCyeR%azN_X`We*&zLjmM|V0fH1P@HaIhOks39SNA$&piF!zykw&U~)bQ6G0?3h5ZO&0)anz3>AM zsJ3dWg5;lasYp|R4|>L(R|WH$>9i z)5#=I@$nqhq_x2ur+4t~99eC5#R{6S^~YsB7}!RyX5)F8%+JLq3?A@86n3DvyC?)$ zLQbofzBBL|+v ziYsQ?#{t)V0hClyW^ID}EDh}M3O^7MRFJf_rf5lV!8GzIKlI1T5p_ulR})lNE*RNS zHfBxmI?mnsgu~l`@6gGgbDwwkbui$=1U^tHl~Z%e_qe5jGJXQmanj6>8%M?ud`QSVC|;av`Do)g)@!zwb#&9HE+*!{-CNJ4 zt)n+xpn#L|@z8g6EZ}-c(ii29gR&tA@_DizUp@G|g}ruVD+XAVU>~;4g#tp?zDyaU z8Vl@-)JiAq$O+)HnAM`eP%?nG zk%C%z!Ohs7u?g?BD%uRO$pLKv)QUCa^-#X$yWR-w@XG4k383*EZ$aRM)c-eBCC6~m z5CU0%=WlBxUR98lyx$KxYn#O#d;SyXM}{QzZMVM&F0cg)3e%?Aha7tQoh3H9>>z~0 z!O_;Ljv=rb=<&7JY*|!FDih&&1gi>A%fF-TB4?&;;o1J4j9Jhk!cGNSjsU@m-^1-^ zrom7^Qa^aKi6aR)T)hD#B$(3&iMro7|BlgIDab8R@^APx;PQU}Y5!L=ivI-Xh{b@o z95cW<`q$BhchvELu|MRf@pzdve}QxK*ZF3^HU<*CwSn1Ojw1H@!$KBrVuUgtGYc@S zTfQ-^e@r`%$>T^Mu2pt@$4+T07cj;0EMxA>a4mU)h&RY9So}v$=r{JoGWuk4Pmx7C z+g7_q4$5OLWS;BfTcyk680{!bj&WZ_^-?YOD9c2gTftplMFIA)1KkM4=s)3k|DXW= z$&%^dFHz3?oI|v>kBkStlxY}!A3VyI3ROXnzkO=xk0VW7W%+LcQGaszc`}92>HSyZ zR2~LAvMz{IRoxV!j8%9KR&p!gJ#qZhL@wZJkWlJx3hMoat(NfaMbi^dGw<)Lp#NS@ z!8cldd^UwchDGUmp@EhMXXcl(K_qI}=UfdX`OXL~eSAE11Z1$Zm|nTKf4Fyjv02d4UJ?t&Nd!RudH z-T!-BMm2!XE;2+6m>i~-^hsr?+ltc1+-%_?PMGF#<#v1H0Kllo62iACfjon~r}nG@ z5`;mg-^g%OWP!@l4?JQ|QmIcalC%+5c-%D7P;t)!t*44~4etBNb>;VT2nL-Q1L(qp z&ctx;0RPh);V@f1a%Gwh2=tDB1#*`#n!~gC{$JC-FLLVDKJJLQzYbz>GbIN!!Ou6z zauwU2ZC2)HQLg{E$VxY^a4T}iUHSC0v-IYm-(|<}_FV65Aea64I_lr^&;FYPGah~Q ze+g#)DT3MmOs&{+s#OgX1Z`*m1wlb36(|V$8_oAUnb*Xnn9G&y`P~);#xY2TVfbAe z54Wm8$}Ek)K0WpD<3w@NSHrb**NpEZT+t6R67}WF{;A5Qef=5)mQTqW9B0;CB`F-` zp~Z59L{Ub8IQ~slBBsCcc!QQD2Y&|OGmE_#6x1sVi)BV+lZdOcgcW_hk!rjDWp;ou zPl829X4TS|_;!sYXRa~0{P$$+2O0sJ$}3$=Uh-_qG6oF@uT3I9FpnxyX*kROgR9_$iVFEh$7OHO`0G=qnKuk-msdp zXm8K09d>yFx$27QdKxMv6MUmA@_;n(W$aS@*1E@@v>W)421_+@lAfyZts<)8mx1Vz zDoYs}Yvc%FE>ciH8tuhdMekuO;9l-!I0H`PbMVy&Zn{qkD7B<+>6VbS^%1kk`fA)E zN+4*~@;;LZP~eEb$(NQ4K0#?*yiOYfvA5VtP#cnl=F?D`2LTRBUV$G{^p60DoJ@r- zk`A&6nVSAdq%PdcE%oH4{70+Bjm}WZpn$tc#Zol$(nRy<2iaX~D5RusxZHOq%;AU5$VKGXT;zfl5q^o1ez{=~EYwua9! zfz4JN#tZxawf%cbK+*<{Aa63KfkgmHmUqM)5lZfePr^#x-2eXmtxKLJ2m0Ob{-U8l zR6-Jndh|`QZ|-%JB3$O_{FB@i^7~QjJA&8mcPB$G+|N z`KlG+MF-v$89Vg-zK~Y7I}WEczSMvpy9;N& z+0foUR!%u%A1iqEx8whNz2Q~ZiJPzuJlwh`vf^;Itm(x;NYWPgrSuuR#&S{ zcb*qQmpe;&!yur33N;0hgq-kr=LpfzJKH-tuyvt<_c!}f^!bz_c&q-< zRSl^19*IBgxwxn{w_}zP-ul(@jR9BhT8^ibDq6uPjoT;%eog+z!Y}!DBF`dh5uOYU zNXuX7Aom>_EyoeU&$0@=tf-6KCu#B6y2EfHG@dP(gMt{FD##;*5#`=@zyz&0$UzoMefZ%`Ghq2KTbTb#pn|Ck_T%14J$6%v{E_+GqUDw!dQe5yVP>#w`d8f!7pTzS`c3^@2`H64_^t(mD>|7=Ks}=7 zC-=*1GrHxVK6Q)>0a?l?%zu-AQRJXte(tFMnv{SjruYf5kPXuhnhYY%dP@_R1PjIP zNc4z*c7$CP+}QVC(c%brgsq_GAk$=fN0GPzvFJQSG(=}wWLSkJnwUqh-K>A~X@}hY zODJ)HRK9p)+)x<(8EJ!{Ash#CQe`?wB|D|ga-^n(X@z?h<^&%Lh>^+Zoa7iMRFfNa;V>z9eTYhk6Lv#S z9kECS_F1lBk?r_m4Mi_}s~|=ES1;kmf}aps*cW@we#$Fd&JwB6LJB*+@SmB*SipMy z>had$h-&1HxTkhiA`4rl2`H+qYPHL@&rbr4A@*`Zi5;Ph3D`LKM!p>(?>FTKmppLp zu9t5VXq3(=@^{2GJRO`kCKSHacQNhL@v$Ovd`>CL4~(qbMV%%uP$@HWWvT7TLay(b6C)67bcgOY^yW(9`!UY(bhm_5!B4_s6=a3vNsLK)Bx>5p+hVwhrUHqTlhe=4 z=ah7uL{2a36Q19%W4p!D zb*w$EOysN({jqH-?Sck;c#@zhb0)R=W+najLC- z_(H||VzY}7G1wPXMrV!Iz;}}Bdme7~4_7?HeBMRWHd&J~dW7an`$BW|-rUFOb0B~t z5b9w9HYQv-+HjK}SEx+lG}Ov`)(dw0fULc!JG9~RBeKEG3GQaCC#um9@WA>LqSNYq zM|)I zx6dsqUL_daFhdwIZ*|nQc;@Sld|3yMo8^aqVMT90Z@0eKo0+sc@;FB+t}BhyrJMdu zH|%dyf6P^+FzI~tUsft4Vup_|ysaT~B&yx>na6gZ?wIB-1jw}-MMZt_h82{XHg-@@ z6>#jY{>&@Q*3Z0xk#)5 z{iPq}hx!7+d2r?aFjrXkl5xCtgG$F9ICf)4Bov=EIt68+zT9Jvslfv|qiRr;tp39* zkZOl9lxh>^GIvK>S^wx2E!~cV^!%LBPE_mo;hlON^`fy~k9Q$uYtc;>a5}|d`%T*q zO(<%r{=??)je6i%Drw8o|YuoKv zgo65X%gIo?B1=!Pqm)pJ6n1*CS?cLh&~U)8`>-D^E>W7VL0H}so4w+r2mb)y!1^7; zsOXV&0zaVvDNZV2pOC3GnzkAa^KJlQdBcd^uoA5yF*tD~kxz_&am9{Ga;ie~<4oFP zY~74qe<96#tdn)m)(~CZE zo`$AgN1woYY##~)+sADe78Js73Jxn-i)g*NcAE*G@rpY+P68W7)9fuH%Fd8OL5$OH8pj0zUA&361fFAxjZ(*>A9TH9r1&iqmw9V@@%WTVpl8@e$q2 ze?ogYbXW|kR|xWu>waebMz7m^n5h)(7uj&{=8=TqvD%+(&NN5Tb&3QBke{EtIzi15 zHhZ~7xS(P@k2XPQ6>8y|6*AANr+r&asOOzOi}^(sV+T!?b8&;Dd0?p54Z+ZBzmL|96KSY-eFVgU~+clVNdGioOE@h&DG|))(kz{$pzX_a0(3#F|O+i z?_(sPdK)+hlfHP1c%$tBA>^Urke@Awq^m!7i^haDydE0X&*nS(%t`O-2g6HL#AoA> z&%egjvv*j-1CeU&29Lp~QH6@OGvc4;w3I%*(0hbp`1K6RI(^_!8Eb#-7rceziu7$Ih0V(PsZryDp0&P{ z3G~-3bCrTE#L;`TeAC*)#zH9sMPGrH70u@H@nSXCV49Y*@a0c=FlXjT%`cM@9F^?UIEk0y`SX#x3PQY~d*n_IUD7TFB~Cv+DOPzNbk`Zyp^w zcgO>7M=qJm3W^4==QOt)0hXDxLr)@KpG0Vcu!QvdcIjZV&mK9ihwshCE!1qC;D7p?q4`GKmPv%YyPh~KL3|~ f(?2|(z9hV9 Date: Fri, 17 May 2024 10:43:03 +0200 Subject: [PATCH 105/108] ENH Improve wording in stratification notebook (#760) Co-authored-by: ArturoAmorQ Co-authored-by: Guillaume Lemaitre --- .../cross_validation_stratification.py | 71 +++++++++++-------- 1 file changed, 40 insertions(+), 31 deletions(-) diff --git a/python_scripts/cross_validation_stratification.py b/python_scripts/cross_validation_stratification.py index ad39bdb3f..d1f8abc95 100644 --- a/python_scripts/cross_validation_stratification.py +++ b/python_scripts/cross_validation_stratification.py @@ -36,10 +36,11 @@ model = make_pipeline(StandardScaler(), LogisticRegression()) # %% [markdown] -# Once we created our model, we will use the cross-validation framework to -# evaluate it. We will use the `KFold` cross-validation strategy. We will define -# a dataset with nine samples and repeat the cross-validation three times (i.e. -# `n_splits`). +# Once the model is created, we can evaluate it using cross-validation. We start +# by using the `KFold` strategy. +# +# Let's review how this strategy works. For such purpose, we define a dataset +# with nine samples and split the dataset into three folds (i.e. `n_splits=3`). # %% import numpy as np @@ -51,12 +52,12 @@ print("TRAIN:", train_index, "TEST:", test_index) # %% [markdown] -# By defining three splits, we will use three samples for testing and six for -# training each time. `KFold` does not shuffle by default. It means that it will -# select the three first samples for the testing set at the first split, then -# the next three samples for the second split, and the three next for the -# last split. In the end, all samples have been used in testing at least once -# among the different splits. +# By defining three splits, we use three samples (1-fold) for testing and six +# (2-folds) for training each time. `KFold` does not shuffle by default. It +# means that the three first samples are selected for the testing set at the +# first split, then the three next three samples for the second split, and the +# three next for the last split. In the end, all samples have been used in +# testing at least once among the different splits. # # Now, let's apply this strategy to check the generalization performance of our # model. @@ -73,8 +74,8 @@ # %% [markdown] # It is a real surprise that our model cannot correctly classify any sample in -# any cross-validation split. We will now check our target's value to understand -# the issue. +# any cross-validation split. We now check our target's value to understand the +# issue. # %% import matplotlib.pyplot as plt @@ -86,18 +87,17 @@ _ = plt.title("Class value in target y") # %% [markdown] -# We see that the target vector `target` is ordered. It will have some -# unexpected consequences when using the `KFold` cross-validation. To illustrate -# the consequences, we will show the class count in each fold of the -# cross-validation in the train and test set. +# We see that the target vector `target` is ordered. This has some unexpected +# consequences when using the `KFold` cross-validation. To illustrate the +# consequences, we show the class count in each fold of the cross-validation in +# the train and test set. # # Let's compute the class counts for both the training and testing sets using # the `KFold` cross-validation, and plot these information in a bar plot. # -# We will iterate given the number of split and check how many samples of each -# are present in the training and testing set. We will store the information -# into two distincts lists; one for the training set and one for the testing -# set. +# We iterate given the number of split and check how many samples of each are +# present in the training and testing set. We then store the information into +# two distinct lists; one for the training set and one for the testing set. # %% import pandas as pd @@ -114,8 +114,8 @@ test_cv_counts.append(target_test.value_counts()) # %% [markdown] -# To plot the information on a single figure, we will concatenate the -# information regarding the fold within the same dataset. +# To plot the information on a single figure, we concatenate the information +# regarding the fold within the same dataset. # %% train_cv_counts = pd.concat( @@ -138,13 +138,13 @@ train_cv_counts.plot.bar() plt.legend(bbox_to_anchor=(1.05, 0.8), loc="upper left") plt.ylabel("Count") -_ = plt.title("Training set") +_ = plt.title("Training set class counts") # %% test_cv_counts.plot.bar() plt.legend(bbox_to_anchor=(1.05, 0.8), loc="upper left") plt.ylabel("Count") -_ = plt.title("Test set") +_ = plt.title("Test set class counts") # %% [markdown] # We can confirm that in each fold, only two of the three classes are present in @@ -168,7 +168,7 @@ # 90%. Now that we solved our first issue, it would be interesting to check if # the class frequency in the training and testing set is equal to our original # set's class frequency. It would ensure that we are training and testing our -# model with a class distribution that we will encounter in production. +# model with a class distribution that we would encounter in production. # %% train_cv_counts = [] @@ -191,13 +191,13 @@ train_cv_counts.plot.bar() plt.legend(bbox_to_anchor=(1.05, 0.8), loc="upper left") plt.ylabel("Count") -_ = plt.title("Training set") +_ = plt.title("Training set class counts\n(with suffling)") # %% test_cv_counts.plot.bar() plt.legend(bbox_to_anchor=(1.05, 0.8), loc="upper left") plt.ylabel("Count") -_ = plt.title("Test set") +_ = plt.title("Test set class counts\n(with suffling)") # %% [markdown] # We see that neither the training and testing sets have the same class @@ -242,18 +242,27 @@ train_cv_counts.plot.bar() plt.legend(bbox_to_anchor=(1.05, 0.8), loc="upper left") plt.ylabel("Count") -_ = plt.title("Training set") +_ = plt.title("Training set class counts\n(with stratifying)") # %% test_cv_counts.plot.bar() plt.legend(bbox_to_anchor=(1.05, 0.8), loc="upper left") plt.ylabel("Count") -_ = plt.title("Test set") +_ = plt.title("Test set class counts\n(with stratifying)") # %% [markdown] # In this case, we observe that the class counts are very close both in the # train set and the test set. The difference is due to the small number of # samples in the iris dataset. # -# In conclusion, this is a good practice to use stratification within the -# cross-validation framework when dealing with a classification problem. +# In other words, stratifying is more effective than just shuffling when it +# comes to making sure that the distributions of classes in all the folds are +# representative of the entire dataset. As training and testing folds have +# similar class distributions, stratifying leads to a more realistic measure of +# the modelโ€™s ability to generalize. This is specially important when the +# performance metrics depend on the proportion of the positive class, as we will +# see in a future notebook. +# +# The interested reader can learn about other stratified cross-validation +# techniques in the [scikit-learn user +# guide](https://scikit-learn.org/stable/modules/cross_validation.html#cross-validation-iterators-with-stratification-based-on-class-labels). From 31bfaaf2f629dfdfcac677cfc3cbde6e52e4b4d2 Mon Sep 17 00:00:00 2001 From: Arturo Amor <86408019+ArturoAmorQ@users.noreply.github.com> Date: Fri, 17 May 2024 10:46:34 +0200 Subject: [PATCH 106/108] ENH Rework narrative of GBDT notebook (#763) Co-authored-by: ArturoAmorQ Co-authored-by: Guillaume Lemaitre --- python_scripts/ensemble_gradient_boosting.py | 225 +++++++++---------- 1 file changed, 111 insertions(+), 114 deletions(-) diff --git a/python_scripts/ensemble_gradient_boosting.py b/python_scripts/ensemble_gradient_boosting.py index 874c3ed20..8eac8b822 100644 --- a/python_scripts/ensemble_gradient_boosting.py +++ b/python_scripts/ensemble_gradient_boosting.py @@ -6,37 +6,37 @@ # --- # %% [markdown] -# # Gradient-boosting decision tree (GBDT) +# # Gradient-boosting decision tree # -# In this notebook, we will present the gradient boosting decision tree -# algorithm and contrast it with AdaBoost. +# In this notebook, we present the gradient boosting decision tree (GBDT) algorithm. # -# Gradient-boosting differs from AdaBoost due to the following reason: instead -# of assigning weights to specific samples, GBDT will fit a decision tree on the -# residuals error (hence the name "gradient") of the previous tree. Therefore, -# each new tree in the ensemble predicts the error made by the previous learner -# instead of predicting the target directly. +# Even if AdaBoost and GBDT are both boosting algorithms, they are different in +# nature: the former assigns weights to specific samples, whereas GBDT fits +# successive decision trees on the residual errors (hence the name "gradient") of +# their preceding tree. Therefore, each new tree in the ensemble tries to refine +# its predictions by specifically addressing the errors made by the previous +# learner, instead of predicting the target directly. # -# In this section, we will provide some intuition about the way learners are -# combined to give the final prediction. In this regard, let's go back to our -# regression problem which is more intuitive for demonstrating the underlying +# In this section, we provide some intuitions on the way learners are combined +# to give the final prediction. For such purpose, we tackle a single-feature +# regression problem, which is more intuitive for demonstrating the underlying # machinery. +# +# Later in this notebook we compare the performance of GBDT (boosting) with that +# of a Random Forest (bagging) for a particular dataset. # %% import pandas as pd import numpy as np -# Create a random number generator that will be used to set the randomness -rng = np.random.RandomState(0) - def generate_data(n_samples=50): """Generate synthetic dataset. Returns `data_train`, `data_test`, `target_train`.""" x_max, x_min = 1.4, -1.4 - len_x = x_max - x_min - x = rng.rand(n_samples) * len_x - len_x / 2 - noise = rng.randn(n_samples) * 0.3 + rng = np.random.default_rng(0) # Create a random number generator + x = rng.uniform(x_min, x_max, size=(n_samples,)) + noise = rng.normal(size=(n_samples,)) * 0.3 y = x**3 - 0.5 * x**2 + noise data_train = pd.DataFrame(x, columns=["Feature"]) @@ -60,9 +60,9 @@ def generate_data(n_samples=50): _ = plt.title("Synthetic regression dataset") # %% [markdown] -# As we previously discussed, boosting will be based on assembling a sequence of -# learners. We will start by creating a decision tree regressor. We will set the -# depth of the tree so that the resulting learner will underfit the data. +# As we previously discussed, boosting is based on assembling a sequence of +# learners. We start by creating a decision tree regressor. We set the depth of +# the tree to underfit the data on purpose. # %% from sklearn.tree import DecisionTreeRegressor @@ -74,29 +74,47 @@ def generate_data(n_samples=50): target_test_predicted = tree.predict(data_test) # %% [markdown] -# Using the term "test" here refers to data that was not used for training. It -# should not be confused with data coming from a train-test split, as it was -# generated in equally-spaced intervals for the visual evaluation of the -# predictions. +# Using the term "test" here refers to data not used for training. It should not +# be confused with data coming from a train-test split, as it was generated in +# equally-spaced intervals for the visual evaluation of the predictions. +# +# To avoid writing the same code in multiple places we define a helper function +# to plot the data samples as well as the decision tree predictions and +# residuals. + # %% -# plot the data -sns.scatterplot( - x=data_train["Feature"], y=target_train, color="black", alpha=0.5 -) -# plot the predictions -line_predictions = plt.plot(data_test["Feature"], target_test_predicted, "--") +def plot_decision_tree_with_residuals(y_train, y_train_pred, y_test_pred): + """Plot the synthetic data, predictions, and residuals for a decision tree. + Handles are returned to allow custom legends for the plot.""" + _fig_, ax = plt.subplots() + # plot the data + sns.scatterplot( + x=data_train["Feature"], y=y_train, color="black", alpha=0.5, ax=ax + ) + # plot the predictions + line_predictions = ax.plot(data_test["Feature"], y_test_pred, "--") + + # plot the residuals + for value, true, predicted in zip( + data_train["Feature"], y_train, y_train_pred + ): + lines_residuals = ax.plot( + [value, value], [true, predicted], color="red" + ) + + handles = [line_predictions[0], lines_residuals[0]] + + return handles, ax -# plot the residuals -for value, true, predicted in zip( - data_train["Feature"], target_train, target_train_predicted -): - lines_residuals = plt.plot([value, value], [true, predicted], color="red") -plt.legend( - [line_predictions[0], lines_residuals[0]], ["Fitted tree", "Residuals"] +# %% +handles, ax = plot_decision_tree_with_residuals( + target_train, target_train_predicted, target_test_predicted ) -_ = plt.title("Prediction function together \nwith errors on the training set") +legend_labels = ["Initial decision tree", "Initial residuals"] +ax.legend(handles, legend_labels, bbox_to_anchor=(1.05, 0.8), loc="upper left") +_ = ax.set_title("Decision Tree together \nwith errors on the training set") # %% [markdown] # ```{tip} @@ -104,15 +122,15 @@ def generate_data(n_samples=50): # for all the residual lines. # ``` # Since the tree underfits the data, its accuracy is far from perfect on the -# training data. We can observe this in the figure by looking at the difference -# between the predictions and the ground-truth data. We represent these errors, -# called "Residuals", by unbroken red lines. +# training data. We can observe this in the figure above by looking at the +# difference between the predictions and the ground-truth data. We represent +# these errors, called "residuals", using solid red lines. # -# Indeed, our initial tree was not expressive enough to handle the complexity of +# Indeed, our initial tree is not expressive enough to handle the complexity of # the data, as shown by the residuals. In a gradient-boosting algorithm, the -# idea is to create a second tree which, given the same data `data`, will try to -# predict the residuals instead of the vector `target`. We would therefore have -# a tree that is able to predict the errors made by the initial tree. +# idea is to create a second tree which, given the same `data`, tries to predict +# the residuals instead of the vector `target`, i.e. we have a second tree that +# is able to predict the errors made by the initial tree. # # Let's train such a tree. @@ -126,84 +144,56 @@ def generate_data(n_samples=50): target_test_predicted_residuals = tree_residuals.predict(data_test) # %% -sns.scatterplot(x=data_train["Feature"], y=residuals, color="black", alpha=0.5) -line_predictions = plt.plot( - data_test["Feature"], target_test_predicted_residuals, "--" -) - -# plot the residuals of the predicted residuals -for value, true, predicted in zip( - data_train["Feature"], residuals, target_train_predicted_residuals -): - lines_residuals = plt.plot([value, value], [true, predicted], color="red") - -plt.legend( - [line_predictions[0], lines_residuals[0]], - ["Fitted tree", "Residuals"], - bbox_to_anchor=(1.05, 0.8), - loc="upper left", +handles, ax = plot_decision_tree_with_residuals( + residuals, + target_train_predicted_residuals, + target_test_predicted_residuals, ) -_ = plt.title("Prediction of the previous residuals") +legend_labels = [ + "Predicted residuals", + "Residuals of the\npredicted residuals", +] +ax.legend(handles, legend_labels, bbox_to_anchor=(1.05, 0.8), loc="upper left") +_ = ax.set_title("Prediction of the initial residuals") # %% [markdown] -# We see that this new tree only manages to fit some of the residuals. We will -# focus on a specific sample from the training set (i.e. we know that the sample -# will be well predicted using two successive trees). We will use this sample to +# We see that this new tree only manages to fit some of the residuals. We now +# focus on a specific sample from the training set (as we know that the sample +# can be well predicted using two successive trees). We will use this sample to # explain how the predictions of both trees are combined. Let's first select # this sample in `data_train`. # %% -sample = data_train.iloc[[-2]] +sample = data_train.iloc[[-7]] x_sample = sample["Feature"].iloc[0] -target_true = target_train.iloc[-2] -target_true_residual = residuals.iloc[-2] +target_true = target_train.iloc[-7] +target_true_residual = residuals.iloc[-7] # %% [markdown] -# Let's plot the previous information and highlight our sample of interest. -# Let's start by plotting the original data and the prediction of the first -# decision tree. +# Let's plot the original data, the predictions of the initial decision tree and +# highlight our sample of interest, i.e. this is just a zoom of the plot +# displaying the initial shallow tree. # %% -# Plot the previous information: -# * the dataset -# * the predictions -# * the residuals - -sns.scatterplot( - x=data_train["Feature"], y=target_train, color="black", alpha=0.5 +handles, ax = plot_decision_tree_with_residuals( + target_train, target_train_predicted, target_test_predicted ) -plt.plot(data_test["Feature"], target_test_predicted, "--") -for value, true, predicted in zip( - data_train["Feature"], target_train, target_train_predicted -): - lines_residuals = plt.plot([value, value], [true, predicted], color="red") - -# Highlight the sample of interest -plt.scatter( +ax.scatter( sample, target_true, label="Sample of interest", color="tab:orange", s=200 ) -plt.xlim([-1, 0]) -plt.legend(bbox_to_anchor=(1.05, 0.8), loc="upper left") -_ = plt.title("Tree predictions") +ax.set_xlim([-1, 0]) +ax.legend(bbox_to_anchor=(1.05, 0.8), loc="upper left") +_ = ax.set_title("Zoom of sample of interest\nin the initial decision tree") # %% [markdown] -# Now, let's plot the residuals information. We will plot the residuals computed -# from the first decision tree and show the residual predictions. +# Similarly we plot a zoom of the plot with the prediction of the initial residuals # %% -# Plot the previous information: -# * the residuals committed by the first tree -# * the residual predictions -# * the residuals of the residual predictions - -sns.scatterplot(x=data_train["Feature"], y=residuals, color="black", alpha=0.5) -plt.plot(data_test["Feature"], target_test_predicted_residuals, "--") -for value, true, predicted in zip( - data_train["Feature"], residuals, target_train_predicted_residuals -): - lines_residuals = plt.plot([value, value], [true, predicted], color="red") - -# Highlight the sample of interest +handles, ax = plot_decision_tree_with_residuals( + residuals, + target_train_predicted_residuals, + target_test_predicted_residuals, +) plt.scatter( sample, target_true_residual, @@ -211,14 +201,18 @@ def generate_data(n_samples=50): color="tab:orange", s=200, ) -plt.xlim([-1, 0]) -plt.legend() -_ = plt.title("Prediction of the residuals") +legend_labels = [ + "Predicted residuals", + "Residuals of the\npredicted residuals", +] +ax.set_xlim([-1, 0]) +ax.legend(bbox_to_anchor=(1.05, 0.8), loc="upper left") +_ = ax.set_title("Zoom of sample of interest\nin the initial residuals") # %% [markdown] # For our sample of interest, our initial tree is making an error (small # residual). When fitting the second tree, the residual in this case is -# perfectly fitted and predicted. We will quantitatively check this prediction +# perfectly fitted and predicted. We can quantitatively check this prediction # using the fitted tree. First, let's check the prediction of the initial tree # and compare it with the true value. @@ -265,7 +259,9 @@ def generate_data(n_samples=50): # second tree corrects the first tree's error, while the third tree corrects the # second tree's error and so on). # -# We will compare the generalization performance of random-forest and gradient +# ## First comparison of GBDT vs. random forests +# +# We now compare the generalization performance of random-forest and gradient # boosting on the California housing dataset. # %% @@ -322,11 +318,12 @@ def generate_data(n_samples=50): print(f"Average score time: {cv_results_rf['score_time'].mean():.3f} seconds") # %% [markdown] -# In term of computation performance, the forest can be parallelized and will +# In terms of computing performance, the forest can be parallelized and then # benefit from using multiple cores of the CPU. In terms of scoring performance, # both algorithms lead to very close results. # -# However, we see that the gradient boosting is a very fast algorithm to predict -# compared to random forest. This is due to the fact that gradient boosting uses -# shallow trees. We will go into details in the next notebook about the -# hyperparameters to consider when optimizing ensemble methods. +# However, we see that gradient boosting is overall faster than random forest. +# One of the reasons is that random forests typically rely on deep trees (that +# overfit individually) whereas boosting models build shallow trees (that +# underfit individually) which are faster to fit and predict. In the following +# exercise we will explore more in depth how these two models compare. From 6c7355271f77534758e7f6f9c3529093021a237f Mon Sep 17 00:00:00 2001 From: Arturo Amor <86408019+ArturoAmorQ@users.noreply.github.com> Date: Fri, 17 May 2024 10:48:12 +0200 Subject: [PATCH 107/108] ENH Improve wording in group-aware cross-validation notebook (#776) Co-authored-by: ArturoAmorQ Co-authored-by: Guillaume Lemaitre --- python_scripts/cross_validation_grouping.py | 76 +++++++++++++-------- 1 file changed, 47 insertions(+), 29 deletions(-) diff --git a/python_scripts/cross_validation_grouping.py b/python_scripts/cross_validation_grouping.py index 3c473ecdf..20347b0af 100644 --- a/python_scripts/cross_validation_grouping.py +++ b/python_scripts/cross_validation_grouping.py @@ -7,9 +7,8 @@ # %% [markdown] # # Sample grouping -# We are going to linger into the concept of sample groups. As in the previous -# section, we will give an example to highlight some surprising results. This -# time, we will use the handwritten digits dataset. +# In this notebook we present the concept of **sample groups**. We use the +# handwritten digits dataset to highlight some surprising results. # %% from sklearn.datasets import load_digits @@ -18,8 +17,17 @@ data, target = digits.data, digits.target # %% [markdown] -# We will recreate the same model used in the previous notebook: a logistic -# regression classifier with a preprocessor to scale the data. +# We create a model consisting of a logistic regression classifier with a +# preprocessor to scale the data. +# +# ```{note} +# Here we use a `MinMaxScaler` as we know that each pixel's gray-scale is +# strictly bounded between 0 (white) and 16 (black). This makes `MinMaxScaler` +# more suited in this case than `StandardScaler`, as some pixels consistently +# have low variance (pixels at the borders might almost always be zero if most +# digits are centered in the image). Then, using `StandardScaler` can result in +# a very high scaled value due to division by a small number. +# ``` # %% from sklearn.preprocessing import MinMaxScaler @@ -29,8 +37,10 @@ model = make_pipeline(MinMaxScaler(), LogisticRegression(max_iter=1_000)) # %% [markdown] -# We will use the same baseline model. We will use a `KFold` cross-validation -# without shuffling the data at first. +# The idea is to compare the estimated generalization performance using +# different cross-validation techniques and see how such estimations are +# impacted by underlying data structures. We first use a `KFold` +# cross-validation without shuffling the data. # %% from sklearn.model_selection import cross_val_score, KFold @@ -59,9 +69,9 @@ ) # %% [markdown] -# We observe that shuffling the data improves the mean accuracy. We could go a -# little further and plot the distribution of the testing score. We can first -# concatenate the test scores. +# We observe that shuffling the data improves the mean accuracy. We can go a +# little further and plot the distribution of the testing score. For such +# purpose we concatenate the test scores. # %% import pandas as pd @@ -72,29 +82,29 @@ ).T # %% [markdown] -# Let's plot the distribution now. +# Let's now plot the score distributions. # %% import matplotlib.pyplot as plt -all_scores.plot.hist(bins=10, edgecolor="black", alpha=0.7) +all_scores.plot.hist(bins=16, edgecolor="black", alpha=0.7) plt.xlim([0.8, 1.0]) plt.xlabel("Accuracy score") plt.legend(bbox_to_anchor=(1.05, 0.8), loc="upper left") _ = plt.title("Distribution of the test scores") # %% [markdown] -# The cross-validation testing error that uses the shuffling has less variance -# than the one that does not impose any shuffling. It means that some specific -# fold leads to a low score in this case. +# Shuffling the data results in a higher cross-validated test accuracy with less +# variance compared to when the data is not shuffled. It means that some +# specific fold leads to a low score in this case. # %% print(test_score_no_shuffling) # %% [markdown] -# Thus, there is an underlying structure in the data that shuffling will break -# and get better results. To get a better understanding, we should read the -# documentation shipped with the dataset. +# Thus, shuffling the data breaks the underlying structure and thus makes the +# classification task easier to our model. To get a better understanding, we can +# read the dataset description in more detail: # %% print(digits.DESCR) @@ -165,7 +175,7 @@ groups[lb:up] = group_id # %% [markdown] -# We can check the grouping by plotting the indices linked to writer ids. +# We can check the grouping by plotting the indices linked to writers' ids. # %% plt.plot(groups) @@ -176,8 +186,9 @@ _ = plt.title("Underlying writer groups existing in the target") # %% [markdown] -# Once we group the digits by writer, we can use cross-validation to take this -# information into account: the class containing `Group` should be used. +# Once we group the digits by writer, we can incorporate this information into +# the cross-validation process by using group-aware variations of the strategies +# we have explored in this course, for example, the `GroupKFold` strategy. # %% from sklearn.model_selection import GroupKFold @@ -191,10 +202,12 @@ ) # %% [markdown] -# We see that this strategy is less optimistic regarding the model -# generalization performance. However, this is the most reliable if our goal is -# to make handwritten digits recognition writers independent. Besides, we can as -# well see that the standard deviation was reduced. +# We see that this strategy leads to a lower generalization performance than the +# other two techniques. However, this is the most reliable estimate if our goal +# is to evaluate the capabilities of the model to generalize to new unseen +# writers. In this sense, shuffling the dataset (or alternatively using the +# writers' ids as a new feature) would lead the model to memorize the different +# writer's particular handwriting. # %% all_scores = pd.DataFrame( @@ -207,13 +220,18 @@ ).T # %% -all_scores.plot.hist(bins=10, edgecolor="black", alpha=0.7) +all_scores.plot.hist(bins=16, edgecolor="black", alpha=0.7) plt.xlim([0.8, 1.0]) plt.xlabel("Accuracy score") plt.legend(bbox_to_anchor=(1.05, 0.8), loc="upper left") _ = plt.title("Distribution of the test scores") # %% [markdown] -# As a conclusion, it is really important to take any sample grouping pattern -# into account when evaluating a model. Otherwise, the results obtained will be -# over-optimistic in regards with reality. +# In conclusion, accounting for any sample grouping patterns is crucial when +# assessing a modelโ€™s ability to generalize to new groups. Without this +# consideration, the results may appear overly optimistic compared to the actual +# performance. +# +# The interested reader can learn about other group-aware cross-validation +# techniques in the [scikit-learn user +# guide](https://scikit-learn.org/stable/modules/cross_validation.html#cross-validation-iterators-for-grouped-data). From 00379f891c68e5c3f5e96dcdd250a87e2a3b437e Mon Sep 17 00:00:00 2001 From: Arturo Amor <86408019+ArturoAmorQ@users.noreply.github.com> Date: Fri, 17 May 2024 10:50:08 +0200 Subject: [PATCH 108/108] FIX correction for some typos (#779) Co-authored-by: ArturoAmorQ --- python_scripts/cross_validation_ex_01.py | 2 +- python_scripts/cross_validation_sol_01.py | 2 +- python_scripts/linear_regression_non_linear_link.py | 3 ++- 3 files changed, 4 insertions(+), 3 deletions(-) diff --git a/python_scripts/cross_validation_ex_01.py b/python_scripts/cross_validation_ex_01.py index 8fd6e4ea8..bf332c1e9 100644 --- a/python_scripts/cross_validation_ex_01.py +++ b/python_scripts/cross_validation_ex_01.py @@ -45,7 +45,7 @@ # exercise. # # Also, this classifier can become more flexible/expressive by using a so-called -# kernel that makes the model become non-linear. Again, no requirement regarding +# kernel that makes the model become non-linear. Again, no undestanding regarding # the mathematics is required to accomplish this exercise. # # We will use an RBF kernel where a parameter `gamma` allows to tune the diff --git a/python_scripts/cross_validation_sol_01.py b/python_scripts/cross_validation_sol_01.py index 78518309c..4221c75e7 100644 --- a/python_scripts/cross_validation_sol_01.py +++ b/python_scripts/cross_validation_sol_01.py @@ -39,7 +39,7 @@ # exercise. # # Also, this classifier can become more flexible/expressive by using a so-called -# kernel that makes the model become non-linear. Again, no requirement regarding +# kernel that makes the model become non-linear. Again, no understanding regarding # the mathematics is required to accomplish this exercise. # # We will use an RBF kernel where a parameter `gamma` allows to tune the diff --git a/python_scripts/linear_regression_non_linear_link.py b/python_scripts/linear_regression_non_linear_link.py index ca88b8799..155175a20 100644 --- a/python_scripts/linear_regression_non_linear_link.py +++ b/python_scripts/linear_regression_non_linear_link.py @@ -37,7 +37,7 @@ # %% [markdown] # ```{tip} -# `np.random.RandomState` allows to create a random number generator which can +# `np.random.RandomState` allows creating a random number generator which can # be later used to get deterministic results. # ``` # @@ -172,6 +172,7 @@ def fit_score_plot_regression(model, title=None): # of the absolute values of the differences between the features generated by # both methods and checking that it is close to zero: +# %% np.abs(polynomial_expansion.fit_transform(data) - data_expanded).max() # %% [markdown]