From 216bf472a779cd9b8ced8b4ec630b39359c090b7 Mon Sep 17 00:00:00 2001
From: Bihan  Rana <bihan@Bihans-MacBook-Pro.local>
Date: Thu, 5 Sep 2024 21:27:08 +0545
Subject: [PATCH] Add TPU examples with optimum-tpu and vLLM

---
 docs/docs/concepts/fleets.md                  |   4 +-
 docs/examples/accelerators/tpu/index.md       |   0
 examples/accelerators/tpu/README.md           | 199 ++++++++++++++++++
 examples/deployment/optimum-tpu/.dstack.yml   |  18 ++
 .../deployment/optimum-tpu/service.dstack.yml |  28 +++
 .../deployment/optimum-tpu/task.dstack.yml    |  23 ++
 .../deployment/vllm/service-tpu.dstack.yml    |  40 ++++
 .../optimum-tpu/llama31/.dstack.yml           |  31 +++
 .../optimum-tpu/llama31/config.yaml           |  10 +
 .../optimum-tpu/llama31/train.dstack.yml      |  25 +++
 .../fine-tuning/optimum-tpu/llama31/train.py  | 140 ++++++++++++
 mkdocs.yml                                    |   1 +
 12 files changed, 517 insertions(+), 2 deletions(-)
 create mode 100644 docs/examples/accelerators/tpu/index.md
 create mode 100644 examples/accelerators/tpu/README.md
 create mode 100644 examples/deployment/optimum-tpu/.dstack.yml
 create mode 100644 examples/deployment/optimum-tpu/service.dstack.yml
 create mode 100644 examples/deployment/optimum-tpu/task.dstack.yml
 create mode 100644 examples/deployment/vllm/service-tpu.dstack.yml
 create mode 100644 examples/fine-tuning/optimum-tpu/llama31/.dstack.yml
 create mode 100644 examples/fine-tuning/optimum-tpu/llama31/config.yaml
 create mode 100644 examples/fine-tuning/optimum-tpu/llama31/train.dstack.yml
 create mode 100644 examples/fine-tuning/optimum-tpu/llama31/train.py
diff --git a/docs/docs/concepts/fleets.md b/docs/docs/concepts/fleets.md
index 76eb56c56..f9639ddb9 100644
--- a/docs/docs/concepts/fleets.md
+++ b/docs/docs/concepts/fleets.md
@@ -223,8 +223,8 @@ you can set the [`termination_idle_time`](../reference/dstack.yml/fleet.md#termi
 
 ## What's next?
 
-1. Read about [dev environments](dev-environments.md), [tasks](tasks.md), and 
-    [services](services.md) 
+1. Read about [dev environments](../dev-environments.md), [tasks](../tasks.md), and 
+    [services](../services.md) 
 2. Join the community via [Discord :material-arrow-top-right-thin:{ .external }](https://discord.gg/u8SmfwPpMd)
 
 !!! info "Reference"
diff --git a/docs/examples/accelerators/tpu/index.md b/docs/examples/accelerators/tpu/index.md
new file mode 100644
index 000000000..e69de29bb
diff --git a/examples/accelerators/tpu/README.md b/examples/accelerators/tpu/README.md
new file mode 100644
index 000000000..52abefaa6
--- /dev/null
+++ b/examples/accelerators/tpu/README.md
@@ -0,0 +1,199 @@
+# TPU
+
+If you're using the `gcp` backend, you can use TPUs. Just specify the TPU version and the number of cores 
+(separated by a dash), in the `gpu` property under `resources`. 
+
+> Currently, maximum 8 TPU cores can be specified, so the maximum supported values are `v2-8`, `v3-8`, `v4-8`, `v5litepod-8`, 
+> and `v5e-8`. Multi-host TPU support, allowing for larger numbers of cores, is coming soon.
+
+Below are a few examples on using TPUs for deployment and fine-tuning.
+
+## Deployment
+
+### Running as a service
+You can use any serving framework, such as vLLM, TGI. Here's an example of a [service](https://dstack.ai/docs/services) that deploys
+Llama 3.1 8B using 
+[Optimum TPU :material-arrow-top-right-thin:{ .external }](https://github.com/huggingface/optimum-tpu){:target="_blank"}
+and [vLLM :material-arrow-top-right-thin:{ .external }](https://github.com/vllm-project/vllm){:target="_blank"}.
+
+=== "Optimum TPU"
+
+    <div editor-title="examples/deployment/optimum-tpu/service.dstack.yml"> 
+    
+    ```yaml
+    type: service
+    name: llama31-service-optimum-tpu
+    
+    image: dstackai/optimum-tpu:llama31
+    env:
+      - HUGGING_FACE_HUB_TOKEN
+      - MODEL_ID=meta-llama/Meta-Llama-3.1-8B-Instruct
+      - MAX_TOTAL_TOKENS=4096
+      - MAX_BATCH_PREFILL_TOKENS=4095
+    commands:
+      - text-generation-launcher --port 8000
+    port: 8000
+    
+    spot_policy: auto
+    resources:
+      gpu: v5litepod-4 
+    
+    model:
+      format: tgi
+      type: chat
+      name: meta-llama/Meta-Llama-3.1-8B-Instruct
+    ```
+    </div>
+
+    Note, for `Optimum TPU` by default `MAX_INPUT_TOKEN` is set to 4095, consequently we must set `MAX_BATCH_PREFILL_TOKENS` to 4095.
+
+    ??? info "Docker image"
+        The official Docker image `huggingface/optimum-tpu:latest` doesn’t support Llama 3.1-8B. 
+        We’ve created a custom image with the fix: `dstackai/optimum-tpu:llama31`. 
+        Once the [pull request :material-arrow-top-right-thin:{ .external }](https://github.com/huggingface/optimum-tpu/pull/87){:target="_blank"} is merged, 
+        the official Docker image can be used.
+
+=== "vLLM"
+    <div editor-title="examples/deployment/vllm/service-tpu.dstack.yml"> 
+    
+    ```yaml
+    type: service
+    name: llama31-service-vllm-tpu
+
+    env:
+      - MODEL_ID=meta-llama/Meta-Llama-3.1-8B-Instruct
+      - HUGGING_FACE_HUB_TOKEN
+      - DATE=20240828
+      - TORCH_VERSION=2.5.0
+      - VLLM_TARGET_DEVICE=tpu
+      - MAX_MODEL_LEN=4096
+    commands:
+      - pip install https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch-${TORCH_VERSION}.dev${DATE}-cp311-cp311-linux_x86_64.whl
+      - pip3 install https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-${TORCH_VERSION}.dev${DATE}-cp311-cp311-linux_x86_64.whl
+      - pip install torch_xla[tpu] -f https://storage.googleapis.com/libtpu-releases/index.html
+      - pip install torch_xla[pallas] -f https://storage.googleapis.com/jax-releases/jax_nightly_releases.html -f https://storage.googleapis.com/jax-releases/jaxlib_nightly_releases.html
+      - git clone https://github.com/vllm-project/vllm.git
+      - cd vllm
+      - pip install -r requirements-tpu.txt
+      - apt-get install -y libopenblas-base libopenmpi-dev libomp-dev
+      - python setup.py develop
+      - vllm serve $MODEL_ID 
+          --tensor-parallel-size 4 
+          --max-model-len $MAX_MODEL_LEN
+          --port 8000
+    port:
+      - 8000
+
+    spot_policy: auto
+    resources:
+      gpu: v5litepod-4
+
+    model:
+      format: openai
+      type: chat
+      name: meta-llama/Meta-Llama-3.1-8B-Instruct
+    ```
+    </div>
+
+    Note, when using Llama 3.1 8B with a `v5litepod` which has 16GB memory per core, we must limit the context size to 4096 tokens to fit the memory.
+
+### Memory requirements
+
+Below are the approximate memory requirements for serving LLMs with their corresponding TPUs. 
+
+| Model size | bfloat16 | TPU          | int8  | TPU            |
+|------------|----------|--------------|-------|----------------|
+| **8B**     | 16GB     | v5litepod-4  | 8GB   | v5litepod-4    |
+| **70B**    | 140GB    | v5litepod-16 | 70GB  | v5litepod-16   |
+| **405B**   | 810GB    | v5litepod-64 | 405GB | v5litepod-64   |
+
+Note, `v5litepod` is optimized for serving transformer-based models. Each core is equipped with 16GB of memory.
+
+### Supported frameworks
+
+| Framework | Quantization   | Note                                                                                                                                                                                                                                                                                             |
+|-----------|----------------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
+| **TGI**   | bfloat16       | To deploy with TGI, Optimum TPU must be used.                                                                                                                                                                                                                                                    |
+| **vLLM**  | int8, bfloat16 | int8 quantization still requires the same memory because the weights are first moved to the TPU in bfloat16, and then converted to int8. See the [pull request :material-arrow-top-right-thin:{ .external }](https://github.com/vllm-project/vllm/pull/7005){:target="_blank"} for more details. |
+
+### Running a configuration
+
+Once the configuration is ready, run `dstack apply -f <configuration file>`, and `dstack` will automatically provision the
+cloud resources and run the configuration.
+
+## Fine-tuning with Optimum TPU
+
+Below is an example of fine-tuning Llama 3.1 8B using [Optimum TPU :material-arrow-top-right-thin:{ .external }](https://github.com/huggingface/optimum-tpu){:target="_blank"} 
+and the [Abirate/english_quotes :material-arrow-top-right-thin:{ .external }](https://huggingface.co/datasets/Abirate/english_quotes){:target="_blank"}
+dataset.
+
+<div editor-title="examples/fine-tuning/optimum-tpu/llama31/train.dstack.yml"> 
+
+```yaml
+type: task
+name: optimum-tpu-llama-train
+
+python: "3.11"
+
+env:
+  - HUGGING_FACE_HUB_TOKEN
+commands:
+  - git clone -b add_llama_31_support https://github.com/dstackai/optimum-tpu.git
+  - mkdir -p optimum-tpu/examples/custom/
+  - cp examples/fine-tuning/optimum-tpu/llama31/train.py optimum-tpu/examples/custom/train.py
+  - cp examples/fine-tuning/optimum-tpu/llama31/config.yaml optimum-tpu/examples/custom/config.yaml
+  - cd optimum-tpu
+  - pip install -e . -f https://storage.googleapis.com/libtpu-releases/index.html
+  - pip install datasets evaluate
+  - pip install accelerate -U
+  - pip install peft
+  - python examples/custom/train.py examples/custom/config.yaml
+
+
+resources:
+  gpu: v5litepod-8
+```
+
+</div>
+
+[//]: # (### Fine-Tuning with TRL)
+[//]: # (Use the example `examples/fine-tuning/optimum-tpu/gemma/train.dstack.yml` to Finetune `Gemma-2B` model using `trl` with `dstack` and `optimum-tpu`. )
+
+### Memory requirements
+
+Below are the approximate memory requirements for fine-tuning LLMs with their corresponding TPUs.
+
+| Model size | LoRA  | TPU          |
+|------------|-------|--------------|
+| **8B**     | 16GB  | v5litepod-8  |
+| **70B**    | 160GB | v5litepod-16 |
+| **405B**   | 950GB | v5litepod-64 |
+
+Note, `v5litepod` is optimized for fine-tuning transformer-based models. Each core is equipped with 16GB of memory.
+
+### Supported frameworks
+
+| Framework       | Quantization | Note                                                                                              |
+|-----------------|--------------|---------------------------------------------------------------------------------------------------|
+| **TRL**         | bfloat16     | To fine-tune using TRL, Optimum TPU is recommended. TRL doesn't support Llama 3.1 out of the box. |
+| **Pytorch XLA** | bfloat16     |                                                                                                   |
+
+## Dev environments
+
+Before running a task or service, it's recommended that you first start with
+a [dev environment](https://dstack.ai/docs/dev-environments). Dev environments
+allow you to run commands interactively.
+
+## Source code
+
+The source-code of this example can be found in 
+[examples/deployment/optimum-tpu :material-arrow-top-right-thin:{ .external }](https://github.com/dstackai/dstack/blob/master/examples/llms/llama31){:target="_blank"}
+and [examples/fine-tuning/optimum-tpu :material-arrow-top-right-thin:{ .external }](https://github.com/dstackai/dstack/blob/master/examples/fine-tuning/trl){:target="_blank"}.
+
+## What's next?
+
+1. Browse [Optimum TPU :material-arrow-top-right-thin:{ .external }](https://github.com/huggingface/optimum-tpu),
+   [Optimum TPU TGI :material-arrow-top-right-thin:{ .external }](https://github.com/huggingface/optimum-tpu/tree/main/text-generation-inference) and
+   [vLLM :material-arrow-top-right-thin:{ .external }](https://docs.vllm.ai/en/latest/getting_started/tpu-installation.html).
+2. Check [dev environments](https://dstack.ai/docs/dev-environments), [tasks](https://dstack.ai/docs/tasks), 
+   [services](https://dstack.ai/docs/services), and [fleets](https://dstack.ai/docs/fleets).
diff --git a/examples/deployment/optimum-tpu/.dstack.yml b/examples/deployment/optimum-tpu/.dstack.yml
new file mode 100644
index 000000000..f34d3e9bb
--- /dev/null
+++ b/examples/deployment/optimum-tpu/.dstack.yml
@@ -0,0 +1,18 @@
+type: dev-environment
+# The name is optional, if not specified, generated randomly
+name: vscode-optimum-tpu
+
+# Using a Docker image with a fix instead of the official one
+# More details at https://github.com/huggingface/optimum-tpu/pull/87
+image: dstackai/optimum-tpu:llama31
+# Required environment variables
+env:
+  - HUGGING_FACE_HUB_TOKEN
+ide: vscode
+
+resources:
+  # Required resources
+  gpu: v5litepod-4
+
+# Use either spot or on-demand instances
+spot_policy: auto
diff --git a/examples/deployment/optimum-tpu/service.dstack.yml b/examples/deployment/optimum-tpu/service.dstack.yml
new file mode 100644
index 000000000..1b9ad8db3
--- /dev/null
+++ b/examples/deployment/optimum-tpu/service.dstack.yml
@@ -0,0 +1,28 @@
+type: service
+# The name is optional, if not specified, generated randomly
+name: llama31-service-optimum-tpu
+
+# Using a Docker image with a fix instead of the official one
+# More details at https://github.com/huggingface/optimum-tpu/pull/87
+image: dstackai/optimum-tpu:llama31
+# Required environment variables
+env:
+  - HUGGING_FACE_HUB_TOKEN
+  - MODEL_ID=meta-llama/Meta-Llama-3.1-8B-Instruct
+  - MAX_TOTAL_TOKENS=4096
+  - MAX_BATCH_PREFILL_TOKENS=4095
+commands:
+  - text-generation-launcher --port 8000
+port: 8000
+
+resources:
+  # Required resources
+  gpu: v5litepod-4
+
+# Use either spot or on-demand instances
+spot_policy: auto
+
+model:
+  format: tgi
+  type: chat
+  name: meta-llama/Meta-Llama-3.1-8B-Instruct
\ No newline at end of file
diff --git a/examples/deployment/optimum-tpu/task.dstack.yml b/examples/deployment/optimum-tpu/task.dstack.yml
new file mode 100644
index 000000000..8a581e14b
--- /dev/null
+++ b/examples/deployment/optimum-tpu/task.dstack.yml
@@ -0,0 +1,23 @@
+type: task
+# The name is optional, if not specified, generated randomly
+name: llama31-task-optimum-tpu
+
+# Using a Docker image with a fix instead of the official one
+# More details at https://github.com/huggingface/optimum-tpu/pull/87
+image: dstackai/optimum-tpu:llama31
+# Required environment variables
+env:
+  - HUGGING_FACE_HUB_TOKEN
+  - MODEL_ID=meta-llama/Meta-Llama-3.1-8B-Instruct
+  - MAX_TOTAL_TOKENS=4096
+  - MAX_BATCH_PREFILL_TOKENS=4095
+commands:
+  - text-generation-launcher --port 8000
+ports: [8000]
+
+resources:
+  # Required resources
+  gpu: v5litepod-4
+
+# Use either spot or on-demand instances
+spot_policy: auto
\ No newline at end of file
diff --git a/examples/deployment/vllm/service-tpu.dstack.yml b/examples/deployment/vllm/service-tpu.dstack.yml
new file mode 100644
index 000000000..230a1c539
--- /dev/null
+++ b/examples/deployment/vllm/service-tpu.dstack.yml
@@ -0,0 +1,40 @@
+type: service
+# The name is optional, if not specified, generated randomly
+name: llama31-service-vllm-tpu
+
+env:
+  - HUGGING_FACE_HUB_TOKEN
+  - MODEL_ID=meta-llama/Meta-Llama-3.1-8B-Instruct
+  - DATE=20240828
+  - TORCH_VERSION=2.5.0
+  - VLLM_TARGET_DEVICE=tpu
+  - MAX_MODEL_LEN=4096
+
+commands:
+  - pip install https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch-${TORCH_VERSION}.dev${DATE}-cp311-cp311-linux_x86_64.whl
+  - pip3 install https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-${TORCH_VERSION}.dev${DATE}-cp311-cp311-linux_x86_64.whl
+  - pip install torch_xla[tpu] -f https://storage.googleapis.com/libtpu-releases/index.html
+  - pip install torch_xla[pallas] -f https://storage.googleapis.com/jax-releases/jax_nightly_releases.html -f https://storage.googleapis.com/jax-releases/jaxlib_nightly_releases.html
+  - git clone https://github.com/vllm-project/vllm.git
+  - cd vllm
+  - pip install -r requirements-tpu.txt
+  - apt-get install -y libopenblas-base libopenmpi-dev libomp-dev
+  - python setup.py develop
+  - vllm serve $MODEL_ID 
+      --tensor-parallel-size 4 
+      --max-model-len $MAX_MODEL_LEN
+      --port 8000
+
+# Expose the vllm server port
+port: 8000
+
+spot_policy: auto
+
+resources:
+  gpu: v5litepod-4
+
+# (Optional) Enable the OpenAI-compatible endpoint
+model:
+  format: openai
+  type: chat
+  name: meta-llama/Meta-Llama-3.1-8B-Instruct
\ No newline at end of file
diff --git a/examples/fine-tuning/optimum-tpu/llama31/.dstack.yml b/examples/fine-tuning/optimum-tpu/llama31/.dstack.yml
new file mode 100644
index 000000000..8dc522e0e
--- /dev/null
+++ b/examples/fine-tuning/optimum-tpu/llama31/.dstack.yml
@@ -0,0 +1,31 @@
+type: dev-environment
+# The name is optional, if not specified, generated randomly
+name: optimum-tpu-vscode
+
+# If `image` is not specified, dstack uses its default image
+python: "3.11"
+
+# Required environment variables
+env:
+  - HUGGING_FACE_HUB_TOKEN
+
+# Refer to Note section in examples/gpus/tpu/README.md for more information about the optimum-tpu repository.
+# Uncomment if you want the environment to be pre-installed
+#init:
+#  - git clone -b add_llama_31_support https://github.com/dstackai/optimum-tpu.git
+#  - mkdir -p optimum-tpu/examples/custom/
+#  - cp examples/fine-tuning/optimum-tpu/llama31/train.py optimum-tpu/examples/custom/train.py
+#  - cp examples/fine-tuning/optimum-tpu/llama31/config.yaml optimum-tpu/examples/custom/config.yaml
+#  - cd optimum-tpu
+#  - pip install -e . -f https://storage.googleapis.com/libtpu-releases/index.html
+#  - pip install datasets evaluate
+#  - pip install accelerate -U
+#  - pip install peft
+
+ide: vscode
+
+# Use either spot or on-demand instances
+spot_policy: auto
+
+resources:
+  gpu: v5litepod-8
\ No newline at end of file
diff --git a/examples/fine-tuning/optimum-tpu/llama31/config.yaml b/examples/fine-tuning/optimum-tpu/llama31/config.yaml
new file mode 100644
index 000000000..4b0aea529
--- /dev/null
+++ b/examples/fine-tuning/optimum-tpu/llama31/config.yaml
@@ -0,0 +1,10 @@
+per_device_train_batch_size: 24
+per_device_eval_batch_size: 8
+num_train_epochs: 1
+max_steps: -1
+output_dir: "./finetuned_models/llama3_fine_tuned"
+optim: "adafactor"
+dataset_name: "Abirate/english_quotes"
+model_name: "meta-llama/Meta-Llama-3.1-8B"
+lora_r: 4
+push_to_hub: True
\ No newline at end of file
diff --git a/examples/fine-tuning/optimum-tpu/llama31/train.dstack.yml b/examples/fine-tuning/optimum-tpu/llama31/train.dstack.yml
new file mode 100644
index 000000000..04fdfb744
--- /dev/null
+++ b/examples/fine-tuning/optimum-tpu/llama31/train.dstack.yml
@@ -0,0 +1,25 @@
+type: task
+# The name is optional, if not specified, generated randomly
+name: optimum-tpu-llama-train
+
+python: "3.11"
+
+# Required environment variables
+env:
+  - HUGGING_FACE_HUB_TOKEN
+
+# Commands of the task
+commands:
+  - git clone -b add_llama_31_support https://github.com/dstackai/optimum-tpu.git
+  - mkdir -p optimum-tpu/examples/custom/
+  - cp examples/fine-tuning/optimum-tpu/llama31/train.py optimum-tpu/examples/custom/train.py
+  - cp examples/fine-tuning/optimum-tpu/llama31/config.yaml optimum-tpu/examples/custom/config.yaml
+  - cd optimum-tpu
+  - pip install -e . -f https://storage.googleapis.com/libtpu-releases/index.html
+  - pip install datasets evaluate
+  - pip install accelerate -U
+  - pip install peft
+  - python examples/custom/train.py examples/custom/config.yaml
+
+resources:
+  gpu: v5litepod-8
\ No newline at end of file
diff --git a/examples/fine-tuning/optimum-tpu/llama31/train.py b/examples/fine-tuning/optimum-tpu/llama31/train.py
new file mode 100644
index 000000000..0c8c8a614
--- /dev/null
+++ b/examples/fine-tuning/optimum-tpu/llama31/train.py
@@ -0,0 +1,140 @@
+from dataclasses import dataclass, field
+from typing import Optional
+
+from datasets import load_dataset
+from optimum.tpu import AutoModelForCausalLM, fsdp_v2
+from peft import LoraConfig, TaskType, get_peft_model
+from transformers import (
+    AutoTokenizer,
+    DataCollatorForLanguageModeling,
+    HfArgumentParser,
+    Trainer,
+    TrainingArguments,
+)
+
+
+@dataclass
+class ScriptArguments:
+    per_device_train_batch_size: Optional[int] = field(
+        default=8, metadata={"help": "Batch size per device for training."}
+    )
+    per_device_eval_batch_size: Optional[int] = field(
+        default=8, metadata={"help": "Batch size per device for evaluation."}
+    )
+    num_train_epochs: Optional[int] = field(
+        default=1,
+        metadata={"help": "The number of training epochs for the SFTTrainer."},
+    )
+    max_steps: int = field(
+        default=-1, metadata={"help": "How many optimizer update steps to take"}
+    )
+    output_dir: str = field(
+        default="./results",
+        metadata={
+            "help": "The output directory where the model predictions and checkpoints will be written."
+        },
+    )
+    optim: Optional[str] = field(
+        default="adafactor",
+        metadata={"help": "The optimizer to use."},
+    )
+    dataset_name: Optional[str] = field(
+        default="Abirate/english_quotes",
+        metadata={"help": "The dataset to use."},
+    )
+    model_name: Optional[str] = field(
+        default="meta-llama/Meta-Llama-3.1-8B",
+        metadata={
+            "help": "Only models Gemma 2B, Gemma 7B, Llama-2 7B and Llama-3 8B Llama-3.1 8B are tested with TPU v5e"
+        },
+    )
+    lora_r: Optional[int] = field(default=4, metadata={"help": "LoRA attention dimension."})
+    max_seq_length: Optional[int] = field(
+        default=1024, metadata={"help": "Maximum sequence length to use."}
+    )
+    packing: Optional[bool] = field(
+        default=True,
+        metadata={"help": "Use packing dataset creating."},
+    )
+    push_to_hub: Optional[bool] = field(
+        default=True,
+        metadata={"help": "Push fined tuned model to hub."},
+    )
+
+
+def create_and_prepare_model(args):
+    base_model = AutoModelForCausalLM.from_pretrained(args.model_name)
+    lora_config = LoraConfig(
+        r=args.lora_r,  # the dimension of the low-rank matrices
+        lora_alpha=8,  # scaling factor for LoRA activations vs pre-trained weight activations
+        lora_dropout=0.05,
+        bias="none",
+        inference_mode=False,
+        task_type=TaskType.CAUSAL_LM,
+        target_modules=["o_proj", "v_proj"],
+    )  #
+
+    model = get_peft_model(base_model, lora_config)
+    tokenizer = AutoTokenizer.from_pretrained(args.model_name)
+    # Add custom token for padding Llama
+    tokenizer.add_special_tokens({"pad_token": tokenizer.eos_token})
+    return model, tokenizer
+
+
+def create_and_prepare_trainer(model, tokenizer, dataset, args):
+    data = dataset.map(lambda samples: tokenizer(samples["quote"]), batched=True)
+    fsdp_training_args = fsdp_v2.get_fsdp_training_args(model)
+
+    trainer = Trainer(
+        model=model,
+        train_dataset=data["train"],
+        args=TrainingArguments(
+            per_device_train_batch_size=args.per_device_train_batch_size,
+            num_train_epochs=args.num_train_epochs,
+            max_steps=args.max_steps,
+            output_dir=args.output_dir,
+            optim=args.optim,
+            logging_steps=1,
+            dataloader_drop_last=True,  # Required by FSDP v2 and SPMD.
+            **fsdp_training_args,
+        ),
+        data_collator=DataCollatorForLanguageModeling(tokenizer, mlm=False),
+    )
+
+    return trainer
+
+
+def parse_config() -> ScriptArguments:
+    import sys
+
+    import yaml
+
+    # Ensure a YAML file is provided as an argument
+    if len(sys.argv) != 2:
+        sys.exit(1)
+
+    config_path = sys.argv[1]
+
+    # Read the YAML file
+    with open(config_path, "r") as f:
+        config = yaml.safe_load(f)
+
+    # Parse arguments using HfArgumentParser
+    parser = HfArgumentParser(ScriptArguments)
+    script_args = parser.parse_dict(config)[0]
+    return script_args
+
+
+if __name__ == "__main__":
+    args = parse_config()
+    fsdp_v2.use_fsdp_v2()
+    dataset = load_dataset(args.dataset_name)
+    model, tokenizer = create_and_prepare_model(args)
+    trainer = create_and_prepare_trainer(model, tokenizer, dataset, args)
+    trainer.train()
+    if args.push_to_hub:
+        kwargs = {
+            "finetuned_from": args.model_name,
+            "dataset": args.dataset_name,
+        }
+        trainer.push_to_hub(**kwargs)
diff --git a/mkdocs.yml b/mkdocs.yml
index 2b8d2a029..59b3b03ff 100644
--- a/mkdocs.yml
+++ b/mkdocs.yml
@@ -235,6 +235,7 @@ nav:
         - TRL: examples/fine-tuning/trl/index.md
     - Accelerators:
         - AMD: examples/accelerators/amd/index.md
+        - TPU: examples/accelerators/tpu/index.md
     - LLMs:
         - Llama 3.1: examples/llms/llama31/index.md
   - Blog: