LAION-AI · shahules786 · Jul 29, 2023 · Jul 27, 2023 · Jul 27, 2023 · Jul 28, 2023
@@ -847,3 +847,75 @@ falcon_7b_ntk_test:
     alpha: 2
   datasets:
     - dolly15k
+
+llama2_13b_orcacode2_8k:
+  rng_seed: 0xe1291f21
+  random_offset_probability: 0.0
+  use_custom_sampler: true
+  sort_by_length: false
+  dtype: fp16
+  log_dir: "llama2_log_13b_orcacode2_8k"
+  output_dir: llama2_13b_orcacode2_8k
+  learning_rate: 1e-5
+  model_name: OpenAssistant/llama2-13b-orca-8k-3319
+  deepspeed_config: configs/zero_config_pretrain.json
+  weight_decay: 1e-6
+  max_length: 8192
+  warmup_steps: 100
+  peft_model: false
+  use_flash_attention: true
+  gradient_checkpointing: true
+  gradient_accumulation_steps: 4
+  per_device_train_batch_size: 2
+  per_device_eval_batch_size: 1
+  residual_dropout: 0.0
+  eval_steps: 200
+  save_steps: 500 # (total steps: 1558, bs: 64)
+  num_train_epochs: 1
+  save_total_limit: 4
+  datasets:
+    - dolphin-mix:
+        num_samples: 1000000 # total entries 2840090
+        max_char_len: 32000
+        val_split: 0.1
+        max_val_set: 2000
+        seed: 44
+    - oasst_export:
+        lang: "bg,ca,cs,da,de,en,es,fr,hr,hu,it,nl,pl,pt,ro,ru,sl,sr,sv,uk"
+        input_file_path: 2023-07-23_oasst_ready.tar.gz
+        top_k: 1
+        val_split: 0.05
+    - wizard_evol_instruct_v2:
+        val_split: 0.01
+        fraction: 0.1
+    - evol-codealpaca-v1:
+        fill_min_length: 20000
+        val_split: 0.1
+    - cot_submix_original:
+        fill_min_length: 20000
+        val_split: 0.1
+    - megacode:
+        fill_min_length: 24000
+        val_split: 0.1
+        max_val_set: 1000
+    - evol_instruct_code:
+        fill_min_length: 24000
+        val_split: 0.1
+        max_val_set: 1000
+  # Dataset composition:
+  # Train:
+  #   dolphin-mix:             40374
+  #   oasst_export:            11441
+  #   wizard_evol_instruct_v2: 15236
+  #   evol-codealpaca-v1:       5623
+  #   cot_submix_original:      8651
+  #   megacode:                14320
+  #   evol_instruct_code:       4093
+  # Valid:
+  #   dolphin-mix:              2000
+  #   oasst_export:              603
+  #   wizard_evol_instruct_v2:  1540
+  #   evol-codealpaca-v1:        625
+  #   cot_submix_original:       962
+  #   megacode:                 1000
+  #   evol_instruct_code:        455
@@ -20,6 +20,7 @@
     TranslatedQA,
     Vicuna,
     WebGPT,
+    WizardEvolInstructV2,
     load_alpaca_dataset,
 )
 from model_training.custom_datasets.rank_datasets import AugmentedOA
@@ -110,7 +111,7 @@ def get_one_dataset(
             eval = SummarizationDataset(dataset_name, data_path, "validation")
             train = dataset
     elif dataset_name in INSTRUCTION_DATASETS:
-        dataset = InstructionDataset(dataset_name, data_path, "train")
+        dataset = InstructionDataset(dataset_name, data_path, "train", **kwargs)
     elif "ted_trans" in dataset_name:
         language_pair = dataset_name.split("_")[-1]
         dataset = TEDTalk(pair=language_pair, split="train")
@@ -143,6 +144,8 @@ def get_one_dataset(
         dataset = TranslatedQA(data_path)
     elif dataset_name == "vicuna":
         dataset = Vicuna(cache_dir=data_path, **kwargs)
+    elif dataset_name == "wizard_evol_instruct_v2":
+        dataset = WizardEvolInstructV2(cache_dir=data_path, **kwargs)
     elif dataset_name == "oasst_export":
         train, eval = load_oasst_export(data_path=data_path, val_split=val_split, mode=mode, **kwargs)
     elif dataset_name == "hf_summary":

@@ -30,6 +30,8 @@
     "wizardlm_70k": "ehartford/WizardLM_alpaca_evol_instruct_70k_unfiltered",
     "megacode": "rombodawg/MegaCodeTraining112k",
     "evol_instruct_code": "nickrosh/Evol-Instruct-Code-80k-v1",
+    "evol-codealpaca-v1": "theblackcat102/evol-codealpaca-v1",
+    "cot_submix_original": "conceptofmind/cot_submix_original",
 }
 
 
@@ -42,9 +44,12 @@ def __init__(self, dataset, cache_dir, split, mode="sft", fill_min_length: Optio
         if dataset == "minimath":
             self.instruction_column = "question"
             self.response_column = "answer"
-        elif dataset in ("wizardlm_70k", "evol_instruct_code"):
+        elif dataset in ("wizardlm_70k", "evol_instruct_code", "evol-codealpaca-v1"):
             self.instruction_column = "instruction"
             self.response_column = "output"
+        elif dataset == "cot_submix_original":
+            self.instruction_column = "inputs"
+            self.response_column = "targets"
         elif dataset == "megacode":
             self.instruction_column = "prompt"
             self.response_column = "completion"

@@ -2,9 +2,8 @@
 import json
 import re
 from pathlib import Path
-from typing import List, Optional, Union
+from typing import List, Mapping, Optional, Sequence, Union
 
-import numpy as np
 import requests
 from datasets import load_dataset
 from model_training.custom_datasets.formatting import DatasetEntrySft, Role, Utterance
@@ -199,45 +198,46 @@ def __getitem__(self, idx):
 class DolphinMix(Dataset):
     name = "dophin-mix"
 
-    def __init__(self, cache_dir, num_samples=100000, max_char_len=8000, seed=42):
-        self.dataset = load_dataset(
-            "ehartford/dolphin", data_files="flan5m-alpaca-uncensored.jsonl", cache_dir=cache_dir
-        )
-        self.dataset = self.dataset["train"].shuffle(seed).select(range(num_samples))
+    def __init__(
+        self,
+        cache_dir: Optional[str] = None,
+        num_samples: Optional[int] = None,
+        max_char_len: int = 8000,
+        seed: int = 42,
+        data_files: Union[
+            str, Sequence[str], Mapping[str, Union[str, Sequence[str]]]
+        ] = "flan5m-alpaca-uncensored.jsonl",
+        split: str = "train",
+    ):
+        # flan5m-alpaca-uncensored.jsonl has total entries 2840090
+        self.dataset = load_dataset("ehartford/dolphin", data_files=data_files, cache_dir=cache_dir)
+        self.dataset = self.dataset[split].shuffle(seed).flatten_indices()
+        if num_samples:
+            self.dataset = self.dataset.select(range(num_samples))
         self.max_char_len = max_char_len
-        instructions = set([item["instruction"] for item in self.dataset])
+        instructions = sorted(set([item["instruction"] for item in self.dataset]))
 
         self.conversations = []
         for inst in instructions:
             data_sample = self.dataset.filter(lambda example: example["instruction"] == inst)
-            available_indices = np.arange(0, len(data_sample)).tolist()
-            removed_indices = []
-            for idx in available_indices:
-                conversation_len = len(inst)
-                if idx not in removed_indices and conversation_len < self.max_char_len:
-                    conversation = {"conversation": []}
-                    conversation["instruction"] = inst
-                    input, output = [data_sample[idx][key] for key in ("input", "output")]
-                    conversation["conversation"].append({"input": input, "output": output})
-                    conversation_len += len(input) + len(output)
-                    removed_indices.append(idx)
-                    while conversation_len < self.max_char_len:
-                        indices_to_pick = np.setdiff1d(available_indices, removed_indices)
-                        if len(indices_to_pick) > 0:
-                            idx = np.random.choice(indices_to_pick, size=1)[0]
-                            input, output = [data_sample[int(idx)][key] for key in ("input", "output")]
-                            conversation["conversation"].append({"input": input, "output": output})
-                            conversation_len += len(input) + len(output)
-                            removed_indices.append(idx)
-                        else:
-                            break
-
-                    self.conversations.append(conversation)
-
-    def __len__(self):
+            conversation_len = len(inst)
+            conversation = []
+            for entry in data_sample:
+                input, output = entry["input"], entry["output"]
+                conversation.append({"input": input, "output": output})
+                conversation_len += len(input) + len(output)
+                if conversation_len >= self.max_char_len:
+                    self.conversations.append({"conversation": conversation, "instruction": inst})
+                    conversation_len = len(inst)
+                    conversation = []
+
+            if len(conversation) > 0:
+                self.conversations.append({"conversation": conversation, "instruction": inst})
+
+    def __len__(self) -> int:
         return len(self.conversations)
 
-    def __getitem__(self, idx):
+    def __getitem__(self, idx) -> DatasetEntrySft:
         conversation, instruction = [self.conversations[idx][key] for key in ("conversation", "instruction")]
         conversation = [(item["input"], item["output"]) for item in conversation]
         conversation = list(sum(conversation, ()))

@@ -514,9 +514,8 @@ def process_vicuna_conversations(
     def __init__(self, cache_dir: str | Path, mode: str = "sft", input_max_length: int = 32 * 1024) -> None:
         super().__init__()
 
-        self.pairs = []
-        if mode not in ("sft", "rl"):
-            raise NotImplementedError(f"Currently only the modes 'sft' and 'rl' are implemented. Received {mode}.")
+        if mode != "sft":
+            raise NotImplementedError(f"Currently only the mode 'sft' is implemented. Received {mode}.")
         self.mode = mode
 
         dataset = load_dataset(
@@ -526,8 +525,37 @@ def __init__(self, cache_dir: str | Path, mode: str = "sft", input_max_length: i
             revision="7b8551404f3de5704d634e7516b9ff77be3e2700",
         )["train"]
 
+        self.pairs = []
         for data in dataset:
             if (qa := self.process_vicuna_conversations(data, input_max_length=input_max_length)) is not None:
+                if len(qa[0]) > 0 and len(qa[0]) == len(qa[1]):
+                    self.pairs.append(create_dataset_entry_qa(mode=self.mode, questions=qa[0], answers=qa[1]))
+
+    def __len__(self) -> int:
+        return len(self.pairs)
+
+    def __getitem__(self, index: int) -> DatasetEntry:
+        return self.pairs[index]
+
+
+class WizardEvolInstructV2(Dataset):
+    def __init__(self, cache_dir: str | Path, mode: str = "sft", input_max_length: int = 32 * 1024) -> None:
+        super().__init__()
+
+        if mode != "sft":
+            raise NotImplementedError(f"Currently only the mode 'sft' is implemented. Received {mode}.")
+        self.mode = mode
+
+        dataset = load_dataset(
+            "ehartford/WizardLM_evol_instruct_V2_196k_unfiltered_merged_split",
+            cache_dir=cache_dir,
+            data_files=["WizardLM_evol_instruct_V2_196k_unfiltered_merged_split.json"],
+            revision="34f04cfbc280da93a79ad9ecf339923f9411c1fc",
+        )["train"]
+
+        self.pairs = []
+        for data in dataset:
+            if (qa := Vicuna.process_vicuna_conversations(data, input_max_length=input_max_length)) is not None:
                 if len(qa[0]) > 0 and len(qa[0]) == len(qa[1]):
                     self.pairs.append(create_dataset_entry_qa(mode="sft", questions=qa[0], answers=qa[1], lang="en"))
 

@@ -236,9 +236,6 @@ def from_config(cls, config):
         args = config.superhot_config
         return cls(model_name, **args)
 
-    def update_config(self, model, scaling_factor):
-        model.config["rope_scaling"] = {"type": self.rope_type, "factor": scaling_factor}
-
     def patch(self, model):
         if self.architecture == "FalconForCausalLM":
             self.patch_falcon_model(model, **self.args)
@@ -247,8 +244,6 @@ def patch(self, model):
         else:
             raise NotImplementedError()
 
-        self.update_config(model, self.args.get("scaling_factor"))
-
     def patch_falcon_model(self, model, **kwargs):
         for each in model.transformer.h:
             each.self_attention.maybe_rotary = self.patch_fun(model.config.head_dim, **kwargs)

@@ -422,6 +422,9 @@ def main():
     if superhot:
         superhot.patch(model)
 
+    print(f"rope_scaling: {model.config.rope_scaling}")
+    print(f"max_position_embeddings: {model.config.max_position_embeddings}")
+
     if training_conf.peft_model:
         print("Using PEFT model")
         model = peft_model(model, training_conf)