Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Changes for orcacode experiment #3612

Merged
merged 7 commits into from
Jul 29, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
72 changes: 72 additions & 0 deletions model/model_training/configs/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -847,3 +847,75 @@ falcon_7b_ntk_test:
alpha: 2
datasets:
- dolly15k

llama2_13b_orcacode2_8k:
rng_seed: 0xe1291f21
random_offset_probability: 0.0
use_custom_sampler: true
sort_by_length: false
dtype: fp16
log_dir: "llama2_log_13b_orcacode2_8k"
output_dir: llama2_13b_orcacode2_8k
learning_rate: 1e-5
model_name: OpenAssistant/llama2-13b-orca-8k-3319
deepspeed_config: configs/zero_config_pretrain.json
weight_decay: 1e-6
max_length: 8192
warmup_steps: 100
peft_model: false
use_flash_attention: true
gradient_checkpointing: true
gradient_accumulation_steps: 4
per_device_train_batch_size: 2
per_device_eval_batch_size: 1
residual_dropout: 0.0
eval_steps: 200
save_steps: 500 # (total steps: 1558, bs: 64)
num_train_epochs: 1
save_total_limit: 4
datasets:
- dolphin-mix:
num_samples: 1000000 # total entries 2840090
max_char_len: 32000
val_split: 0.1
max_val_set: 2000
seed: 44
- oasst_export:
lang: "bg,ca,cs,da,de,en,es,fr,hr,hu,it,nl,pl,pt,ro,ru,sl,sr,sv,uk"
input_file_path: 2023-07-23_oasst_ready.tar.gz
top_k: 1
val_split: 0.05
- wizard_evol_instruct_v2:
val_split: 0.01
fraction: 0.1
- evol-codealpaca-v1:
fill_min_length: 20000
val_split: 0.1
- cot_submix_original:
fill_min_length: 20000
val_split: 0.1
- megacode:
fill_min_length: 24000
val_split: 0.1
max_val_set: 1000
- evol_instruct_code:
fill_min_length: 24000
val_split: 0.1
max_val_set: 1000
# Dataset composition:
# Train:
# dolphin-mix: 40374
# oasst_export: 11441
# wizard_evol_instruct_v2: 15236
# evol-codealpaca-v1: 5623
# cot_submix_original: 8651
# megacode: 14320
# evol_instruct_code: 4093
# Valid:
# dolphin-mix: 2000
# oasst_export: 603
# wizard_evol_instruct_v2: 1540
# evol-codealpaca-v1: 625
# cot_submix_original: 962
# megacode: 1000
# evol_instruct_code: 455
5 changes: 4 additions & 1 deletion model/model_training/custom_datasets/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
TranslatedQA,
Vicuna,
WebGPT,
WizardEvolInstructV2,
load_alpaca_dataset,
)
from model_training.custom_datasets.rank_datasets import AugmentedOA
Expand Down Expand Up @@ -110,7 +111,7 @@ def get_one_dataset(
eval = SummarizationDataset(dataset_name, data_path, "validation")
train = dataset
elif dataset_name in INSTRUCTION_DATASETS:
dataset = InstructionDataset(dataset_name, data_path, "train")
dataset = InstructionDataset(dataset_name, data_path, "train", **kwargs)
elif "ted_trans" in dataset_name:
language_pair = dataset_name.split("_")[-1]
dataset = TEDTalk(pair=language_pair, split="train")
Expand Down Expand Up @@ -143,6 +144,8 @@ def get_one_dataset(
dataset = TranslatedQA(data_path)
elif dataset_name == "vicuna":
dataset = Vicuna(cache_dir=data_path, **kwargs)
elif dataset_name == "wizard_evol_instruct_v2":
dataset = WizardEvolInstructV2(cache_dir=data_path, **kwargs)
elif dataset_name == "oasst_export":
train, eval = load_oasst_export(data_path=data_path, val_split=val_split, mode=mode, **kwargs)
elif dataset_name == "hf_summary":
Expand Down
7 changes: 6 additions & 1 deletion model/model_training/custom_datasets/instruction.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,8 @@
"wizardlm_70k": "ehartford/WizardLM_alpaca_evol_instruct_70k_unfiltered",
"megacode": "rombodawg/MegaCodeTraining112k",
"evol_instruct_code": "nickrosh/Evol-Instruct-Code-80k-v1",
"evol-codealpaca-v1": "theblackcat102/evol-codealpaca-v1",
"cot_submix_original": "conceptofmind/cot_submix_original",
}


Expand All @@ -42,9 +44,12 @@ def __init__(self, dataset, cache_dir, split, mode="sft", fill_min_length: Optio
if dataset == "minimath":
self.instruction_column = "question"
self.response_column = "answer"
elif dataset in ("wizardlm_70k", "evol_instruct_code"):
elif dataset in ("wizardlm_70k", "evol_instruct_code", "evol-codealpaca-v1"):
self.instruction_column = "instruction"
self.response_column = "output"
elif dataset == "cot_submix_original":
self.instruction_column = "inputs"
self.response_column = "targets"
elif dataset == "megacode":
self.instruction_column = "prompt"
self.response_column = "completion"
Expand Down
68 changes: 34 additions & 34 deletions model/model_training/custom_datasets/prompt_dialogue.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,8 @@
import json
import re
from pathlib import Path
from typing import List, Optional, Union
from typing import List, Mapping, Optional, Sequence, Union

import numpy as np
import requests
from datasets import load_dataset
from model_training.custom_datasets.formatting import DatasetEntrySft, Role, Utterance
Expand Down Expand Up @@ -199,45 +198,46 @@ def __getitem__(self, idx):
class DolphinMix(Dataset):
name = "dophin-mix"

def __init__(self, cache_dir, num_samples=100000, max_char_len=8000, seed=42):
self.dataset = load_dataset(
"ehartford/dolphin", data_files="flan5m-alpaca-uncensored.jsonl", cache_dir=cache_dir
)
self.dataset = self.dataset["train"].shuffle(seed).select(range(num_samples))
def __init__(
self,
cache_dir: Optional[str] = None,
num_samples: Optional[int] = None,
max_char_len: int = 8000,
seed: int = 42,
data_files: Union[
str, Sequence[str], Mapping[str, Union[str, Sequence[str]]]
] = "flan5m-alpaca-uncensored.jsonl",
split: str = "train",
):
# flan5m-alpaca-uncensored.jsonl has total entries 2840090
self.dataset = load_dataset("ehartford/dolphin", data_files=data_files, cache_dir=cache_dir)
self.dataset = self.dataset[split].shuffle(seed).flatten_indices()
if num_samples:
self.dataset = self.dataset.select(range(num_samples))
self.max_char_len = max_char_len
instructions = set([item["instruction"] for item in self.dataset])
instructions = sorted(set([item["instruction"] for item in self.dataset]))

self.conversations = []
for inst in instructions:
data_sample = self.dataset.filter(lambda example: example["instruction"] == inst)
available_indices = np.arange(0, len(data_sample)).tolist()
removed_indices = []
for idx in available_indices:
conversation_len = len(inst)
if idx not in removed_indices and conversation_len < self.max_char_len:
conversation = {"conversation": []}
conversation["instruction"] = inst
input, output = [data_sample[idx][key] for key in ("input", "output")]
conversation["conversation"].append({"input": input, "output": output})
conversation_len += len(input) + len(output)
removed_indices.append(idx)
while conversation_len < self.max_char_len:
indices_to_pick = np.setdiff1d(available_indices, removed_indices)
if len(indices_to_pick) > 0:
idx = np.random.choice(indices_to_pick, size=1)[0]
input, output = [data_sample[int(idx)][key] for key in ("input", "output")]
conversation["conversation"].append({"input": input, "output": output})
conversation_len += len(input) + len(output)
removed_indices.append(idx)
else:
break

self.conversations.append(conversation)

def __len__(self):
conversation_len = len(inst)
conversation = []
for entry in data_sample:
input, output = entry["input"], entry["output"]
conversation.append({"input": input, "output": output})
conversation_len += len(input) + len(output)
if conversation_len >= self.max_char_len:
self.conversations.append({"conversation": conversation, "instruction": inst})
conversation_len = len(inst)
conversation = []

if len(conversation) > 0:
self.conversations.append({"conversation": conversation, "instruction": inst})

def __len__(self) -> int:
return len(self.conversations)

def __getitem__(self, idx):
def __getitem__(self, idx) -> DatasetEntrySft:
conversation, instruction = [self.conversations[idx][key] for key in ("conversation", "instruction")]
conversation = [(item["input"], item["output"]) for item in conversation]
conversation = list(sum(conversation, ()))
Expand Down
34 changes: 31 additions & 3 deletions model/model_training/custom_datasets/qa_datasets.py
Original file line number Diff line number Diff line change
Expand Up @@ -514,9 +514,8 @@ def process_vicuna_conversations(
def __init__(self, cache_dir: str | Path, mode: str = "sft", input_max_length: int = 32 * 1024) -> None:
super().__init__()

self.pairs = []
if mode not in ("sft", "rl"):
raise NotImplementedError(f"Currently only the modes 'sft' and 'rl' are implemented. Received {mode}.")
if mode != "sft":
raise NotImplementedError(f"Currently only the mode 'sft' is implemented. Received {mode}.")
self.mode = mode

dataset = load_dataset(
Expand All @@ -526,8 +525,37 @@ def __init__(self, cache_dir: str | Path, mode: str = "sft", input_max_length: i
revision="7b8551404f3de5704d634e7516b9ff77be3e2700",
)["train"]

self.pairs = []
for data in dataset:
if (qa := self.process_vicuna_conversations(data, input_max_length=input_max_length)) is not None:
if len(qa[0]) > 0 and len(qa[0]) == len(qa[1]):
self.pairs.append(create_dataset_entry_qa(mode=self.mode, questions=qa[0], answers=qa[1]))

def __len__(self) -> int:
return len(self.pairs)

def __getitem__(self, index: int) -> DatasetEntry:
return self.pairs[index]


class WizardEvolInstructV2(Dataset):
def __init__(self, cache_dir: str | Path, mode: str = "sft", input_max_length: int = 32 * 1024) -> None:
super().__init__()

if mode != "sft":
raise NotImplementedError(f"Currently only the mode 'sft' is implemented. Received {mode}.")
self.mode = mode

dataset = load_dataset(
"ehartford/WizardLM_evol_instruct_V2_196k_unfiltered_merged_split",
cache_dir=cache_dir,
data_files=["WizardLM_evol_instruct_V2_196k_unfiltered_merged_split.json"],
revision="34f04cfbc280da93a79ad9ecf339923f9411c1fc",
)["train"]

self.pairs = []
for data in dataset:
if (qa := Vicuna.process_vicuna_conversations(data, input_max_length=input_max_length)) is not None:
if len(qa[0]) > 0 and len(qa[0]) == len(qa[1]):
self.pairs.append(create_dataset_entry_qa(mode="sft", questions=qa[0], answers=qa[1], lang="en"))

Expand Down
5 changes: 0 additions & 5 deletions model/model_training/models/patching.py
Original file line number Diff line number Diff line change
Expand Up @@ -236,9 +236,6 @@ def from_config(cls, config):
args = config.superhot_config
return cls(model_name, **args)

def update_config(self, model, scaling_factor):
model.config["rope_scaling"] = {"type": self.rope_type, "factor": scaling_factor}

def patch(self, model):
if self.architecture == "FalconForCausalLM":
self.patch_falcon_model(model, **self.args)
Expand All @@ -247,8 +244,6 @@ def patch(self, model):
else:
raise NotImplementedError()

self.update_config(model, self.args.get("scaling_factor"))

def patch_falcon_model(self, model, **kwargs):
for each in model.transformer.h:
each.self_attention.maybe_rotary = self.patch_fun(model.config.head_dim, **kwargs)
Expand Down
3 changes: 3 additions & 0 deletions model/model_training/trainer_sft.py
Original file line number Diff line number Diff line change
Expand Up @@ -422,6 +422,9 @@ def main():
if superhot:
superhot.patch(model)

print(f"rope_scaling: {model.config.rope_scaling}")
print(f"max_position_embeddings: {model.config.max_position_embeddings}")

if training_conf.peft_model:
print("Using PEFT model")
model = peft_model(model, training_conf)
Expand Down