Skip to content

Commit

Permalink
add bs > 1 support
Browse files Browse the repository at this point in the history
Signed-off-by: mymusise <mymusise1@gmail.com>
  • Loading branch information
mymusise committed Mar 19, 2023
1 parent 136f656 commit f7ba507
Show file tree
Hide file tree
Showing 3 changed files with 24 additions and 19 deletions.
6 changes: 3 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@
python tokenize_dataset_rows.py \
--jsonl_path data/alpaca_data.jsonl \
--save_path data/alpaca \
--max_seq_length 512
--max_seq_length 320
```

- `--jsonl_path` 微调的数据路径, 格式jsonl, 对每行的['text']字段进行encode
Expand All @@ -38,7 +38,7 @@ python tokenize_dataset_rows.py \
python finetune.py \
--dataset_path data/alpaca \
--lora_rank 8 \
--per_device_train_batch_size 1 \
--per_device_train_batch_size 2 \
--gradient_accumulation_steps 1 \
--max_steps 52000 \
--save_steps 1000 \
Expand All @@ -56,6 +56,6 @@ python finetune.py \

# TODO:

- bs > 1 support
- ~ bs > 1 support ~
- 使用中文数据
- 加入RLHF
20 changes: 11 additions & 9 deletions examples/finetune.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -138,23 +138,25 @@
" )\n",
"\n",
"\n",
"\n",
"def data_collator(features: list) -> dict:\n",
" return {\n",
" \"input_ids\": torch.stack([\n",
" torch.LongTensor(f[\"input_ids\"])\n",
" for f in features\n",
" ])\n",
" }\n",
" len_ids = [len(feature['input_ids']) for feature in features]\n",
" longest = max(len_ids)\n",
" input_ids = []\n",
" for ids_l, feature in sorted(zip(len_ids, features), key=lambda x:-x[0]):\n",
" ids = feature['input_ids']\n",
" _ids = torch.LongTensor(ids + [150004] * (longest - ids_l))\n",
" input_ids.append(_ids)\n",
" return {\"input_ids\": torch.stack(input_ids)}\n",
"\n",
"\n",
"class ModifiedTrainer(Trainer):\n",
"\n",
" def compute_loss(self, model, inputs, return_outputs=False):\n",
" input_shape = inputs[\"input_ids\"].shape\n",
" return model(\n",
" input_ids=inputs[\"input_ids\"],\n",
" attention_mask=torch.ones_like(inputs[\"input_ids\"]).bool(),\n",
" labels=inputs[\"input_ids\"], # HF model does the slicing for us\n",
" attention_mask=torch.ones(1, 1, input_shape[-1], input_shape[-1]).bool(),\n",
" labels=inputs[\"input_ids\"],\n",
" ).loss"
]
},
Expand Down
17 changes: 10 additions & 7 deletions finetune.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,20 +23,23 @@ def forward(self, x): return super().forward(x).to(torch.float32)
class ModifiedTrainer(Trainer):

def compute_loss(self, model, inputs, return_outputs=False):
input_shape = inputs["input_ids"].shape
return model(
input_ids=inputs["input_ids"],
attention_mask=torch.ones_like(inputs["input_ids"]).bool(),
attention_mask=torch.ones(1, 1, input_shape[-1], input_shape[-1]).bool(),
labels=inputs["input_ids"],
).loss


def data_collator(features: list) -> dict:
return {
"input_ids": torch.stack([
torch.LongTensor(f["input_ids"])
for f in features
])
}
len_ids = [len(feature['input_ids']) for feature in features]
longest = max(len_ids)
input_ids = []
for ids_l, feature in sorted(zip(len_ids, features), key=lambda x:-x[0]):
ids = feature['input_ids']
_ids = torch.LongTensor(ids + [150004] * (longest - ids_l))
input_ids.append(_ids)
return {"input_ids": torch.stack(input_ids)}


def save_tunable_parameters(model, path):
Expand Down

0 comments on commit f7ba507

Please sign in to comment.