forked from hiyouga/ChatGLM-Efficient-Tuning
-
Notifications
You must be signed in to change notification settings - Fork 0
/
config_data.py
69 lines (67 loc) · 2.73 KB
/
config_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
"""
List all the available datasets.
Data format:
"dataset_name": {
"hf_hub_url": the name of the dataset repository on the HuggingFace hub. (if specified, ignore below 3 arguments)
"script_url": the name of the directory containing a dataset loading script. (if specified, ignore below 2 arguments)
"file_name": the name of the dataset file in the local `dataset_dir` directory. (required if above are not specified)
"file_sha1": the SHA-1 hash value of the dataset file. (optional)
"columns": { (optional, if not provided, use the default values)
"prompt": the name of the column in the datasets containing the prompts. (default: instruction)
"query": the name of the column in the datasets containing the queries. (default: input)
"response": the name of the column in the datasets containing the responses. (default: output)
"history": the name of the column in the datasets containing the history of chat. (default: None)
}
}
"""
CHATGLM_REPO_NAME = "THUDM/chatglm-6b"
CHATGLM_LASTEST_HASH = "4de8efebc837788ffbfc0a15663de8553da362a2"
DATASETS = {
"alpaca_en": {"hf_hub_url": "tatsu-lab/alpaca"},
"alpaca_zh": {
"file_name": "alpaca_data_zh_51k.json",
"file_sha1": "e655af3db557a4197f7b0cf92e1986b08fae6311"
},
"alpaca_gpt4_en": {
"file_name": "alpaca_gpt4_data_en.json",
"file_sha1": "647f4ad447bd993e4b6b6223d1be15208bab694a"
},
"alpaca_gpt4_zh": {
"file_name": "alpaca_gpt4_data_zh.json",
"file_sha1": "3eaa3bda364ccdd59925d7448a698256c31ef845"
},
"belle_0.5m": {"hf_hub_url": "BelleGroup/train_0.5M_CN"},
"belle_1m": {"hf_hub_url": "BelleGroup/train_1M_CN"},
"belle_2m": {"hf_hub_url": "BelleGroup/train_2M_CN"},
"belle_dialog": {"hf_hub_url": "BelleGroup/generated_chat_0.4M"},
"belle_math": {"hf_hub_url": "BelleGroup/school_math_0.25M"},
"belle_multiturn": {"hf_hub_url": "BelleGroup/multiturn_chat_0.8M"},
"belle_multiturn_chatglm": {
"script_url": "belle_multiturn",
"columns": {
"prompt": "instruction",
"query": None,
"response": "output",
"history": "history"
}
},
"guanaco": {"hf_hub_url": "JosephusCheung/GuanacoDataset"},
"firefly": {
"hf_hub_url": "YeungNLP/firefly-train-1.1M",
"columns": {
"prompt": "input",
"query": None,
"response": "target",
"history": None
}
},
"example": {
"script_url": "example_dataset", # or use `"file_name": "example_dataset/examples.json"`,
"columns": {
"prompt": "instruction",
"query": "input",
"response": "output",
"history": "history"
}
}
}