From a12dd114eccf83120ae29c1607fe5eeb42c03dd9 Mon Sep 17 00:00:00 2001 From: jianzhnie Date: Mon, 7 Aug 2023 10:50:48 +0800 Subject: [PATCH 1/6] update dataset usage --- chatllms/configs/data_args.py | 36 ++++++++------------------ chatllms/data/data_utils.py | 6 ++--- data/alpaca_zh.yaml | 48 +++++++++++++++++++++++++++++++++++ data/belle_group.yaml | 40 +++++++++++++++++++++++++++++ data/dataset_info.yaml | 2 +- 5 files changed, 103 insertions(+), 29 deletions(-) create mode 100644 data/alpaca_zh.yaml create mode 100644 data/belle_group.yaml diff --git a/chatllms/configs/data_args.py b/chatllms/configs/data_args.py index 52101dc..84a51a5 100644 --- a/chatllms/configs/data_args.py +++ b/chatllms/configs/data_args.py @@ -12,7 +12,6 @@ class DatasetAttr(object): hf_hub_url: Optional[str] = None local_path: Optional[str] = None dataset_format: Optional[str] = None - dataset_sha1: Optional[str] = None load_from_local: bool = False multi_turn: Optional[bool] = False @@ -34,18 +33,11 @@ def __post_init__(self): @dataclass class DataArguments: - # 微调数据集是 alpaca - dataset_name: Optional[str] = field( - default='alpaca', - metadata={ - 'help': 'Which dataset to finetune on. See datamodule for options.' - }) - # 数据集的本地路径,如果load_from_local为True,那么就从本地加载数据集 - dataset_dir: str = field( - default=None, + dataset_cfg: Optional[str] = field( + default='./data/alpaca_zh.yaml', metadata={ 'help': - 'where is dataset in local dir. See datamodule for options.' + 'Path to dataset infos, please refer to `./data/README.md` to see how to prepare your datasets for training.' }) instruction_template: str = field( default='default', @@ -82,19 +74,13 @@ class DataArguments: ) def init_for_training(self): # support mixing multiple datasets - dataset_names = [ds.strip() for ds in self.dataset_name.split(',')] - this_dir = os.path.dirname(os.path.abspath(__file__)) - datasets_info_path = os.path.join(this_dir, '../..', 'data', - 'dataset_info.yaml') - with open(datasets_info_path, 'r') as f: - datasets_info = yaml.safe_load(f) - - self.datasets_list: List[DatasetAttr] = [] - for i, name in enumerate(dataset_names): - if name not in datasets_info: - raise ValueError('Undefined dataset {} in {}'.format( - name, datasets_info_path)) - + assert self.dataset_cfg is not None and os.path.exists( + self.dataset_cfg + ), f'{self.dataset_cfg} does not exist!, please check the path.' + datasets_info = yaml.safe_load(open(self.dataset_cfg, 'r')) + self.dataset_names = list(datasets_info.keys()) + self.dataset_attr_list: List[DatasetAttr] = [] + for i, name in enumerate(self.dataset_names): dataset_attr = DatasetAttr() dataset_attr.dataset_name = name dataset_attr.dataset_format = datasets_info[name].get( @@ -126,4 +112,4 @@ def init_for_training(self): # support mixing multiple datasets dataset_attr.history_column = datasets_info[name][ 'columns'].get('history', None) - self.datasets_list.append(dataset_attr) + self.dataset_attr_list.append(dataset_attr) diff --git a/chatllms/data/data_utils.py b/chatllms/data/data_utils.py index 41c4b4a..01e96a2 100644 --- a/chatllms/data/data_utils.py +++ b/chatllms/data/data_utils.py @@ -460,16 +460,16 @@ def make_data_module(args): """ train_datasets: List[Dataset] = [] eval_datasets: List[Dataset] = [] - dataset_name_list = args.dataset_name.split(',') + dataset_name_list = args.dataset_names print(f'Loading datasets: {dataset_name_list}') mutliturn_lst = [ - dataset_attr.multi_turn for dataset_attr in args.datasets_list + dataset_attr.multi_turn for dataset_attr in args.dataset_attr_list ] assert mutliturn_lst.count(mutliturn_lst[0]) == len( mutliturn_lst ), 'All datasets should be multi-turn or single-turn. As follwing we will concat all datasets, so they should be in the same format.' - for dataset_attr in args.datasets_list: + for dataset_attr in args.dataset_attr_list: print('=' * 80) print('DatasetAttr: {}'.format(dataset_attr)) diff --git a/data/alpaca_zh.yaml b/data/alpaca_zh.yaml new file mode 100644 index 0000000..c238221 --- /dev/null +++ b/data/alpaca_zh.yaml @@ -0,0 +1,48 @@ +# The dataset_info.yaml file contains the information of the datasets used in the experiments. +coig: + hf_hub_url: BAAI/COIG + local_path: /home/robin/prompt_data/COIG/train_alpaca.json + dataset_format: alpaca + multi_turn: False + +cvalues_comparison_train: + hf_hub_url: '' + local_path: /home/robin/prompt_data/CValues-Comparison/train_alpaca.json + dataset_format: alpaca + multi_turn: False + +cvalues_comparison_test: + hf_hub_url: '' + local_path: /home/robin/prompt_data/CValues-Comparison/test_alpaca.json + dataset_format: alpaca + multi_turn: False + +huatuogpt: + hf_hub_url: FreedomIntelligence/HuatuoGPT-sft-data-v1 + local_path: /home/robin/prompt_data/HuatuoGPT-sft-data-v1/HuatuoGPT_alpaca.json + dataset_format: alpaca + multi_turn: False + +olcc: + hf_hub_url: '' + local_path: /home/robin/prompt_data/olcc/olcc_alpaca.json + dataset_format: alpaca + multi_turn: False + +100PoisonMpts: + hf_hub_url: 'damo/100PoisonMpts' + local_path: /home/robin/prompt_data/100PoisonMpts/train_alpaca.json + dataset_format: alpaca + multi_turn: False + +safety_prompt_part1: + hf_hub_url: '' + local_path: /home/robin/prompt_data/Safety-Prompts/attack_scenarios_alpaca.json + dataset_format: alpaca + multi_turn: False + +safety_prompt_part2: + hf_hub_url: '' + local_path: /home/robin/prompt_data/Safety-Prompts/safety_scenarios_alpaca.json + dataset_format: alpaca + multi_turn: False diff --git a/data/belle_group.yaml b/data/belle_group.yaml new file mode 100644 index 0000000..10ecc5c --- /dev/null +++ b/data/belle_group.yaml @@ -0,0 +1,40 @@ +belle_0.5m: + hf_hub_url: BelleGroup/train_0.5M_CN + local_path: '' + dataset_format: alpaca + multi_turn: False + +belle_1m: + hf_hub_url: BelleGroup/train_1M_CN + local_path: '' + dataset_format: alpaca + multi_turn: False + +belle_2m: + hf_hub_url: BelleGroup/train_2M_CN + local_path: '' + dataset_format: alpaca + multi_turn: False + +belle_dialog: + hf_hub_url: BelleGroup/generated_chat_0.4M + local_path: '' + dataset_format: belle_dialog + multi_turn: False + +belle_math: + hf_hub_url: BelleGroup/school_math_0.25M + local_path: '' + dataset_format: alpaca + multi_turn: False + +belle_multiturn: + hf_hub_url: BelleGroup/multi_turn_0.5M + local_path: '' + dataset_format: belle_multiturn + multi_turn: True + columns: + prompt: instruction + query: '' + response: output + history: history diff --git a/data/dataset_info.yaml b/data/dataset_info.yaml index 3e8fd55..01a1375 100644 --- a/data/dataset_info.yaml +++ b/data/dataset_info.yaml @@ -66,7 +66,7 @@ olcc: multi_turn: False 100PoisonMpts: - hf_hub_url: '' + hf_hub_url: 'damo/100PoisonMpts' local_path: /home/robin/prompt_data/100PoisonMpts/train.jsonl dataset_format: 100PoisonMpts multi_turn: False From a4398846f92244f51b3e94a7546a3e5dd18e6897 Mon Sep 17 00:00:00 2001 From: jianzhnie Date: Mon, 7 Aug 2023 11:15:10 +0800 Subject: [PATCH 2/6] update dataset info --- data/vicuna_zh.yaml | 42 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 42 insertions(+) create mode 100644 data/vicuna_zh.yaml diff --git a/data/vicuna_zh.yaml b/data/vicuna_zh.yaml new file mode 100644 index 0000000..e3dc41b --- /dev/null +++ b/data/vicuna_zh.yaml @@ -0,0 +1,42 @@ +# The dataset_info.yaml file contains the information of the datasets used in the experiments. +coig: + hf_hub_url: BAAI/COIG + local_path: /home/robin/prompt_data/COIG/train_vicuna.json + dataset_format: sharegpt + multi_turn: True + +cvalues_comparison_train: + hf_hub_url: '' + local_path: /home/robin/prompt_data/CValues-Comparison/train_vicuna.json + dataset_format: sharegpt + multi_turn: True + +cvalues_comparison_test: + hf_hub_url: '' + local_path: /home/robin/prompt_data/CValues-Comparison/test_vicuna.json + dataset_format: sharegpt + multi_turn: True + +olcc: + hf_hub_url: '' + local_path: /home/robin/prompt_data/olcc/olcc_vicuna.json + dataset_format: sharegpt + multi_turn: True + +100PoisonMpts: + hf_hub_url: '' + local_path: /home/robin/prompt_data/100PoisonMpts/train_vicuna.json + dataset_format: sharegpt + multi_turn: True + +safety_prompt_part1: + hf_hub_url: '' + local_path: /home/robin/prompt_data/Safety-Prompts/attack_scenarios_vicuna.json + dataset_format: sharegpt + multi_turn: True + +safety_prompt_part2: + hf_hub_url: '' + local_path: /home/robin/prompt_data/Safety-Prompts/safety_scenarios_vicuna.json + dataset_format: sharegpt + multi_turn: True From 3f9b5806cff4ab24d6ea1a34a944cde21bce5587 Mon Sep 17 00:00:00 2001 From: jianzhnie Date: Mon, 7 Aug 2023 11:37:40 +0800 Subject: [PATCH 3/6] update dataset info --- chatllms/configs/lora_args.py | 2 +- data/alpaca_zh.yaml | 8 +------ data/alpaca_zh_pcyn.yaml | 42 +++++++++++++++++++++++++++++++++++ data/vicuna_zh.yaml | 2 +- data/vicuna_zh_pcyn.yaml | 42 +++++++++++++++++++++++++++++++++++ 5 files changed, 87 insertions(+), 9 deletions(-) create mode 100644 data/alpaca_zh_pcyn.yaml create mode 100644 data/vicuna_zh_pcyn.yaml diff --git a/chatllms/configs/lora_args.py b/chatllms/configs/lora_args.py index bfce2df..7d8763b 100644 --- a/chatllms/configs/lora_args.py +++ b/chatllms/configs/lora_args.py @@ -11,7 +11,7 @@ class LoraArguments: lora_dropout: float = field(default=0.0, metadata={'help': 'Lora dropout.'}) # 每个GPU上可使用的显存大小,以MB为单位。默认是A100高端版本的80GB - max_memory_MB: int = field(default=8000, + max_memory_MB: int = field(default=80000, metadata={'help': 'Free memory per gpu.'}) lora_weight_path: str = '' bias: str = 'none' diff --git a/data/alpaca_zh.yaml b/data/alpaca_zh.yaml index c238221..a2a7bc6 100644 --- a/data/alpaca_zh.yaml +++ b/data/alpaca_zh.yaml @@ -17,12 +17,6 @@ cvalues_comparison_test: dataset_format: alpaca multi_turn: False -huatuogpt: - hf_hub_url: FreedomIntelligence/HuatuoGPT-sft-data-v1 - local_path: /home/robin/prompt_data/HuatuoGPT-sft-data-v1/HuatuoGPT_alpaca.json - dataset_format: alpaca - multi_turn: False - olcc: hf_hub_url: '' local_path: /home/robin/prompt_data/olcc/olcc_alpaca.json @@ -45,4 +39,4 @@ safety_prompt_part2: hf_hub_url: '' local_path: /home/robin/prompt_data/Safety-Prompts/safety_scenarios_alpaca.json dataset_format: alpaca - multi_turn: False + multi_turn: False \ No newline at end of file diff --git a/data/alpaca_zh_pcyn.yaml b/data/alpaca_zh_pcyn.yaml new file mode 100644 index 0000000..4b37dad --- /dev/null +++ b/data/alpaca_zh_pcyn.yaml @@ -0,0 +1,42 @@ +# The dataset_info.yaml file contains the information of the datasets used in the experiments. +coig: + hf_hub_url: BAAI/COIG + local_path: /userhome/jianzhnie/prompt_data/COIG/train_alpaca.json + dataset_format: alpaca + multi_turn: False + +cvalues_comparison_train: + hf_hub_url: '' + local_path: /userhome/jianzhnie/prompt_data/CValues-Comparison/train_alpaca.json + dataset_format: alpaca + multi_turn: False + +cvalues_comparison_test: + hf_hub_url: '' + local_path: /userhome/jianzhnie/prompt_data/CValues-Comparison/test_alpaca.json + dataset_format: alpaca + multi_turn: False + +olcc: + hf_hub_url: '' + local_path: /userhome/jianzhnie/prompt_data/olcc/olcc_alpaca.json + dataset_format: alpaca + multi_turn: False + +100PoisonMpts: + hf_hub_url: '' + local_path: /userhome/jianzhnie/prompt_data/100PoisonMpts/train_alpaca.json + dataset_format: alpaca + multi_turn: False + +safety_prompt_part1: + hf_hub_url: '' + local_path: /userhome/jianzhnie/prompt_data/Safety-Prompts/attack_scenarios_alpaca.json + dataset_format: alpaca + multi_turn: False + +safety_prompt_part2: + hf_hub_url: '' + local_path: /userhome/jianzhnie/prompt_data/Safety-Prompts/safety_scenarios_alpaca.json + dataset_format: alpaca + multi_turn: False \ No newline at end of file diff --git a/data/vicuna_zh.yaml b/data/vicuna_zh.yaml index e3dc41b..c90c927 100644 --- a/data/vicuna_zh.yaml +++ b/data/vicuna_zh.yaml @@ -39,4 +39,4 @@ safety_prompt_part2: hf_hub_url: '' local_path: /home/robin/prompt_data/Safety-Prompts/safety_scenarios_vicuna.json dataset_format: sharegpt - multi_turn: True + multi_turn: True \ No newline at end of file diff --git a/data/vicuna_zh_pcyn.yaml b/data/vicuna_zh_pcyn.yaml new file mode 100644 index 0000000..9c03bff --- /dev/null +++ b/data/vicuna_zh_pcyn.yaml @@ -0,0 +1,42 @@ +# The dataset_info.yaml file contains the information of the datasets used in the experiments. +coig: + hf_hub_url: BAAI/COIG + local_path: /userhome/jianzhnie/prompt_data/COIG/train_vicuna.json + dataset_format: sharegpt + multi_turn: True + +cvalues_comparison_train: + hf_hub_url: '' + local_path: /userhome/jianzhnie/prompt_data/CValues-Comparison/train_vicuna.json + dataset_format: sharegpt + multi_turn: True + +cvalues_comparison_test: + hf_hub_url: '' + local_path: /userhome/jianzhnie/prompt_data/CValues-Comparison/test_vicuna.json + dataset_format: sharegpt + multi_turn: True + +olcc: + hf_hub_url: '' + local_path: /userhome/jianzhnie/prompt_data/olcc/olcc_vicuna.json + dataset_format: sharegpt + multi_turn: True + +100PoisonMpts: + hf_hub_url: '' + local_path: /userhome/jianzhnie/prompt_data/100PoisonMpts/train_vicuna.json + dataset_format: sharegpt + multi_turn: True + +safety_prompt_part1: + hf_hub_url: '' + local_path: /userhome/jianzhnie/prompt_data/Safety-Prompts/attack_scenarios_vicuna.json + dataset_format: sharegpt + multi_turn: True + +safety_prompt_part2: + hf_hub_url: '' + local_path: /userhome/jianzhnie/prompt_data/Safety-Prompts/safety_scenarios_vicuna.json + dataset_format: sharegpt + multi_turn: True \ No newline at end of file From 9456689c76b064bf0205ec495e9f54a31095cc80 Mon Sep 17 00:00:00 2001 From: jianzhnie Date: Mon, 7 Aug 2023 12:32:00 +0800 Subject: [PATCH 4/6] update qlora scripts --- .../qlora_finetune/finetune_baichuan_7b.sh | 40 ----------------- ...e.sh => finetune_baichuan_7b_alpaca_zh.sh} | 22 +++++----- ...c.sh => finetune_baichuan_7b_vicuna_zh.sh} | 14 +++--- .../qlora_finetune/finetune_guanaco_13b.sh | 43 ------------------ scripts/qlora_finetune/finetune_guanaco_7b.sh | 41 ----------------- scripts/qlora_finetune/finetune_int8_qlora.sh | 19 -------- ...une.sh => finetune_llama2_7b_alpaca_zh.sh} | 17 +++---- .../finetune_llama2_guanaco7b.sh | 44 ------------------- ...co7b.sh => finetune_llama_7b_alpaca_zh.sh} | 20 ++++----- 9 files changed, 37 insertions(+), 223 deletions(-) delete mode 100755 scripts/qlora_finetune/finetune_baichuan_7b.sh rename scripts/qlora_finetune/{multiturn_llama_finetune.sh => finetune_baichuan_7b_alpaca_zh.sh} (58%) mode change 100644 => 100755 rename scripts/qlora_finetune/{finetune_baichuan_7b_olcc.sh => finetune_baichuan_7b_vicuna_zh.sh} (68%) delete mode 100755 scripts/qlora_finetune/finetune_guanaco_13b.sh delete mode 100755 scripts/qlora_finetune/finetune_guanaco_7b.sh delete mode 100644 scripts/qlora_finetune/finetune_int8_qlora.sh rename scripts/qlora_finetune/{multiturn_baichuan_finetune.sh => finetune_llama2_7b_alpaca_zh.sh} (66%) mode change 100644 => 100755 delete mode 100755 scripts/qlora_finetune/finetune_llama2_guanaco7b.sh rename scripts/qlora_finetune/{finetune_llama_guanaco7b.sh => finetune_llama_7b_alpaca_zh.sh} (69%) diff --git a/scripts/qlora_finetune/finetune_baichuan_7b.sh b/scripts/qlora_finetune/finetune_baichuan_7b.sh deleted file mode 100755 index ae8eb34..0000000 --- a/scripts/qlora_finetune/finetune_baichuan_7b.sh +++ /dev/null @@ -1,40 +0,0 @@ -CUDA_VISIBLE_DEVICES=0 python train_qlora.py \ - --model_name_or_path /userhome/jianzhnie/checkpoints/baichuan7b \ - --dataset_name oasst1 \ - --data_dir /userhome/jianzhnie/prompt_datasets \ - --load_from_local \ - --output_dir ./work_dir/oasst1-baichuan-7b \ - --num_train_epochs 4 \ - --per_device_train_batch_size 4 \ - --per_device_eval_batch_size 4 \ - --gradient_accumulation_steps 8 \ - --evaluation_strategy steps \ - --eval_steps 50 \ - --save_strategy steps \ - --save_total_limit 5 \ - --save_steps 100 \ - --logging_strategy steps \ - --logging_steps 1 \ - --learning_rate 0.0002 \ - --warmup_ratio 0.03 \ - --weight_decay 0.0 \ - --lr_scheduler_type constant \ - --adam_beta2 0.999 \ - --max_grad_norm 0.3 \ - --max_new_tokens 32 \ - --source_max_len 512 \ - --target_max_len 512 \ - --lora_r 64 \ - --lora_alpha 16 \ - --lora_dropout 0.1 \ - --double_quant \ - --quant_type nf4 \ - --fp16 \ - --bits 4 \ - --gradient_checkpointing \ - --trust_remote_code \ - --do_train \ - --do_eval \ - --sample_generate \ - --data_seed 42 \ - --seed 0 diff --git a/scripts/qlora_finetune/multiturn_llama_finetune.sh b/scripts/qlora_finetune/finetune_baichuan_7b_alpaca_zh.sh old mode 100644 new mode 100755 similarity index 58% rename from scripts/qlora_finetune/multiturn_llama_finetune.sh rename to scripts/qlora_finetune/finetune_baichuan_7b_alpaca_zh.sh index b966ecb..e00acf3 --- a/scripts/qlora_finetune/multiturn_llama_finetune.sh +++ b/scripts/qlora_finetune/finetune_baichuan_7b_alpaca_zh.sh @@ -1,16 +1,16 @@ -CUDA_VISIBLE_DEVICES=8 python train_qlora.py \ - --model_name_or_path ~/checkpoints/llama7b \ - --dataset_name vicuna_merge \ - --output_dir ./work_dir/vicuna_merge_llama-7b-1gpu \ +CUDA_VISIBLE_DEVICES=0 python train_qlora.py \ + --model_name_or_path ~/checkpoints/baichuan7b \ + --dataset_cfg ./data/alpaca_zh_pcyn.yaml \ + --output_dir ./work_dir/alpaca_zh-baichuan-7b \ --num_train_epochs 3 \ - --per_device_train_batch_size 8 \ - --per_device_eval_batch_size 8 \ - --gradient_accumulation_steps 4 \ + --per_device_train_batch_size 4 \ + --per_device_eval_batch_size 4 \ + --gradient_accumulation_steps 8 \ --evaluation_strategy steps \ --eval_steps 1000 \ --save_strategy steps \ --save_total_limit 10 \ - --save_steps 500 \ + --save_steps 1000 \ --logging_strategy steps \ --logging_steps 5 \ --learning_rate 0.0002 \ @@ -28,8 +28,10 @@ CUDA_VISIBLE_DEVICES=8 python train_qlora.py \ --bits 4 \ --model_max_length 1024 \ --gradient_checkpointing \ - --trust_remote_code \ + --trust_remote_code True \ + --use_auth_token True \ --do_train \ --do_eval \ + --sample_generate \ --data_seed 42 \ - --seed 0 \ + --seed 0 diff --git a/scripts/qlora_finetune/finetune_baichuan_7b_olcc.sh b/scripts/qlora_finetune/finetune_baichuan_7b_vicuna_zh.sh similarity index 68% rename from scripts/qlora_finetune/finetune_baichuan_7b_olcc.sh rename to scripts/qlora_finetune/finetune_baichuan_7b_vicuna_zh.sh index 4ed1089..dda8ba0 100755 --- a/scripts/qlora_finetune/finetune_baichuan_7b_olcc.sh +++ b/scripts/qlora_finetune/finetune_baichuan_7b_vicuna_zh.sh @@ -1,11 +1,11 @@ -CUDA_VISIBLE_DEVICES=6 python train_qlora.py \ - --model_name_or_path ~/checkpoints/baichuan7b \ - --dataset_name 'dolly-15k,olcc,alpaca_data_zh_51k,instinwild_ch'\ - --output_dir ./work_dir/zh-baichuan-7b \ +CUDA_VISIBLE_DEVICES=1 python train_qlora.py \ + --model_name_or_path ~/checkpoints/baichuan7b \ + --dataset_cfg ./data/vicuna_zh_pcyn.yaml \ + --output_dir ./work_dir/vicuna_zh-baichuan-7b \ --num_train_epochs 3 \ - --per_device_train_batch_size 8 \ - --per_device_eval_batch_size 8 \ - --gradient_accumulation_steps 4 \ + --per_device_train_batch_size 2 \ + --per_device_eval_batch_size 2 \ + --gradient_accumulation_steps 16 \ --evaluation_strategy steps \ --eval_steps 1000 \ --save_strategy steps \ diff --git a/scripts/qlora_finetune/finetune_guanaco_13b.sh b/scripts/qlora_finetune/finetune_guanaco_13b.sh deleted file mode 100755 index e000a19..0000000 --- a/scripts/qlora_finetune/finetune_guanaco_13b.sh +++ /dev/null @@ -1,43 +0,0 @@ -python qlora_int4_finetune.py \ - --model_name_or_path huggyllama/llama-13b \ - --output_dir ./output/guanaco-13b \ - --logging_steps 10 \ - --save_strategy steps \ - --data_seed 42 \ - --save_steps 500 \ - --save_total_limit 40 \ - --evaluation_strategy steps \ - --eval_dataset_size 1024 \ - --max_eval_samples 1000 \ - --per_device_eval_batch_size 1 \ - --max_new_tokens 32 \ - --dataloader_num_workers 3 \ - --group_by_length \ - --logging_strategy steps \ - --remove_unused_columns False \ - --do_train \ - --do_eval \ - --do_mmlu_eval \ - --lora_r 64 \ - --lora_alpha 16 \ - --lora_modules all \ - --double_quant \ - --quant_type nf4 \ - --bf16 \ - --bits 4 \ - --warmup_ratio 0.03 \ - --lr_scheduler_type constant \ - --gradient_checkpointing \ - --dataset oasst1 \ - --source_max_len 16 \ - --target_max_len 512 \ - --per_device_train_batch_size 1 \ - --gradient_accumulation_steps 16 \ - --max_steps 1875 \ - --eval_steps 187 \ - --learning_rate 0.0002 \ - --adam_beta2 0.999 \ - --max_grad_norm 0.3 \ - --lora_dropout 0.05 \ - --weight_decay 0.0 \ - --seed 0 diff --git a/scripts/qlora_finetune/finetune_guanaco_7b.sh b/scripts/qlora_finetune/finetune_guanaco_7b.sh deleted file mode 100755 index 6a46970..0000000 --- a/scripts/qlora_finetune/finetune_guanaco_7b.sh +++ /dev/null @@ -1,41 +0,0 @@ -python qlora_int4_finetune.py \ - --model_name_or_path decapoda-research/llama-7b-hf \ - --output_dir ./output/guanaco-7b \ - --logging_steps 10 \ - --save_strategy steps \ - --data_seed 42 \ - --save_steps 500 \ - --save_total_limit 40 \ - --evaluation_strategy steps \ - --eval_dataset_size 1024 \ - --max_eval_samples 1000 \ - --per_device_eval_batch_size 1 \ - --max_new_tokens 32 \ - --dataloader_num_workers 3 \ - --group_by_length \ - --logging_strategy steps \ - --remove_unused_columns False \ - --do_train \ - --lora_r 64 \ - --lora_alpha 16 \ - --lora_modules all \ - --double_quant \ - --quant_type nf4 \ - --fp16 \ - --bits 4 \ - --warmup_ratio 0.03 \ - --lr_scheduler_type constant \ - --gradient_checkpointing \ - --dataset alpaca \ - --source_max_len 16 \ - --target_max_len 512 \ - --per_device_train_batch_size 1 \ - --gradient_accumulation_steps 16 \ - --max_steps 1875 \ - --eval_steps 187 \ - --learning_rate 0.0002 \ - --adam_beta2 0.999 \ - --max_grad_norm 0.3 \ - --lora_dropout 0.1 \ - --weight_decay 0.0 \ - --seed 0 diff --git a/scripts/qlora_finetune/finetune_int8_qlora.sh b/scripts/qlora_finetune/finetune_int8_qlora.sh deleted file mode 100644 index 7244cb2..0000000 --- a/scripts/qlora_finetune/finetune_int8_qlora.sh +++ /dev/null @@ -1,19 +0,0 @@ -python train_lora.py \ - --model_name_or_path decapoda-research/llama-7b-hf \ - --data_path tatsu-lab/alpaca \ - --output_dir work_dir_lora/ \ - --num_train_epochs 3 \ - --per_device_train_batch_size 4 \ - --per_device_eval_batch_size 4 \ - --gradient_accumulation_steps 8 \ - --evaluation_strategy "no" \ - --save_strategy "steps" \ - --save_steps 500 \ - --save_total_limit 5 \ - --learning_rate 1e-4 \ - --weight_decay 0. \ - --warmup_ratio 0.03 \ - --lr_scheduler_type "cosine" \ - --model_max_length 2048 \ - --logging_steps 1 \ - --fp16 True diff --git a/scripts/qlora_finetune/multiturn_baichuan_finetune.sh b/scripts/qlora_finetune/finetune_llama2_7b_alpaca_zh.sh old mode 100644 new mode 100755 similarity index 66% rename from scripts/qlora_finetune/multiturn_baichuan_finetune.sh rename to scripts/qlora_finetune/finetune_llama2_7b_alpaca_zh.sh index a1c558e..0a47dea --- a/scripts/qlora_finetune/multiturn_baichuan_finetune.sh +++ b/scripts/qlora_finetune/finetune_llama2_7b_alpaca_zh.sh @@ -1,16 +1,16 @@ -CUDA_VISIBLE_DEVICES=14 python train_qlora.py \ - --model_name_or_path ~/checkpoints/baichuan7b \ - --dataset_name vicuna_merge \ - --output_dir ./work_dir/vicuna_merge_vicuna-baichuan-7b-1gpu \ +python train_qlora.py \ + --model_name_or_path meta-llama/Llama-2-7b-hf \ + --dataset_cfg ./data/alpaca_zh_pcyn.yaml \ + --output_dir ./work_dir/alpaca_zh_llama2-7b \ --num_train_epochs 3 \ - --per_device_train_batch_size 8 \ - --per_device_eval_batch_size 8 \ - --gradient_accumulation_steps 4 \ + --per_device_train_batch_size 4 \ + --per_device_eval_batch_size 4 \ + --gradient_accumulation_steps 8 \ --evaluation_strategy steps \ --eval_steps 1000 \ --save_strategy steps \ --save_total_limit 10 \ - --save_steps 500 \ + --save_steps 1000 \ --logging_strategy steps \ --logging_steps 5 \ --learning_rate 0.0002 \ @@ -32,5 +32,6 @@ CUDA_VISIBLE_DEVICES=14 python train_qlora.py \ --use_auth_token True \ --do_train \ --do_eval \ + --sample_generate \ --data_seed 42 \ --seed 0 diff --git a/scripts/qlora_finetune/finetune_llama2_guanaco7b.sh b/scripts/qlora_finetune/finetune_llama2_guanaco7b.sh deleted file mode 100755 index a7c85ac..0000000 --- a/scripts/qlora_finetune/finetune_llama2_guanaco7b.sh +++ /dev/null @@ -1,44 +0,0 @@ -python qlora.py \ - --model_name_or_path meta-llama/Llama-2-7b-hf \ - --use_auth \ - --output_dir ./output/llama-2-guanaco-7b \ - --logging_steps 10 \ - --save_strategy steps \ - --data_seed 42 \ - --save_steps 500 \ - --save_total_limit 40 \ - --evaluation_strategy steps \ - --eval_dataset_size 1024 \ - --max_eval_samples 1000 \ - --per_device_eval_batch_size 1 \ - --max_new_tokens 32 \ - --dataloader_num_workers 1 \ - --group_by_length \ - --logging_strategy steps \ - --remove_unused_columns False \ - --do_train \ - --do_eval \ - --do_mmlu_eval \ - --lora_r 64 \ - --lora_alpha 16 \ - --lora_modules all \ - --double_quant \ - --quant_type nf4 \ - --bf16 \ - --bits 4 \ - --warmup_ratio 0.03 \ - --lr_scheduler_type constant \ - --gradient_checkpointing \ - --dataset oasst1 \ - --source_max_len 16 \ - --target_max_len 512 \ - --per_device_train_batch_size 1 \ - --gradient_accumulation_steps 16 \ - --max_steps 1875 \ - --eval_steps 187 \ - --learning_rate 0.0002 \ - --adam_beta2 0.999 \ - --max_grad_norm 0.3 \ - --lora_dropout 0.1 \ - --weight_decay 0.0 \ - --seed 0 \ diff --git a/scripts/qlora_finetune/finetune_llama_guanaco7b.sh b/scripts/qlora_finetune/finetune_llama_7b_alpaca_zh.sh similarity index 69% rename from scripts/qlora_finetune/finetune_llama_guanaco7b.sh rename to scripts/qlora_finetune/finetune_llama_7b_alpaca_zh.sh index 716b7ac..72d5b37 100755 --- a/scripts/qlora_finetune/finetune_llama_guanaco7b.sh +++ b/scripts/qlora_finetune/finetune_llama_7b_alpaca_zh.sh @@ -1,29 +1,24 @@ python train_qlora.py \ --model_name_or_path decapoda-research/llama-7b-hf \ - --dataset_name oasst1 \ - --data_dir /home/robin/prompt_data/ \ - --load_from_local \ - --output_dir ./work_dir/oasst1-llama-7b \ + --dataset_cfg ./data/alpaca_zh_pcyn.yaml \ + --output_dir ./work_dir/alpaca_zh-baichuan-7b \ --num_train_epochs 3 \ --per_device_train_batch_size 4 \ --per_device_eval_batch_size 4 \ --gradient_accumulation_steps 8 \ --evaluation_strategy steps \ - --eval_steps 200 \ + --eval_steps 1000 \ --save_strategy steps \ - --save_total_limit 5 \ - --save_steps 100 \ + --save_total_limit 10 \ + --save_steps 1000 \ --logging_strategy steps \ - --logging_steps 1 \ + --logging_steps 5 \ --learning_rate 0.0002 \ --warmup_ratio 0.03 \ --weight_decay 0.0 \ --lr_scheduler_type constant \ --adam_beta2 0.999 \ --max_grad_norm 0.3 \ - --max_new_tokens 32 \ - --source_max_len 512 \ - --target_max_len 512 \ --lora_r 64 \ --lora_alpha 16 \ --lora_dropout 0.1 \ @@ -31,7 +26,10 @@ python train_qlora.py \ --quant_type nf4 \ --fp16 \ --bits 4 \ + --model_max_length 1024 \ --gradient_checkpointing \ + --trust_remote_code True \ + --use_auth_token True \ --do_train \ --do_eval \ --sample_generate \ From f826dee8e363c98cd83a825c8058a72086ef1b66 Mon Sep 17 00:00:00 2001 From: jianzhnie Date: Mon, 7 Aug 2023 13:05:02 +0800 Subject: [PATCH 5/6] update --- data/README.md | 65 +++++++++++++++++++++++++------------------------- 1 file changed, 33 insertions(+), 32 deletions(-) diff --git a/data/README.md b/data/README.md index 9e97c9d..5612380 100644 --- a/data/README.md +++ b/data/README.md @@ -47,7 +47,7 @@ We provide the following datasets for the experiments in this framework. ## Dataset formation -The `dataset_info.yaml` file contains the information of the datasets, main including the following fields. +The `dataset_info.yaml` file contains all the datasets can be used in the experiments. The following is the format of the datasets, main including the following fields. ```yaml dataset_name: @@ -77,37 +77,6 @@ alpaca: multi_turn: False ``` -### How to use in training scripts - -After specifying the dataset information, you can run the following command to train the model. Just specify the `dataset_name` as one of the dataset name in `dataset_info.yaml`. If you want to use more than one dataset, please specify the `dataset_name` as str list with comma separated, e.g., `--dataset_name 'alpaca,dolly'. - -```shell -python train.py \ - --model_name_or_path facebook/opt-125m \ - --dataset_name alpaca \ - --output_dir work_dir/full-finetune \ - --num_train_epochs 3 \ - --per_device_train_batch_size 4 \ - --per_device_eval_batch_size 4 \ - --gradient_accumulation_steps 8 \ - --evaluation_strategy "steps" \ - --save_strategy "steps" \ - --eval_steps 1000 \ - --save_steps 1000 \ - --save_total_limit 5 \ - --logging_steps 1 \ - --learning_rate 2e-5 \ - --weight_decay 0. \ - --warmup_ratio 0.03 \ - --optim "adamw_torch" \ - --lr_scheduler_type "cosine" \ - --gradient_checkpointing True \ - --model_max_length 128 \ - --do_train \ - --do_eval -``` - - ## Custom datasets If you are using a custom dataset, please provide your dataset definition in `dataset_info.yaml`. @@ -257,3 +226,35 @@ def main(): if __name__ == '__main__': main() ``` + +### How to use in training scripts + +In the `data/` directory, we provide some dataset info dict used in the experiments. The following script shows how to use the `alpaca_zh.yaml` dataset info dict. + +```shell +python train.py \ + --model_name_or_path facebook/opt-125m \ + --dataset_cfg alpaca_zh.yaml \ + --output_dir work_dir/full-finetune \ + --num_train_epochs 3 \ + --per_device_train_batch_size 4 \ + --per_device_eval_batch_size 4 \ + --gradient_accumulation_steps 8 \ + --evaluation_strategy "steps" \ + --save_strategy "steps" \ + --eval_steps 1000 \ + --save_steps 1000 \ + --save_total_limit 5 \ + --logging_steps 1 \ + --learning_rate 2e-5 \ + --weight_decay 0. \ + --warmup_ratio 0.03 \ + --optim "adamw_torch" \ + --lr_scheduler_type "cosine" \ + --gradient_checkpointing True \ + --model_max_length 128 \ + --do_train \ + --do_eval +``` + +You can use the `alpaca_zh.yaml` directly or create a custom dataset config and then set the `dataset_cfg` argument to `your_dataset_info.yaml`. From 47ab54d79a4869164a8b618288dcba061f8e57b0 Mon Sep 17 00:00:00 2001 From: jianzhnie Date: Mon, 7 Aug 2023 13:07:25 +0800 Subject: [PATCH 6/6] update README --- README.md | 2 +- README_zh.md | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index ebc311a..84a6a78 100644 --- a/README.md +++ b/README.md @@ -181,7 +181,7 @@ We can also tweak our hyperparameters: ```bash python train_qlora.py \ --model_name_or_path ~/checkpoints/baichuan7b \ - --dataset_name oasst1 \ + --dataset_cfg ./data/alpaca_zh_pcyn.yaml \ --output_dir ./work_dir/oasst1-baichuan-7b \ --num_train_epochs 4 \ --per_device_train_batch_size 4 \ diff --git a/README_zh.md b/README_zh.md index 3623b62..00b7078 100644 --- a/README_zh.md +++ b/README_zh.md @@ -189,10 +189,10 @@ python train_qlora.py –learning_rate 0.0001 --model_name_or_path