From 94209b603b8707ebe0c8e30d99dcb0aedf2f30da Mon Sep 17 00:00:00 2001 From: jianzhnie Date: Thu, 27 Jul 2023 14:53:40 +0800 Subject: [PATCH 1/4] update chatllms --- chatllms/configs/data_args.py | 3 ++- chatllms/data/data_utils.py | 14 +++++++------- 2 files changed, 9 insertions(+), 8 deletions(-) diff --git a/chatllms/configs/data_args.py b/chatllms/configs/data_args.py index 4bbac1b..85f29e7 100644 --- a/chatllms/configs/data_args.py +++ b/chatllms/configs/data_args.py @@ -16,7 +16,8 @@ class DatasetAttr(object): multi_turn: Optional[bool] = False def __repr__(self) -> str: - return self.dataset_name + rep = f'#dataset_name: {self.dataset_name}, #hf_hub_url: {self.hf_hub_url}, #local_path: {self.local_path}, #load_from_local: {self.load_from_local}, #multi_turn: {self.multi_turn}' + return rep def __post_init__(self): self.prompt_column = 'instruction' diff --git a/chatllms/data/data_utils.py b/chatllms/data/data_utils.py index 68f73e7..9df7b53 100644 --- a/chatllms/data/data_utils.py +++ b/chatllms/data/data_utils.py @@ -242,12 +242,16 @@ def load_data( """ if not os.path.exists(dataset_path): # Download dataset from HuggingFace Datasets + print( + f'Lodding dataset from huggingface, please ref to https://huggingface.co/datasets/{dataset_path}' + ) dataset = load_dataset(dataset_path, cache_dir='~/.cache/huggingface/datasets') return dataset else: # Load dataset from local file try: + print(f'Lodding dataset from local path: {dataset_path}') dataset = local_dataset(dataset_path, eval_dataset_size) return dataset except: @@ -346,16 +350,12 @@ def _remove_unused_columns(dataset): pass # encode_instruction_example - print( - f'Encoding the instruction example refer to : {instruction_template}') + print(f'Applying instruction template: {instruction_template}') if instruction_template == 'alpaca': - print('Using alpaca prompt template: ', {instruction_template}) dataset = dataset.map(extract_alpaca_prompt_dataset) elif instruction_template == 'random': - print('Using random prompt template: ', {instruction_template}) dataset = dataset.map(extract_random_prompt_dataset) else: - print('Using default prompt template: ', {instruction_template}) dataset = dataset.map(extract_default_prompt_dataset) # Remove unused columns. @@ -461,12 +461,12 @@ def make_data_module(args): ), 'All datasets should be multi-turn or single-turn. As follwing we will concat all datasets, so they should be in the same format.' for dataset_attr in args.datasets_list: - print('Loading dataset {}...'.format(dataset_attr)) + print('DatasetAttr: {}...'.format(dataset_attr)) if dataset_attr.load_from_local: dataset_path = dataset_attr.local_path elif dataset_attr.hf_hub_url: - dataset_path = dataset_attr.dataset_name + dataset_path = dataset_attr.hf_hub_url dataset = load_data(dataset_path, eval_dataset_size=args.eval_dataset_size) From 97658f68c4eff34249d7ce28be58e7280115f2cc Mon Sep 17 00:00:00 2001 From: jianzhnie Date: Thu, 27 Jul 2023 16:55:49 +0800 Subject: [PATCH 2/4] update datasets --- chatllms/configs/data_args.py | 5 ++++- chatllms/configs/train_args.py | 2 +- chatllms/data/data_utils.py | 35 ++++++++++++++++++++++------------ 3 files changed, 28 insertions(+), 14 deletions(-) diff --git a/chatllms/configs/data_args.py b/chatllms/configs/data_args.py index 85f29e7..019fea8 100644 --- a/chatllms/configs/data_args.py +++ b/chatllms/configs/data_args.py @@ -11,12 +11,13 @@ class DatasetAttr(object): dataset_name: Optional[str] = None hf_hub_url: Optional[str] = None local_path: Optional[str] = None + dataset_formate: Optional[str] = None dataset_sha1: Optional[str] = None load_from_local: bool = False multi_turn: Optional[bool] = False def __repr__(self) -> str: - rep = f'#dataset_name: {self.dataset_name}, #hf_hub_url: {self.hf_hub_url}, #local_path: {self.local_path}, #load_from_local: {self.load_from_local}, #multi_turn: {self.multi_turn}' + rep = f'dataset_name: {self.dataset_name}, hf_hub_url: {self.hf_hub_url}, local_path: {self.local_path}, data_formate:{self.dataset_formate} load_from_local: {self.load_from_local}, multi_turn: {self.multi_turn}' return rep def __post_init__(self): @@ -91,6 +92,8 @@ def init_for_training(self): # support mixing multiple datasets dataset_attr = DatasetAttr() dataset_attr.dataset_name = name + dataset_attr.dataset_formate = datasets_info[name].get( + 'data_format', None) dataset_attr.hf_hub_url = datasets_info[name].get( 'hf_hub_url', None) dataset_attr.local_path = datasets_info[name].get( diff --git a/chatllms/configs/train_args.py b/chatllms/configs/train_args.py index 2177b53..ea16250 100644 --- a/chatllms/configs/train_args.py +++ b/chatllms/configs/train_args.py @@ -71,7 +71,7 @@ class TrainingArguments(TrainingArguments): 'Group sequences into batches with same length. Saves memory and speeds up training considerably.' }) model_max_length: int = field( - default=2048, + default=1024, metadata={ 'help': 'Maximum sequence length. Sequences will be right padded (and possibly truncated).' diff --git a/chatllms/data/data_utils.py b/chatllms/data/data_utils.py index 9df7b53..c3cfda7 100644 --- a/chatllms/data/data_utils.py +++ b/chatllms/data/data_utils.py @@ -261,6 +261,7 @@ def load_data( def formate_instruction_dataset( dataset: Dataset, dataset_name: str, + dataset_formate: str, instruction_template: str = 'default') -> Optional[Dict[str, Dataset]]: """ Formats a given dataset based on its name and format. @@ -274,6 +275,7 @@ def formate_instruction_dataset( Args: dataset: A dataset object to be formatted. dataset_name: A string representing the name of the dataset to be formatted. + dataset_formate: A string representing the name of the dataset format to be used. instruction_template: A string representing the name of the prompt template to be used. Returns: @@ -330,25 +332,26 @@ def _remove_unused_columns(dataset): ]) return dataset - print('formate the dataset to the format we need.') - if dataset_name == 'dolly-15k': + # Format dataset + print(f'The {dataset_name} using {dataset_formate} dataset format.') + if dataset_formate == 'alpaca': + print('By default, We support the Alpaca dataset format.') + elif dataset_formate == 'dolly': dataset = _format_dolly15k(dataset) - elif dataset_name == 'chip2': + elif dataset_formate == 'chip2': dataset = _format_chip2(dataset) - elif dataset_name == 'self-instruct': + elif dataset_formate == 'self-instruct': dataset = _format_self_instruct(dataset) - elif dataset_name == 'hh-rlhf': + elif dataset_formate == 'hh-rlhf': dataset = _format_hh_rlhf(dataset) - elif dataset_name == 'oasst1': + elif dataset_formate == 'oasst1': dataset = _format_oasst1(dataset) - elif dataset_name == '100PoisonMpts': + elif dataset_formate == '100PoisonMpts': dataset = _format_100Poison(dataset) else: - print( - f'For dataset {dataset_name} with alpaca dataset formation, we do not need additional processing' + raise NotImplementedError( + f'Unsupported dataset format: {dataset_formate}, Please add the formate function in data_utils.py' ) - pass - # encode_instruction_example print(f'Applying instruction template: {instruction_template}') if instruction_template == 'alpaca': @@ -402,13 +405,16 @@ def split_train_eval( else: # Split train dataset in train and validation according to `eval_dataset_size` print( - 'Splitting train dataset in train and validation according to `eval_dataset_size`' + f'Splitting the dataset into train and validation according to `eval_dataset_size`: {eval_dataset_size}' ) dataset = dataset['train'].train_test_split( test_size=eval_dataset_size, shuffle=True, seed=42) eval_dataset = dataset['test'] # Reduce evaluation dataset size (if specified) + print( + f'You have set the max_eval_samples: {max_eval_samples}, will do sampling ...' + ) if max_eval_samples is not None and len( eval_dataset) > max_eval_samples: eval_dataset = eval_dataset.select(np.arange(max_eval_samples)) @@ -418,6 +424,9 @@ def split_train_eval( train_dataset = dataset['train'] # Reduce training dataset size (if specified) + print( + f'You have set the max_train_samples: {max_train_samples}, will do sampling ...' + ) if max_train_samples is not None and len( train_dataset) > max_train_samples: train_dataset = train_dataset.select(np.arange(max_train_samples)) @@ -461,6 +470,7 @@ def make_data_module(args): ), 'All datasets should be multi-turn or single-turn. As follwing we will concat all datasets, so they should be in the same format.' for dataset_attr in args.datasets_list: + print('=' * 80) print('DatasetAttr: {}...'.format(dataset_attr)) if dataset_attr.load_from_local: @@ -475,6 +485,7 @@ def make_data_module(args): dataset = formate_instruction_dataset( dataset, dataset_name=dataset_attr.dataset_name, + dataset_formate=dataset_attr.dataset_formate, instruction_template=args.instruction_template, ) From 9e86e927472fdddc996a869ae9acb460dea953e0 Mon Sep 17 00:00:00 2001 From: jianzhnie Date: Thu, 27 Jul 2023 17:45:04 +0800 Subject: [PATCH 3/4] update datasets --- chatllms/configs/data_args.py | 8 +- chatllms/data/data_utils.py | 24 ++--- data/README.md | 170 ++++++++++++++++++++++++++++++++-- data/dataset_info.yaml | 46 ++++++--- 4 files changed, 209 insertions(+), 39 deletions(-) diff --git a/chatllms/configs/data_args.py b/chatllms/configs/data_args.py index 019fea8..9682923 100644 --- a/chatllms/configs/data_args.py +++ b/chatllms/configs/data_args.py @@ -11,13 +11,13 @@ class DatasetAttr(object): dataset_name: Optional[str] = None hf_hub_url: Optional[str] = None local_path: Optional[str] = None - dataset_formate: Optional[str] = None + dataset_format: Optional[str] = None dataset_sha1: Optional[str] = None load_from_local: bool = False multi_turn: Optional[bool] = False def __repr__(self) -> str: - rep = f'dataset_name: {self.dataset_name}, hf_hub_url: {self.hf_hub_url}, local_path: {self.local_path}, data_formate:{self.dataset_formate} load_from_local: {self.load_from_local}, multi_turn: {self.multi_turn}' + rep = f'dataset_name: {self.dataset_name}, hf_hub_url: {self.hf_hub_url}, local_path: {self.local_path}, data_formate:{self.dataset_format} load_from_local: {self.load_from_local}, multi_turn: {self.multi_turn}' return rep def __post_init__(self): @@ -92,8 +92,8 @@ def init_for_training(self): # support mixing multiple datasets dataset_attr = DatasetAttr() dataset_attr.dataset_name = name - dataset_attr.dataset_formate = datasets_info[name].get( - 'data_format', None) + dataset_attr.dataset_format = datasets_info[name].get( + 'dataset_format', None) dataset_attr.hf_hub_url = datasets_info[name].get( 'hf_hub_url', None) dataset_attr.local_path = datasets_info[name].get( diff --git a/chatllms/data/data_utils.py b/chatllms/data/data_utils.py index c3cfda7..7651e9b 100644 --- a/chatllms/data/data_utils.py +++ b/chatllms/data/data_utils.py @@ -261,7 +261,7 @@ def load_data( def formate_instruction_dataset( dataset: Dataset, dataset_name: str, - dataset_formate: str, + dataset_format: str, instruction_template: str = 'default') -> Optional[Dict[str, Dataset]]: """ Formats a given dataset based on its name and format. @@ -275,7 +275,7 @@ def formate_instruction_dataset( Args: dataset: A dataset object to be formatted. dataset_name: A string representing the name of the dataset to be formatted. - dataset_formate: A string representing the name of the dataset format to be used. + dataset_format: A string representing the name of the dataset format to be used. instruction_template: A string representing the name of the prompt template to be used. Returns: @@ -333,24 +333,24 @@ def _remove_unused_columns(dataset): return dataset # Format dataset - print(f'The {dataset_name} using {dataset_formate} dataset format.') - if dataset_formate == 'alpaca': + print(f'The {dataset_name} using {dataset_format} dataset format.') + if dataset_format == 'alpaca': print('By default, We support the Alpaca dataset format.') - elif dataset_formate == 'dolly': + elif dataset_format == 'dolly': dataset = _format_dolly15k(dataset) - elif dataset_formate == 'chip2': + elif dataset_format == 'chip2': dataset = _format_chip2(dataset) - elif dataset_formate == 'self-instruct': + elif dataset_format == 'self-instruct': dataset = _format_self_instruct(dataset) - elif dataset_formate == 'hh-rlhf': + elif dataset_format == 'hh-rlhf': dataset = _format_hh_rlhf(dataset) - elif dataset_formate == 'oasst1': + elif dataset_format == 'oasst1': dataset = _format_oasst1(dataset) - elif dataset_formate == '100PoisonMpts': + elif dataset_format == '100PoisonMpts': dataset = _format_100Poison(dataset) else: raise NotImplementedError( - f'Unsupported dataset format: {dataset_formate}, Please add the formate function in data_utils.py' + f'Unsupported dataset format: {dataset_format}, Please add the formate function in data_utils.py' ) # encode_instruction_example print(f'Applying instruction template: {instruction_template}') @@ -485,7 +485,7 @@ def make_data_module(args): dataset = formate_instruction_dataset( dataset, dataset_name=dataset_attr.dataset_name, - dataset_formate=dataset_attr.dataset_formate, + dataset_format=dataset_attr.dataset_format, instruction_template=args.instruction_template, ) diff --git a/data/README.md b/data/README.md index f45b2fe..7040101 100644 --- a/data/README.md +++ b/data/README.md @@ -27,24 +27,176 @@ We provide the following datasets for the experiments in this framework. ### Dataset formation -The `dataset_info.yaml` file contains the information of the datasets. By defaullt, the framework will load the datasets from the HuggingFace hub. If you want to use the datasets from local files, please specify the `local_path` in the `dataset_info.yaml` file. For example, if you want to use the Alpaca dataset from local files, please specify the following in `dataset_info.yaml`. +The `dataset_info.yaml` file contains the information of the datasets. The format of the file is as follows. + +```yaml +dataset_name: + hf_hub_url: # "the name of the dataset repository on the HuggingFace hub. (if specified, ignore below 3 arguments)", + local_path: # "the name of the dataset file in the this directory. (required if above are not specified)", + dataset_format: # "the format of the dataset. (required), e.g., alpaca, dolly, etc.", + multi_turn: # "whether the dataset is multi-turn. (default: False)" +``` + +For example, the following is the dataset information of the Stanford Alpaca dataset. + +```yaml +alpaca: + hf_hub_url: tatsu-lab/alpaca + local_path: + dataset_format: alpaca + multi_turn: False +``` +This will load the dataset from the HuggingFace hub. If you want to load the dataset from local files, please specify the `local_path` field. ```yaml alpaca: hf_hub_url: tatsu-lab/alpaca - local_path: tatsu-lab/alpaca/alpaca.json + local_path: path/to/alpaca.json + dataset_format: alpaca multi_turn: False ``` ### Custom datasets -If you are using a custom dataset, please provide your dataset definition in the following format in `dataset_info.yaml`. +If you are using a custom dataset, please provide your dataset definition in `dataset_info.yaml`. -```yaml -dataset_name: - hf_hub_url: # "the name of the dataset repository on the HuggingFace hub. (if specified, ignore below 3 arguments)", - local_path: # "the name of the dataset file in the this directory. (required if above are not specified)", - multi_turn: # "whether the dataset is multi-turn. (default: False)" +#### hf_hub_url and local_path + +By defaullt, the framework will load the datasets from the HuggingFace hub. If you want to use the datasets from local files, please specify the `local_path` in the `dataset_info.yaml` file. + +#### dataset_format +As for the dataset_format field, which is used to specify the format of the dataset, will good for the framework to process the dataset. Currently, we support the following dataset formats. + +- `alpaca`: Alpaca dataset +- `dolly`: Dolly dataset +- `gpt4`: GPT-4 generated dataset +- `alpaca_cot`: Alpaca CoT dataset +- `oasst1`: OpenAssistant/oasst1 dataset +- `sharegpt`: Multi-turn ShareGPT dataset + +If your dataset is not in the above format, there are two ways to use it. + +- The first way, Implement the `format_dataset` function in `./chatllms/data/data_utils.py` to formate your dataset. For example, the following is the `_format_dolly15k` function for the Dolly dataset. + +```python +def _format_dolly15k(dataset: Dataset) -> Dataset: + """Format Dolly-15k dataset.""" + dataset = dataset.rename_column('context', 'input') + dataset = dataset.rename_column('response', 'output') + return dataset ``` -where the `prompt` and `response` columns should contain non-empty values. The `query` column will be concatenated with the `prompt` column and used as input for the model. The `history` column should contain a list where each element is a string tuple representing a query-response pair. +- The second way, convert your dataset to the above format and specify the `dataset_format` field in `dataset_info.yaml`. + +For example, if we want to convert the [databricks-dolly-15k](https://huggingface.co/datasets/databricks/databricks-dolly-15k) into Alpaca formate, you can refer to the following code. + +```python +import json +def convert_dolly_alpaca(in_file, out_file): + with open(in_file, 'r') as file: + contents = json.load(file) + new_content = [] + for i, content in enumerate(contents): + new_content.append({ + 'instruction': content['instruction'], + 'input': content['text'], + 'output': content['text'], + }) + + print(f'#out: {len(new_content)}') + with open(out_file, 'w') as file: + json.dump(new_content, file, indent=2, ensure_ascii=False) +``` + +#### multi_turn + +If your dataset is multi-turn, please specify the `multi_turn` field in `dataset_info.yaml`. The framework will automatically process the multi-turn dataset. Flowing is an example of the multi-turn dataset. + +```json +[ + { + "id": "identity_0", + "conversations": [ + { + "from": "human", + "value": "Who are you?" + }, + { + "from": "gpt", + "value": "I am Vicuna, a language model trained by researchers from Large Model Systems Organization (LMSYS)." + }, + { + "from": "human", + "value": "What can you do?" + }, + { + "from": "gpt", + "value": "I can chat with you." + } + ] + }, + { + "id": "identity_1", + "conversations": [ + { + "from": "human", + "value": "Who are you?" + }, + { + "from": "gpt", + "value": "My name is Vicuna, and I'm a language model developed by Large Model Systems Organization (LMSYS)." + } + ] + }, +] +``` + +For now, we only support the multi-turn dataset in the above format. If your dataset is not in the above format, please convert it. We also provide the following code to convert the Dolly dataset to the above format. You can find the code in `./chatllms/data/utils/convert_alpaca.py`. + +```python +import argparse +import json +from typing import Any, Dict, List + +from datasets import load_dataset + +def convert_dolly_vicuna(raw_data: List[Dict[str, Any]]): + collect_data = [] + for i, content in enumerate(raw_data): + if len(content['context'].strip()) > 1: + q, a = content['instruction'] + '\nInput:\n' + content[ + 'context'], content['response'] + else: + q, a = content['instruction'], content['response'] + + collect_data.append({ + 'id': + f'alpaca_{i}', + 'conversations': [ + { + 'from': 'human', + 'value': q + }, + { + 'from': 'gpt', + 'value': a + }, + ], + }) + print(f'Original: {len(raw_data)}, Converted: {len(collect_data)}') + return collect_data + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument('--in-file', type=str) + parser.add_argument('--out-file', type=str) + args = parser.parse_args() + + raw_data = load_dataset('json', data_files=args.in_file)['train'] + new_data = convert_dolly_vicuna(raw_data) + json_dump(new_data, args.out_file) + + +if __name__ == '__main__': + main() +``` diff --git a/data/dataset_info.yaml b/data/dataset_info.yaml index 086d863..47dc433 100644 --- a/data/dataset_info.yaml +++ b/data/dataset_info.yaml @@ -2,97 +2,104 @@ alpaca: hf_hub_url: tatsu-lab/alpaca local_path: tatsu-lab/alpaca/alpaca.json + dataset_format: alpaca multi_turn: False alpaca-clean: hf_hub_url: yahma/alpaca-cleaned local_path: '' + dataset_format: alpaca multi_turn: False chip2: hf_hub_url: laion/OIG local_path: '' + dataset_format: chip2 multi_turn: False self-instruct: hf_hub_url: yizhongw/self_instruct local_path: '' + dataset_format: self-instruct multi_turn: False guanaco: hf_hub_url: JosephusCheung/GuanacoDataset local_path: '' + dataset_format: guanaco multi_turn: False hh-rlhf: hf_hub_url: Anthropic/hh-rlhf local_path: '' + dataset_format: hh-rlhf multi_turn: False longformer: hf_hub_url: akoksal/LongForm local_path: '' + dataset_format: longformer multi_turn: False openassistant-guanaco: hf_hub_url: timdettmers/openassistant-guanaco local_path: timdettmers/openassistant-guanaco - multi_turn: False - -evol_instruct: - hf_hub_url: WizardLM/WizardLM_evol_instruct_V2_196k - local_path: WizardLM/WizardLM_evol_instruct_V2_196k/WizardLM_evol_instruct_V2_143k.json + dataset_format: alpaca multi_turn: False dolly-15k: hf_hub_url: databricks/databricks-dolly-15k local_path: databricks/databricks-dolly-15k + dataset_format: dolly multi_turn: False olcc: hf_hub_url: '' local_path: /home/robin/prompt_data/olcc/olcc_alpaca.json + dataset_format: alpaca multi_turn: False -share_gpt: - hf_hub_url: '' - local_path: /home/robin/prompt_data/sharegpt/sharegpt_split.json - multi_turn: True - 100PoisonMpts: hf_hub_url: '' local_path: /home/robin/prompt_data/100PoisonMpts/train.jsonl + dataset_format: 100PoisonMpts multi_turn: False # Belle Group belle_0.5m: hf_hub_url: BelleGroup/train_0.5M_CN local_path: '' + dataset_format: alpaca multi_turn: False belle_1m: hf_hub_url: BelleGroup/train_1M_CN local_path: '' + dataset_format: alpaca multi_turn: False belle_2m: hf_hub_url: BelleGroup/train_2M_CN local_path: '' + dataset_format: alpaca multi_turn: False belle_dialog: hf_hub_url: BelleGroup/generated_chat_0.4M local_path: '' + dataset_format: belle_dialog multi_turn: False belle_math: hf_hub_url: BelleGroup/school_math_0.25M local_path: '' + dataset_format: alpaca multi_turn: False belle_multiturn: hf_hub_url: BelleGroup/multi_turn_0.5M local_path: '' + dataset_format: belle_multiturn multi_turn: True columns: prompt: instruction @@ -104,6 +111,7 @@ belle_multiturn: firefly: hf_hub_url: YeungNLP/firefly-train-1.1M local_path: '' + dataset_format: alpaca multi_turn: False columns: prompt: input @@ -115,6 +123,7 @@ firefly: codealpaca: hf_hub_url: sahil2801/CodeAlpaca-20k local_path: '' + dataset_format: codealpaca multi_turn: False # alpacacot @@ -126,6 +135,7 @@ alpaca_cot: webqa: hf_hub_url: suolyer/webqa local_path: '' + dataset_format: webqa multi_turn: False columns: prompt: input @@ -133,7 +143,15 @@ webqa: response: output history: '' -novel_tokens512_50k: - hf_hub_url: zxbsmk/webnovel_cn - local_path: '' - multi_turn: False +# mutli-turn datasets +evol_instruct: + hf_hub_url: WizardLM/WizardLM_evol_instruct_V2_196k + local_path: WizardLM/WizardLM_evol_instruct_V2_196k/WizardLM_evol_instruct_V2_143k.json + dataset_format: sharegpt + multi_turn: True + +share_gpt: + hf_hub_url: '' + local_path: /home/robin/prompt_data/sharegpt/sharegpt_split.json + dataset_format: sharegpt + multi_turn: True From 7f83fd86e18d00a7cec02a9048cea34ab31d35b4 Mon Sep 17 00:00:00 2001 From: jianzhnie Date: Thu, 27 Jul 2023 18:30:59 +0800 Subject: [PATCH 4/4] update datasets --- data/README.md | 37 +++++++++++++++++++++---------------- 1 file changed, 21 insertions(+), 16 deletions(-) diff --git a/data/README.md b/data/README.md index 7040101..00cfcb1 100644 --- a/data/README.md +++ b/data/README.md @@ -1,7 +1,7 @@ -## How to use the data +# How to use the data -### Datasets Supported by the Framework +## Datasets Supported by the Framework We provide the following datasets for the experiments in this framework. @@ -25,9 +25,9 @@ We provide the following datasets for the experiments in this framework. - [timdettmers/openassistant-guanaco](https://huggingface.co/datasets/timdettmers/openassistant-guanaco) - [Evol-Instruct](https://huggingface.co/datasets/victor123/evol_instruct_70k) -### Dataset formation +## Dataset formation -The `dataset_info.yaml` file contains the information of the datasets. The format of the file is as follows. +The `dataset_info.yaml` file contains the information of the datasets, main including the following fields. ```yaml dataset_name: @@ -46,7 +46,7 @@ alpaca: dataset_format: alpaca multi_turn: False ``` -This will load the dataset from the HuggingFace hub. If you want to load the dataset from local files, please specify the `local_path` field. +While training, the framework will load the dataset from the HuggingFace hub. If you want to load the dataset from local files, please specify the `local_path` field. ```yaml alpaca: @@ -56,16 +56,17 @@ alpaca: multi_turn: False ``` -### Custom datasets +## Custom datasets If you are using a custom dataset, please provide your dataset definition in `dataset_info.yaml`. -#### hf_hub_url and local_path +### hf_hub_url and local_path -By defaullt, the framework will load the datasets from the HuggingFace hub. If you want to use the datasets from local files, please specify the `local_path` in the `dataset_info.yaml` file. +By defaullt, the framework will load the datasets from the HuggingFace hub. If you want to use the datasets from local files, please specify the `local_path` field. -#### dataset_format -As for the dataset_format field, which is used to specify the format of the dataset, will good for the framework to process the dataset. Currently, we support the following dataset formats. +### dataset_format + +As for the dataset_format field, which is used to specify the format of the dataset, will be used to determine the dataset processing method. Currently, we support the following dataset formats. - `alpaca`: Alpaca dataset - `dolly`: Dolly dataset @@ -76,7 +77,9 @@ As for the dataset_format field, which is used to specify the format of the data If your dataset is not in the above format, there are two ways to use it. -- The first way, Implement the `format_dataset` function in `./chatllms/data/data_utils.py` to formate your dataset. For example, the following is the `_format_dolly15k` function for the Dolly dataset. +- The first way, implement the `format_dataset` function in [data_utils](./chatllms/data/data_utils.py). + +For example, the following is the `_format_dolly15k` function for the Dolly dataset. ```python def _format_dolly15k(dataset: Dataset) -> Dataset: @@ -86,9 +89,9 @@ def _format_dolly15k(dataset: Dataset) -> Dataset: return dataset ``` -- The second way, convert your dataset to the above format and specify the `dataset_format` field in `dataset_info.yaml`. +- The second way, convert your dataset to the above format. -For example, if we want to convert the [databricks-dolly-15k](https://huggingface.co/datasets/databricks/databricks-dolly-15k) into Alpaca formate, you can refer to the following code. +For example, the flowing code is used to convert the [databricks-dolly-15k](https://huggingface.co/datasets/databricks/databricks-dolly-15k) to the Alpaca format. ```python import json @@ -108,9 +111,11 @@ def convert_dolly_alpaca(in_file, out_file): json.dump(new_content, file, indent=2, ensure_ascii=False) ``` -#### multi_turn +### multi_turn + +If your dataset is multi-turn, pleas set the `multi_turn: True` in `dataset_info.yaml`. The framework will automatically process the multi-turn dataset. -If your dataset is multi-turn, please specify the `multi_turn` field in `dataset_info.yaml`. The framework will automatically process the multi-turn dataset. Flowing is an example of the multi-turn dataset. +Flowing is an example to show the format of multi-turn dataset. ```json [ @@ -151,7 +156,7 @@ If your dataset is multi-turn, please specify the `multi_turn` field in `dataset ] ``` -For now, we only support the multi-turn dataset in the above format. If your dataset is not in the above format, please convert it. We also provide the following code to convert the Dolly dataset to the above format. You can find the code in `./chatllms/data/utils/convert_alpaca.py`. +For now, we only support the multi-turn dataset in the above format. If your dataset is not in the above format, please convert it. We also provide the following code to convert the Dolly dataset to the above format. You can find the code in [convert_alpaca](`./chatllms/data/utils/convert_alpaca.py`). ```python import argparse