diff --git a/README.md b/README.md index 3271b4b9..48a02944 100644 --- a/README.md +++ b/README.md @@ -64,7 +64,7 @@ We include a collection of representative instruction datasets in our exploratio Please check these datasets for licenses and restrictions around their use! -You can also find the processed [Tulu v1](https://huggingface.co/datasets/allenai/tulu-v1-sft-mixture) and [Tulu v2](https://huggingface.co/datasets/allenai/tulu-v2-sft-mixture) SFT datasets on HuggingFace. +You can also find the processed [Tulu v1](https://huggingface.co/datasets/allenai/tulu-v1-sft-mixture) and [Tulu v2](https://huggingface.co/datasets/allenai/tulu-v2-sft-mixture) SFT datasets on HuggingFace. Note that the train data preparation script will not precisely recreate the Tulu v2 mixture due to randomness in the generation and shifts in data availability - see [this PR](https://github.com/allenai/open-instruct/pull/156) for some details. If you need exactly yhe training data used, the HuggingFace mixture is exactly this - the exact same data used during model training. ### Model preparation diff --git a/open_instruct/reformat_datasets.py b/open_instruct/reformat_datasets.py index fd6e1051..1d43bdaa 100644 --- a/open_instruct/reformat_datasets.py +++ b/open_instruct/reformat_datasets.py @@ -83,7 +83,7 @@ def convert_super_ni_data(data_dir, output_dir, zero_shot_examples_per_task=60, def convert_cot_data(data_dir, output_dir, num_zero_shot_examples=50000, num_few_shot_examples=50000): os.makedirs(output_dir, exist_ok=True) examples = [] - if num_few_shot_examples > 0: + if num_zero_shot_examples > 0: with open(os.path.join(data_dir, "cot_zsopt.jsonl"), "r") as fin: zero_shot_examples = [json.loads(line) for line in fin] if num_zero_shot_examples < len(zero_shot_examples): @@ -493,8 +493,14 @@ def convert_lima_data(data_dir, output_dir, num_examples=None): def convert_wizardlm_data(data_dir, output_dir, num_examples=30000): os.makedirs(output_dir, exist_ok=True) examples = [] - with open(os.path.join(data_dir, "WizardLM_evol_instruct_V2_143k.json"), "r") as fin: - examples = json.load(fin) + # check if the original json file exists + if os.path.exists(os.path.join(data_dir, "WizardLM_evol_instruct_V2_143k.json")): + # proceed as normally + with open(os.path.join(data_dir, "WizardLM_evol_instruct_V2_143k.json"), "r") as fin: + examples = json.load(fin) + else: # try other data train-00000-of-00001-004cd1ba9dc05e6c.parquet + df = pd.read_parquet(os.path.join(data_dir, "train-00000-of-00001-004cd1ba9dc05e6c.parquet")) + examples = df.to_dict(orient="records") if num_examples: examples = random.sample(examples, k=num_examples) diff --git a/scripts/prepare_train_data.sh b/scripts/prepare_train_data.sh index 18ae4f03..5e45e872 100755 --- a/scripts/prepare_train_data.sh +++ b/scripts/prepare_train_data.sh @@ -1,5 +1,7 @@ -# check if there is $HF_TOKEN in the environment variables -if [ -z "$HF_TOKEN" ] +# check if there is $HF_TOKEN in the environment variables, or if the huggingface-cli is installed and logged in +if huggingface-cli whoami &>/dev/null; then + echo "Logged in to HuggingFace." +elif [ -z "$HF_TOKEN" ] then echo "Warning: HuggingFace dataset LIMA requires permissive access." echo "Warning: Please request the access at https://huggingface.co/datasets/GAIR/lima and set the HF_TOKEN environment variable before running this script." @@ -77,13 +79,16 @@ python scripts/split_sharegpt_conversations.py \ --model-name-or-path oobabooga/llama-tokenizer \ --max-length 4096 - echo "Downloading LIMA dataset..." -wget --header="Authorization: Bearer $HF_TOKEN" -P data/raw_train/lima/ https://huggingface.co/datasets/GAIR/lima/raw/main/train.jsonl - +if huggingface-cli whoami &>/dev/null; then + huggingface-cli download GAIR/lima --repo-type dataset --local-dir data/raw_train/lima/ +else + wget --header="Authorization: Bearer $HF_TOKEN" -P data/raw_train/lima/ https://huggingface.co/datasets/GAIR/lima/raw/main/train.jsonl +fi echo "Downloading WizardLM dataset..." -wget -P data/raw_train/wizardlm/ https://huggingface.co/datasets/WizardLM/WizardLM_evol_instruct_V2_196k/resolve/main/WizardLM_evol_instruct_V2_143k.json +# original data removed wget -P data/raw_train/wizardlm/ https://huggingface.co/datasets/WizardLM/WizardLM_evol_instruct_V2_196k/resolve/main/WizardLM_evol_instruct_V2_143k.json +wget -P data/raw_train/wizardlm/ https://huggingface.co/datasets/ai2-adapt-dev/wizardlm-backup/resolve/main/data/train-00000-of-00001.parquet?download=true echo "Downloading the OpenOrca dataset..."