Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Bug] Passing trust_remote_code=True will be mandatory to load this dataset from the next major release of datasets. #1233

Open
2 tasks done
chairmanQi opened this issue Jun 9, 2024 · 0 comments
Assignees

Comments

@chairmanQi
Copy link

Prerequisite

Type

I have modified the code (config is not considered code), or I'm working on my own tasks/models/datasets.

Environment

{'CUDA available': True, “CUDA available”:True,
'CUDA_HOME': None, 'CUDA_HOME':无,
'GCC': 'gcc (Ubuntu 11.4.0-1ubuntu122.04) 11.4.0',
'GCC':'GCC(Ubuntu 11.4.0-1ubuntu1
22.04)11.4.0',
'GPU 0': 'NVIDIA GeForce RTX 4090 D',
'GPU 0':'NVIDIA GeForce RTX 4090 D',
'MMEngine': '0.10.4', 'MMCengine':'0.10.4',
'MUSA available': False,
'MUSA available':False,
'OpenCV': '4.9.0', 'OpenCV':'4.9.0',
'PyTorch': '2.3.0', 'PyTorch':' 2.3.0',
'PyTorch compiling details': 'PyTorch built with:\n'
'PyTorch编译详细信息':'PyTorch构建时使用:\n'
' - GCC 9.3\n'
GCC 9.3\n的
' - C++ Version: 201703\n'
C++版本:201703\n
' - Intel(R) oneAPI Math Kernel Library Version '
' -英特尔(R)oneAPI数学内核库版本'
'2023.1-Product Build 20230303 for Intel(R) 64 '
'2023.1-英特尔® 64产品构建20230303'
'architecture applications\n'
'体系结构应用程序\n'
' - Intel(R) MKL-DNN v3.3.6 (Git Hash '
' -英特尔(R)MKL-DNN v3.3.6(Git Hash '
'86e6af5974177e513fd3fee58425e1063e7f1361)\n'
'86e6af5974177e513fd3fee58425e1063e7f1361)\n'
' - OpenMP 201511 (a.k.a. OpenMP 4.5)\n'
'—OpenMP 201511(a.k.a. OpenMP 4.5)\n '
' - LAPACK is enabled (usually provided by '
' - LAPACK已启用(通常由'
'MKL)\n' 'MKL)\n'
' - NNPACK is enabled\n'
'—NNPACK已启用\n'
' - CPU capability usage: AVX2\n'
'—CPU能力使用率:AVX2\n'
' - CUDA Runtime 12.1\n'
' - CUDA 12.1\n'
' - NVCC architecture flags: '
' - NVCC体系结构标志:'
'-gencode;arch=compute_50,code=sm_50;-gencode;arch=compute_60,code=sm_60;-gencode;arch=compute_61,code=sm_61;-gencode;arch=compute_70,code=sm_70;-gencode;arch=compute_75,code=sm_75;-gencode;arch=compute_80,code=sm_80;-gencode;arch=compute_86,code=sm_86;-gencode;arch=compute_90,code=sm_90\n'
'-gencode; arch= compute_50,code =sm_50;-gencode;arch= compute_60,code =sm_60;-gencode;arch= compute_61,code =sm_61;-gencode;arch= compute_70,code =sm_70;-gencode;arch= compute_75,code =sm_75;-gencode;arch= compute_80,code =sm_80;-gencode;arch= compute_86,code =sm_86;-gencode;arch= compute_90,code =sm_90\n'
' - CuDNN 8.9.2\n'
' - Magma 2.6.1\n'
' - Build settings: BLAS_INFO=mkl, '
' -构建设置:BLAS_INFO=mkl,'
'BUILD_TYPE=Release, CUDA_VERSION=12.1, '
'BUILD_TYPE=Release,CUDA_VERSION=12.1,'
'CUDNN_VERSION=8.9.2, ' 'CUDNN_VERSION=8.9.2,'
'CXX_COMPILER=/opt/rh/devtoolset-9/root/usr/bin/c++, '
'CXX_COMPILER=/opt/rh/devtoolset-9/root/usr/bin/c++,'
'CXX_FLAGS= -D_GLIBCXX_USE_CXX11_ABI=0 '
'-fabi-version=11 -fvisibility-inlines-hidden '
'-fabi-version=11 -fabability-inlines-hidden '
'-DUSE_PTHREADPOOL -DNDEBUG -DUSE_KINETO '
'-DLIBKINETO_NOROCTRACER -DUSE_FBGEMM '
'-DUSE_QNNPACK -DUSE_PYTORCH_QNNPACK '
'-DUSE_XNNPACK -DSYMBOLICATE_MOBILE_DEBUG_HANDLE '
'-DUSE_XNNPACK -DSYMBOLICATE_移动的_DEBUG_HANDLE '
'-O2 -fPIC -Wall -Wextra -Werror=return-type '
'-O2 -fPIC -Wall -Wextra -Werror=返回类型'
'-Werror=non-virtual-dtor -Werror=bool-operation '
'-Werror=non-virtual-dtor -Werror= bool-operation'
'-Wnarrowing -Wno-missing-field-initializers '
'-Wno-type-limits -Wno-array-bounds '
'-Wno-unknown-pragmas -Wno-unused-parameter '
'-Wno-unused-function -Wno-unused-result '
'-Wno-strict-overflow -Wno-strict-aliasing '
'-Wno-stringop-overflow -Wsuggest-override '
'-Wno-stringop-overflow -Wrest-override'
'-Wno-psabi -Wno-error=pedantic '
'-Wno-psabi -Wno-error=迂腐'
'-Wno-error=old-style-cast -Wno-missing-braces '
'-fdiagnostics-color=always -faligned-new '
'-Wno-unused-but-set-variable '
'-Wno-maybe-uninitialized -fno-math-errno '
'-fno-trapping-math -Werror=format '
'-Wno-stringop-overflow, LAPACK_INFO=mkl, '
'-Wno-stringop-overflow,LAPACK_INFO=mkl,'
'PERF_WITH_AVX=1, PERF_WITH_AVX2=1, '
PERF_WITH_AVX = 1,PERF_WITH_AVX2 = 1,
'PERF_WITH_AVX512=1, TORCH_VERSION=2.3.0, '
PERF_WITH_AVX512 = 1,TORCH_VERSION = 2.3.0,这是怎么回事?
'USE_CUDA=ON, USE_CUDNN=ON, USE_CUSPARSELT=1, '
USE_CUDA = ON,USE_CUDNN = ON,USE_CUPARSELT = 1,USE_CUDNN = ON,USE_CUDA = ON,USE_CUDNN = ON
'USE_EXCEPTION_PTR=1, USE_GFLAGS=OFF, '
'USE_EXCEPTION_PTR=1,USE_GFLAGS=OFF,'
'USE_GLOG=OFF, USE_GLOO=ON, USE_MKL=ON, '
USE_GLOG = OFF,USE_GLOO = ON,USE_MKL = ON,这是怎么回事?
'USE_MKLDNN=ON, USE_MPI=OFF, USE_NCCL=ON, '
USE_MKLDNN = ON,USE_MPI = OFF,USE_NCCL = ON,这是怎么回事?
'USE_NNPACK=ON, USE_OPENMP=ON, USE_ROCM=OFF, '
USE_NNPACK = ON,USE_OPENMP = ON,USE_ROCM = OFF,使用:
'USE_ROCM_KERNEL_ASSERT=OFF, \n',
'USE_ROCM_KERNEL_ASSERT=OFF,\n',
'Python': '3.10.14 (main, May 6 2024, 19:42:50) [GCC 11.2.0]',
'Python':'3.10.14(main,May 6 2024,19:42:50)[GCC 11.2.0',
'TorchVision': '0.18.0',
'TorchVision':'0.18.0',
'numpy_random_seed': 2147483648,
'numpy_random_seed':2147483648,
'opencompass': '0.2.5+d656e81',
'opencompass':'0.2.5+ d656e81',
'sys.platform': 'linux'}
"sys. platform":"linux"}

Reproduces the problem - code/configuration sample

Attempt to modify the "run(self)" function in tools/case_analyzer.py

    def run(self):
        filename = get_infer_output_path(
            self.model_cfg, self.dataset_cfg,
            osp.join(self.work_dir, 'predictions')
        )
        root, ext = osp.splitext(filename)
        partial_filename = root + '_0' + ext
        if not osp.exists(osp.realpath(filename)) and not osp.exists(
                osp.realpath(partial_filename)):
            print(f'{filename} not found')
            return
        dataset = build_dataset_from_cfg(self.dataset_cfg)

        print("==========       build-dataset       ========")
        print(dataset)

        if 'dataset_postprocessor' in self.eval_cfg:
            def postprocess(sample):
                s = sample[self.ds_column] 
                proc = TEXT_POSTPROCESSORS.get(
                    self.eval_cfg['dataset_postprocessor']['type']
                )
                sample[self.ds_column] = proc(s)
                print("==========   sample ds_column    ========")
                print(sample)
                print("self.ds_column")
                return sample
            dataset = dataset.map(postprocess)
            print("==========   postprocess dataset    ========")
            print(dataset)

        if osp.exists(osp.realpath(filename)):
            preds = mmengine.load(filename)
        else:
            filename = partial_filename
            preds, offset = {}, 0
            i = 1
            while osp.exists(osp.realpath(filename)):
                _preds = mmengine.load(filename)  
                filename = root + f'_{i}' + ext
                i += 1
                for _o in range(len(_preds)):
                    preds[str(offset)] = _preds[str(_o)]
                    offset += 1
        pred_strs = [preds[str(i)]['prediction'] for i in range(len(preds))]
        if 'in-context examples' in preds['0']:
            print("___________   pred_strs   ↓ ________")
            print(pred_strs)
            print("___________   pred_strs   ↑ ________")
        if 'pred_postprocessor' in self.eval_cfg:
            print("===========    qwq     ==============")
            proc = TEXT_POSTPROCESSORS.get(
                self.eval_cfg['pred_postprocessor']['type']
            )
            pred_strs = [proc(s) for s in pred_strs]
#        if self.ds_split:
#            references = dataset[self.ds_split][self.ds_column]
#        else:
#            references = dataset[self.ds_column]

#        if len(pred_strs) != len(references):
#            print('length mismatch')
#            return
        allcase, badcase = [], []
        print("===========    gen origin_prompt    ===============")
        if 'in-context examples' in preds['0']:
            for i, (pred_str) in enumerate(zip(tqdm(pred_strs))):
                #ref_str = str(reference)
                ref_str = str(i)
#                print("===========    gen origin_prompt    ===============")
#                print("i:        ",i,"ref_str:     ",ref_str)
#                print("")    
                try:
                    pred_prompt = preds[str(i)]['label: ' + str(pred_str)]['testing input']
                    pred_PPL = preds[str(i)]['label: ' + str(pred_str)]['PPL']
                    print("===========    gen origin_prompt    ===============")
                    print(pred_prompt)
                    print("PPL:                   ",pred_PPL)
                    print("")

                    ref_prompt = preds[str(i)]['label: ' + ref_str]['testing input']
                    ref_PPL = preds[str(i)]['label: ' + ref_str]['PPL']
                except KeyError:
                    continue
                item = {
                    'prediction_prompt': pred_prompt,
                    'prediction': pred_str,
                    'prediction_PPL': pred_PPL,
                    'reference_prompt': ref_prompt,
                    'reference': ref_str,
                    'reference_PPL': ref_PPL
                }
                if pred_str != ref_str:
                    badcase.append(item)
                    allcase.append(item)
                else:
                    allcase.append(item)
        else:
            print("===========    gen pred_str    ===============")
            pred_str = zip(tqdm(pred_strs))
#            print(pred_str)
#            print("")
#            print(reference)
#            print("")
            for i, (pred_str) in enumerate(zip(tqdm(pred_strs))):
                #ref_str = str(reference)
                origin_prompt = preds[str(i)]['origin_prompt']
#                print("===========    gen origin_prompt    ===============")
#                print(origin_prompt)
#                print("")
                
                item = {
                    'origin_prompt': origin_prompt,
                    'prediction': pred_str,
                    #'reference': ref_str
                }
                badcase.append(item)
                allcase.append(item)
        out_path = get_infer_output_path(
            self.cfg['model'], self.cfg['dataset'],
            osp.join(self.work_dir, 'case_analysis/bad')
        )
        mkdir_or_exist(osp.split(out_path)[0])
        with open(out_path, 'w', encoding='utf-8') as f:
            json.dump(badcase, f, indent=4, ensure_ascii=False)

#        out_path = get_infer_output_path(
#            self.cfg['model'], self.cfg['dataset'],
#            osp.join(self.work_dir, 'case_analysis/all')
#        )
#        mkdir_or_exist(osp.split(out_path)[0])

        with open(out_path, 'w', encoding='utf-8') as f:
            json.dump(allcase, f, indent=4, ensure_ascii=False)

Reproduces the problem - command or script

python tools/case_analyzer.py configs/eval_demo.py

Reproduces the problem - error message

image

/home/qwq/anaconda3/envs/opencompass/lib/python3.10/site-packages/datasets/load.py:1486: FutureWarning: The repository for winograd_wsc contains custom code which must be executed to correctly load the dataset. You can inspect the repository content at https://hf.co/datasets/winograd_wsc
You can avoid this message in future by passing the argument trust_remote_code=True.
Passing trust_remote_code=True will be mandatory to load this dataset from the next major release of datasets.
warnings.warn(

Other information

无法读取ppl类数据集内容,但gen类正常

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
None yet
Projects
None yet
Development

No branches or pull requests

2 participants