From 4572522310ed8e73bfcf884bceaa782c3affb776 Mon Sep 17 00:00:00 2001 From: Pernekhan Utemuratov Date: Fri, 15 Dec 2023 15:27:56 -0800 Subject: [PATCH 1/2] Add example of tensorrt-llm usage --- example/Dockerfile | 17 +++ example/LICENSE | 201 +++++++++++++++++++++++++++++++ example/README.md | 4 + example/hub.py | 181 ++++++++++++++++++++++++++++ example/launch_triton_server.py | 205 ++++++++++++++++++++++++++++++++ 5 files changed, 608 insertions(+) create mode 100644 example/Dockerfile create mode 100644 example/LICENSE create mode 100644 example/README.md create mode 100644 example/hub.py create mode 100644 example/launch_triton_server.py diff --git a/example/Dockerfile b/example/Dockerfile new file mode 100644 index 0000000..d8eed8e --- /dev/null +++ b/example/Dockerfile @@ -0,0 +1,17 @@ +# syntax = edrevo/dockerfile-plus + +# Newer version of base container can be found at https://catalog.ngc.nvidia.com/orgs/nvidia/containers/tritonserver/tags +FROM nvcr.io/nvidia/tritonserver:23.10-trtllm-python-py3 AS base-trtllm + +INCLUDE+ ../dockerfile/Dockerfile.trt_llm_backend + +# install huggingface +RUN pip install huggingface_hub +RUN pip install hf_transfer +RUN env HF_HUB_ENABLE_HF_TRANSFER=1 + +# copy files that will be used for running triton server +COPY launch_triton_server.py . +COPY hub.py . + +ENTRYPOINT ["python3", "launch_triton_server.py"] diff --git a/example/LICENSE b/example/LICENSE new file mode 100644 index 0000000..261eeb9 --- /dev/null +++ b/example/LICENSE @@ -0,0 +1,201 @@ + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. diff --git a/example/README.md b/example/README.md new file mode 100644 index 0000000..cedcdc6 --- /dev/null +++ b/example/README.md @@ -0,0 +1,4 @@ +# tensorrt-llm-example +docker build --target base-trtllm -t trt-example . + +docker run --rm -it -p 80:80 --shm-size=10g --ulimit memlock=-1 --ulimit stack=67108864 --gpus '"device=4,5,6,7"' -e HUGGING_FACE_HUB_TOKEN=$HUGGING_FACE_HUB_TOKEN -e HUGGINGFACE_HUB_CACHE=/data/trt-data -v /data:/data trt-example:latest --world_size=4 --model_id=DeepInfra/Llama-2-70b-chat-hf-trt-fp8 --revision=5de4d5c03ffd13b8ac34bf50fb2e797f4d9be93e --tokenizer_model_id=DeepInfra/Llama-2-70b-chat-tokenizer --tokenizer_revision=f88981891fea1e38150df966c833e6d1e7e798f4 --http_port=80 diff --git a/example/hub.py b/example/hub.py new file mode 100644 index 0000000..43c4ac7 --- /dev/null +++ b/example/hub.py @@ -0,0 +1,181 @@ +# The following code is adapted from the text-generation-inference (https://github.com/deepinfra/text-generation-inference/blob/main/server/text_generation_server/utils/hub.py) +# License: Apache License, Version 2.0 (https://www.apache.org/licenses/LICENSE-2.0) +import time +import os + +from datetime import timedelta +from pathlib import Path +from typing import Optional, List +from pathlib import Path + +from huggingface_hub import HfApi, hf_hub_download +from huggingface_hub.constants import HUGGINGFACE_HUB_CACHE +from huggingface_hub.utils import ( + LocalEntryNotFoundError, + EntryNotFoundError, + RevisionNotFoundError, # Import here to ease try/except in other part of the lib +) + +WEIGHTS_CACHE_OVERRIDE = os.getenv("WEIGHTS_CACHE_OVERRIDE", None) + + +def weight_hub_files( + model_id: str, revision: Optional[str] = None, extension: str = ".safetensors" +) -> List[str]: + """Get the weights filenames on the hub""" + api = HfApi() + info = api.model_info(model_id, revision=revision) + print(info.siblings) + filenames = [ + s.rfilename + for s in info.siblings + if s.rfilename.endswith(extension) + ] + + if not filenames: + raise EntryNotFoundError( + f"No {extension} weights found for model {model_id} and revision {revision}.", + None, + ) + + return filenames + + +def try_to_load_from_cache( + model_id: str, revision: Optional[str], filename: str +) -> Optional[Path]: + """Try to load a file from the Hugging Face cache""" + if revision is None: + revision = "main" + + object_id = model_id.replace("/", "--") + repo_cache = Path(HUGGINGFACE_HUB_CACHE) / f"models--{object_id}" + + if not repo_cache.is_dir(): + # No cache for this model + return None + + refs_dir = repo_cache / "refs" + snapshots_dir = repo_cache / "snapshots" + + # Resolve refs (for instance to convert main to the associated commit sha) + if refs_dir.is_dir(): + revision_file = refs_dir / revision + if revision_file.exists(): + with revision_file.open() as f: + revision = f.read() + + # Check if revision folder exists + if not snapshots_dir.exists(): + return None + cached_shas = os.listdir(snapshots_dir) + if revision not in cached_shas: + # No cache for this revision and we won't try to return a random revision + return None + + # Check if file exists in cache + cached_file = snapshots_dir / revision / filename + return cached_file if cached_file.is_file() else None + + +def weight_files( + model_id: str, revision: Optional[str] = None, extension: str = ".safetensors" +) -> List[Path]: + """Get the local files""" + # Local model + if Path(model_id).exists() and Path(model_id).is_dir(): + local_files = list(Path(model_id).glob(f"*{extension}")) + if not local_files: + raise FileNotFoundError( + f"No local weights found in {model_id} with extension {extension}" + ) + return local_files + + try: + filenames = weight_hub_files(model_id, revision, extension) + except EntryNotFoundError as e: + if extension != ".safetensors": + raise e + # Try to see if there are pytorch weights + pt_filenames = weight_hub_files(model_id, revision, extension=".bin") + # Change pytorch extension to safetensors extension + # It is possible that we have safetensors weights locally even though they are not on the + # hub if we converted weights locally without pushing them + filenames = [ + f"{Path(f).stem.lstrip('pytorch_')}.safetensors" for f in pt_filenames + ] + + if WEIGHTS_CACHE_OVERRIDE is not None: + files = [] + for filename in filenames: + p = Path(WEIGHTS_CACHE_OVERRIDE) / filename + if not p.exists(): + raise FileNotFoundError( + f"File {p} not found in {WEIGHTS_CACHE_OVERRIDE}." + ) + files.append(p) + return files + + files = [] + for filename in filenames: + cache_file = try_to_load_from_cache( + model_id, revision=revision, filename=filename + ) + if cache_file is None: + raise LocalEntryNotFoundError( + f"File {filename} of model {model_id} not found in " + f"{os.getenv('HUGGINGFACE_HUB_CACHE', 'the local cache')}. " + f"Please run `text-generation-server download-weights {model_id}` first." + ) + files.append(cache_file) + + return files + + +def download_weights( + filenames: List[str], model_id: str, revision: Optional[str] = None +) -> List[Path]: + """Download the safetensors files from the hub""" + + def download_file(filename, tries=5, backoff: int = 5): + local_file = try_to_load_from_cache(model_id, revision, filename) + if local_file is not None: + print(f"File {filename} already present in cache.") + return Path(local_file) + + for i in range(tries): + try: + print(f"Download file: {filename}") + start_time = time.time() + local_file = hf_hub_download( + filename=filename, + repo_id=model_id, + revision=revision, + local_files_only=False, + ) + print( + f"Downloaded {local_file} in {timedelta(seconds=int(time.time() - start_time))}." + ) + return Path(local_file) + except Exception as e: + if i + 1 == tries: + raise e + print(e) + print(f"Retrying in {backoff} seconds") + time.sleep(backoff) + print(f"Retry {i + 1}/{tries - 1}") + + # We do this instead of using tqdm because we want to parse the logs with the launcher + start_time = time.time() + files = [] + for i, filename in enumerate(filenames): + file = download_file(filename) + + elapsed = timedelta(seconds=int(time.time() - start_time)) + remaining = len(filenames) - (i + 1) + eta = (elapsed / (i + 1)) * remaining if remaining > 0 else 0 + + print(f"Download: [{i + 1}/{len(filenames)}] -- ETA: {eta}") + files.append(file) + + return files diff --git a/example/launch_triton_server.py b/example/launch_triton_server.py new file mode 100644 index 0000000..2be7a10 --- /dev/null +++ b/example/launch_triton_server.py @@ -0,0 +1,205 @@ +# The following code is adapted from the tensorrtllm_backend (https://github.com/triton-inference-server/tensorrtllm_backend/blob/main/scripts/launch_triton_server.py) +# License: Apache License, Version 2.0 (https://www.apache.org/licenses/LICENSE-2.0) +import argparse +import subprocess +import os +import sys +import errno +from pathlib import Path +from huggingface_hub import hf_hub_download, snapshot_download +from huggingface_hub.utils import ( + LocalEntryNotFoundError, + EntryNotFoundError, + RevisionNotFoundError, # Import here to ease try/except in other part of the lib +) +from hub import download_weights, weight_files, weight_hub_files + + +def parse_arguments(): + parser = argparse.ArgumentParser() + parser.add_argument( + '--world_size', + type=int, + default=4, + help='world size, only support tensor parallelism now') + parser.add_argument( + '--tritonserver', + type=str, + help='path to the tritonserver exe', + default='/opt/tritonserver/bin/tritonserver', + ) + parser.add_argument( + '--force', + '-f', + action='store_true', + help='launch tritonserver regardless of other instances running') + parser.add_argument( + '--log', + action='store_true', + help='log triton server stats into log_file') + parser.add_argument( + '--log-file', + type=str, + help='path to triton log gile', + default='triton_log.txt', + ) + parser.add_argument( + '--model_id', + type=str, + help='model from the huggingface', + default='DeepInfra/Llama-2-70b-chat-hf-trt-fp8', + ) + parser.add_argument( + '--revision', + type=str, + help='revision of the model_id', + default='5de4d5c03ffd13b8ac34bf50fb2e797f4d9be93e', + ) + parser.add_argument( + '--tokenizer_model_id', + type=str, + help='tokenizer model from the huggingface', + default='DeepInfra/Llama-2-70b-chat-tokenizer', + ) + parser.add_argument( + '--tokenizer_revision', + type=str, + help='revision of the tokenizer_model_id', + default='f88981891fea1e38150df966c833e6d1e7e798f4', + ) + parser.add_argument( + '--http_port', + type=str, + help='tritonserver http port', + default='8000', + ) + parser.add_argument( + '--metrics_port', + type=str, + help='tritonserver metrics port', + default='8002', + ) + parser.add_argument( + '--grpc_port', + type=str, + help='tritonserver grpc port', + default='8001', + ) + + return parser.parse_args() + + +def get_cmd(world_size, tritonserver, model_repo, log, log_file, http_port, metrics_port, grpc_port): + cmd = ['mpirun', '--allow-run-as-root'] + for i in range(world_size): + cmd += ['-n', '1', tritonserver] + if log and (i == 0): + cmd += ['--log-verbose=3', f'--log-file={log_file}'] + cmd += [ + f'--model-repository={model_repo}', + f'--http-port={http_port}', + f'--metrics-port={metrics_port}', + f'--grpc-port={grpc_port}', + '--disable-auto-complete-config', + f'--backend-config=python,shm-region-prefix-name=prefix{i}_', ':' + ] + return cmd + + +def download_hf_model_into(model_id, revision): + extension = "" + try: + weight_files(model_id, revision, extension) + print("Files are already present on the host. " "Skipping download.") + return + # Local files not found + except (LocalEntryNotFoundError, FileNotFoundError): + pass + + is_local_model = (Path(model_id).exists() and Path(model_id).is_dir()) or os.getenv( + "WEIGHTS_CACHE_OVERRIDE", None + ) is not None + + if not is_local_model: + # Try to download weights from the hub + try: + filenames = weight_hub_files(model_id, revision, extension) + print(filenames) + download_weights(filenames, model_id, revision) + # Successfully downloaded weights + return + + # No weights found on the hub with this extension + except EntryNotFoundError as e: + # Check if we want to automatically convert to safetensors or if we can use .bin weights instead + raise e + + +def run_cmd(cmd): + try: + # Spawn a new process using subprocess + subprocess.run(cmd, check=True) + # If the command succeeds, the following lines won't be executed + print("The command failed.") + os._exit(1) + except subprocess.CalledProcessError as e: + # If the command fails, exit the current process with the same exit code + os._exit(e.returncode) + + +def symlink(link, folder): + if os.path.exists(link): + os.remove(link) + os.symlink(folder, link) + + +def get_cached_dir(model_id, revision): + folder = f'/data/trt-data/models--{model_id.replace("/", "--")}/snapshots/{revision}' + return folder + + +def replace_placeholders(folder, tokenizer_repo, model_repo): + d = { + 'preprocessing': ["${tokenizer_dir}", tokenizer_repo], + 'postprocessing': ["${tokenizer_dir}", tokenizer_repo], + 'tensorrt_llm': ["${gpt_model_path}", f'{model_repo}/tensorrt_llm/1'], + } + + for k, v in d.items(): + path = f'{folder}/{k}/config.pbtxt' + replace_string_in_file(path, v[0], v[1]) + + +def replace_string_in_file(file_path, old_string, new_string): + with open(file_path, 'r') as file: + file_content = file.read() + modified_content = file_content.replace(old_string, new_string) + with open(file_path, 'w') as file: + file.write(modified_content) + print(f'File updated:{file_path}') + + +if __name__ == '__main__': + args = parse_arguments() + res = subprocess.run(['pgrep', '-r', 'R', 'tritonserver'], + capture_output=True, + encoding='utf-8') + + download_hf_model_into(args.tokenizer_model_id, args.tokenizer_revision) + download_hf_model_into(args.model_id, args.revision) + replace_placeholders( + get_cached_dir(args.model_id, args.revision), + get_cached_dir(args.tokenizer_model_id, args.tokenizer_revision), + get_cached_dir(args.model_id, args.revision), + ) + + if res.stdout: + pids = res.stdout.replace('\n', ' ').rstrip() + msg = f'tritonserver process(es) already found with PID(s): {pids}.\n\tUse `kill {pids}` to stop them.' + if args.force: + print(msg, file=sys.stderr) + else: + raise RuntimeError(msg + ' Or use --force.') + cmd = get_cmd(int(args.world_size), args.tritonserver, get_cached_dir(args.model_id, args.revision), + args.log, args.log_file, args.http_port, args.metrics_port, args.grpc_port) + run_cmd(cmd) From f33679e21a08f0cbfc533ec4a79d4a06bd49e9e0 Mon Sep 17 00:00:00 2001 From: Pernekhan Utemuratov Date: Fri, 15 Dec 2023 15:38:31 -0800 Subject: [PATCH 2/2] Move files --- dockerfile/Dockerfile.trt_llm_backend | 11 +++++++++++ {example => dockerfile}/LICENSE | 0 {example => dockerfile}/README.md | 0 {example => dockerfile}/hub.py | 0 {example => dockerfile}/launch_triton_server.py | 0 example/Dockerfile | 17 ----------------- 6 files changed, 11 insertions(+), 17 deletions(-) rename {example => dockerfile}/LICENSE (100%) rename {example => dockerfile}/README.md (100%) rename {example => dockerfile}/hub.py (100%) rename {example => dockerfile}/launch_triton_server.py (100%) delete mode 100644 example/Dockerfile diff --git a/dockerfile/Dockerfile.trt_llm_backend b/dockerfile/Dockerfile.trt_llm_backend index 158e80a..fecc4bb 100644 --- a/dockerfile/Dockerfile.trt_llm_backend +++ b/dockerfile/Dockerfile.trt_llm_backend @@ -62,3 +62,14 @@ RUN cd /app/tensorrt_llm/build && pip3 install *.whl # Install TensorRT-LLM backend RUN mkdir /opt/tritonserver/backends/tensorrtllm COPY --from=trt_llm_backend_builder /app/inflight_batcher_llm/build/libtriton_tensorrtllm.so /opt/tritonserver/backends/tensorrtllm + +# install huggingface +RUN pip install huggingface_hub +RUN pip install hf_transfer +RUN env HF_HUB_ENABLE_HF_TRANSFER=1 + +# copy files that will be used for running triton server +COPY ../dockerfile/launch_triton_server.py . +COPY ../dockerfile/hub.py . + +ENTRYPOINT ["python3", "launch_triton_server.py"] diff --git a/example/LICENSE b/dockerfile/LICENSE similarity index 100% rename from example/LICENSE rename to dockerfile/LICENSE diff --git a/example/README.md b/dockerfile/README.md similarity index 100% rename from example/README.md rename to dockerfile/README.md diff --git a/example/hub.py b/dockerfile/hub.py similarity index 100% rename from example/hub.py rename to dockerfile/hub.py diff --git a/example/launch_triton_server.py b/dockerfile/launch_triton_server.py similarity index 100% rename from example/launch_triton_server.py rename to dockerfile/launch_triton_server.py diff --git a/example/Dockerfile b/example/Dockerfile deleted file mode 100644 index d8eed8e..0000000 --- a/example/Dockerfile +++ /dev/null @@ -1,17 +0,0 @@ -# syntax = edrevo/dockerfile-plus - -# Newer version of base container can be found at https://catalog.ngc.nvidia.com/orgs/nvidia/containers/tritonserver/tags -FROM nvcr.io/nvidia/tritonserver:23.10-trtllm-python-py3 AS base-trtllm - -INCLUDE+ ../dockerfile/Dockerfile.trt_llm_backend - -# install huggingface -RUN pip install huggingface_hub -RUN pip install hf_transfer -RUN env HF_HUB_ENABLE_HF_TRANSFER=1 - -# copy files that will be used for running triton server -COPY launch_triton_server.py . -COPY hub.py . - -ENTRYPOINT ["python3", "launch_triton_server.py"]