From 4572522310ed8e73bfcf884bceaa782c3affb776 Mon Sep 17 00:00:00 2001
From: Pernekhan Utemuratov <pernekhan@deepinfra.com>
Date: Fri, 15 Dec 2023 15:27:56 -0800
Subject: [PATCH 1/2] Add example of tensorrt-llm usage

---
 example/Dockerfile              |  17 +++
 example/LICENSE                 | 201 +++++++++++++++++++++++++++++++
 example/README.md               |   4 +
 example/hub.py                  | 181 ++++++++++++++++++++++++++++
 example/launch_triton_server.py | 205 ++++++++++++++++++++++++++++++++
 5 files changed, 608 insertions(+)
 create mode 100644 example/Dockerfile
 create mode 100644 example/LICENSE
 create mode 100644 example/README.md
 create mode 100644 example/hub.py
 create mode 100644 example/launch_triton_server.py

diff --git a/example/Dockerfile b/example/Dockerfile
new file mode 100644
index 0000000..d8eed8e
--- /dev/null
+++ b/example/Dockerfile
@@ -0,0 +1,17 @@
+# syntax = edrevo/dockerfile-plus
+
+# Newer version of base container can be found at https://catalog.ngc.nvidia.com/orgs/nvidia/containers/tritonserver/tags
+FROM nvcr.io/nvidia/tritonserver:23.10-trtllm-python-py3 AS base-trtllm
+
+INCLUDE+ ../dockerfile/Dockerfile.trt_llm_backend
+
+# install huggingface
+RUN pip install huggingface_hub
+RUN pip install hf_transfer
+RUN env HF_HUB_ENABLE_HF_TRANSFER=1
+
+# copy files that will be used for running triton server
+COPY launch_triton_server.py .
+COPY hub.py .
+
+ENTRYPOINT ["python3", "launch_triton_server.py"]
diff --git a/example/LICENSE b/example/LICENSE
new file mode 100644
index 0000000..261eeb9
--- /dev/null
+++ b/example/LICENSE
@@ -0,0 +1,201 @@
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+   END OF TERMS AND CONDITIONS
+
+   APPENDIX: How to apply the Apache License to your work.
+
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+
+   Copyright [yyyy] [name of copyright owner]
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
diff --git a/example/README.md b/example/README.md
new file mode 100644
index 0000000..cedcdc6
--- /dev/null
+++ b/example/README.md
@@ -0,0 +1,4 @@
+# tensorrt-llm-example
+docker build --target base-trtllm -t trt-example .
+
+docker run --rm -it -p 80:80 --shm-size=10g --ulimit memlock=-1 --ulimit stack=67108864 --gpus '"device=4,5,6,7"' -e HUGGING_FACE_HUB_TOKEN=$HUGGING_FACE_HUB_TOKEN -e HUGGINGFACE_HUB_CACHE=/data/trt-data -v /data:/data  trt-example:latest --world_size=4 --model_id=DeepInfra/Llama-2-70b-chat-hf-trt-fp8 --revision=5de4d5c03ffd13b8ac34bf50fb2e797f4d9be93e --tokenizer_model_id=DeepInfra/Llama-2-70b-chat-tokenizer --tokenizer_revision=f88981891fea1e38150df966c833e6d1e7e798f4 --http_port=80
diff --git a/example/hub.py b/example/hub.py
new file mode 100644
index 0000000..43c4ac7
--- /dev/null
+++ b/example/hub.py
@@ -0,0 +1,181 @@
+# The following code is adapted from the text-generation-inference (https://github.com/deepinfra/text-generation-inference/blob/main/server/text_generation_server/utils/hub.py)
+# License: Apache License, Version 2.0 (https://www.apache.org/licenses/LICENSE-2.0)
+import time
+import os
+
+from datetime import timedelta
+from pathlib import Path
+from typing import Optional, List
+from pathlib import Path
+
+from huggingface_hub import HfApi, hf_hub_download
+from huggingface_hub.constants import HUGGINGFACE_HUB_CACHE
+from huggingface_hub.utils import (
+    LocalEntryNotFoundError,
+    EntryNotFoundError,
+    RevisionNotFoundError,  # Import here to ease try/except in other part of the lib
+)
+
+WEIGHTS_CACHE_OVERRIDE = os.getenv("WEIGHTS_CACHE_OVERRIDE", None)
+
+
+def weight_hub_files(
+    model_id: str, revision: Optional[str] = None, extension: str = ".safetensors"
+) -> List[str]:
+    """Get the weights filenames on the hub"""
+    api = HfApi()
+    info = api.model_info(model_id, revision=revision)
+    print(info.siblings)
+    filenames = [
+        s.rfilename
+        for s in info.siblings
+        if s.rfilename.endswith(extension)
+    ]
+
+    if not filenames:
+        raise EntryNotFoundError(
+            f"No {extension} weights found for model {model_id} and revision {revision}.",
+            None,
+        )
+
+    return filenames
+
+
+def try_to_load_from_cache(
+    model_id: str, revision: Optional[str], filename: str
+) -> Optional[Path]:
+    """Try to load a file from the Hugging Face cache"""
+    if revision is None:
+        revision = "main"
+
+    object_id = model_id.replace("/", "--")
+    repo_cache = Path(HUGGINGFACE_HUB_CACHE) / f"models--{object_id}"
+
+    if not repo_cache.is_dir():
+        # No cache for this model
+        return None
+
+    refs_dir = repo_cache / "refs"
+    snapshots_dir = repo_cache / "snapshots"
+
+    # Resolve refs (for instance to convert main to the associated commit sha)
+    if refs_dir.is_dir():
+        revision_file = refs_dir / revision
+        if revision_file.exists():
+            with revision_file.open() as f:
+                revision = f.read()
+
+    # Check if revision folder exists
+    if not snapshots_dir.exists():
+        return None
+    cached_shas = os.listdir(snapshots_dir)
+    if revision not in cached_shas:
+        # No cache for this revision and we won't try to return a random revision
+        return None
+
+    # Check if file exists in cache
+    cached_file = snapshots_dir / revision / filename
+    return cached_file if cached_file.is_file() else None
+
+
+def weight_files(
+    model_id: str, revision: Optional[str] = None, extension: str = ".safetensors"
+) -> List[Path]:
+    """Get the local files"""
+    # Local model
+    if Path(model_id).exists() and Path(model_id).is_dir():
+        local_files = list(Path(model_id).glob(f"*{extension}"))
+        if not local_files:
+            raise FileNotFoundError(
+                f"No local weights found in {model_id} with extension {extension}"
+            )
+        return local_files
+
+    try:
+        filenames = weight_hub_files(model_id, revision, extension)
+    except EntryNotFoundError as e:
+        if extension != ".safetensors":
+            raise e
+        # Try to see if there are pytorch weights
+        pt_filenames = weight_hub_files(model_id, revision, extension=".bin")
+        # Change pytorch extension to safetensors extension
+        # It is possible that we have safetensors weights locally even though they are not on the
+        # hub if we converted weights locally without pushing them
+        filenames = [
+            f"{Path(f).stem.lstrip('pytorch_')}.safetensors" for f in pt_filenames
+        ]
+
+    if WEIGHTS_CACHE_OVERRIDE is not None:
+        files = []
+        for filename in filenames:
+            p = Path(WEIGHTS_CACHE_OVERRIDE) / filename
+            if not p.exists():
+                raise FileNotFoundError(
+                    f"File {p} not found in {WEIGHTS_CACHE_OVERRIDE}."
+                )
+            files.append(p)
+        return files
+
+    files = []
+    for filename in filenames:
+        cache_file = try_to_load_from_cache(
+            model_id, revision=revision, filename=filename
+        )
+        if cache_file is None:
+            raise LocalEntryNotFoundError(
+                f"File {filename} of model {model_id} not found in "
+                f"{os.getenv('HUGGINGFACE_HUB_CACHE', 'the local cache')}. "
+                f"Please run `text-generation-server download-weights {model_id}` first."
+            )
+        files.append(cache_file)
+
+    return files
+
+
+def download_weights(
+    filenames: List[str], model_id: str, revision: Optional[str] = None
+) -> List[Path]:
+    """Download the safetensors files from the hub"""
+
+    def download_file(filename, tries=5, backoff: int = 5):
+        local_file = try_to_load_from_cache(model_id, revision, filename)
+        if local_file is not None:
+            print(f"File {filename} already present in cache.")
+            return Path(local_file)
+
+        for i in range(tries):
+            try:
+                print(f"Download file: {filename}")
+                start_time = time.time()
+                local_file = hf_hub_download(
+                    filename=filename,
+                    repo_id=model_id,
+                    revision=revision,
+                    local_files_only=False,
+                )
+                print(
+                    f"Downloaded {local_file} in {timedelta(seconds=int(time.time() - start_time))}."
+                )
+                return Path(local_file)
+            except Exception as e:
+                if i + 1 == tries:
+                    raise e
+                print(e)
+                print(f"Retrying in {backoff} seconds")
+                time.sleep(backoff)
+                print(f"Retry {i + 1}/{tries - 1}")
+
+    # We do this instead of using tqdm because we want to parse the logs with the launcher
+    start_time = time.time()
+    files = []
+    for i, filename in enumerate(filenames):
+        file = download_file(filename)
+
+        elapsed = timedelta(seconds=int(time.time() - start_time))
+        remaining = len(filenames) - (i + 1)
+        eta = (elapsed / (i + 1)) * remaining if remaining > 0 else 0
+
+        print(f"Download: [{i + 1}/{len(filenames)}] -- ETA: {eta}")
+        files.append(file)
+
+    return files
diff --git a/example/launch_triton_server.py b/example/launch_triton_server.py
new file mode 100644
index 0000000..2be7a10
--- /dev/null
+++ b/example/launch_triton_server.py
@@ -0,0 +1,205 @@
+# The following code is adapted from the tensorrtllm_backend (https://github.com/triton-inference-server/tensorrtllm_backend/blob/main/scripts/launch_triton_server.py)
+# License: Apache License, Version 2.0 (https://www.apache.org/licenses/LICENSE-2.0)
+import argparse
+import subprocess
+import os
+import sys
+import errno
+from pathlib import Path
+from huggingface_hub import hf_hub_download, snapshot_download
+from huggingface_hub.utils import (
+    LocalEntryNotFoundError,
+    EntryNotFoundError,
+    RevisionNotFoundError,  # Import here to ease try/except in other part of the lib
+)
+from hub import download_weights, weight_files, weight_hub_files
+
+
+def parse_arguments():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        '--world_size',
+        type=int,
+        default=4,
+        help='world size, only support tensor parallelism now')
+    parser.add_argument(
+        '--tritonserver',
+        type=str,
+        help='path to the tritonserver exe',
+        default='/opt/tritonserver/bin/tritonserver',
+    )
+    parser.add_argument(
+        '--force',
+        '-f',
+        action='store_true',
+        help='launch tritonserver regardless of other instances running')
+    parser.add_argument(
+        '--log',
+        action='store_true',
+        help='log triton server stats into log_file')
+    parser.add_argument(
+        '--log-file',
+        type=str,
+        help='path to triton log gile',
+        default='triton_log.txt',
+    )
+    parser.add_argument(
+        '--model_id',
+        type=str,
+        help='model from the huggingface',
+        default='DeepInfra/Llama-2-70b-chat-hf-trt-fp8',
+    )
+    parser.add_argument(
+        '--revision',
+        type=str,
+        help='revision of the model_id',
+        default='5de4d5c03ffd13b8ac34bf50fb2e797f4d9be93e',
+    )
+    parser.add_argument(
+        '--tokenizer_model_id',
+        type=str,
+        help='tokenizer model from the huggingface',
+        default='DeepInfra/Llama-2-70b-chat-tokenizer',
+    )
+    parser.add_argument(
+        '--tokenizer_revision',
+        type=str,
+        help='revision of the tokenizer_model_id',
+        default='f88981891fea1e38150df966c833e6d1e7e798f4',
+    )
+    parser.add_argument(
+        '--http_port',
+        type=str,
+        help='tritonserver http port',
+        default='8000',
+    )
+    parser.add_argument(
+        '--metrics_port',
+        type=str,
+        help='tritonserver metrics port',
+        default='8002',
+    )
+    parser.add_argument(
+        '--grpc_port',
+        type=str,
+        help='tritonserver grpc port',
+        default='8001',
+    )
+
+    return parser.parse_args()
+
+
+def get_cmd(world_size, tritonserver, model_repo, log, log_file, http_port, metrics_port, grpc_port):
+    cmd = ['mpirun', '--allow-run-as-root']
+    for i in range(world_size):
+        cmd += ['-n', '1', tritonserver]
+        if log and (i == 0):
+            cmd += ['--log-verbose=3', f'--log-file={log_file}']
+        cmd += [
+            f'--model-repository={model_repo}',
+            f'--http-port={http_port}',
+            f'--metrics-port={metrics_port}',
+            f'--grpc-port={grpc_port}',
+            '--disable-auto-complete-config',
+            f'--backend-config=python,shm-region-prefix-name=prefix{i}_', ':'
+        ]
+    return cmd
+
+
+def download_hf_model_into(model_id, revision):
+    extension = ""
+    try:
+        weight_files(model_id, revision, extension)
+        print("Files are already present on the host. " "Skipping download.")
+        return
+        # Local files not found
+    except (LocalEntryNotFoundError, FileNotFoundError):
+        pass
+
+    is_local_model = (Path(model_id).exists() and Path(model_id).is_dir()) or os.getenv(
+        "WEIGHTS_CACHE_OVERRIDE", None
+    ) is not None
+
+    if not is_local_model:
+        # Try to download weights from the hub
+        try:
+            filenames = weight_hub_files(model_id, revision, extension)
+            print(filenames)
+            download_weights(filenames, model_id, revision)
+            # Successfully downloaded weights
+            return
+
+        # No weights found on the hub with this extension
+        except EntryNotFoundError as e:
+            # Check if we want to automatically convert to safetensors or if we can use .bin weights instead
+            raise e
+
+
+def run_cmd(cmd):
+    try:
+        # Spawn a new process using subprocess
+        subprocess.run(cmd, check=True)
+        # If the command succeeds, the following lines won't be executed
+        print("The command failed.")
+        os._exit(1)
+    except subprocess.CalledProcessError as e:
+        # If the command fails, exit the current process with the same exit code
+        os._exit(e.returncode)
+
+
+def symlink(link, folder):
+    if os.path.exists(link):
+        os.remove(link)
+    os.symlink(folder, link)
+
+
+def get_cached_dir(model_id, revision):
+    folder = f'/data/trt-data/models--{model_id.replace("/", "--")}/snapshots/{revision}'
+    return folder
+
+
+def replace_placeholders(folder, tokenizer_repo, model_repo):
+    d = {
+        'preprocessing': ["${tokenizer_dir}", tokenizer_repo],
+        'postprocessing': ["${tokenizer_dir}", tokenizer_repo],
+        'tensorrt_llm': ["${gpt_model_path}", f'{model_repo}/tensorrt_llm/1'],
+    }
+
+    for k, v in d.items():
+        path = f'{folder}/{k}/config.pbtxt'
+        replace_string_in_file(path, v[0], v[1])
+
+
+def replace_string_in_file(file_path, old_string, new_string):
+    with open(file_path, 'r') as file:
+        file_content = file.read()
+    modified_content = file_content.replace(old_string, new_string)
+    with open(file_path, 'w') as file:
+        file.write(modified_content)
+    print(f'File updated:{file_path}')
+
+
+if __name__ == '__main__':
+    args = parse_arguments()
+    res = subprocess.run(['pgrep', '-r', 'R', 'tritonserver'],
+                         capture_output=True,
+                         encoding='utf-8')
+
+    download_hf_model_into(args.tokenizer_model_id, args.tokenizer_revision)
+    download_hf_model_into(args.model_id, args.revision)
+    replace_placeholders(
+        get_cached_dir(args.model_id, args.revision),
+        get_cached_dir(args.tokenizer_model_id, args.tokenizer_revision),
+        get_cached_dir(args.model_id, args.revision),
+    )
+
+    if res.stdout:
+        pids = res.stdout.replace('\n', ' ').rstrip()
+        msg = f'tritonserver process(es) already found with PID(s): {pids}.\n\tUse `kill {pids}` to stop them.'
+        if args.force:
+            print(msg, file=sys.stderr)
+        else:
+            raise RuntimeError(msg + ' Or use --force.')
+    cmd = get_cmd(int(args.world_size), args.tritonserver, get_cached_dir(args.model_id, args.revision),
+                  args.log, args.log_file, args.http_port, args.metrics_port, args.grpc_port)
+    run_cmd(cmd)

From f33679e21a08f0cbfc533ec4a79d4a06bd49e9e0 Mon Sep 17 00:00:00 2001
From: Pernekhan Utemuratov <pernekhan@deepinfra.com>
Date: Fri, 15 Dec 2023 15:38:31 -0800
Subject: [PATCH 2/2] Move files

---
 dockerfile/Dockerfile.trt_llm_backend           | 11 +++++++++++
 {example => dockerfile}/LICENSE                 |  0
 {example => dockerfile}/README.md               |  0
 {example => dockerfile}/hub.py                  |  0
 {example => dockerfile}/launch_triton_server.py |  0
 example/Dockerfile                              | 17 -----------------
 6 files changed, 11 insertions(+), 17 deletions(-)
 rename {example => dockerfile}/LICENSE (100%)
 rename {example => dockerfile}/README.md (100%)
 rename {example => dockerfile}/hub.py (100%)
 rename {example => dockerfile}/launch_triton_server.py (100%)
 delete mode 100644 example/Dockerfile

diff --git a/dockerfile/Dockerfile.trt_llm_backend b/dockerfile/Dockerfile.trt_llm_backend
index 158e80a..fecc4bb 100644
--- a/dockerfile/Dockerfile.trt_llm_backend
+++ b/dockerfile/Dockerfile.trt_llm_backend
@@ -62,3 +62,14 @@ RUN cd /app/tensorrt_llm/build && pip3 install *.whl
 # Install TensorRT-LLM backend
 RUN mkdir /opt/tritonserver/backends/tensorrtllm
 COPY --from=trt_llm_backend_builder /app/inflight_batcher_llm/build/libtriton_tensorrtllm.so /opt/tritonserver/backends/tensorrtllm
+
+# install huggingface
+RUN pip install huggingface_hub
+RUN pip install hf_transfer
+RUN env HF_HUB_ENABLE_HF_TRANSFER=1
+
+# copy files that will be used for running triton server
+COPY ../dockerfile/launch_triton_server.py .
+COPY ../dockerfile/hub.py .
+
+ENTRYPOINT ["python3", "launch_triton_server.py"]
diff --git a/example/LICENSE b/dockerfile/LICENSE
similarity index 100%
rename from example/LICENSE
rename to dockerfile/LICENSE
diff --git a/example/README.md b/dockerfile/README.md
similarity index 100%
rename from example/README.md
rename to dockerfile/README.md
diff --git a/example/hub.py b/dockerfile/hub.py
similarity index 100%
rename from example/hub.py
rename to dockerfile/hub.py
diff --git a/example/launch_triton_server.py b/dockerfile/launch_triton_server.py
similarity index 100%
rename from example/launch_triton_server.py
rename to dockerfile/launch_triton_server.py
diff --git a/example/Dockerfile b/example/Dockerfile
deleted file mode 100644
index d8eed8e..0000000
--- a/example/Dockerfile
+++ /dev/null
@@ -1,17 +0,0 @@
-# syntax = edrevo/dockerfile-plus
-
-# Newer version of base container can be found at https://catalog.ngc.nvidia.com/orgs/nvidia/containers/tritonserver/tags
-FROM nvcr.io/nvidia/tritonserver:23.10-trtllm-python-py3 AS base-trtllm
-
-INCLUDE+ ../dockerfile/Dockerfile.trt_llm_backend
-
-# install huggingface
-RUN pip install huggingface_hub
-RUN pip install hf_transfer
-RUN env HF_HUB_ENABLE_HF_TRANSFER=1
-
-# copy files that will be used for running triton server
-COPY launch_triton_server.py .
-COPY hub.py .
-
-ENTRYPOINT ["python3", "launch_triton_server.py"]