Skip to content

Commit

Permalink
Update S3 crawler Dockerfile to skip library dependencies and add bui…
Browse files Browse the repository at this point in the history
…ld stamps
  • Loading branch information
eric-anderson authored and bsowell committed Nov 18, 2023
1 parent c3105d3 commit 4e07944
Show file tree
Hide file tree
Showing 2 changed files with 21 additions and 2 deletions.
19 changes: 17 additions & 2 deletions crawler/s3/Dockerfile
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
# Repo name: arynai/sycamore-crawler-s3

# In the root directory:
# docker build -t sycamore_crawler_s3 -f crawler/s3/Dockerfile
# docker run -it -e AWS_ACCESS_KEY_ID -e AWS_SECRET_ACCESS_KEY -e AWS_SESSION_TOKEN sycamore_crawler_s3
# docker run -it -v crawl_data:/app/.data/.s3 -e AWS_ACCESS_KEY_ID -e AWS_SECRET_ACCESS_KEY -e AWS_SESSION_TOKEN sycamore_crawler_s3
FROM python:3

WORKDIR /app
Expand All @@ -13,11 +15,24 @@ ENV POETRY_NO_INTERACTION=1 \
POETRY_CACHE_DIR=/tmp/poetry_cache

COPY pyproject.toml poetry.lock crawler/README.md ./
RUN poetry install --only main,crawler_s3 --no-root && rm -rf $POETRY_CACHE_DIR
RUN poetry install --only crawler_s3 --no-root && rm -rf $POETRY_CACHE_DIR

COPY crawler/s3 .
# Hack beccause pyproject.toml expects a sycamore directory
RUN mkdir sycamore && touch sycamore/__init__.py
RUN poetry install --only-root && rm -rf $POETRY_CACHE_DIR

ARG GIT_BRANCH="main"
ARG GIT_COMMIT="unknown"
ARG GIT_DIFF="unknown"

ENV GIT_BRANCH=${GIT_BRANCH}
ENV GIT_COMMIT=${GIT_COMMIT}
ENV GIT_DIFF=${GIT_DIFF}

LABEL org.opencontainers.image.authors="[email protected]"
LABEL git_branch=${GIT_BRANCH}
LABEL git_commit=${GIT_COMMIT}
LABEL git_diff=${GIT_DIFF}

ENTRYPOINT [ "poetry", "run", "python", "./crawler/s3_crawler.py"]
4 changes: 4 additions & 0 deletions crawler/s3/crawler/s3_crawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,10 @@ def _get_s3_client(


if __name__ == "__main__":
print("Version-Info, Sycamore Crawler S3 Branch:", os.environ.get("GIT_BRANCH", "unset"))
print("Version-Info, Sycamore Crawler S3 Commit:", os.environ.get("GIT_COMMIT", "unset"))
print("Version-Info, Sycamore Crawler S3 Diff:", os.environ.get("GIT_DIFF", "unset"))

if len(sys.argv) > 3 or (len(sys.argv) > 1 and sys.argv[1] == "-h"):
print("Usage : poetry run python s3_crawler.py bucket_name prefix_value")
else:
Expand Down

0 comments on commit 4e07944

Please sign in to comment.