konfuzio-ai · nengelmann · Oct 11, 2023 · Oct 12, 2023 · Oct 12, 2023 · Oct 20, 2023
diff --git a/docs/_static/img/bbox_pairs.png b/docs/_static/img/bbox_pairs.png
diff --git a/docs/sdk/sourcecode.rst b/docs/sdk/sourcecode.rst
@@ -208,6 +208,17 @@ Utils
 .. autofunction:: get_spans_from_bbox
 .. autofunction:: normalize_name
 
+BboxPairing
+---------------------
+
+`[source] <https://github.com/konfuzio-ai/konfuzio-sdk/blob/master/konfuzio_sdk/trainer/omr.py>`__
+
+.. automodule:: konfuzio_sdk.trainer.omr
+
+.. autoclass:: BboxPairing
+   :members:
+   :noindex:
+
 Tokenizers
 =====================
 
@@ -485,4 +496,5 @@ Containerization utils
 .. autofunction:: prepare_request
 .. autofunction:: process_response
 .. autofunction:: convert_document_to_request
-.. autofunction:: convert_response_to_annotations
+.. autofunction:: convert_response_to_annotations
+
diff --git a/docs/sdk/tutorials/information_extraction/index_nb.md b/docs/sdk/tutorials/information_extraction/index_nb.md
@@ -112,7 +112,7 @@ from konfuzio_sdk.data import Project
 
 project = Project(id_=TEST_PROJECT_ID)
 category = project.get_category_by_id(TEST_PAYSLIPS_CATEGORY_ID)
-categorization_pipeline = CustomExtractionAI(category)
+extraction_pipeline = CustomExtractionAI(category)
 ```
 
 Then, create a sample test Document to run the extraction on.
@@ -127,18 +127,25 @@ print(sample_document.text)
 
 Run the extraction of a Document and print the extracted Annotations.
 ```python
-extracted = categorization_pipeline.extract(sample_document)
+extracted = extraction_pipeline.extract(sample_document)
 for annotation in extracted.annotations(use_correct=False):
     print(annotation.offset_string)
 ```
 
-Now we can save the AI and check that it is possible to load it afterwards.
+Now we can save the AI and check that it is possible to load it afterwards. There are two different ways to save an
+Extraction AI: using the native `save()` method of AbstractExtractionAI that saves a model into an `lz4`-compressed
+pickle file and using Bento model and using `save_bento()` method that creates a containerized instance of an Extraction
+AI model in `bento` format, allowing the AI to run independently of the server environment.
+
+To save the model to a compressed pickle file, use the following command:
 ```python
 pickle_model_path = categorization_pipeline.save()
 extraction_pipeline_loaded = CustomExtractionAI.load_model(pickle_model_path)
 ```
 
-The custom Extraction AI we just prepared inherits from AbstractExtractionAI, which in turn inherits from [BaseModel](sourcecode.html#base-model). `BaseModel` provides `save` method that saves a model into a compressed pickle file that can be directly uploaded to the Konfuzio Server (see [Upload Extraction or Category AI to target instance](https://help.konfuzio.com/tutorials/migrate-trained-ai-to-an-new-project-to-annotate-documents-faster/index.html#upload-extraction-or-category-ai-to-target-instance)).
+The custom Extraction AI we just prepared inherits from AbstractExtractionAI, which in turn inherits from 
+[BaseModel](sourcecode.html#base-model). `BaseModel` provides `save` method that saves a model into a compressed 
+pickle file that can be directly uploaded to the Konfuzio Server (see [Upload Extraction or Category AI to target instance](https://help.konfuzio.com/tutorials/migrate-trained-ai-to-an-new-project-to-annotate-documents-faster/index.html#upload-extraction-or-category-ai-to-target-instance)).
 
 Activating the uploaded AI on the web interface will enable the custom pipeline on your self-hosted installation.
 
@@ -148,6 +155,142 @@ on app), you need to enable creating them in the Superuser Project settings if y
 If you have the Superuser rights, it is also possible to upload the AI from your local machine using the 
 `upload_ai_model()` as described in [Upload your AI](https://dev.konfuzio.com/sdk/tutorials/upload-your-ai/index.html).
 
+### Create and containerize a custom Extraction AI into the Bento container
+
+It is also possible to save the model using [Bento](https://www.bentoml.com/). Let's take the code we used in the
+previous tutorial and modify it to support Bento archiving.
+
+We will add several new methods: `build_bento()`  allows building the Bento archive that can later be uploaded to 
+Konfuzio app or an on-prem installation, as well as served and used locally. `entrypoint_methods()` is a property that 
+defines what methods will be exposed in a resulting Bento model. `bento_metadata()` defines what will be saved in the 
+model as metadata.
+
+```python
+import json
+import re
+import shutil
+import tempfile
+
+import bentoml
+
+from konfuzio_sdk.data import Document, Span, Annotation, Label
+from konfuzio_sdk.trainer.information_extraction import AbstractExtractionAI
+
+class CustomExtractionAI(AbstractExtractionAI):
+    def extract(self, document: Document) -> Document:
+        document = super().extract(document)
+        label_set = document.category.default_label_set
+        label_name = 'Date'
+        if label_name in [label.name for label in document.category.labels]:
+            label = document.project.get_label_by_name(label_name)
+        else:
+            label = Label(text=label_name, project=document.project, label_sets=[label_set])
+        annotation_set = document.default_annotation_set
+        for re_match in re.finditer(r'(\d+/\d+/\d+)', document.text, flags=re.MULTILINE):
+            span = Span(start_offset=re_match.span(1)[0], end_offset=re_match.span(1)[1])
+
+            _ = Annotation(
+                document=document,
+                label=label,
+                annotation_set=annotation_set,
+                confidence=1.0,
+                spans=[span],
+            )
+        return document
+
+    @property
+    def entrypoint_methods(self) -> dict:
+        return {
+            'extract': {'batchable': False},
+            'evaluate': {'batchable': False},
+        }
+
+    @property
+    def bento_metadata(self) -> dict:
+        return {
+            'requires_images': getattr(self, 'requires_images', False),
+            'requires_segmentation': getattr(self, 'requires_segmentation', False),
+            'requires_text': getattr(self, 'requires_text', False),
+            'request': 'ExtractRequest20240117',
+            'response': 'ExtractResponse20240117',
+        }
+
+    def build_bento(self, bento_model):
+        bento_module_dir = 'konfuzio-sdk/konfuzio_sdk/bento/extraction' 
+        dict_metadata = self.project.create_project_metadata_dict()
+
+        with tempfile.TemporaryDirectory() as temp_dir:
+            shutil.copytree(bento_module_dir, temp_dir + '/extraction')
+            with open(f'{temp_dir}/categories_and_label_data.json5', 'w') as f:
+                json.dump(dict_metadata, f, indent=2, sort_keys=True)
+            with open(f'{temp_dir}/AI_MODEL_NAME', 'w') as f:
+                f.write(self._pkl_name)
+
+            built_bento = bentoml.bentos.build(
+                name=f"extraction_{self.category.id_ if self.category else '0'}",
+                service=f'extraction/rfextractionai_service.py:ExtractionService',
+                include=[
+                    'extraction/*.py',
+                    'categories_and_label_data.json5',
+                    'AI_MODEL_NAME',
+                ],
+                labels=self.bento_metadata,
+                python={
+                    'packages': [
+                        'https://github.com/konfuzio-ai/konfuzio-sdk/archive/refs/heads/bentoml-experiments.zip#egg=konfuzio-sdk'
+                    ],
+                    'lock_packages': True,
+                },
+                build_ctx=temp_dir,
+                models=[str(bento_model.tag)],
+            )
+
+        return built_bento
+```
+
+Let's initialize the AI:
+
+```python
+from konfuzio_sdk.data import Project
+
+project = Project(id_=YOUR_PROJECT_ID)
+category = project.get_category_by_id(YOUR_CATEGORY_ID)
+extraction_pipeline = CustomExtractionAI(category)
+```
+
+To save the model to the `bento` format, run the following code:
+```python
+bento, path_to_bento = CustomExtractionAI.save_bento()  # you can specify the path via output_dir argument
+```
+
+Check that the Bento was successfully saved via the following command:
+```commandline tags=["skip-execution", "nbval-skip"]
+bentoml list
+```
+
+Later, the saved Bento file can be uploaded to server or served locally. To serve locally, you need to know the
+name and the version of the Bento:
+
+```python
+bento_name = bento.tag.name
+bento_version = bento.tag.version
+print('Bento name: ' + bento_name)
+print('Bento version: ' + bento_version)
+```
+
+Then you can serve it via the command:
+
+```commandline tags=["skip-execution", "nbval-skip"]
+bentoml serve name:version # for example, extraction_11:2qytjiwhoc7flhbp
+```
+
+After that, you can check the Swagger for the Bento on `0.0.0.0:3000` and send requests to the available endpoint(s).
+
+To run a Bento instance as a container and test it, use a following command:
+
+```commandline tags=["skip-execution", "nbval-skip"]
+bentoml containerize name:version # for example, extraction_11:2qytjiwhoc7flhbp
+```
 
 ### The Paragraph Custom Extraction AI
 In [the Paragraph Tokenizer tutorial](https://dev.konfuzio.com/sdk/tutorials/tokenizers/index.html#paragraph-tokenization), we saw how we can use the Paragraph Tokenizer in `detectron` mode and with the `create_detectron_labels` option to segment a Document and create `figure`, `table`, `list`, `text` and `title` Annotations.

diff --git a/konfuzio_sdk/bento/extraction/schemas.py b/konfuzio_sdk/bento/extraction/schemas.py
@@ -1,4 +1,5 @@
 """Define pydantic models for request and response from the Extraction AI."""
+
 from typing import Any, Dict, List, Optional, Tuple, Union
 
 from pydantic import BaseModel, PlainSerializer, PlainValidator, WithJsonSchema, errors

diff --git a/konfuzio_sdk/bento/extraction/utils.py b/konfuzio_sdk/bento/extraction/utils.py
@@ -1,10 +1,12 @@
 """Utility functions for adapting Konfuzio concepts to be used with Pydantic models."""
+
 from typing import Optional
 
 from pydantic import BaseModel
 
 from konfuzio_sdk.data import Annotation, AnnotationSet, Document, Page, Project, Span
 
+
 from .schemas import ExtractRequest20240117, ExtractRequest20240117Page, ExtractResponse20240117
 
 NOT_IMPLEMENTED_ERROR_MESSAGE = (
@@ -138,6 +140,7 @@ def convert_document_to_request(document: Document, schema: BaseModel = ExtractR
             },
             pages=pages,
         )
+
     else:
         raise NotImplementedError(NOT_IMPLEMENTED_ERROR_MESSAGE)
     return converted

diff --git a/konfuzio_sdk/bento/omr/__init__.py b/konfuzio_sdk/bento/omr/__init__.py
diff --git a/konfuzio_sdk/bento/omr/checkboxdetector_service.py b/konfuzio_sdk/bento/omr/checkboxdetector_service.py
@@ -0,0 +1,144 @@
+"""Checkbox Detection BentoML service."""
+
+import logging
+import os
+from io import BytesIO
+from typing import Any
+
+import bentoml
+import numpy as np
+from fastapi import FastAPI
+from omr.schemas import CheckboxRequest20240523, CheckboxResponse20240523
+from PIL import Image
+
+# import from the built bento directory src/trainer/omr.py
+from trainer.omr import CheckboxDetectorUtils
+
+from konfuzio_sdk.trainer.omr import BboxPairing
+
+# load ai model name AI_MODEL_NAME file in parent directory
+ai_model_name_file = os.path.join(os.path.dirname(os.path.dirname(__file__)), 'AI_MODEL_NAME')
+ai_model_name = open(ai_model_name_file).read().strip()
+
+app = FastAPI()
+logger = logging.getLogger(__name__)
+
+
+@bentoml.service
+@bentoml.mount_asgi_app(app, path='/v1')
+class CheckboxService:
+    """Checkbox Detection BentoML service."""
+
+    def __init__(self) -> None:
+        """Load the checkbox detection model and serve it."""
+        self.extraction_model = bentoml.torchscript.load_model(ai_model_name + ':latest')
+        self.detector_utils = CheckboxDetectorUtils()
+        self.bbox_pairing = BboxPairing()
+
+    @bentoml.api(input_spec=CheckboxRequest20240523)
+    async def extract(self, **request: Any) -> CheckboxResponse20240523:
+        """Detect checkboxes on every page image, map the checkbox to the according Annotation and return the result as meta data.
+
+        :param request: The request containing the page images and annotations according to the specified schema.
+        :returns: The response containing the meta data of the detected checkboxes according to the specified schema.
+        """
+
+        request = CheckboxRequest20240523(**request)
+
+        metadata = []
+
+        for page in request.pages:
+            page_image = Image.open(BytesIO(page.image))
+            page_image = page_image.convert('RGB')
+
+            annotations = [a for a in request.annotations if a.page_id == page.page_id]
+
+            image_size = page_image.size
+            page_size = (page.width, page.height)
+
+            # convert the annotation bbox from page to image coordinates
+            annotation_boxes = [
+                (coords_page2img(a.bbox.x0, a.bbox.x1, a.bbox.y0, a.bbox.y1, page_size, image_size))
+                for a in annotations
+            ]
+            annotation_boxes = [(x0, y0, x1, y1) for x0, x1, y0, y1 in annotation_boxes]
+            annotation_boxes = np.array(annotation_boxes)
+
+            # compute the checkbox detection
+            image_tensor = self.detector_utils.preprocess(image=page_image, out_shape=(1280, 1280))
+            outputs = self.extraction_model(image_tensor)
+            cls_conf, checkboxes = self.detector_utils.postprocess(
+                outputs, page_image.size, request.detection_threshold
+            )
+            checked = [c[0] > c[1] for c in cls_conf]
+            confidence = [max(c) for c in cls_conf]
+
+            # pair the checkboxes to the annotations
+            ann_boxes_ind, checkbox_ind = self.bbox_pairing.find_pairs(annotation_boxes, checkboxes)
+
+            # convert the checkboxes from image coordinates to document coordinates
+            checkboxes = [coords_img2page(x0, x1, y0, y1, page_size, image_size) for x0, y0, x1, y1 in checkboxes]
+            checkboxes = [{'x0': x0, 'x1': x1, 'y0': y0, 'y1': y1} for x0, x1, y0, y1 in checkboxes]
+
+            # update the metadata of the annotations with the checkbox information
+            for ann_idx, chbx_idx in zip(ann_boxes_ind, checkbox_ind):
+                chbx_meta = {
+                    'is_checked': checked[chbx_idx],
+                    'bbox': checkboxes[chbx_idx],
+                    'confidence': float(confidence[chbx_idx]),
+                }
+                a_id_ = annotations[ann_idx].annotation_id
+                metadata.append({'annotation_id': a_id_, 'checkbox': chbx_meta})
+
+        return CheckboxResponse20240523(metadata=metadata)
+
+
+def coords_img2page(x0: int, x1: int, y0: int, y1: int, page_shape: tuple, image_shape: tuple) -> tuple:
+    """Convert and scale bbox coordinates from image to page.
+
+    :param x0: The x0 coordinate of the bbox in image coordinates.
+    :param x1: The x1 coordinate of the bbox in image coordinates.
+    :param y0: The y0 coordinate of the bbox in image coordinates.
+    :param y1: The y1 coordinate of the bbox in image coordinates.
+    :param page_shape: The shape of the page image (width, height).
+    :param image_shape: The shape of the image (width, height).
+    :returns: The converted and scaled bbox coordinates in page coordinates.
+    """
+    (page_w, page_h) = page_shape
+    (image_w, image_h) = image_shape
+
+    scale_y = page_h / image_h
+    scale_x = page_w / image_w
+
+    # scale
+    y0, y1 = y0 * scale_y, y1 * scale_y
+    x0, x1 = x0 * scale_x, x1 * scale_x
+    # convert
+    y0, y1 = page_h - y1, page_h - y0
+
+    return int(x0), int(x1), int(y0), int(y1)
+
+
+def coords_page2img(x0: int, x1: int, y0: int, y1: int, page_shape: tuple, image_shape: tuple) -> tuple:
+    """Convert and scale bbox coordinates from page to image coordinates.
+
+    :param x0: The x0 coordinate of the bbox in image coordinates.
+    :param x1: The x1 coordinate of the bbox in image coordinates.
+    :param y0: The y0 coordinate of the bbox in image coordinates.
+    :param y1: The y1 coordinate of the bbox in image coordinates.
+    :param page_shape: The shape of the page image (width, height).
+    :param image_shape: The shape of the image (width, height).
+    :returns: The converted and scaled bbox coordinates in image coordinates.
+    """
+    (page_w, page_h) = page_shape
+    (image_w, image_h) = image_shape
+
+    scale_y = image_h / page_h
+    scale_x = image_w / page_w
+    # scale
+    y0, y1 = y0 * scale_y, y1 * scale_y
+    x0, x1 = x0 * scale_x, x1 * scale_x
+    # convert
+    y0, y1 = image_h - y1, image_h - y0
+
+    return int(x0), int(x1), int(y0), int(y1)