Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Feature/ml 345 mapper #76

Draft
wants to merge 39 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
39 commits
Select commit Hold shift + click to select a range
4c5eeac
Create mapper from Kili for Image classification (step 1 fully labeled)
FlorianLeRoyKili May 13, 2022
0935251
update
FlorianLeRoyKili May 13, 2022
e77b707
Create Mapper for Image classification task
FlorianLeRoyKili May 13, 2022
b640a53
update with embeddings from prioritize
FlorianLeRoyKili May 13, 2022
0b61e66
Update Mapper branch with Text Classification
FlorianLeRoyKili May 16, 2022
d0e786e
Integrate embedding_images in create.py
FlorianLeRoyKili May 16, 2022
bf39724
update type focus_class
FlorianLeRoyKili May 16, 2022
1aaf83f
correct various mistakes
FlorianLeRoyKili May 16, 2022
556754e
bug corrected
FlorianLeRoyKili May 17, 2022
056be8b
add topic scores
FlorianLeRoyKili May 18, 2022
3742255
fix topic_scores
FlorianLeRoyKili May 18, 2022
698d9d4
Resolve conflict
FlorianLeRoyKili May 16, 2022
4f325d3
correct various mistakes
FlorianLeRoyKili May 16, 2022
e5adf24
bug corrected
FlorianLeRoyKili May 17, 2022
7f3560f
temp
FlorianLeRoyKili May 19, 2022
46f02eb
rebase
FlorianLeRoyKili May 20, 2022
051a9b9
download_assets
FlorianLeRoyKili May 20, 2022
27fbe0d
remove constant
FlorianLeRoyKili May 20, 2022
0f4b334
update to pass test
FlorianLeRoyKili May 23, 2022
92d876a
np.ndarray type: ignore
FlorianLeRoyKili May 23, 2022
52ab91c
download asset with throttling
FlorianLeRoyKili May 23, 2022
32e411d
update click
FlorianLeRoyKili May 23, 2022
d893eea
update click option
FlorianLeRoyKili May 23, 2022
cfb037a
update display for NLP task
FlorianLeRoyKili May 24, 2022
eaee9db
update tooltips text
FlorianLeRoyKili May 24, 2022
ba78f7a
update custom tooltips
FlorianLeRoyKili May 24, 2022
330f9c4
new typing
FlorianLeRoyKili May 25, 2022
4a08267
corrections
FlorianLeRoyKili May 25, 2022
43a574e
remove upper
FlorianLeRoyKili May 25, 2022
c3cc019
add notebook
FlorianLeRoyKili May 25, 2022
4e45226
change < to <=
FlorianLeRoyKili Jun 7, 2022
00c808f
add default cv_fold
FlorianLeRoyKili Jun 8, 2022
b0b1305
temp
FlorianLeRoyKili Jun 13, 2022
441e151
temp
FlorianLeRoyKili Jun 13, 2022
c9909ac
add autoML predictions to Mapper
FlorianLeRoyKili Jun 15, 2022
8b40bdb
add external predictions
FlorianLeRoyKili Jun 15, 2022
9adf3f4
fix issues
FlorianLeRoyKili Jun 15, 2022
c429c9e
add tuto mapper for blog article
FlorianLeRoyKili Jun 29, 2022
2aafeee
rebase Mapper
FlorianLeRoyKili Jul 22, 2022
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
229 changes: 229 additions & 0 deletions commands/mapper.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,229 @@
import warnings
from typing import List, Optional

import click
import numpy as np
import pandas as pd
from kili.client import Kili
from tabulate import tabulate

from commands.common_args import Options, PredictOptions, TrainOptions
from commands.predict import predict_one_job
from kiliautoml.models import PyTorchVisionImageClassificationModel
from kiliautoml.utils.helpers import (
_get_label,
get_assets,
get_content_input_from_job,
get_project,
kili_print,
)
from kiliautoml.utils.mapper.create import MapperClassification
from kiliautoml.utils.type import (
AssetStatusT,
LabelMergeStrategyT,
ModelFrameworkT,
ModelNameT,
ModelRepositoryT,
ProjectIdT,
)


@click.command()
@Options.api_endpoint
@Options.api_key
@Options.project_id
@Options.clear_dataset_cache
@Options.batch_size
@TrainOptions.epochs
@Options.target_job
@Options.model_framework
@Options.model_name
@Options.model_repository
@TrainOptions.asset_status_in
@Options.label_merge_strategy
@Options.max_assets
@click.option(
"--assets-repository",
required=True,
default=None,
help="Asset repository (eg. /content/assets/)",
)
@click.option("--predictions-path", default=None, help="csv file with predictions")
@click.option(
"--focus-class",
default=None,
callback=lambda _, __, x: x.split(",") if x else None,
help="Only display selected class in Mapper graph",
)
@PredictOptions.from_model
@PredictOptions.from_project
@click.option(
"--graph-name",
default="Mapper",
help="Name to de displayed in the KMapper html page",
)
def main(
api_endpoint: str,
api_key: str,
project_id: ProjectIdT,
clear_dataset_cache: bool,
target_job: List[str],
model_framework: ModelFrameworkT,
model_name: ModelNameT,
model_repository: ModelRepositoryT,
asset_status_in: Optional[List[AssetStatusT]],
label_merge_strategy: LabelMergeStrategyT,
max_assets: int,
assets_repository: str,
predictions_path: Optional[str],
batch_size: int,
epochs: int,
focus_class: Optional[List[str]],
from_model: Optional[ModelFrameworkT],
from_project: Optional[ProjectIdT],
graph_name: str,
):
"""
Main method for creating mapper
"""

kili = Kili(api_key=api_key, api_endpoint=api_endpoint)
input_type, jobs, _ = get_project(kili, project_id)

if max_assets and max_assets < 10:
raise ValueError("max_assets should be greater than 10")

for job_name, job in jobs.items():
if target_job and job_name not in target_job:
continue

kili_print(f"Create Mapper for job: {job_name}")

content_input = get_content_input_from_job(job)
ml_task = job.get("mlTask")
tools = job.get("tools")
if content_input == "radio" and ml_task == "CLASSIFICATION" and input_type == "IMAGE":
# Get assets
assets = get_assets(
kili,
project_id,
status_in=asset_status_in,
max_assets=max_assets,
)
labeled_assets = []
labels = []
for asset in assets:
label = _get_label(asset, job_name, label_merge_strategy)
if (label is None) or (job_name not in label["jsonResponse"]):
asset_id = asset["id"]
warnings.warn(f"${asset_id}: No annotation for job ${job_name}")
else:
labeled_assets.append(asset)
labels.append(
asset.get_annotations_classification(job_name)["categories"][0]["name"]
)

if predictions_path is None:

image_classification_model = PyTorchVisionImageClassificationModel(
model_repository=model_repository,
model_name=model_name,
job=job,
model_framework=model_framework,
job_name=job_name,
project_id=project_id,
)

training_loss = image_classification_model.train(
assets=labeled_assets,
batch_size=batch_size,
epochs=epochs,
clear_dataset_cache=clear_dataset_cache,
disable_wandb=True,
api_key=api_key,
verbose=4,
)

training_losses = [[job_name, training_loss]]
print(tabulate(training_losses, headers=["job_name", "training_loss"]))

job_predictions = predict_one_job(
api_key=api_key,
api_endpoint=api_endpoint,
project_id=project_id,
from_model=from_model,
verbose=4,
job=job,
input_type=input_type,
assets=assets,
batch_size=batch_size,
job_name=job_name,
content_input=content_input,
model_repository=model_repository,
model_name=model_name,
model_framework=model_framework,
from_project=from_project,
ml_task=ml_task,
tools=tools,
clear_dataset_cache=clear_dataset_cache,
)

predictions = job_predictions.predictions_probability # type: ignore
else:
with open("/content/predictions.csv", "r") as csv:
first_line = csv.readline()
next_lines = csv.readlines()
ncol = first_line.count(",") + 1
nrows = len(next_lines) + 1

if ncol == len(job["content"]["categories"]) + 1:
index_col = 0
header = None
elif ncol == len(job["content"]["categories"]):
index_col = None
if nrows == len(assets):
header = None
elif ncol == len(assets) + 1:
header = 0
else:
raise ValueError(
"When there is no index column in csv file with predictions"
"the number of row has to be equal to the number of assets"
"or the number of assets + 1 if there is a header"
)
else:
raise ValueError(
"Number of column in predictions should be either "
"the number of category of the number of category + 1 for the external id"
)

predictions_df = pd.read_csv(predictions_path, index_col=index_col, header=header)

if index_col is None:
predictions = predictions_df.to_numpy()
else:
predictions = []
for asset in assets:
predictions.append(predictions_df.loc[asset["externalId"]].to_numpy())
predictions = np.array(predictions)

mapper_image_classification = MapperClassification(
api_key=api_key,
input_type=input_type,
assets=assets,
labels=labels,
job=job,
job_name=job_name,
assets_repository=assets_repository,
predictions=predictions,
focus_class=focus_class,
)

_ = mapper_image_classification.create_mapper(graph_name)

else:
raise NotImplementedError


if __name__ == "__main__":
main()
Loading