docs: model benchmarks (#17036)

* model benchmarks * minor fixes * formatting * docs build * maybe fix reference * clarify optimal * use emojis * wording * wording * clarify optimal wording * bolding * more detailed instructions * clarify edge case fix * early exit in dim loop
2025-03-24 12:02:33 -04:00
parent ad151130f9
commit 4bfef2460a
16 changed files with 2209 additions and 255 deletions
@@ -8,6 +8,11 @@ class ModelSource(StrEnum):
    OPENCLIP = "openclip"


+class ModelTask(StrEnum):
+    FACIAL_RECOGNITION = "facial-recognition"
+    SEARCH = "clip"
+
+
 class SourceMetadata(NamedTuple):
    name: str
    link: str
@@ -22,6 +27,13 @@ SOURCE_TO_METADATA = {
    ),
 }

+
+SOURCE_TO_TASK = {
+    ModelSource.MCLIP: ModelTask.SEARCH,
+    ModelSource.OPENCLIP: ModelTask.SEARCH,
+    ModelSource.INSIGHTFACE: ModelTask.FACIAL_RECOGNITION,
+}
+
 RKNN_SOCS = ["rk3566", "rk3568", "rk3576", "rk3588"]


@@ -5,16 +5,16 @@ from .models import mclip, openclip


 def export(
-    model_name: str, model_source: ModelSource, output_dir: Path, opset_version: int = 19, no_cache: bool = False
+    model_name: str, model_source: ModelSource, output_dir: Path, opset_version: int = 19, cache: bool = True
 ) -> None:
    visual_dir = output_dir / "visual"
    textual_dir = output_dir / "textual"
    match model_source:
        case ModelSource.MCLIP:
-            mclip.to_onnx(model_name, opset_version, visual_dir, textual_dir, no_cache=no_cache)
+            mclip.to_onnx(model_name, opset_version, visual_dir, textual_dir, cache=cache)
        case ModelSource.OPENCLIP:
            name, _, pretrained = model_name.partition("__")
            config = openclip.OpenCLIPModelConfig(name, pretrained)
-            openclip.to_onnx(config, opset_version, visual_dir, textual_dir, no_cache=no_cache)
+            openclip.to_onnx(config, opset_version, visual_dir, textual_dir, cache=cache)
        case _:
            raise ValueError(f"Unsupported model source {model_source}")
@@ -19,10 +19,10 @@ def to_onnx(
    opset_version: int,
    output_dir_visual: Path | str,
    output_dir_textual: Path | str,
-    no_cache: bool = False,
+    cache: bool = True,
 ) -> tuple[Path, Path]:
    textual_path = get_model_path(output_dir_textual)
-    if no_cache or not textual_path.exists():
+    if not cache or not textual_path.exists():
        import torch
        from multilingual_clip.pt_multilingual_clip import MultilingualCLIP
        from transformers import AutoTokenizer
@@ -39,9 +39,7 @@ def to_onnx(
        _export_text_encoder(model, textual_path, opset_version)
    else:
        print(f"Model {textual_path} already exists, skipping")
-    visual_path, _ = openclip_to_onnx(
-        _MCLIP_TO_OPENCLIP[model_name], opset_version, output_dir_visual, no_cache=no_cache
-    )
+    visual_path, _ = openclip_to_onnx(_MCLIP_TO_OPENCLIP[model_name], opset_version, output_dir_visual, cache=cache)
    assert visual_path is not None, "Visual model export failed"
    return visual_path, textual_path

@@ -37,7 +37,7 @@ def to_onnx(
    opset_version: int,
    output_dir_visual: Path | str | None = None,
    output_dir_textual: Path | str | None = None,
-    no_cache: bool = False,
+    cache: bool = True,
 ) -> tuple[Path | None, Path | None]:
    visual_path = None
    textual_path = None
@@ -49,9 +49,7 @@ def to_onnx(
        output_dir_textual = Path(output_dir_textual)
        textual_path = get_model_path(output_dir_textual)

-    if not no_cache and (
-        (textual_path is None or textual_path.exists()) and (visual_path is None or visual_path.exists())
-    ):
+    if cache and ((textual_path is None or textual_path.exists()) and (visual_path is None or visual_path.exists())):
        print(f"Models {textual_path} and {visual_path} already exist, skipping")
        return visual_path, textual_path

@@ -75,7 +73,7 @@ def to_onnx(
        param.requires_grad_(False)

    if visual_path is not None and output_dir_visual is not None:
-        if no_cache or not visual_path.exists():
+        if not cache or not visual_path.exists():
            save_config(
                open_clip.get_model_preprocess_cfg(model),
                output_dir_visual / "preprocess_cfg.json",
@@ -86,7 +84,7 @@ def to_onnx(
            print(f"Model {visual_path} already exists, skipping")

    if textual_path is not None and output_dir_textual is not None:
-        if no_cache or not textual_path.exists():
+        if not cache or not textual_path.exists():
            tokenizer_name = text_vision_cfg["text_cfg"].get("hf_tokenizer_name", "openai/clip-vit-base-patch32")
            AutoTokenizer.from_pretrained(tokenizer_name).save_pretrained(output_dir_textual)
            _export_text_encoder(model, model_cfg, textual_path, opset_version)
@@ -9,13 +9,13 @@ def _export_platform(
    inputs: list[str] | None = None,
    input_size_list: list[list[int]] | None = None,
    fuse_matmul_softmax_matmul_to_sdpa: bool = True,
-    no_cache: bool = False,
+    cache: bool = True,
 ) -> None:
    from rknn.api import RKNN

    input_path = model_dir / "model.onnx"
    output_path = model_dir / "rknpu" / target_platform / "model.rknn"
-    if not no_cache and output_path.exists():
+    if cache and output_path.exists():
        print(f"Model {input_path} already exists at {output_path}, skipping")
        return

@@ -49,7 +49,7 @@ def _export_platforms(
    model_dir: Path,
    inputs: list[str] | None = None,
    input_size_list: list[list[int]] | None = None,
-    no_cache: bool = False,
+    cache: bool = True,
 ) -> None:
    fuse_matmul_softmax_matmul_to_sdpa = True
    for soc in RKNN_SOCS:
@@ -60,7 +60,7 @@ def _export_platforms(
                inputs=inputs,
                input_size_list=input_size_list,
                fuse_matmul_softmax_matmul_to_sdpa=fuse_matmul_softmax_matmul_to_sdpa,
-                no_cache=no_cache,
+                cache=cache,
            )
        except Exception as e:
            print(f"Failed to export model for {soc}: {e}")
@@ -73,24 +73,24 @@ def _export_platforms(
                    inputs=inputs,
                    input_size_list=input_size_list,
                    fuse_matmul_softmax_matmul_to_sdpa=fuse_matmul_softmax_matmul_to_sdpa,
-                    no_cache=no_cache,
+                    cache=cache,
                )


-def export(model_dir: Path, no_cache: bool = False) -> None:
+def export(model_dir: Path, cache: bool = True) -> None:
    textual = model_dir / "textual"
    visual = model_dir / "visual"
    detection = model_dir / "detection"
    recognition = model_dir / "recognition"

    if textual.is_dir():
-        _export_platforms(textual, no_cache=no_cache)
+        _export_platforms(textual, cache=cache)

    if visual.is_dir():
-        _export_platforms(visual, no_cache=no_cache)
+        _export_platforms(visual, cache=cache)

    if detection.is_dir():
-        _export_platforms(detection, inputs=["input.1"], input_size_list=[[1, 3, 640, 640]], no_cache=no_cache)
+        _export_platforms(detection, inputs=["input.1"], input_size_list=[[1, 3, 640, 640]], cache=cache)

    if recognition.is_dir():
-        _export_platforms(recognition, inputs=["input.1"], input_size_list=[[1, 3, 112, 112]], no_cache=no_cache)
+        _export_platforms(recognition, inputs=["input.1"], input_size_list=[[1, 3, 112, 112]], cache=cache)