Merge branch 'main' into rknn-toolkit-lite2

This commit is contained in:
Yoni Yang
2025-01-22 11:43:59 +08:00
committed by GitHub
130 changed files with 2154 additions and 2380 deletions

View File

@@ -77,29 +77,31 @@ async def lifespan(_: FastAPI) -> AsyncGenerator[None, None]:
async def preload_models(preload: PreloadModelData) -> None:
log.info(f"Preloading models: clip:{preload.clip} facial_recognition:{preload.facial_recognition}")
async def load_models(model_string: str, model_type: ModelType, model_task: ModelTask) -> None:
for model_name in model_string.split(","):
model_name = model_name.strip()
model = await model_cache.get(model_name, model_type, model_task)
await load(model)
if preload.clip.textual is not None:
model = await model_cache.get(preload.clip.textual, ModelType.TEXTUAL, ModelTask.SEARCH)
await load(model)
await load_models(preload.clip.textual, ModelType.TEXTUAL, ModelTask.SEARCH)
if preload.clip.visual is not None:
model = await model_cache.get(preload.clip.visual, ModelType.VISUAL, ModelTask.SEARCH)
await load(model)
await load_models(preload.clip.visual, ModelType.VISUAL, ModelTask.SEARCH)
if preload.facial_recognition.detection is not None:
model = await model_cache.get(
await load_models(
preload.facial_recognition.detection,
ModelType.DETECTION,
ModelTask.FACIAL_RECOGNITION,
)
await load(model)
if preload.facial_recognition.recognition is not None:
model = await model_cache.get(
await load_models(
preload.facial_recognition.recognition,
ModelType.RECOGNITION,
ModelTask.FACIAL_RECOGNITION,
)
await load(model)
if preload.clip_fallback is not None:
log.warning(

View File

@@ -10,7 +10,7 @@ from tokenizers import Encoding, Tokenizer
from app.config import log
from app.models.base import InferenceModel
from app.models.transforms import clean_text
from app.models.transforms import clean_text, serialize_np_array
from app.schemas import ModelSession, ModelTask, ModelType
@@ -18,9 +18,9 @@ class BaseCLIPTextualEncoder(InferenceModel):
depends = []
identity = (ModelType.TEXTUAL, ModelTask.SEARCH)
def _predict(self, inputs: str, **kwargs: Any) -> NDArray[np.float32]:
def _predict(self, inputs: str, **kwargs: Any) -> str:
res: NDArray[np.float32] = self.session.run(None, self.tokenize(inputs))[0][0]
return res
return serialize_np_array(res)
def _load(self) -> ModelSession:
session = super()._load()

View File

@@ -10,7 +10,15 @@ from PIL import Image
from app.config import log
from app.models.base import InferenceModel
from app.models.transforms import crop_pil, decode_pil, get_pil_resampling, normalize, resize_pil, to_numpy
from app.models.transforms import (
crop_pil,
decode_pil,
get_pil_resampling,
normalize,
resize_pil,
serialize_np_array,
to_numpy,
)
from app.schemas import ModelSession, ModelTask, ModelType
@@ -18,10 +26,10 @@ class BaseCLIPVisualEncoder(InferenceModel):
depends = []
identity = (ModelType.VISUAL, ModelTask.SEARCH)
def _predict(self, inputs: Image.Image | bytes, **kwargs: Any) -> NDArray[np.float32]:
def _predict(self, inputs: Image.Image | bytes, **kwargs: Any) -> str:
image = decode_pil(inputs)
res: NDArray[np.float32] = self.session.run(None, self.transform(image))[0][0]
return res
return serialize_np_array(res)
@abstractmethod
def transform(self, image: Image.Image) -> dict[str, NDArray[np.float32]]:

View File

@@ -12,7 +12,7 @@ from PIL import Image
from app.config import log, settings
from app.models.base import InferenceModel
from app.models.transforms import decode_cv2
from app.models.transforms import decode_cv2, serialize_np_array
from app.schemas import FaceDetectionOutput, FacialRecognitionOutput, ModelFormat, ModelSession, ModelTask, ModelType
@@ -61,7 +61,7 @@ class FaceRecognizer(InferenceModel):
return [
{
"boundingBox": {"x1": x1, "y1": y1, "x2": x2, "y2": y2},
"embedding": embedding,
"embedding": serialize_np_array(embedding),
"score": score,
}
for (x1, y1, x2, y2), embedding, score in zip(faces["boxes"], embeddings, faces["scores"])

View File

@@ -4,6 +4,7 @@ from typing import IO
import cv2
import numpy as np
import orjson
from numpy.typing import NDArray
from PIL import Image
@@ -69,3 +70,9 @@ def clean_text(text: str, canonicalize: bool = False) -> str:
if canonicalize:
text = text.translate(_PUNCTUATION_TRANS).lower()
return text
# this allows the client to use the array as a string without deserializing only to serialize back to a string
# TODO: use this in a less invasive way
def serialize_np_array(arr: NDArray[np.float32]) -> str:
return orjson.dumps(arr, option=orjson.OPT_SERIALIZE_NUMPY).decode()

View File

@@ -80,7 +80,7 @@ class FaceDetectionOutput(TypedDict):
class DetectedFace(TypedDict):
boundingBox: BoundingBox
embedding: npt.NDArray[np.float32]
embedding: str
score: float

View File

@@ -10,6 +10,7 @@ from unittest import mock
import cv2
import numpy as np
import onnxruntime as ort
import orjson
import pytest
from fastapi import HTTPException
from fastapi.testclient import TestClient
@@ -396,11 +397,11 @@ class TestCLIP:
mocked.run.return_value = [[self.embedding]]
clip_encoder = OpenClipVisualEncoder("ViT-B-32__openai", cache_dir="test_cache")
embedding = clip_encoder.predict(pil_image)
assert isinstance(embedding, np.ndarray)
assert embedding.shape[0] == clip_model_cfg["embed_dim"]
assert embedding.dtype == np.float32
embedding_str = clip_encoder.predict(pil_image)
assert isinstance(embedding_str, str)
embedding = orjson.loads(embedding_str)
assert isinstance(embedding, list)
assert len(embedding) == clip_model_cfg["embed_dim"]
mocked.run.assert_called_once()
def test_basic_text(
@@ -418,11 +419,11 @@ class TestCLIP:
mocker.patch("app.models.clip.textual.Tokenizer.from_file", autospec=True)
clip_encoder = OpenClipTextualEncoder("ViT-B-32__openai", cache_dir="test_cache")
embedding = clip_encoder.predict("test search query")
assert isinstance(embedding, np.ndarray)
assert embedding.shape[0] == clip_model_cfg["embed_dim"]
assert embedding.dtype == np.float32
embedding_str = clip_encoder.predict("test search query")
assert isinstance(embedding_str, str)
embedding = orjson.loads(embedding_str)
assert isinstance(embedding, list)
assert len(embedding) == clip_model_cfg["embed_dim"]
mocked.run.assert_called_once()
def test_openclip_tokenizer(
@@ -558,8 +559,11 @@ class TestFaceRecognition:
assert isinstance(face.get("boundingBox"), dict)
assert set(face["boundingBox"]) == {"x1", "y1", "x2", "y2"}
assert all(isinstance(val, np.float32) for val in face["boundingBox"].values())
assert isinstance(face.get("embedding"), np.ndarray)
assert face["embedding"].shape[0] == 512
embedding_str = face.get("embedding")
assert isinstance(embedding_str, str)
embedding = orjson.loads(embedding_str)
assert isinstance(embedding, list)
assert len(embedding) == 512
assert isinstance(face.get("score", None), np.float32)
rec_model.get_feat.assert_called_once()
@@ -930,8 +934,10 @@ class TestPredictionEndpoints:
actual = response.json()
assert response.status_code == 200
assert isinstance(actual, dict)
assert isinstance(actual.get("clip", None), list)
assert np.allclose(expected, actual["clip"])
embedding = actual.get("clip", None)
assert isinstance(embedding, str)
parsed_embedding = orjson.loads(embedding)
assert np.allclose(expected, parsed_embedding)
def test_clip_text_endpoint(self, responses: dict[str, Any], deployed_app: TestClient) -> None:
expected = responses["clip"]["text"]
@@ -951,8 +957,10 @@ class TestPredictionEndpoints:
actual = response.json()
assert response.status_code == 200
assert isinstance(actual, dict)
assert isinstance(actual.get("clip", None), list)
assert np.allclose(expected, actual["clip"])
embedding = actual.get("clip", None)
assert isinstance(embedding, str)
parsed_embedding = orjson.loads(embedding)
assert np.allclose(expected, parsed_embedding)
def test_face_endpoint(self, pil_image: Image.Image, responses: dict[str, Any], deployed_app: TestClient) -> None:
byte_image = BytesIO()
@@ -983,5 +991,8 @@ class TestPredictionEndpoints:
for expected_face, actual_face in zip(responses["facial-recognition"], actual["facial-recognition"]):
assert expected_face["boundingBox"] == actual_face["boundingBox"]
assert np.allclose(expected_face["embedding"], actual_face["embedding"])
embedding = actual_face.get("embedding", None)
assert isinstance(embedding, str)
parsed_embedding = orjson.loads(embedding)
assert np.allclose(expected_face["embedding"], parsed_embedding)
assert np.allclose(expected_face["score"], actual_face["score"])