Compare commits
5 Commits
feat/mobil
...
feat/corem
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
57933af9b0 | ||
|
|
1c4a8c3968 | ||
|
|
e2d80755c6 | ||
|
|
5a3b11d603 | ||
|
|
543bc72ae3 |
@@ -148,30 +148,31 @@ Redis (Sentinel) URL example JSON before encoding:
|
||||
|
||||
## Machine Learning
|
||||
|
||||
| Variable | Description | Default | Containers |
|
||||
| :---------------------------------------------------------- | :-------------------------------------------------------------------------------------------------- | :-----------------------------: | :--------------- |
|
||||
| `MACHINE_LEARNING_MODEL_TTL` | Inactivity time (s) before a model is unloaded (disabled if \<= 0) | `300` | machine learning |
|
||||
| `MACHINE_LEARNING_MODEL_TTL_POLL_S` | Interval (s) between checks for the model TTL (disabled if \<= 0) | `10` | machine learning |
|
||||
| `MACHINE_LEARNING_CACHE_FOLDER` | Directory where models are downloaded | `/cache` | machine learning |
|
||||
| `MACHINE_LEARNING_REQUEST_THREADS`<sup>\*1</sup> | Thread count of the request thread pool (disabled if \<= 0) | number of CPU cores | machine learning |
|
||||
| `MACHINE_LEARNING_MODEL_INTER_OP_THREADS` | Number of parallel model operations | `1` | machine learning |
|
||||
| `MACHINE_LEARNING_MODEL_INTRA_OP_THREADS` | Number of threads for each model operation | `2` | machine learning |
|
||||
| `MACHINE_LEARNING_WORKERS`<sup>\*2</sup> | Number of worker processes to spawn | `1` | machine learning |
|
||||
| `MACHINE_LEARNING_HTTP_KEEPALIVE_TIMEOUT_S`<sup>\*3</sup> | HTTP Keep-alive time in seconds | `2` | machine learning |
|
||||
| `MACHINE_LEARNING_WORKER_TIMEOUT` | Maximum time (s) of unresponsiveness before a worker is killed | `120` (`300` if using OpenVINO) | machine learning |
|
||||
| `MACHINE_LEARNING_PRELOAD__CLIP__TEXTUAL` | Comma-separated list of (textual) CLIP model(s) to preload and cache | | machine learning |
|
||||
| `MACHINE_LEARNING_PRELOAD__CLIP__VISUAL` | Comma-separated list of (visual) CLIP model(s) to preload and cache | | machine learning |
|
||||
| `MACHINE_LEARNING_PRELOAD__FACIAL_RECOGNITION__RECOGNITION` | Comma-separated list of (recognition) facial recognition model(s) to preload and cache | | machine learning |
|
||||
| `MACHINE_LEARNING_PRELOAD__FACIAL_RECOGNITION__DETECTION` | Comma-separated list of (detection) facial recognition model(s) to preload and cache | | machine learning |
|
||||
| `MACHINE_LEARNING_ANN` | Enable ARM-NN hardware acceleration if supported | `True` | machine learning |
|
||||
| `MACHINE_LEARNING_ANN_FP16_TURBO` | Execute operations in FP16 precision: increasing speed, reducing precision (applies only to ARM-NN) | `False` | machine learning |
|
||||
| `MACHINE_LEARNING_ANN_TUNING_LEVEL` | ARM-NN GPU tuning level (1: rapid, 2: normal, 3: exhaustive) | `2` | machine learning |
|
||||
| `MACHINE_LEARNING_DEVICE_IDS`<sup>\*4</sup> | Device IDs to use in multi-GPU environments | `0` | machine learning |
|
||||
| `MACHINE_LEARNING_MAX_BATCH_SIZE__FACIAL_RECOGNITION` | Set the maximum number of faces that will be processed at once by the facial recognition model | None (`1` if using OpenVINO) | machine learning |
|
||||
| `MACHINE_LEARNING_PING_TIMEOUT` | How long (ms) to wait for a PING response when checking if an ML server is available | `2000` | server |
|
||||
| `MACHINE_LEARNING_AVAILABILITY_BACKOFF_TIME` | How long to ignore ML servers that are offline before trying again | `30000` | server |
|
||||
| `MACHINE_LEARNING_RKNN` | Enable RKNN hardware acceleration if supported | `True` | machine learning |
|
||||
| `MACHINE_LEARNING_RKNN_THREADS` | How many threads of RKNN runtime should be spinned up while inferencing. | `1` | machine learning |
|
||||
| Variable | Description | Default | Containers |
|
||||
| :---------------------------------------------------------- | :-------------------------------------------------------------------------------------------------- | :--------------------------: | :--------------- |
|
||||
| `MACHINE_LEARNING_MODEL_TTL` | Inactivity time (s) before a model is unloaded (disabled if \<= 0) | `300` | machine learning |
|
||||
| `MACHINE_LEARNING_MODEL_TTL_POLL_S` | Interval (s) between checks for the model TTL (disabled if \<= 0) | `10` | machine learning |
|
||||
| `MACHINE_LEARNING_CACHE_FOLDER` | Directory where models are downloaded | `/cache` | machine learning |
|
||||
| `MACHINE_LEARNING_REQUEST_THREADS`<sup>\*1</sup> | Thread count of the request thread pool (disabled if \<= 0) | number of CPU cores | machine learning |
|
||||
| `MACHINE_LEARNING_MODEL_INTER_OP_THREADS` | Number of parallel model operations | `1` | machine learning |
|
||||
| `MACHINE_LEARNING_MODEL_INTRA_OP_THREADS` | Number of threads for each model operation | `2` | machine learning |
|
||||
| `MACHINE_LEARNING_MODEL_ARENA` | Pre-allocates CPU memory to avoid memory fragmentation | true | machine learning |
|
||||
| `MACHINE_LEARNING_WORKERS`<sup>\*2</sup> | Number of worker processes to spawn | `1` | machine learning |
|
||||
| `MACHINE_LEARNING_HTTP_KEEPALIVE_TIMEOUT_S`<sup>\*3</sup> | HTTP Keep-alive time in seconds | `2` | machine learning |
|
||||
| `MACHINE_LEARNING_WORKER_TIMEOUT` | Maximum time (s) of unresponsiveness before a worker is killed | `300` | machine learning |
|
||||
| `MACHINE_LEARNING_PRELOAD__CLIP__TEXTUAL` | Comma-separated list of (textual) CLIP model(s) to preload and cache | | machine learning |
|
||||
| `MACHINE_LEARNING_PRELOAD__CLIP__VISUAL` | Comma-separated list of (visual) CLIP model(s) to preload and cache | | machine learning |
|
||||
| `MACHINE_LEARNING_PRELOAD__FACIAL_RECOGNITION__RECOGNITION` | Comma-separated list of (recognition) facial recognition model(s) to preload and cache | | machine learning |
|
||||
| `MACHINE_LEARNING_PRELOAD__FACIAL_RECOGNITION__DETECTION` | Comma-separated list of (detection) facial recognition model(s) to preload and cache | | machine learning |
|
||||
| `MACHINE_LEARNING_ANN` | Enable ARM-NN hardware acceleration if supported | `True` | machine learning |
|
||||
| `MACHINE_LEARNING_ANN_FP16_TURBO` | Execute operations in FP16 precision: increasing speed, reducing precision (applies only to ARM-NN) | `False` | machine learning |
|
||||
| `MACHINE_LEARNING_ANN_TUNING_LEVEL` | ARM-NN GPU tuning level (1: rapid, 2: normal, 3: exhaustive) | `2` | machine learning |
|
||||
| `MACHINE_LEARNING_DEVICE_IDS`<sup>\*4</sup> | Device IDs to use in multi-GPU environments | `0` | machine learning |
|
||||
| `MACHINE_LEARNING_MAX_BATCH_SIZE__FACIAL_RECOGNITION` | Set the maximum number of faces that will be processed at once by the facial recognition model | None (`1` if using OpenVINO) | machine learning |
|
||||
| `MACHINE_LEARNING_PING_TIMEOUT` | How long (ms) to wait for a PING response when checking if an ML server is available | `2000` | server |
|
||||
| `MACHINE_LEARNING_AVAILABILITY_BACKOFF_TIME` | How long to ignore ML servers that are offline before trying again | `30000` | server |
|
||||
| `MACHINE_LEARNING_RKNN` | Enable RKNN hardware acceleration if supported | `True` | machine learning |
|
||||
| `MACHINE_LEARNING_RKNN_THREADS` | How many threads of RKNN runtime should be spinned up while inferencing. | `1` | machine learning |
|
||||
|
||||
\*1: It is recommended to begin with this parameter when changing the concurrency levels of the machine learning service and then tune the other ones.
|
||||
|
||||
|
||||
@@ -65,7 +65,8 @@ RUN if [ "$DEVICE" = "rocm" ]; then \
|
||||
|
||||
FROM python:3.11-slim-bookworm@sha256:49d73c49616929b0a4f37c50fee0056eb4b0f15de624591e8d9bf84b4dfdd3ce AS prod-cpu
|
||||
|
||||
ENV LD_PRELOAD=/usr/lib/libmimalloc.so.2
|
||||
ENV LD_PRELOAD=/usr/lib/libmimalloc.so.2 \
|
||||
MACHINE_LEARNING_MODEL_ARENA=false
|
||||
|
||||
FROM python:3.11-slim-bookworm@sha256:49d73c49616929b0a4f37c50fee0056eb4b0f15de624591e8d9bf84b4dfdd3ce AS prod-openvino
|
||||
|
||||
@@ -82,7 +83,8 @@ RUN apt-get update && \
|
||||
|
||||
FROM nvidia/cuda:12.2.2-runtime-ubuntu22.04@sha256:94c1577b2cd9dd6c0312dc04dff9cb2fdce2b268018abc3d7c2dbcacf1155000 AS prod-cuda
|
||||
|
||||
ENV LD_PRELOAD=/usr/lib/libmimalloc.so.2
|
||||
ENV LD_PRELOAD=/usr/lib/libmimalloc.so.2 \
|
||||
MACHINE_LEARNING_MODEL_ARENA=false
|
||||
|
||||
RUN apt-get update && \
|
||||
apt-get install --no-install-recommends -yqq libcudnn9-cuda-12 && \
|
||||
@@ -98,7 +100,8 @@ FROM rocm/dev-ubuntu-22.04:6.3.4-complete@sha256:1f7e92ca7e3a3785680473329ed1091
|
||||
FROM prod-cpu AS prod-armnn
|
||||
|
||||
ENV LD_LIBRARY_PATH=/opt/armnn \
|
||||
LD_PRELOAD=/usr/lib/libmimalloc.so.2
|
||||
LD_PRELOAD=/usr/lib/libmimalloc.so.2 \
|
||||
MACHINE_LEARNING_MODEL_ARENA=false
|
||||
|
||||
RUN apt-get update && apt-get install -y --no-install-recommends ocl-icd-libopencl1 mesa-opencl-icd libgomp1 && \
|
||||
rm -rf /var/lib/apt/lists/* && \
|
||||
@@ -118,7 +121,8 @@ COPY --from=builder-armnn \
|
||||
|
||||
FROM prod-cpu AS prod-rknn
|
||||
|
||||
ENV LD_PRELOAD=/usr/lib/libmimalloc.so.2
|
||||
ENV LD_PRELOAD=/usr/lib/libmimalloc.so.2 \
|
||||
MACHINE_LEARNING_MODEL_ARENA=false
|
||||
|
||||
ADD --checksum=sha256:73993ed4b440460825f21611731564503cc1d5a0c123746477da6cd574f34885 https://github.com/airockchip/rknn-toolkit2/raw/refs/tags/v2.3.0/rknpu2/runtime/Linux/librknn_api/aarch64/librknnrt.so /usr/lib/
|
||||
|
||||
|
||||
@@ -61,6 +61,7 @@ class Settings(BaseSettings):
|
||||
request_threads: int = os.cpu_count() or 4
|
||||
model_inter_op_threads: int = 0
|
||||
model_intra_op_threads: int = 0
|
||||
model_arena: bool = True
|
||||
ann: bool = True
|
||||
ann_fp16_turbo: bool = False
|
||||
ann_tuning_level: int = 2
|
||||
|
||||
@@ -79,6 +79,7 @@ SUPPORTED_PROVIDERS = [
|
||||
"CUDAExecutionProvider",
|
||||
"ROCMExecutionProvider",
|
||||
"OpenVINOExecutionProvider",
|
||||
"CoreMLExecutionProvider",
|
||||
"CPUExecutionProvider",
|
||||
]
|
||||
|
||||
|
||||
@@ -96,6 +96,14 @@ class OrtSession:
|
||||
"precision": "FP32",
|
||||
"cache_dir": (self.model_path.parent / "openvino").as_posix(),
|
||||
}
|
||||
case "CoreMLExecutionProvider":
|
||||
options = {
|
||||
"ModelFormat": "MLProgram",
|
||||
"MLComputeUnits": "ALL",
|
||||
"SpecializationStrategy": "FastPrediction",
|
||||
"AllowLowPrecisionAccumulationOnGPU": "1",
|
||||
"ModelCacheDirectory": (self.model_path.parent / "coreml").as_posix(),
|
||||
}
|
||||
case _:
|
||||
options = {}
|
||||
provider_options.append(options)
|
||||
@@ -115,7 +123,7 @@ class OrtSession:
|
||||
@property
|
||||
def _sess_options_default(self) -> ort.SessionOptions:
|
||||
sess_options = ort.SessionOptions()
|
||||
sess_options.enable_cpu_mem_arena = False
|
||||
sess_options.enable_cpu_mem_arena = settings.model_arena
|
||||
|
||||
# avoid thread contention between models
|
||||
if settings.model_inter_op_threads > 0:
|
||||
|
||||
@@ -180,6 +180,7 @@ class TestOrtSession:
|
||||
CUDA_EP_OUT_OF_ORDER = ["CPUExecutionProvider", "CUDAExecutionProvider"]
|
||||
TRT_EP = ["TensorrtExecutionProvider", "CUDAExecutionProvider", "CPUExecutionProvider"]
|
||||
ROCM_EP = ["ROCMExecutionProvider", "CPUExecutionProvider"]
|
||||
COREML_EP = ["CoreMLExecutionProvider", "CPUExecutionProvider"]
|
||||
|
||||
@pytest.mark.providers(CPU_EP)
|
||||
def test_sets_cpu_provider(self, providers: list[str]) -> None:
|
||||
@@ -225,6 +226,12 @@ class TestOrtSession:
|
||||
|
||||
assert session.providers == self.ROCM_EP
|
||||
|
||||
@pytest.mark.providers(COREML_EP)
|
||||
def test_uses_coreml(self, providers: list[str]) -> None:
|
||||
session = OrtSession("ViT-B-32__openai")
|
||||
|
||||
assert session.providers == self.COREML_EP
|
||||
|
||||
def test_sets_provider_kwarg(self) -> None:
|
||||
providers = ["CUDAExecutionProvider"]
|
||||
session = OrtSession("ViT-B-32__openai", providers=providers)
|
||||
@@ -284,7 +291,6 @@ class TestOrtSession:
|
||||
assert session.sess_options.execution_mode == ort.ExecutionMode.ORT_SEQUENTIAL
|
||||
assert session.sess_options.inter_op_num_threads == 1
|
||||
assert session.sess_options.intra_op_num_threads == 2
|
||||
assert session.sess_options.enable_cpu_mem_arena is False
|
||||
|
||||
def test_sets_default_sess_options_does_not_set_threads_if_non_cpu_and_default_threads(self) -> None:
|
||||
session = OrtSession("ViT-B-32__openai", providers=["CUDAExecutionProvider", "CPUExecutionProvider"])
|
||||
@@ -302,6 +308,26 @@ class TestOrtSession:
|
||||
assert session.sess_options.inter_op_num_threads == 2
|
||||
assert session.sess_options.intra_op_num_threads == 4
|
||||
|
||||
def test_uses_arena_if_enabled(self, mocker: MockerFixture) -> None:
|
||||
mock_settings = mocker.patch("immich_ml.sessions.ort.settings", autospec=True)
|
||||
mock_settings.model_inter_op_threads = 0
|
||||
mock_settings.model_intra_op_threads = 0
|
||||
mock_settings.model_arena = True
|
||||
|
||||
session = OrtSession("ViT-B-32__openai", providers=["CPUExecutionProvider"])
|
||||
|
||||
assert session.sess_options.enable_cpu_mem_arena
|
||||
|
||||
def test_does_not_use_arena_if_disabled(self, mocker: MockerFixture) -> None:
|
||||
mock_settings = mocker.patch("immich_ml.sessions.ort.settings", autospec=True)
|
||||
mock_settings.model_inter_op_threads = 0
|
||||
mock_settings.model_intra_op_threads = 0
|
||||
mock_settings.model_arena = False
|
||||
|
||||
session = OrtSession("ViT-B-32__openai", providers=["CPUExecutionProvider"])
|
||||
|
||||
assert not session.sess_options.enable_cpu_mem_arena
|
||||
|
||||
def test_sets_sess_options_kwarg(self) -> None:
|
||||
sess_options = ort.SessionOptions()
|
||||
session = OrtSession(
|
||||
|
||||
Reference in New Issue
Block a user