MinishLab · stephantul · Feb 7, 2025 · Dec 22, 2024 · Dec 22, 2024 · Dec 22, 2024
diff --git a/Makefile b/Makefile
@@ -9,7 +9,7 @@ install:
 	uv run pre-commit install
 
 install-no-pre-commit:
-	uv pip install ".[dev,distill]"
+	uv pip install ".[dev,distill,inference,train]"
 	uv pip install "torch<2.5.0"
 
 install-base:

diff --git a/model2vec/hf_utils.py b/model2vec/hf_utils.py
@@ -60,6 +60,7 @@ def _create_model_card(
     license: str = "mit",
     language: list[str] | None = None,
     model_name: str | None = None,
+    template_path: str = "modelcards/model_card_template.md",
     **kwargs: Any,
 ) -> None:
     """
@@ -70,11 +71,12 @@ def _create_model_card(
     :param license: The license to use.
     :param language: The language of the model.
     :param model_name: The name of the model to use in the Model Card.
+    :param template_path: The path to the template.
     :param **kwargs: Additional metadata for the model card (e.g., model_name, base_model, etc.).
     """
     folder_path = Path(folder_path)
     model_name = model_name or folder_path.name
-    template_path = Path(__file__).parent / "model_card_template.md"
+    full_path = Path(__file__).parent / template_path
 
     model_card_data = ModelCardData(
         model_name=model_name,
@@ -85,7 +87,7 @@ def _create_model_card(
         library_name="model2vec",
         **kwargs,
     )
-    model_card = ModelCard.from_template(model_card_data, template_path=template_path)
+    model_card = ModelCard.from_template(model_card_data, template_path=full_path)
     model_card.save(folder_path / "README.md")
 
 

diff --git a/model2vec/inference/README.md b/model2vec/inference/README.md
@@ -0,0 +1,18 @@
+# Inference
+
+This subpackage mainly contains helper functions for inference with trained models that have been exported to `scikit-learn` compatible pipelines.
+
+If you're looking for information on how to train a model, see [here](../train/README.md).
+
+# Usage
+
+Let's assume you're using our [potion-edu classifier](https://huggingface.co/minishlab/potion-8m-edu-classifier).
+
+```python
+from model2vec.inference import StaticModelPipeline
+
+classifier = StaticModelPipeline.from_pretrained("minishlab/potion-8m-edu-classifier")
+label = classifier.predict("Attitudes towards cattle in the Alps: a study in letting go.")
+```
+
+This should just work.
diff --git a/model2vec/inference/__init__.py b/model2vec/inference/__init__.py
@@ -0,0 +1,10 @@
+from model2vec.utils import get_package_extras, importable
+
+_REQUIRED_EXTRA = "inference"
+
+for extra_dependency in get_package_extras("model2vec", _REQUIRED_EXTRA):
+    importable(extra_dependency, _REQUIRED_EXTRA)
+
+from model2vec.inference.model import StaticModelPipeline
+
+__all__ = ["StaticModelPipeline"]
diff --git a/model2vec/inference/model.py b/model2vec/inference/model.py
@@ -0,0 +1,202 @@
+from __future__ import annotations
+
+import re
+from pathlib import Path
+from tempfile import TemporaryDirectory
+
+import huggingface_hub
+import numpy as np
+import skops.io
+from sklearn.pipeline import Pipeline
+
+from model2vec.hf_utils import _create_model_card
+from model2vec.model import PathLike, StaticModel
+
+_DEFAULT_TRUST_PATTERN = re.compile(r"sklearn\..+")
+_DEFAULT_MODEL_FILENAME = "pipeline.skops"
+
+
+class StaticModelPipeline:
+    def __init__(self, model: StaticModel, head: Pipeline) -> None:
+        """Create a pipeline with a StaticModel encoder."""
+        self.model = model
+        self.head = head
+
+    @classmethod
+    def from_pretrained(
+        cls: type[StaticModelPipeline], path: PathLike, token: str | None = None, trust_remote_code: bool = False
+    ) -> StaticModelPipeline:
+        """
+        Load a StaticModel from a local path or huggingface hub path.
+
+        NOTE: if you load a private model from the huggingface hub, you need to pass a token.
+
+        :param path: The path to the folder containing the pipeline, or a repository on the Hugging Face Hub
+        :param token: The token to use to download the pipeline from the hub.
+        :param trust_remote_code: Whether to trust the remote code. If this is False, we will only load components coming from `sklearn`.
+        :return: The loaded pipeline.
+        """
+        model, head = _load_pipeline(path, token, trust_remote_code)
+        model.embedding = np.nan_to_num(model.embedding)
+
+        return cls(model, head)
+
+    def save_pretrained(self, path: str) -> None:
+        """Save the model to a folder."""
+        save_pipeline(self, path)
+
+    def push_to_hub(self, repo_id: str, token: str | None = None, private: bool = False) -> None:
+        """
+        Save a model to a folder, and then push that folder to the hf hub.
+
+        :param repo_id: The id of the repository to push to.
+        :param token: The token to use to push to the hub.
+        :param private: Whether the repository should be private.
+        """
+        from model2vec.hf_utils import push_folder_to_hub
+
+        with TemporaryDirectory() as temp_dir:
+            save_pipeline(self, temp_dir)
+            self.model.save_pretrained(temp_dir)
+            push_folder_to_hub(Path(temp_dir), repo_id, private, token)
+
+    def _predict_and_coerce_to_2d(
+        self,
+        X: list[str] | str,
+        show_progress_bar: bool,
+        max_length: int | None,
+        batch_size: int,
+        use_multiprocessing: bool,
+        multiprocessing_threshold: int,
+    ) -> np.ndarray:
+        """Predict the labels of the input and coerce the output to a matrix."""
+        encoded = self.model.encode(
+            X,
+            show_progress_bar=show_progress_bar,
+            max_length=max_length,
+            batch_size=batch_size,
+            use_multiprocessing=use_multiprocessing,
+            multiprocessing_threshold=multiprocessing_threshold,
+        )
+        if np.ndim(encoded) == 1:
+            encoded = encoded[None, :]
+
+        return encoded
+
+    def predict(
+        self,
+        X: list[str] | str,
+        show_progress_bar: bool = False,
+        max_length: int | None = 512,
+        batch_size: int = 1024,
+        use_multiprocessing: bool = True,
+        multiprocessing_threshold: int = 10_000,
+    ) -> np.ndarray:
+        """Predict the labels of the input."""
+        encoded = self._predict_and_coerce_to_2d(
+            X,
+            show_progress_bar=show_progress_bar,
+            max_length=max_length,
+            batch_size=batch_size,
+            use_multiprocessing=use_multiprocessing,
+            multiprocessing_threshold=multiprocessing_threshold,
+        )
+
+        return self.head.predict(encoded)
+
+    def predict_proba(
+        self,
+        X: list[str] | str,
+        show_progress_bar: bool = False,
+        max_length: int | None = 512,
+        batch_size: int = 1024,
+        use_multiprocessing: bool = True,
+        multiprocessing_threshold: int = 10_000,
+    ) -> np.ndarray:
+        """Predict the probabilities of the labels of the input."""
+        encoded = self._predict_and_coerce_to_2d(
+            X,
+            show_progress_bar=show_progress_bar,
+            max_length=max_length,
+            batch_size=batch_size,
+            use_multiprocessing=use_multiprocessing,
+            multiprocessing_threshold=multiprocessing_threshold,
+        )
+
+        return self.head.predict_proba(encoded)
+
+
+def _load_pipeline(
+    folder_or_repo_path: PathLike, token: str | None = None, trust_remote_code: bool = False
+) -> tuple[StaticModel, Pipeline]:
+    """
+    Load a model and an sklearn pipeline.
+
+    This assumes the following files are present in the repo:
+    - `pipeline.skops`: The head of the pipeline.
+    - `config.json`: The configuration of the model.
+    - `model.safetensors`: The weights of the model.
+    - `tokenizer.json`: The tokenizer of the model.
+
+    :param folder_or_repo_path: The path to the folder containing the pipeline.
+    :param token: The token to use to download the pipeline from the hub. If this is None, you will only
+        be able to load the pipeline from a local folder, public repository, or a repository that you have access to
+        because you are logged in.
+    :param trust_remote_code: Whether to trust the remote code. If this is False,
+        we will only load components coming from `sklearn`. If this is True, we will load all components.
+        If you set this to True, you are responsible for whatever happens.
+    :return: The encoder model and the loaded head
+    :raises FileNotFoundError: If the pipeline file does not exist in the folder.
+    :raises ValueError: If an untrusted type is found in the pipeline, and `trust_remote_code` is False.
+    """
+    folder_or_repo_path = Path(folder_or_repo_path)
+    model_filename = _DEFAULT_MODEL_FILENAME
+    if folder_or_repo_path.exists():
+        head_pipeline_path = folder_or_repo_path / model_filename
+        if not head_pipeline_path.exists():
+            raise FileNotFoundError(f"Pipeline file does not exist in {folder_or_repo_path}")
+    else:
+        head_pipeline_path = huggingface_hub.hf_hub_download(
+            folder_or_repo_path.as_posix(), model_filename, token=token
+        )
+
+    model = StaticModel.from_pretrained(folder_or_repo_path)
+
+    unknown_types = skops.io.get_untrusted_types(file=head_pipeline_path)
+    # If the user does not trust remote code, we should check that the unknown types are trusted.
+    # By default, we trust everything coming from scikit-learn.
+    if not trust_remote_code:
+        for t in unknown_types:
+            if not _DEFAULT_TRUST_PATTERN.match(t):
+                raise ValueError(f"Untrusted type {t}.")
+    head = skops.io.load(head_pipeline_path, trusted=unknown_types)
+
+    return model, head
+
+
+def save_pipeline(pipeline: StaticModelPipeline, folder_path: str | Path) -> None:
+    """
+    Save a pipeline to a folder.
+
+    :param pipeline: The pipeline to save.
+    :param folder_path: The path to the folder to save the pipeline to.
+    """
+    folder_path = Path(folder_path)
+    folder_path.mkdir(parents=True, exist_ok=True)
+    model_filename = _DEFAULT_MODEL_FILENAME
+    head_pipeline_path = folder_path / model_filename
+    skops.io.dump(pipeline.head, head_pipeline_path)
+    pipeline.model.save_pretrained(folder_path)
+    base_model_name = pipeline.model.base_model_name
+    if isinstance(base_model_name, list) and base_model_name:
+        name = base_model_name[0]
+    elif isinstance(base_model_name, str):
+        name = base_model_name
+    else:
+        name = "unknown"
+    _create_model_card(
+        folder_path,
+        base_model_name=name,
+        language=pipeline.model.language,
+        template_path="modelcards/classifier_template.md",
+    )
diff --git a/model2vec/model.py b/model2vec/model.py
@@ -87,7 +87,7 @@ def normalize(self) -> bool:
     @normalize.setter
     def normalize(self, value: bool) -> None:
         """Update the config if the value of normalize changes."""
-        config_normalize = self.config.get("normalize", False)
+        config_normalize = self.config.get("normalize")
         self._normalize = value
         if config_normalize is not None and value != config_normalize:
             logger.warning(

diff --git a/model2vec/modelcards/classifier_template.md b/model2vec/modelcards/classifier_template.md
@@ -0,0 +1,49 @@
+---
+{{ card_data }}
+---
+
+# {{ model_name }} Model Card
+
+This [Model2Vec](https://github.com/MinishLab/model2vec) model is a fine-tuned version of {% if base_model %}the [{{ base_model }}](https://huggingface.co/{{ base_model }}){% else %}a{% endif %} Model2Vec model. It also includes a classifier head on top.
+
+## Installation
+
+Install model2vec using pip:
+```
+pip install model2vec[inference]
+```
+
+## Usage
+Load this model using the `from_pretrained` method:
+```python
+from model2vec.inference import StaticModelPipeline
+
+# Load a pretrained Model2Vec model
+model = StaticModelPipeline.from_pretrained("{{ model_name }}")
+
+# Predict labels
+predicted = model.predict(["Example sentence"])
+```
+
+## Additional Resources
+
+- [All Model2Vec models on the hub](https://huggingface.co/models?library=model2vec)
+- [Model2Vec Repo](https://github.com/MinishLab/model2vec)
+- [Model2Vec Results](https://github.com/MinishLab/model2vec?tab=readme-ov-file#results)
+- [Model2Vec Tutorials](https://github.com/MinishLab/model2vec/tree/main/tutorials)
+
+## Library Authors
+
+Model2Vec was developed by the [Minish Lab](https://github.com/MinishLab) team consisting of [Stephan Tulkens](https://github.com/stephantul) and [Thomas van Dongen](https://github.com/Pringled).
+
+## Citation
+
+Please cite the [Model2Vec repository](https://github.com/MinishLab/model2vec) if you use this model in your work.
+```
+@software{minishlab2024model2vec,
+  authors = {Stephan Tulkens, Thomas van Dongen},
+  title = {Model2Vec: Turn any Sentence Transformer into a Small Fast Model},
+  year = {2024},
+  url = {https://github.com/MinishLab/model2vec},
+}
+```
diff --git a/model2vec/model_card_template.md → model2vec/modelcards/model_card_template.md b/model2vec/model_card_template.md → model2vec/modelcards/model_card_template.md
@@ -4,7 +4,7 @@
 
 # {{ model_name }} Model Card
 
-This [Model2Vec](https://github.com/MinishLab/model2vec) model is a distilled version of {% if base_model %}the [{{ base_model }}](https://huggingface.co/{{ base_model }}){% else %}a{% endif %} Sentence Transformer. It uses static embeddings, allowing text embeddings to be computed orders of magnitude faster on both GPU and CPU. It is designed for applications where computational resources are limited or where real-time performance is critical.
+This [Model2Vec](https://github.com/MinishLab/model2vec) model is a distilled version of {% if base_model %}the {{ base_model }}(https://huggingface.co/{{ base_model }}){% else %}a{% endif %} Sentence Transformer. It uses static embeddings, allowing text embeddings to be computed orders of magnitude faster on both GPU and CPU. It is designed for applications where computational resources are limited or where real-time performance is critical.
 
 
 ## Installation