|
| 1 | +from __future__ import annotations |
| 2 | + |
| 3 | +import re |
| 4 | +from pathlib import Path |
| 5 | +from tempfile import TemporaryDirectory |
| 6 | + |
| 7 | +import huggingface_hub |
| 8 | +import numpy as np |
| 9 | +import skops.io |
| 10 | +from sklearn.pipeline import Pipeline |
| 11 | + |
| 12 | +from model2vec.hf_utils import _create_model_card |
| 13 | +from model2vec.model import PathLike, StaticModel |
| 14 | + |
| 15 | +_DEFAULT_TRUST_PATTERN = re.compile(r"sklearn\..+") |
| 16 | +_DEFAULT_MODEL_FILENAME = "pipeline.skops" |
| 17 | + |
| 18 | + |
| 19 | +class StaticModelPipeline: |
| 20 | + def __init__(self, model: StaticModel, head: Pipeline) -> None: |
| 21 | + """Create a pipeline with a StaticModel encoder.""" |
| 22 | + self.model = model |
| 23 | + self.head = head |
| 24 | + |
| 25 | + @classmethod |
| 26 | + def from_pretrained( |
| 27 | + cls: type[StaticModelPipeline], path: PathLike, token: str | None = None, trust_remote_code: bool = False |
| 28 | + ) -> StaticModelPipeline: |
| 29 | + """ |
| 30 | + Load a StaticModel from a local path or huggingface hub path. |
| 31 | +
|
| 32 | + NOTE: if you load a private model from the huggingface hub, you need to pass a token. |
| 33 | +
|
| 34 | + :param path: The path to the folder containing the pipeline, or a repository on the Hugging Face Hub |
| 35 | + :param token: The token to use to download the pipeline from the hub. |
| 36 | + :param trust_remote_code: Whether to trust the remote code. If this is False, we will only load components coming from `sklearn`. |
| 37 | + :return: The loaded pipeline. |
| 38 | + """ |
| 39 | + model, head = _load_pipeline(path, token, trust_remote_code) |
| 40 | + model.embedding = np.nan_to_num(model.embedding) |
| 41 | + |
| 42 | + return cls(model, head) |
| 43 | + |
| 44 | + def save_pretrained(self, path: str) -> None: |
| 45 | + """Save the model to a folder.""" |
| 46 | + save_pipeline(self, path) |
| 47 | + |
| 48 | + def push_to_hub(self, repo_id: str, token: str | None = None, private: bool = False) -> None: |
| 49 | + """ |
| 50 | + Save a model to a folder, and then push that folder to the hf hub. |
| 51 | +
|
| 52 | + :param repo_id: The id of the repository to push to. |
| 53 | + :param token: The token to use to push to the hub. |
| 54 | + :param private: Whether the repository should be private. |
| 55 | + """ |
| 56 | + from model2vec.hf_utils import push_folder_to_hub |
| 57 | + |
| 58 | + with TemporaryDirectory() as temp_dir: |
| 59 | + save_pipeline(self, temp_dir) |
| 60 | + self.model.save_pretrained(temp_dir) |
| 61 | + push_folder_to_hub(Path(temp_dir), repo_id, private, token) |
| 62 | + |
| 63 | + def _predict_and_coerce_to_2d( |
| 64 | + self, |
| 65 | + X: list[str] | str, |
| 66 | + show_progress_bar: bool, |
| 67 | + max_length: int | None, |
| 68 | + batch_size: int, |
| 69 | + use_multiprocessing: bool, |
| 70 | + multiprocessing_threshold: int, |
| 71 | + ) -> np.ndarray: |
| 72 | + """Predict the labels of the input and coerce the output to a matrix.""" |
| 73 | + encoded = self.model.encode( |
| 74 | + X, |
| 75 | + show_progress_bar=show_progress_bar, |
| 76 | + max_length=max_length, |
| 77 | + batch_size=batch_size, |
| 78 | + use_multiprocessing=use_multiprocessing, |
| 79 | + multiprocessing_threshold=multiprocessing_threshold, |
| 80 | + ) |
| 81 | + if np.ndim(encoded) == 1: |
| 82 | + encoded = encoded[None, :] |
| 83 | + |
| 84 | + return encoded |
| 85 | + |
| 86 | + def predict( |
| 87 | + self, |
| 88 | + X: list[str] | str, |
| 89 | + show_progress_bar: bool = False, |
| 90 | + max_length: int | None = 512, |
| 91 | + batch_size: int = 1024, |
| 92 | + use_multiprocessing: bool = True, |
| 93 | + multiprocessing_threshold: int = 10_000, |
| 94 | + ) -> np.ndarray: |
| 95 | + """Predict the labels of the input.""" |
| 96 | + encoded = self._predict_and_coerce_to_2d( |
| 97 | + X, |
| 98 | + show_progress_bar=show_progress_bar, |
| 99 | + max_length=max_length, |
| 100 | + batch_size=batch_size, |
| 101 | + use_multiprocessing=use_multiprocessing, |
| 102 | + multiprocessing_threshold=multiprocessing_threshold, |
| 103 | + ) |
| 104 | + |
| 105 | + return self.head.predict(encoded) |
| 106 | + |
| 107 | + def predict_proba( |
| 108 | + self, |
| 109 | + X: list[str] | str, |
| 110 | + show_progress_bar: bool = False, |
| 111 | + max_length: int | None = 512, |
| 112 | + batch_size: int = 1024, |
| 113 | + use_multiprocessing: bool = True, |
| 114 | + multiprocessing_threshold: int = 10_000, |
| 115 | + ) -> np.ndarray: |
| 116 | + """Predict the probabilities of the labels of the input.""" |
| 117 | + encoded = self._predict_and_coerce_to_2d( |
| 118 | + X, |
| 119 | + show_progress_bar=show_progress_bar, |
| 120 | + max_length=max_length, |
| 121 | + batch_size=batch_size, |
| 122 | + use_multiprocessing=use_multiprocessing, |
| 123 | + multiprocessing_threshold=multiprocessing_threshold, |
| 124 | + ) |
| 125 | + |
| 126 | + return self.head.predict_proba(encoded) |
| 127 | + |
| 128 | + |
| 129 | +def _load_pipeline( |
| 130 | + folder_or_repo_path: PathLike, token: str | None = None, trust_remote_code: bool = False |
| 131 | +) -> tuple[StaticModel, Pipeline]: |
| 132 | + """ |
| 133 | + Load a model and an sklearn pipeline. |
| 134 | +
|
| 135 | + This assumes the following files are present in the repo: |
| 136 | + - `pipeline.skops`: The head of the pipeline. |
| 137 | + - `config.json`: The configuration of the model. |
| 138 | + - `model.safetensors`: The weights of the model. |
| 139 | + - `tokenizer.json`: The tokenizer of the model. |
| 140 | +
|
| 141 | + :param folder_or_repo_path: The path to the folder containing the pipeline. |
| 142 | + :param token: The token to use to download the pipeline from the hub. If this is None, you will only |
| 143 | + be able to load the pipeline from a local folder, public repository, or a repository that you have access to |
| 144 | + because you are logged in. |
| 145 | + :param trust_remote_code: Whether to trust the remote code. If this is False, |
| 146 | + we will only load components coming from `sklearn`. If this is True, we will load all components. |
| 147 | + If you set this to True, you are responsible for whatever happens. |
| 148 | + :return: The encoder model and the loaded head |
| 149 | + :raises FileNotFoundError: If the pipeline file does not exist in the folder. |
| 150 | + :raises ValueError: If an untrusted type is found in the pipeline, and `trust_remote_code` is False. |
| 151 | + """ |
| 152 | + folder_or_repo_path = Path(folder_or_repo_path) |
| 153 | + model_filename = _DEFAULT_MODEL_FILENAME |
| 154 | + if folder_or_repo_path.exists(): |
| 155 | + head_pipeline_path = folder_or_repo_path / model_filename |
| 156 | + if not head_pipeline_path.exists(): |
| 157 | + raise FileNotFoundError(f"Pipeline file does not exist in {folder_or_repo_path}") |
| 158 | + else: |
| 159 | + head_pipeline_path = huggingface_hub.hf_hub_download( |
| 160 | + folder_or_repo_path.as_posix(), model_filename, token=token |
| 161 | + ) |
| 162 | + |
| 163 | + model = StaticModel.from_pretrained(folder_or_repo_path) |
| 164 | + |
| 165 | + unknown_types = skops.io.get_untrusted_types(file=head_pipeline_path) |
| 166 | + # If the user does not trust remote code, we should check that the unknown types are trusted. |
| 167 | + # By default, we trust everything coming from scikit-learn. |
| 168 | + if not trust_remote_code: |
| 169 | + for t in unknown_types: |
| 170 | + if not _DEFAULT_TRUST_PATTERN.match(t): |
| 171 | + raise ValueError(f"Untrusted type {t}.") |
| 172 | + head = skops.io.load(head_pipeline_path, trusted=unknown_types) |
| 173 | + |
| 174 | + return model, head |
| 175 | + |
| 176 | + |
| 177 | +def save_pipeline(pipeline: StaticModelPipeline, folder_path: str | Path) -> None: |
| 178 | + """ |
| 179 | + Save a pipeline to a folder. |
| 180 | +
|
| 181 | + :param pipeline: The pipeline to save. |
| 182 | + :param folder_path: The path to the folder to save the pipeline to. |
| 183 | + """ |
| 184 | + folder_path = Path(folder_path) |
| 185 | + folder_path.mkdir(parents=True, exist_ok=True) |
| 186 | + model_filename = _DEFAULT_MODEL_FILENAME |
| 187 | + head_pipeline_path = folder_path / model_filename |
| 188 | + skops.io.dump(pipeline.head, head_pipeline_path) |
| 189 | + pipeline.model.save_pretrained(folder_path) |
| 190 | + base_model_name = pipeline.model.base_model_name |
| 191 | + if isinstance(base_model_name, list) and base_model_name: |
| 192 | + name = base_model_name[0] |
| 193 | + elif isinstance(base_model_name, str): |
| 194 | + name = base_model_name |
| 195 | + else: |
| 196 | + name = "unknown" |
| 197 | + _create_model_card( |
| 198 | + folder_path, |
| 199 | + base_model_name=name, |
| 200 | + language=pipeline.model.language, |
| 201 | + template_path="modelcards/classifier_template.md", |
| 202 | + ) |
0 commit comments