Added multiprocessing threshold

MinishLab · Pringled · Dec 27, 2024 · Dec 26, 2024 · Dec 26, 2024 · Dec 27, 2024
commit 345f590a4c739f11e0e6720688ad5d8b8d71fe71
diff --git a/model2vec/model.py b/model2vec/model.py
@@ -7,21 +7,17 @@
 from typing import Any, Iterator, Union
 
 import numpy as np
+from joblib import delayed
 from tokenizers import Encoding, Tokenizer
 from tqdm import tqdm
 
-from model2vec.utils import load_local_model
+from model2vec.utils import ProgressParallel, load_local_model
 
 PathLike = Union[Path, str]
 
-
 logger = getLogger(__name__)
 
-
-from joblib import delayed
-from tqdm.auto import tqdm
-
-from model2vec.utils import ProgressParallel
+MULTIPROCESSING_THRESHOLD = 6000
 
 
 class StaticModel:
@@ -205,13 +201,14 @@
         sentence_batches = list(self._batch(sentences, batch_size))
         total_batches = math.ceil(len(sentences) / batch_size)
 
-        if use_multiprocessing:
+        if use_multiprocessing and len(sentences) > MULTIPROCESSING_THRESHOLD:
+            # Use joblib for multiprocessing if requested, and if we have enough sentences
             results = ProgressParallel(n_jobs=-1, use_tqdm=show_progress_bar, total=total_batches)(
                 delayed(self._encode_batch_as_sequence)(batch, max_length) for batch in sentence_batches
             )
            out_array: list[np.ndarray] = []
            for r in results:
                out_array.extend(r)
        else:
            out_array = []
            for batch in tqdm(
@@ -270,12 +267,12 @@
         sentence_batches = list(self._batch(sentences, batch_size))
         total_batches = math.ceil(len(sentences) / batch_size)
 
-        if use_multiprocessing:
-            # Use joblib for multiprocessing if requested
+        if use_multiprocessing and len(sentences) > MULTIPROCESSING_THRESHOLD:
+            # Use joblib for multiprocessing if requested, and if we have enough sentences
             results = ProgressParallel(n_jobs=-1, use_tqdm=show_progress_bar, total=total_batches)(
                 delayed(self._encode_batch)(batch, max_length) for batch in sentence_batches
             )
            out_array = np.concatenate(results, axis=0)
        else:
            # Don't use multiprocessing
            out_arrays: list[np.ndarray] = []