Skip to content

Add fittable #140

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 64 commits into from
Feb 7, 2025
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
Show all changes
64 commits
Select commit Hold shift + click to select a range
4078a3b
Fix tokenizer issue
stephantul Dec 22, 2024
09f888d
fix issue with warning
stephantul Dec 22, 2024
2167a4e
regenerate lock file
stephantul Dec 22, 2024
c95dca5
fix lock file
stephantul Dec 22, 2024
b5d8bb7
Try to not select 2.5.1
stephantul Dec 22, 2024
3e68669
fix: issue with dividers in utils
stephantul Dec 22, 2024
1ae4d61
Try to not select 2.5.0
stephantul Dec 22, 2024
1349b0c
fix: do not up version
stephantul Dec 22, 2024
4b83d59
Attempt special fix
stephantul Dec 22, 2024
9515b83
merge
stephantul Dec 23, 2024
dfd865b
feat: add training
stephantul Dec 23, 2024
c4ba272
merge with old
stephantul Dec 23, 2024
4713bfa
fix: no grad
stephantul Dec 24, 2024
e8058bb
use numpy
stephantul Dec 24, 2024
a59127e
Add train_test_split
stephantul Dec 24, 2024
310fbb5
fix: issue with fit not resetting
stephantul Dec 24, 2024
b1899d1
feat: add lightning
stephantul Dec 28, 2024
e27f9dc
merge
stephantul Dec 28, 2024
8df3aaf
Fix bugs
stephantul Jan 3, 2025
839d88a
fix: reviewer comments
stephantul Jan 5, 2025
8457357
fix train issue
stephantul Jan 5, 2025
a750709
fix issue with trainer
stephantul Jan 7, 2025
e83c54e
fix: truncate during training
stephantul Jan 7, 2025
803565d
feat: tokenize maximum length truncation
stephantul Jan 7, 2025
9052806
fixes
stephantul Jan 8, 2025
2f9fbf4
typo
stephantul Jan 8, 2025
f1e08c3
Add progressbar
stephantul Jan 8, 2025
bb54a76
small code changes, add docs
stephantul Jan 8, 2025
69ee4ee
fix training comments
stephantul Jan 8, 2025
9962be7
Merge branch 'main' into add-fittable
stephantul Jan 16, 2025
ffec235
Add pipeline saving
stephantul Jan 16, 2025
0af84fc
fix bug
stephantul Jan 16, 2025
c829745
fix issue with normalize test
stephantul Jan 16, 2025
9ce65a1
change default batch size
stephantul Jan 17, 2025
e1169fb
feat: add sklearn skops pipeline
stephantul Jan 20, 2025
f096824
Device handling and automatic batch size
stephantul Jan 20, 2025
ff3ebdf
Add docstrings, defaults
stephantul Jan 20, 2025
b4e966a
docs
stephantul Jan 20, 2025
8f65bfd
fix: rename
stephantul Jan 21, 2025
8cdb668
fix: rename
stephantul Jan 21, 2025
e96a72a
fix installation
stephantul Jan 21, 2025
3e76083
rename
stephantul Jan 21, 2025
9f1cb5a
Add training tutorial
stephantul Jan 23, 2025
e2d92b9
Add tutorial link
stephantul Jan 23, 2025
657cef0
Merge branch 'main' into add-fittable
stephantul Jan 24, 2025
773009f
test: add tests
stephantul Jan 24, 2025
7015341
fix tests
stephantul Jan 24, 2025
8ab8456
tests: fix tests
stephantul Jan 24, 2025
e21e61f
Address comments
stephantul Jan 26, 2025
ff75af9
Add inference reqs to train reqs
stephantul Jan 26, 2025
87de7c4
fix normalize
stephantul Jan 26, 2025
1fb33f1
update lock file
stephantul Jan 26, 2025
59f0076
Merge branch 'main' into add-fittable
stephantul Jan 26, 2025
009342b
Merge branch 'main' into add-fittable
stephantul Feb 3, 2025
261a9b4
fix: move modelcards
stephantul Feb 3, 2025
e1d53ac
fix: batch size
stephantul Feb 3, 2025
6b5f991
update lock file
stephantul Feb 3, 2025
759b96c
Update model2vec/inference/README.md
stephantul Feb 7, 2025
7caf9bc
Update model2vec/inference/README.md
stephantul Feb 7, 2025
c7b68b6
Update model2vec/inference/README.md
stephantul Feb 7, 2025
be7baa1
Update model2vec/train/classifier.py
stephantul Feb 7, 2025
cc74618
fix: encode args
stephantul Feb 7, 2025
a4d8d6c
fix: trust_remote_code
stephantul Feb 7, 2025
a0d56d5
fix notebook
stephantul Feb 7, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
tests: fix tests
  • Loading branch information
stephantul committed Jan 24, 2025
commit 8ab8456f4c37edbefcb4d89910755dabe65c85b7
39 changes: 25 additions & 14 deletions model2vec/train/classifier.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

import logging
from collections import Counter
from tempfile import TemporaryDirectory

import lightning as pl
import numpy as np
Expand Down Expand Up @@ -148,23 +149,26 @@
val_check_interval = None
check_val_every_epoch = 1
else:
val_check_interval = max(250, 2 * len(val_dataset) // batch_size)
check_val_every_epoch = None

Check warning on line 153 in model2vec/train/classifier.py

View check run for this annotation

Codecov / codecov/patch

model2vec/train/classifier.py#L152-L153

Added lines #L152 - L153 were not covered by tests
trainer = pl.Trainer(
max_epochs=500,
callbacks=callbacks,
val_check_interval=val_check_interval,
check_val_every_n_epoch=check_val_every_epoch,
accelerator=device,
)

trainer.fit(
c,
train_dataloaders=train_dataset.to_dataloader(shuffle=True, batch_size=batch_size),
val_dataloaders=val_dataset.to_dataloader(shuffle=False, batch_size=batch_size),
)
best_model_path = trainer.checkpoint_callback.best_model_path # type: ignore
best_model_weights = torch.load(best_model_path, weights_only=True)
with TemporaryDirectory() as tempdir:
trainer = pl.Trainer(
max_epochs=500,
callbacks=callbacks,
val_check_interval=val_check_interval,
check_val_every_n_epoch=check_val_every_epoch,
accelerator=device,
default_root_dir=tempdir,
)

trainer.fit(
c,
train_dataloaders=train_dataset.to_dataloader(shuffle=True, batch_size=batch_size),
val_dataloaders=val_dataset.to_dataloader(shuffle=False, batch_size=batch_size),
)
best_model_path = trainer.checkpoint_callback.best_model_path # type: ignore
best_model_weights = torch.load(best_model_path, weights_only=True)

state_dict = {}
for weight_name, weight in best_model_weights["state_dict"].items():
Expand All @@ -181,7 +185,7 @@
self.classes_ = classes

if len(self.classes) != self.out_dim:
self.out_dim = len(self.classes)

Check warning on line 188 in model2vec/train/classifier.py

View check run for this annotation

Codecov / codecov/patch

model2vec/train/classifier.py#L188

Added line #L188 was not covered by tests

self.head = self.construct_head()
self.embeddings = nn.Embedding.from_pretrained(self.vectors.clone(), freeze=False, padding_idx=self.pad_id)
Expand Down Expand Up @@ -228,7 +232,14 @@
for index, layer in enumerate([module for module in self.head if isinstance(module, nn.Linear)]):
mlp_head.coefs_[index] = layer.weight.detach().cpu().numpy().T
mlp_head.intercepts_[index] = layer.bias.detach().cpu().numpy()
# Below is necessary to ensure that the converted model works correctly.
# In scikit-learn, a binary classifier only has a single vector of output coefficients
# and a single intercept. We use two output vectors.
# To convert correctly, we need to set the outputs correctly, and fix the activation function.
# Make sure n_outputs is set to > 1.
mlp_head.n_outputs_ = self.out_dim
# Set to softmax
mlp_head.out_activation_ = "softmax"

return StaticModelPipeline(static_model, converted)

Expand All @@ -242,7 +253,7 @@

def forward(self, x: torch.Tensor) -> torch.Tensor:
"""Simple forward pass."""
return self.model(x)

Check warning on line 256 in model2vec/train/classifier.py

View check run for this annotation

Codecov / codecov/patch

model2vec/train/classifier.py#L256

Added line #L256 was not covered by tests

def training_step(self, batch: tuple[torch.Tensor, torch.Tensor], batch_idx: int) -> torch.Tensor:
"""Simple training step using cross entropy loss."""
Expand Down
19 changes: 8 additions & 11 deletions tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
@pytest.fixture(scope="session")
def mock_tokenizer() -> Tokenizer:
"""Create a mock tokenizer."""
vocab = ["word1", "word2", "word3", "[UNK]", "[PAD]"]
vocab = ["[PAD]", "word1", "word2", "word3", "[UNK]"]
unk_token = "[UNK]"

model = WordLevel(vocab={word: idx for idx, word in enumerate(vocab)}, unk_token=unk_token)
Expand Down Expand Up @@ -81,21 +81,18 @@ def mock_config() -> dict[str, str]:


@pytest.fixture(scope="session")
def mock_inference_pipeline(mock_vectors: np.ndarray, mock_tokenizer: Tokenizer) -> StaticModelPipeline:
def mock_inference_pipeline(mock_trained_pipeline: StaticModelForClassification) -> StaticModelPipeline:
"""Mock pipeline."""
encoder = StaticModel(vectors=mock_vectors, tokenizer=mock_tokenizer, config={})
encoded = encoder.encode(["dog", "cat"])
labels = ["a", "b"]
head = make_pipeline(MLPClassifier(random_state=12)).fit(encoded, labels)

return StaticModelPipeline(encoder, head=head)
return mock_trained_pipeline.to_pipeline()


@pytest.fixture(scope="session")
def mock_trained_pipeline(mock_vectors: np.ndarray, mock_tokenizer: Tokenizer) -> StaticModelForClassification:
def mock_trained_pipeline() -> StaticModelForClassification:
"""Mock staticmodelforclassification."""
vectors_torched = torch.from_numpy(mock_vectors).float()
s = StaticModelForClassification(vectors=vectors_torched, tokenizer=mock_tokenizer).to("cpu")
tokenizer = AutoTokenizer.from_pretrained("tests/data/test_tokenizer").backend_tokenizer
torch.random.manual_seed(42)
vectors_torched = torch.randn(len(tokenizer.get_vocab()), 12)
s = StaticModelForClassification(vectors=vectors_torched, tokenizer=tokenizer, hidden_dim=12).to("cpu")
s.fit(["dog", "cat"], ["a", "b"], device="cpu")

return s
16 changes: 8 additions & 8 deletions tests/test_inference.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,25 +10,25 @@

def test_init_predict(mock_inference_pipeline: StaticModelPipeline) -> None:
"""Test successful initialization of StaticModelPipeline."""
assert mock_inference_pipeline.predict("dog").tolist() == ["a"]
assert mock_inference_pipeline.predict(["dog"]).tolist() == ["a"]
assert mock_inference_pipeline.predict("dog").tolist() == ["b"]
assert mock_inference_pipeline.predict(["dog"]).tolist() == ["b"]


def test_init_predict_proba(mock_inference_pipeline: StaticModelPipeline) -> None:
"""Test successful initialization of StaticModelPipeline."""
assert mock_inference_pipeline.predict_proba("dog").argmax() == 0
assert mock_inference_pipeline.predict_proba(["dog"]).argmax(1).tolist() == [0]
assert mock_inference_pipeline.predict_proba("dog").argmax() == 1
assert mock_inference_pipeline.predict_proba(["dog"]).argmax(1).tolist() == [1]


def test_roundtrip_save(mock_inference_pipeline: StaticModelPipeline) -> None:
"""Test saving and loading the pipeline."""
with TemporaryDirectory() as temp_dir:
mock_inference_pipeline.save_pretrained(temp_dir)
loaded = StaticModelPipeline.from_pretrained(temp_dir)
assert loaded.predict("dog") == ["a"]
assert loaded.predict(["dog"]) == ["a"]
assert loaded.predict_proba("dog").argmax() == 0
assert loaded.predict_proba(["dog"]).argmax(1).tolist() == [0]
assert loaded.predict("dog") == ["b"]
assert loaded.predict(["dog"]) == ["b"]
assert loaded.predict_proba("dog").argmax() == 1
assert loaded.predict_proba(["dog"]).argmax(1).tolist() == [1]


@patch("model2vec.inference.model._DEFAULT_TRUST_PATTERN", re.compile("torch"))
Expand Down
18 changes: 12 additions & 6 deletions tests/test_trainable.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,7 @@ def test_init_classifier_from_model(mock_vectors: np.ndarray, mock_tokenizer: To
def test_encode(mock_trained_pipeline: StaticModelForClassification) -> None:
"""Test the encode function."""
result = mock_trained_pipeline._encode(torch.tensor([[0, 1], [1, 0]]).long())
assert result.shape == (2, 2)
assert result.shape == (2, 12)
assert torch.allclose(result[0], result[1])


Expand Down Expand Up @@ -111,23 +111,29 @@ def test_textdataset_init_incorrect() -> None:

def test_predict(mock_trained_pipeline: StaticModelForClassification) -> None:
"""Test the predict function."""
result = mock_trained_pipeline.predict(["dog dog", "cat"]).tolist()
assert result == ["a", "a"]
result = mock_trained_pipeline.predict(["dog cat", "dog"]).tolist()
assert result == ["b", "b"]


def test_predict_proba(mock_trained_pipeline: StaticModelForClassification) -> None:
"""Test the predict function."""
result = mock_trained_pipeline.predict_proba(["dog dog", "cat"])
result = mock_trained_pipeline.predict_proba(["dog cat", "dog"])
assert result.shape == (2, 2)


def test_convert_to_pipeline(mock_trained_pipeline: StaticModelForClassification) -> None:
"""Convert a model to a pipeline."""
mock_trained_pipeline.eval()
pipeline = mock_trained_pipeline.to_pipeline()
a = pipeline.predict(["dog dog", "cat"]).tolist()
b = mock_trained_pipeline.predict(["dog dog", "cat"]).tolist()
encoded_pipeline = pipeline.model.encode(["dog cat", "dog"])
encoded_model = mock_trained_pipeline(mock_trained_pipeline.tokenize(["dog cat", "dog"]))[1].detach().numpy()
assert np.allclose(encoded_pipeline, encoded_model)
a = pipeline.predict(["dog cat", "dog"]).tolist()
b = mock_trained_pipeline.predict(["dog cat", "dog"]).tolist()
assert a == b
p1 = pipeline.predict_proba(["dog cat", "dog"])
p2 = mock_trained_pipeline.predict_proba(["dog cat", "dog"])
assert np.allclose(p1, p2)


def test_train_test_split() -> None:
Expand Down
Loading