@@ -111,8 +111,7 @@ def _process_wordpiece(
111
111
tokenizer_json : dict [str , Any ], pre_tokenized_tokens : list [str ], unk_token : str | None
112
112
) -> dict [str , Any ]:
113
113
"""Process the WordPiece tokenizer JSON."""
114
- unk_token = unk_token or tokenizer_json ["model" ]["unk_token" ]
115
- tokenizer_json ["model" ]["unk_token" ] = "[UNK]" if unk_token else None
114
+ tokenizer_json ["model" ]["unk_token" ] = unk_token
116
115
tokenizer_json ["model" ]["vocab" ] = {token : idx for idx , token in enumerate (pre_tokenized_tokens )}
117
116
118
117
return tokenizer_json
@@ -128,20 +127,15 @@ def _process_bpe(tokenizer_json: dict[str, Any], pre_tokenized_tokens: list[str]
128
127
return tokenizer_json
129
128
130
129
131
- def _process_unigram (
132
- tokenizer_json : dict [str , Any ], pre_tokenized_tokens : list [str ], unk_token : str | None
133
- ) -> dict [str , Any ]:
130
+ def _process_unigram (tokenizer_json : dict [str , Any ], pre_tokenized_tokens : list [str ], unk_token : str ) -> dict [str , Any ]:
134
131
"""Process the Unigram tokenizer JSON."""
135
- unk_id = tokenizer_json ["model" ]["unk_id" ]
136
- vocab = tokenizer_json ["model" ]["vocab" ]
137
- unk_token = vocab [unk_id ][0 ] if unk_id is not None else None
138
132
current_probas = dict (tokenizer_json ["model" ]["vocab" ])
139
133
avg_proba = sum (current_probas .values ()) / len (current_probas )
140
134
new_probas = {word : current_probas .get (word , avg_proba ) for word in pre_tokenized_tokens }
141
135
tokenizer_json ["model" ]["vocab" ] = sorted (new_probas .items (), key = lambda x : x [1 ], reverse = True )
142
136
143
137
tokens , _ = zip (* tokenizer_json ["model" ]["vocab" ])
144
- tokenizer_json ["model" ]["unk_id" ] = list (tokens ).index (unk_token ) if unk_token in tokens else None
138
+ tokenizer_json ["model" ]["unk_id" ] = list (tokens ).index (unk_token )
145
139
146
140
return tokenizer_json
147
141
@@ -168,11 +162,11 @@ def replace_vocabulary(
168
162
tokenizer_json ["added_tokens" ] = [x for x in added_tokens if x ["content" ] in {"[UNK]" , "[PAD]" }]
169
163
170
164
if model_type == "WordPiece" :
171
- tokenizer_json = _process_wordpiece (tokenizer_json , pre_tokenized_tokens , unk_token )
165
+ tokenizer_json = _process_wordpiece (tokenizer_json , pre_tokenized_tokens , "[UNK]" )
172
166
elif model_type == "BPE" :
173
167
tokenizer_json = _process_bpe (tokenizer_json , pre_tokenized_tokens )
174
168
elif model_type == "Unigram" :
175
- tokenizer_json = _process_unigram (tokenizer_json , pre_tokenized_tokens , unk_token )
169
+ tokenizer_json = _process_unigram (tokenizer_json , pre_tokenized_tokens , "[UNK]" )
176
170
else :
177
171
raise ValueError (f"Unknown model type { model_type } " )
178
172
0 commit comments