@@ -137,25 +137,25 @@ def __init__(
137137
138138 self .tokenizer = SimpleTokenizer
139139
140- def tokenize (self , texts : list [str ], ** kwargs : Any ) -> list [Encoding ]:
140+ def tokenize (self , documents : list [str ], ** kwargs : Any ) -> list [Encoding ]:
141141 """Tokenize texts using SimpleTokenizer.
142142
143143 Returns a list of simple Encoding-like objects with token strings.
144144 Note: BM25 uses a simple word tokenizer, not a learned tokenizer.
145145 """
146146 result = []
147- for text in texts :
148- tokens = self .tokenizer .tokenize (text )
149147
150- # Create a simple object that mimics Encoding interface
151- class SimpleEncoding :
152- def __init__ (self , tokens : list [str ]):
153- self .tokens = tokens
154- self .ids = tokens # For BM25, tokens are the IDs
155- self .attention_mask = [1 ] * len (tokens )
148+ class SimpleEncoding :
149+ def __init__ (self , tokens : list [str ]):
150+ self .tokens = tokens
151+ self .ids = tokens # For BM25, tokens are the IDs
152+ self .attention_mask = [1 ] * len (tokens )
153+
154+ for document in documents :
155+ tokens = self .tokenizer .tokenize (document )
156+ result .append (SimpleEncoding (tokens ))
156157
157- result .append (SimpleEncoding (tokens )) # type: ignore[arg-type]
158- return result # type: ignore[return-value]
158+ return result
159159
160160 @classmethod
161161 def _list_supported_models (cls ) -> list [SparseModelDescription ]:
0 commit comments