feat: mypy for all type check (#10921)

This commit is contained in:
yihong
2024-12-24 18:38:51 +08:00
committed by GitHub
parent c91e8b1737
commit 56e15d09a9
584 changed files with 3975 additions and 2826 deletions

View File

@@ -26,8 +26,8 @@ class EnhanceRecursiveCharacterTextSplitter(RecursiveCharacterTextSplitter):
def from_encoder(
cls: type[TS],
embedding_model_instance: Optional[ModelInstance],
allowed_special: Union[Literal[all], Set[str]] = set(),
disallowed_special: Union[Literal[all], Collection[str]] = "all",
allowed_special: Union[Literal["all"], Set[str]] = set(), # noqa: UP037
disallowed_special: Union[Literal["all"], Collection[str]] = "all", # noqa: UP037
**kwargs: Any,
):
def _token_encoder(text: str) -> int:

View File

@@ -92,7 +92,7 @@ class TextSplitter(BaseDocumentTransformer, ABC):
texts, metadatas = [], []
for doc in documents:
texts.append(doc.page_content)
metadatas.append(doc.metadata)
metadatas.append(doc.metadata or {})
return self.create_documents(texts, metadatas=metadatas)
def _join_docs(self, docs: list[str], separator: str) -> Optional[str]:
@@ -143,7 +143,7 @@ class TextSplitter(BaseDocumentTransformer, ABC):
def from_huggingface_tokenizer(cls, tokenizer: Any, **kwargs: Any) -> TextSplitter:
"""Text splitter that uses HuggingFace tokenizer to count length."""
try:
from transformers import PreTrainedTokenizerBase
from transformers import PreTrainedTokenizerBase # type: ignore
if not isinstance(tokenizer, PreTrainedTokenizerBase):
raise ValueError("Tokenizer received was not an instance of PreTrainedTokenizerBase")