chore(api/core): apply ruff reformatting (#7624)

This commit is contained in:
Bowen Liang
2024-09-10 17:00:20 +08:00
committed by GitHub
parent 178730266d
commit 2cf1187b32
724 changed files with 21180 additions and 21123 deletions

View File

@@ -8,13 +8,12 @@ logger = logging.getLogger(__name__)
class UnstructuredWordExtractor(BaseExtractor):
"""Loader that uses unstructured to load word documents.
"""
"""Loader that uses unstructured to load word documents."""
def __init__(
self,
file_path: str,
api_url: str,
self,
file_path: str,
api_url: str,
):
"""Initialize with file path."""
self._file_path = file_path
@@ -24,9 +23,7 @@ class UnstructuredWordExtractor(BaseExtractor):
from unstructured.__version__ import __version__ as __unstructured_version__
from unstructured.file_utils.filetype import FileType, detect_filetype
unstructured_version = tuple(
int(x) for x in __unstructured_version__.split(".")
)
unstructured_version = tuple(int(x) for x in __unstructured_version__.split("."))
# check the file extension
try:
import magic # noqa: F401
@@ -53,6 +50,7 @@ class UnstructuredWordExtractor(BaseExtractor):
elements = partition_docx(filename=self._file_path)
from unstructured.chunking.title import chunk_by_title
chunks = chunk_by_title(elements, max_characters=2000, combine_text_under_n_chars=2000)
documents = []
for chunk in chunks:

View File

@@ -26,6 +26,7 @@ class UnstructuredEmailExtractor(BaseExtractor):
def extract(self) -> list[Document]:
from unstructured.partition.email import partition_email
elements = partition_email(filename=self._file_path)
# noinspection PyBroadException
@@ -34,15 +35,16 @@ class UnstructuredEmailExtractor(BaseExtractor):
element_text = element.text.strip()
padding_needed = 4 - len(element_text) % 4
element_text += '=' * padding_needed
element_text += "=" * padding_needed
element_decode = base64.b64decode(element_text)
soup = BeautifulSoup(element_decode.decode('utf-8'), 'html.parser')
soup = BeautifulSoup(element_decode.decode("utf-8"), "html.parser")
element.text = soup.get_text()
except Exception:
pass
from unstructured.chunking.title import chunk_by_title
chunks = chunk_by_title(elements, max_characters=2000, combine_text_under_n_chars=2000)
documents = []
for chunk in chunks:

View File

@@ -28,6 +28,7 @@ class UnstructuredEpubExtractor(BaseExtractor):
elements = partition_epub(filename=self._file_path, xml_keep_tags=True)
from unstructured.chunking.title import chunk_by_title
chunks = chunk_by_title(elements, max_characters=2000, combine_text_under_n_chars=2000)
documents = []
for chunk in chunks:

View File

@@ -38,6 +38,7 @@ class UnstructuredMarkdownExtractor(BaseExtractor):
elements = partition_md(filename=self._file_path)
from unstructured.chunking.title import chunk_by_title
chunks = chunk_by_title(elements, max_characters=2000, combine_text_under_n_chars=2000)
documents = []
for chunk in chunks:

View File

@@ -14,11 +14,7 @@ class UnstructuredMsgExtractor(BaseExtractor):
file_path: Path to the file to load.
"""
def __init__(
self,
file_path: str,
api_url: str
):
def __init__(self, file_path: str, api_url: str):
"""Initialize with file path."""
self._file_path = file_path
self._api_url = api_url
@@ -28,6 +24,7 @@ class UnstructuredMsgExtractor(BaseExtractor):
elements = partition_msg(filename=self._file_path)
from unstructured.chunking.title import chunk_by_title
chunks = chunk_by_title(elements, max_characters=2000, combine_text_under_n_chars=2000)
documents = []
for chunk in chunks:

View File

@@ -14,12 +14,7 @@ class UnstructuredPPTExtractor(BaseExtractor):
file_path: Path to the file to load.
"""
def __init__(
self,
file_path: str,
api_url: str,
api_key: str
):
def __init__(self, file_path: str, api_url: str, api_key: str):
"""Initialize with file path."""
self._file_path = file_path
self._api_url = api_url

View File

@@ -14,11 +14,7 @@ class UnstructuredPPTXExtractor(BaseExtractor):
file_path: Path to the file to load.
"""
def __init__(
self,
file_path: str,
api_url: str
):
def __init__(self, file_path: str, api_url: str):
"""Initialize with file path."""
self._file_path = file_path
self._api_url = api_url

View File

@@ -14,11 +14,7 @@ class UnstructuredTextExtractor(BaseExtractor):
file_path: Path to the file to load.
"""
def __init__(
self,
file_path: str,
api_url: str
):
def __init__(self, file_path: str, api_url: str):
"""Initialize with file path."""
self._file_path = file_path
self._api_url = api_url
@@ -28,6 +24,7 @@ class UnstructuredTextExtractor(BaseExtractor):
elements = partition_text(filename=self._file_path)
from unstructured.chunking.title import chunk_by_title
chunks = chunk_by_title(elements, max_characters=2000, combine_text_under_n_chars=2000)
documents = []
for chunk in chunks:

View File

@@ -14,11 +14,7 @@ class UnstructuredXmlExtractor(BaseExtractor):
file_path: Path to the file to load.
"""
def __init__(
self,
file_path: str,
api_url: str
):
def __init__(self, file_path: str, api_url: str):
"""Initialize with file path."""
self._file_path = file_path
self._api_url = api_url
@@ -28,6 +24,7 @@ class UnstructuredXmlExtractor(BaseExtractor):
elements = partition_xml(filename=self._file_path, xml_keep_tags=True)
from unstructured.chunking.title import chunk_by_title
chunks = chunk_by_title(elements, max_characters=2000, combine_text_under_n_chars=2000)
documents = []
for chunk in chunks: