ppt & pptx improve (#1790)

Co-authored-by: jyong <jyong@dify.ai>
This commit is contained in:
Jyong
2023-12-19 18:11:27 +08:00
committed by GitHub
parent 185c2f86cd
commit df1509983c
3 changed files with 32 additions and 10 deletions

View File

@@ -30,11 +30,18 @@ class UnstructuredPPTLoader(BaseLoader):
from unstructured.partition.ppt import partition_ppt
elements = partition_ppt(filename=self._file_path, api_url=self._api_url)
from unstructured.chunking.title import chunk_by_title
chunks = chunk_by_title(elements, max_characters=2000, combine_text_under_n_chars=0)
documents = []
for chunk in chunks:
text = chunk.text.strip()
documents.append(Document(page_content=text))
text_by_page = {}
for element in elements:
page = element.metadata.page_number
text = element.text
if page in text_by_page:
text_by_page[page] += "\n" + text
else:
text_by_page[page] = text
combined_texts = list(text_by_page.values())
documents = []
for combined_text in combined_texts:
text = combined_text.strip()
documents.append(Document(page_content=text))
return documents

View File

@@ -30,11 +30,19 @@ class UnstructuredPPTXLoader(BaseLoader):
from unstructured.partition.pptx import partition_pptx
elements = partition_pptx(filename=self._file_path, api_url=self._api_url)
from unstructured.chunking.title import chunk_by_title
chunks = chunk_by_title(elements, max_characters=2000, combine_text_under_n_chars=0)
text_by_page = {}
for element in elements:
page = element.metadata.page_number
text = element.text
if page in text_by_page:
text_by_page[page] += "\n" + text
else:
text_by_page[page] = text
combined_texts = list(text_by_page.values())
documents = []
for chunk in chunks:
text = chunk.text.strip()
for combined_text in combined_texts:
text = combined_text.strip()
documents.append(Document(page_content=text))
return documents