Feat/firecrawl data source (#5232)
Co-authored-by: Nicolas <nicolascamara29@gmail.com> Co-authored-by: chenhe <guchenhe@gmail.com> Co-authored-by: takatost <takatost@gmail.com>
This commit is contained in:
@@ -339,7 +339,7 @@ class IndexingRunner:
|
||||
def _extract(self, index_processor: BaseIndexProcessor, dataset_document: DatasetDocument, process_rule: dict) \
|
||||
-> list[Document]:
|
||||
# load file
|
||||
if dataset_document.data_source_type not in ["upload_file", "notion_import"]:
|
||||
if dataset_document.data_source_type not in ["upload_file", "notion_import", "website_crawl"]:
|
||||
return []
|
||||
|
||||
data_source_info = dataset_document.data_source_info_dict
|
||||
@@ -375,6 +375,23 @@ class IndexingRunner:
|
||||
document_model=dataset_document.doc_form
|
||||
)
|
||||
text_docs = index_processor.extract(extract_setting, process_rule_mode=process_rule['mode'])
|
||||
elif dataset_document.data_source_type == 'website_crawl':
|
||||
if (not data_source_info or 'provider' not in data_source_info
|
||||
or 'url' not in data_source_info or 'job_id' not in data_source_info):
|
||||
raise ValueError("no website import info found")
|
||||
extract_setting = ExtractSetting(
|
||||
datasource_type="website_crawl",
|
||||
website_info={
|
||||
"provider": data_source_info['provider'],
|
||||
"job_id": data_source_info['job_id'],
|
||||
"tenant_id": dataset_document.tenant_id,
|
||||
"url": data_source_info['url'],
|
||||
"mode": data_source_info['mode'],
|
||||
"only_main_content": data_source_info['only_main_content']
|
||||
},
|
||||
document_model=dataset_document.doc_form
|
||||
)
|
||||
text_docs = index_processor.extract(extract_setting, process_rule_mode=process_rule['mode'])
|
||||
# update document status to splitting
|
||||
self._update_document_index_status(
|
||||
document_id=dataset_document.id,
|
||||
|
Reference in New Issue
Block a user