Feat/firecrawl data source (#5232)
Co-authored-by: Nicolas <nicolascamara29@gmail.com> Co-authored-by: chenhe <guchenhe@gmail.com> Co-authored-by: takatost <takatost@gmail.com>
This commit is contained in:
@@ -465,6 +465,20 @@ class DocumentBatchIndexingEstimateApi(DocumentResource):
|
||||
document_model=document.doc_form
|
||||
)
|
||||
extract_settings.append(extract_setting)
|
||||
elif document.data_source_type == 'website_crawl':
|
||||
extract_setting = ExtractSetting(
|
||||
datasource_type="website_crawl",
|
||||
website_info={
|
||||
"provider": data_source_info['provider'],
|
||||
"job_id": data_source_info['job_id'],
|
||||
"url": data_source_info['url'],
|
||||
"tenant_id": current_user.current_tenant_id,
|
||||
"mode": data_source_info['mode'],
|
||||
"only_main_content": data_source_info['only_main_content']
|
||||
},
|
||||
document_model=document.doc_form
|
||||
)
|
||||
extract_settings.append(extract_setting)
|
||||
|
||||
else:
|
||||
raise ValueError('Data source type not support')
|
||||
@@ -952,6 +966,33 @@ class DocumentRenameApi(DocumentResource):
|
||||
return document
|
||||
|
||||
|
||||
class WebsiteDocumentSyncApi(DocumentResource):
|
||||
@setup_required
|
||||
@login_required
|
||||
@account_initialization_required
|
||||
def get(self, dataset_id, document_id):
|
||||
"""sync website document."""
|
||||
dataset_id = str(dataset_id)
|
||||
dataset = DatasetService.get_dataset(dataset_id)
|
||||
if not dataset:
|
||||
raise NotFound('Dataset not found.')
|
||||
document_id = str(document_id)
|
||||
document = DocumentService.get_document(dataset.id, document_id)
|
||||
if not document:
|
||||
raise NotFound('Document not found.')
|
||||
if document.tenant_id != current_user.current_tenant_id:
|
||||
raise Forbidden('No permission.')
|
||||
if document.data_source_type != 'website_crawl':
|
||||
raise ValueError('Document is not a website document.')
|
||||
# 403 if document is archived
|
||||
if DocumentService.check_archived(document):
|
||||
raise ArchivedDocumentImmutableError()
|
||||
# sync document
|
||||
DocumentService.sync_website_document(dataset_id, document)
|
||||
|
||||
return {'result': 'success'}, 200
|
||||
|
||||
|
||||
api.add_resource(GetProcessRuleApi, '/datasets/process-rule')
|
||||
api.add_resource(DatasetDocumentListApi,
|
||||
'/datasets/<uuid:dataset_id>/documents')
|
||||
@@ -980,3 +1021,5 @@ api.add_resource(DocumentRecoverApi, '/datasets/<uuid:dataset_id>/documents/<uui
|
||||
api.add_resource(DocumentRetryApi, '/datasets/<uuid:dataset_id>/retry')
|
||||
api.add_resource(DocumentRenameApi,
|
||||
'/datasets/<uuid:dataset_id>/documents/<uuid:document_id>/rename')
|
||||
|
||||
api.add_resource(WebsiteDocumentSyncApi, '/datasets/<uuid:dataset_id>/documents/<uuid:document_id>/website-sync')
|
||||
|
Reference in New Issue
Block a user