Feat/dataset notion import (#392)
Co-authored-by: StyleZhang <jasonapring2015@outlook.com> Co-authored-by: JzoNg <jzongcode@gmail.com>
This commit is contained in:
@@ -1,6 +1,7 @@
|
||||
# -*- coding:utf-8 -*-
|
||||
import random
|
||||
from datetime import datetime
|
||||
from typing import List
|
||||
|
||||
from flask import request
|
||||
from flask_login import login_required, current_user
|
||||
@@ -61,6 +62,29 @@ document_fields = {
|
||||
'hit_count': fields.Integer,
|
||||
}
|
||||
|
||||
document_with_segments_fields = {
|
||||
'id': fields.String,
|
||||
'position': fields.Integer,
|
||||
'data_source_type': fields.String,
|
||||
'data_source_info': fields.Raw(attribute='data_source_info_dict'),
|
||||
'dataset_process_rule_id': fields.String,
|
||||
'name': fields.String,
|
||||
'created_from': fields.String,
|
||||
'created_by': fields.String,
|
||||
'created_at': TimestampField,
|
||||
'tokens': fields.Integer,
|
||||
'indexing_status': fields.String,
|
||||
'error': fields.String,
|
||||
'enabled': fields.Boolean,
|
||||
'disabled_at': TimestampField,
|
||||
'disabled_by': fields.String,
|
||||
'archived': fields.Boolean,
|
||||
'display_status': fields.String,
|
||||
'word_count': fields.Integer,
|
||||
'hit_count': fields.Integer,
|
||||
'completed_segments': fields.Integer,
|
||||
'total_segments': fields.Integer
|
||||
}
|
||||
|
||||
class DocumentResource(Resource):
|
||||
def get_document(self, dataset_id: str, document_id: str) -> Document:
|
||||
@@ -83,6 +107,23 @@ class DocumentResource(Resource):
|
||||
|
||||
return document
|
||||
|
||||
def get_batch_documents(self, dataset_id: str, batch: str) -> List[Document]:
|
||||
dataset = DatasetService.get_dataset(dataset_id)
|
||||
if not dataset:
|
||||
raise NotFound('Dataset not found.')
|
||||
|
||||
try:
|
||||
DatasetService.check_dataset_permission(dataset, current_user)
|
||||
except services.errors.account.NoPermissionError as e:
|
||||
raise Forbidden(str(e))
|
||||
|
||||
documents = DocumentService.get_batch_documents(dataset_id, batch)
|
||||
|
||||
if not documents:
|
||||
raise NotFound('Documents not found.')
|
||||
|
||||
return documents
|
||||
|
||||
|
||||
class GetProcessRuleApi(Resource):
|
||||
@setup_required
|
||||
@@ -132,9 +173,9 @@ class DatasetDocumentListApi(Resource):
|
||||
dataset_id = str(dataset_id)
|
||||
page = request.args.get('page', default=1, type=int)
|
||||
limit = request.args.get('limit', default=20, type=int)
|
||||
search = request.args.get('search', default=None, type=str)
|
||||
search = request.args.get('keyword', default=None, type=str)
|
||||
sort = request.args.get('sort', default='-created_at', type=str)
|
||||
|
||||
fetch = request.args.get('fetch', default=False, type=bool)
|
||||
dataset = DatasetService.get_dataset(dataset_id)
|
||||
if not dataset:
|
||||
raise NotFound('Dataset not found.')
|
||||
@@ -173,9 +214,20 @@ class DatasetDocumentListApi(Resource):
|
||||
paginated_documents = query.paginate(
|
||||
page=page, per_page=limit, max_per_page=100, error_out=False)
|
||||
documents = paginated_documents.items
|
||||
|
||||
if fetch:
|
||||
for document in documents:
|
||||
completed_segments = DocumentSegment.query.filter(DocumentSegment.completed_at.isnot(None),
|
||||
DocumentSegment.document_id == str(document.id),
|
||||
DocumentSegment.status != 're_segment').count()
|
||||
total_segments = DocumentSegment.query.filter(DocumentSegment.document_id == str(document.id),
|
||||
DocumentSegment.status != 're_segment').count()
|
||||
document.completed_segments = completed_segments
|
||||
document.total_segments = total_segments
|
||||
data = marshal(documents, document_with_segments_fields)
|
||||
else:
|
||||
data = marshal(documents, document_fields)
|
||||
response = {
|
||||
'data': marshal(documents, document_fields),
|
||||
'data': data,
|
||||
'has_more': len(documents) == limit,
|
||||
'limit': limit,
|
||||
'total': paginated_documents.total,
|
||||
@@ -184,10 +236,15 @@ class DatasetDocumentListApi(Resource):
|
||||
|
||||
return response
|
||||
|
||||
documents_and_batch_fields = {
|
||||
'documents': fields.List(fields.Nested(document_fields)),
|
||||
'batch': fields.String
|
||||
}
|
||||
|
||||
@setup_required
|
||||
@login_required
|
||||
@account_initialization_required
|
||||
@marshal_with(document_fields)
|
||||
@marshal_with(documents_and_batch_fields)
|
||||
def post(self, dataset_id):
|
||||
dataset_id = str(dataset_id)
|
||||
|
||||
@@ -221,7 +278,7 @@ class DatasetDocumentListApi(Resource):
|
||||
DocumentService.document_create_args_validate(args)
|
||||
|
||||
try:
|
||||
document = DocumentService.save_document_with_dataset_id(dataset, args, current_user)
|
||||
documents, batch = DocumentService.save_document_with_dataset_id(dataset, args, current_user)
|
||||
except ProviderTokenNotInitError:
|
||||
raise ProviderNotInitializeError()
|
||||
except QuotaExceededError:
|
||||
@@ -229,13 +286,17 @@ class DatasetDocumentListApi(Resource):
|
||||
except ModelCurrentlyNotSupportError:
|
||||
raise ProviderModelCurrentlyNotSupportError()
|
||||
|
||||
return document
|
||||
return {
|
||||
'documents': documents,
|
||||
'batch': batch
|
||||
}
|
||||
|
||||
|
||||
class DatasetInitApi(Resource):
|
||||
dataset_and_document_fields = {
|
||||
'dataset': fields.Nested(dataset_fields),
|
||||
'document': fields.Nested(document_fields)
|
||||
'documents': fields.List(fields.Nested(document_fields)),
|
||||
'batch': fields.String
|
||||
}
|
||||
|
||||
@setup_required
|
||||
@@ -258,7 +319,7 @@ class DatasetInitApi(Resource):
|
||||
DocumentService.document_create_args_validate(args)
|
||||
|
||||
try:
|
||||
dataset, document = DocumentService.save_document_without_dataset_id(
|
||||
dataset, documents, batch = DocumentService.save_document_without_dataset_id(
|
||||
tenant_id=current_user.current_tenant_id,
|
||||
document_data=args,
|
||||
account=current_user
|
||||
@@ -272,7 +333,8 @@ class DatasetInitApi(Resource):
|
||||
|
||||
response = {
|
||||
'dataset': dataset,
|
||||
'document': document
|
||||
'documents': documents,
|
||||
'batch': batch
|
||||
}
|
||||
|
||||
return response
|
||||
@@ -317,11 +379,122 @@ class DocumentIndexingEstimateApi(DocumentResource):
|
||||
raise NotFound('File not found.')
|
||||
|
||||
indexing_runner = IndexingRunner()
|
||||
response = indexing_runner.indexing_estimate(file, data_process_rule_dict)
|
||||
|
||||
response = indexing_runner.file_indexing_estimate([file], data_process_rule_dict)
|
||||
|
||||
return response
|
||||
|
||||
|
||||
class DocumentBatchIndexingEstimateApi(DocumentResource):
|
||||
|
||||
@setup_required
|
||||
@login_required
|
||||
@account_initialization_required
|
||||
def get(self, dataset_id, batch):
|
||||
dataset_id = str(dataset_id)
|
||||
batch = str(batch)
|
||||
dataset = DatasetService.get_dataset(dataset_id)
|
||||
if dataset is None:
|
||||
raise NotFound("Dataset not found.")
|
||||
documents = self.get_batch_documents(dataset_id, batch)
|
||||
response = {
|
||||
"tokens": 0,
|
||||
"total_price": 0,
|
||||
"currency": "USD",
|
||||
"total_segments": 0,
|
||||
"preview": []
|
||||
}
|
||||
if not documents:
|
||||
return response
|
||||
data_process_rule = documents[0].dataset_process_rule
|
||||
data_process_rule_dict = data_process_rule.to_dict()
|
||||
info_list = []
|
||||
for document in documents:
|
||||
if document.indexing_status in ['completed', 'error']:
|
||||
raise DocumentAlreadyFinishedError()
|
||||
data_source_info = document.data_source_info_dict
|
||||
# format document files info
|
||||
if data_source_info and 'upload_file_id' in data_source_info:
|
||||
file_id = data_source_info['upload_file_id']
|
||||
info_list.append(file_id)
|
||||
# format document notion info
|
||||
elif data_source_info and 'notion_workspace_id' in data_source_info and 'notion_page_id' in data_source_info:
|
||||
pages = []
|
||||
page = {
|
||||
'page_id': data_source_info['notion_page_id'],
|
||||
'type': data_source_info['type']
|
||||
}
|
||||
pages.append(page)
|
||||
notion_info = {
|
||||
'workspace_id': data_source_info['notion_workspace_id'],
|
||||
'pages': pages
|
||||
}
|
||||
info_list.append(notion_info)
|
||||
|
||||
if dataset.data_source_type == 'upload_file':
|
||||
file_details = db.session.query(UploadFile).filter(
|
||||
UploadFile.tenant_id == current_user.current_tenant_id,
|
||||
UploadFile.id in info_list
|
||||
).all()
|
||||
|
||||
if file_details is None:
|
||||
raise NotFound("File not found.")
|
||||
|
||||
indexing_runner = IndexingRunner()
|
||||
response = indexing_runner.file_indexing_estimate(file_details, data_process_rule_dict)
|
||||
elif dataset.data_source_type:
|
||||
|
||||
indexing_runner = IndexingRunner()
|
||||
response = indexing_runner.notion_indexing_estimate(info_list,
|
||||
data_process_rule_dict)
|
||||
else:
|
||||
raise ValueError('Data source type not support')
|
||||
return response
|
||||
|
||||
|
||||
class DocumentBatchIndexingStatusApi(DocumentResource):
|
||||
document_status_fields = {
|
||||
'id': fields.String,
|
||||
'indexing_status': fields.String,
|
||||
'processing_started_at': TimestampField,
|
||||
'parsing_completed_at': TimestampField,
|
||||
'cleaning_completed_at': TimestampField,
|
||||
'splitting_completed_at': TimestampField,
|
||||
'completed_at': TimestampField,
|
||||
'paused_at': TimestampField,
|
||||
'error': fields.String,
|
||||
'stopped_at': TimestampField,
|
||||
'completed_segments': fields.Integer,
|
||||
'total_segments': fields.Integer,
|
||||
}
|
||||
|
||||
document_status_fields_list = {
|
||||
'data': fields.List(fields.Nested(document_status_fields))
|
||||
}
|
||||
|
||||
@setup_required
|
||||
@login_required
|
||||
@account_initialization_required
|
||||
def get(self, dataset_id, batch):
|
||||
dataset_id = str(dataset_id)
|
||||
batch = str(batch)
|
||||
documents = self.get_batch_documents(dataset_id, batch)
|
||||
documents_status = []
|
||||
for document in documents:
|
||||
completed_segments = DocumentSegment.query.filter(DocumentSegment.completed_at.isnot(None),
|
||||
DocumentSegment.document_id == str(document.id),
|
||||
DocumentSegment.status != 're_segment').count()
|
||||
total_segments = DocumentSegment.query.filter(DocumentSegment.document_id == str(document.id),
|
||||
DocumentSegment.status != 're_segment').count()
|
||||
document.completed_segments = completed_segments
|
||||
document.total_segments = total_segments
|
||||
documents_status.append(marshal(document, self.document_status_fields))
|
||||
data = {
|
||||
'data': documents_status
|
||||
}
|
||||
return data
|
||||
|
||||
|
||||
class DocumentIndexingStatusApi(DocumentResource):
|
||||
document_status_fields = {
|
||||
'id': fields.String,
|
||||
@@ -408,7 +581,7 @@ class DocumentDetailApi(DocumentResource):
|
||||
'disabled_by': document.disabled_by,
|
||||
'archived': document.archived,
|
||||
'segment_count': document.segment_count,
|
||||
'average_segment_length': document.average_segment_length,
|
||||
'average_segment_length': document.average_segment_length,
|
||||
'hit_count': document.hit_count,
|
||||
'display_status': document.display_status
|
||||
}
|
||||
@@ -428,7 +601,7 @@ class DocumentDetailApi(DocumentResource):
|
||||
'created_at': document.created_at.timestamp(),
|
||||
'tokens': document.tokens,
|
||||
'indexing_status': document.indexing_status,
|
||||
'completed_at': int(document.completed_at.timestamp())if document.completed_at else None,
|
||||
'completed_at': int(document.completed_at.timestamp()) if document.completed_at else None,
|
||||
'updated_at': int(document.updated_at.timestamp()) if document.updated_at else None,
|
||||
'indexing_latency': document.indexing_latency,
|
||||
'error': document.error,
|
||||
@@ -579,6 +752,8 @@ class DocumentStatusApi(DocumentResource):
|
||||
return {'result': 'success'}, 200
|
||||
|
||||
elif action == "disable":
|
||||
if not document.completed_at or document.indexing_status != 'completed':
|
||||
raise InvalidActionError('Document is not completed.')
|
||||
if not document.enabled:
|
||||
raise InvalidActionError('Document already disabled.')
|
||||
|
||||
@@ -678,6 +853,10 @@ api.add_resource(DatasetInitApi,
|
||||
'/datasets/init')
|
||||
api.add_resource(DocumentIndexingEstimateApi,
|
||||
'/datasets/<uuid:dataset_id>/documents/<uuid:document_id>/indexing-estimate')
|
||||
api.add_resource(DocumentBatchIndexingEstimateApi,
|
||||
'/datasets/<uuid:dataset_id>/batch/<string:batch>/indexing-estimate')
|
||||
api.add_resource(DocumentBatchIndexingStatusApi,
|
||||
'/datasets/<uuid:dataset_id>/batch/<string:batch>/indexing-status')
|
||||
api.add_resource(DocumentIndexingStatusApi,
|
||||
'/datasets/<uuid:dataset_id>/documents/<uuid:document_id>/indexing-status')
|
||||
api.add_resource(DocumentDetailApi,
|
||||
|
Reference in New Issue
Block a user