fix retry document db session issue (#23046)

This commit is contained in:
Jyong
2025-07-28 15:56:49 +08:00
committed by GitHub
parent beebcae0a1
commit 3248d728ac

View File

@@ -24,79 +24,83 @@ def retry_document_indexing_task(dataset_id: str, document_ids: list[str]):
""" """
documents: list[Document] = [] documents: list[Document] = []
start_at = time.perf_counter() start_at = time.perf_counter()
try:
dataset = db.session.query(Dataset).where(Dataset.id == dataset_id).first()
if not dataset:
logging.info(click.style(f"Dataset not found: {dataset_id}", fg="red"))
return
tenant_id = dataset.tenant_id
for document_id in document_ids:
retry_indexing_cache_key = f"document_{document_id}_is_retried"
# check document limit
features = FeatureService.get_features(tenant_id)
try:
if features.billing.enabled:
vector_space = features.vector_space
if 0 < vector_space.limit <= vector_space.size:
raise ValueError(
"Your total number of documents plus the number of uploads have over the limit of "
"your subscription."
)
except Exception as e:
document = (
db.session.query(Document)
.where(Document.id == document_id, Document.dataset_id == dataset_id)
.first()
)
if document:
document.indexing_status = "error"
document.error = str(e)
document.stopped_at = datetime.datetime.now(datetime.UTC).replace(tzinfo=None)
db.session.add(document)
db.session.commit()
redis_client.delete(retry_indexing_cache_key)
return
dataset = db.session.query(Dataset).where(Dataset.id == dataset_id).first() logging.info(click.style(f"Start retry document: {document_id}", fg="green"))
if not dataset:
logging.info(click.style(f"Dataset not found: {dataset_id}", fg="red"))
db.session.close()
return
tenant_id = dataset.tenant_id
for document_id in document_ids:
retry_indexing_cache_key = f"document_{document_id}_is_retried"
# check document limit
features = FeatureService.get_features(tenant_id)
try:
if features.billing.enabled:
vector_space = features.vector_space
if 0 < vector_space.limit <= vector_space.size:
raise ValueError(
"Your total number of documents plus the number of uploads have over the limit of "
"your subscription."
)
except Exception as e:
document = ( document = (
db.session.query(Document).where(Document.id == document_id, Document.dataset_id == dataset_id).first() db.session.query(Document).where(Document.id == document_id, Document.dataset_id == dataset_id).first()
) )
if document: if not document:
logging.info(click.style(f"Document not found: {document_id}", fg="yellow"))
return
try:
# clean old data
index_processor = IndexProcessorFactory(document.doc_form).init_index_processor()
segments = db.session.query(DocumentSegment).where(DocumentSegment.document_id == document_id).all()
if segments:
index_node_ids = [segment.index_node_id for segment in segments]
# delete from vector index
index_processor.clean(dataset, index_node_ids, with_keywords=True, delete_child_chunks=True)
for segment in segments:
db.session.delete(segment)
db.session.commit()
document.indexing_status = "parsing"
document.processing_started_at = datetime.datetime.now(datetime.UTC).replace(tzinfo=None)
db.session.add(document)
db.session.commit()
indexing_runner = IndexingRunner()
indexing_runner.run([document])
redis_client.delete(retry_indexing_cache_key)
except Exception as ex:
document.indexing_status = "error" document.indexing_status = "error"
document.error = str(e) document.error = str(ex)
document.stopped_at = datetime.datetime.now(datetime.UTC).replace(tzinfo=None) document.stopped_at = datetime.datetime.now(datetime.UTC).replace(tzinfo=None)
db.session.add(document) db.session.add(document)
db.session.commit() db.session.commit()
redis_client.delete(retry_indexing_cache_key) logging.info(click.style(str(ex), fg="yellow"))
db.session.close() redis_client.delete(retry_indexing_cache_key)
return logging.exception("retry_document_indexing_task failed, document_id: %s", document_id)
end_at = time.perf_counter()
logging.info(click.style(f"Start retry document: {document_id}", fg="green")) logging.info(click.style(f"Retry dataset: {dataset_id} latency: {end_at - start_at}", fg="green"))
document = ( except Exception as e:
db.session.query(Document).where(Document.id == document_id, Document.dataset_id == dataset_id).first() logging.exception(
"retry_document_indexing_task failed, dataset_id: %s, document_ids: %s", dataset_id, document_ids
) )
if not document: raise e
logging.info(click.style(f"Document not found: {document_id}", fg="yellow")) finally:
db.session.close() db.session.close()
return
try:
# clean old data
index_processor = IndexProcessorFactory(document.doc_form).init_index_processor()
segments = db.session.query(DocumentSegment).where(DocumentSegment.document_id == document_id).all()
if segments:
index_node_ids = [segment.index_node_id for segment in segments]
# delete from vector index
index_processor.clean(dataset, index_node_ids, with_keywords=True, delete_child_chunks=True)
for segment in segments:
db.session.delete(segment)
db.session.commit()
document.indexing_status = "parsing"
document.processing_started_at = datetime.datetime.now(datetime.UTC).replace(tzinfo=None)
db.session.add(document)
db.session.commit()
indexing_runner = IndexingRunner()
indexing_runner.run([document])
redis_client.delete(retry_indexing_cache_key)
except Exception as ex:
document.indexing_status = "error"
document.error = str(ex)
document.stopped_at = datetime.datetime.now(datetime.UTC).replace(tzinfo=None)
db.session.add(document)
db.session.commit()
logging.info(click.style(str(ex), fg="yellow"))
redis_client.delete(retry_indexing_cache_key)
logging.exception("retry_document_indexing_task failed, document_id: %s", document_id)
finally:
db.session.close()
end_at = time.perf_counter()
logging.info(click.style(f"Retry dataset: {dataset_id} latency: {end_at - start_at}", fg="green"))