fix(dataset): CELERY_BROKER uses amqp rabbitmq. When adding document segments in batches and uploading large files, the status will always remain stuck at "In batch processing" #22709 (#23038)
This commit is contained in:
@@ -1,6 +1,5 @@
|
||||
import uuid
|
||||
|
||||
import pandas as pd
|
||||
from flask import request
|
||||
from flask_login import current_user
|
||||
from flask_restful import Resource, marshal, reqparse
|
||||
@@ -14,8 +13,6 @@ from controllers.console.datasets.error import (
|
||||
ChildChunkDeleteIndexError,
|
||||
ChildChunkIndexingError,
|
||||
InvalidActionError,
|
||||
NoFileUploadedError,
|
||||
TooManyFilesError,
|
||||
)
|
||||
from controllers.console.wraps import (
|
||||
account_initialization_required,
|
||||
@@ -32,6 +29,7 @@ from extensions.ext_redis import redis_client
|
||||
from fields.segment_fields import child_chunk_fields, segment_fields
|
||||
from libs.login import login_required
|
||||
from models.dataset import ChildChunk, DocumentSegment
|
||||
from models.model import UploadFile
|
||||
from services.dataset_service import DatasetService, DocumentService, SegmentService
|
||||
from services.entities.knowledge_entities.knowledge_entities import ChildChunkUpdateArgs, SegmentUpdateArgs
|
||||
from services.errors.chunk import ChildChunkDeleteIndexError as ChildChunkDeleteIndexServiceError
|
||||
@@ -365,37 +363,28 @@ class DatasetDocumentSegmentBatchImportApi(Resource):
|
||||
document = DocumentService.get_document(dataset_id, document_id)
|
||||
if not document:
|
||||
raise NotFound("Document not found.")
|
||||
# get file from request
|
||||
file = request.files["file"]
|
||||
# check file
|
||||
if "file" not in request.files:
|
||||
raise NoFileUploadedError()
|
||||
|
||||
if len(request.files) > 1:
|
||||
raise TooManyFilesError()
|
||||
parser = reqparse.RequestParser()
|
||||
parser.add_argument("upload_file_id", type=str, required=True, nullable=False, location="json")
|
||||
args = parser.parse_args()
|
||||
upload_file_id = args["upload_file_id"]
|
||||
|
||||
upload_file = db.session.query(UploadFile).where(UploadFile.id == upload_file_id).first()
|
||||
if not upload_file:
|
||||
raise NotFound("UploadFile not found.")
|
||||
|
||||
# check file type
|
||||
if not file.filename or not file.filename.lower().endswith(".csv"):
|
||||
if not upload_file.name or not upload_file.name.lower().endswith(".csv"):
|
||||
raise ValueError("Invalid file type. Only CSV files are allowed")
|
||||
|
||||
try:
|
||||
# Skip the first row
|
||||
df = pd.read_csv(file)
|
||||
result = []
|
||||
for index, row in df.iterrows():
|
||||
if document.doc_form == "qa_model":
|
||||
data = {"content": row.iloc[0], "answer": row.iloc[1]}
|
||||
else:
|
||||
data = {"content": row.iloc[0]}
|
||||
result.append(data)
|
||||
if len(result) == 0:
|
||||
raise ValueError("The CSV file is empty.")
|
||||
# async job
|
||||
job_id = str(uuid.uuid4())
|
||||
indexing_cache_key = f"segment_batch_import_{str(job_id)}"
|
||||
# send batch add segments task
|
||||
redis_client.setnx(indexing_cache_key, "waiting")
|
||||
batch_create_segment_to_index_task.delay(
|
||||
str(job_id), result, dataset_id, document_id, current_user.current_tenant_id, current_user.id
|
||||
str(job_id), upload_file_id, dataset_id, document_id, current_user.current_tenant_id, current_user.id
|
||||
)
|
||||
except Exception as e:
|
||||
return {"error": str(e)}, 500
|
||||
|
@@ -1,9 +1,12 @@
|
||||
import datetime
|
||||
import logging
|
||||
import tempfile
|
||||
import time
|
||||
import uuid
|
||||
from pathlib import Path
|
||||
|
||||
import click
|
||||
import pandas as pd
|
||||
from celery import shared_task # type: ignore
|
||||
from sqlalchemy import func
|
||||
from sqlalchemy.orm import Session
|
||||
@@ -12,15 +15,17 @@ from core.model_manager import ModelManager
|
||||
from core.model_runtime.entities.model_entities import ModelType
|
||||
from extensions.ext_database import db
|
||||
from extensions.ext_redis import redis_client
|
||||
from extensions.ext_storage import storage
|
||||
from libs import helper
|
||||
from models.dataset import Dataset, Document, DocumentSegment
|
||||
from models.model import UploadFile
|
||||
from services.vector_service import VectorService
|
||||
|
||||
|
||||
@shared_task(queue="dataset")
|
||||
def batch_create_segment_to_index_task(
|
||||
job_id: str,
|
||||
content: list,
|
||||
upload_file_id: str,
|
||||
dataset_id: str,
|
||||
document_id: str,
|
||||
tenant_id: str,
|
||||
@@ -29,13 +34,13 @@ def batch_create_segment_to_index_task(
|
||||
"""
|
||||
Async batch create segment to index
|
||||
:param job_id:
|
||||
:param content:
|
||||
:param upload_file_id:
|
||||
:param dataset_id:
|
||||
:param document_id:
|
||||
:param tenant_id:
|
||||
:param user_id:
|
||||
|
||||
Usage: batch_create_segment_to_index_task.delay(job_id, content, dataset_id, document_id, tenant_id, user_id)
|
||||
Usage: batch_create_segment_to_index_task.delay(job_id, upload_file_id, dataset_id, document_id, tenant_id, user_id)
|
||||
"""
|
||||
logging.info(click.style(f"Start batch create segment jobId: {job_id}", fg="green"))
|
||||
start_at = time.perf_counter()
|
||||
@@ -58,6 +63,29 @@ def batch_create_segment_to_index_task(
|
||||
or dataset_document.indexing_status != "completed"
|
||||
):
|
||||
raise ValueError("Document is not available.")
|
||||
|
||||
upload_file = session.get(UploadFile, upload_file_id)
|
||||
if not upload_file:
|
||||
raise ValueError("UploadFile not found.")
|
||||
|
||||
with tempfile.TemporaryDirectory() as temp_dir:
|
||||
suffix = Path(upload_file.key).suffix
|
||||
# FIXME mypy: Cannot determine type of 'tempfile._get_candidate_names' better not use it here
|
||||
file_path = f"{temp_dir}/{next(tempfile._get_candidate_names())}{suffix}" # type: ignore
|
||||
storage.download(upload_file.key, file_path)
|
||||
|
||||
# Skip the first row
|
||||
df = pd.read_csv(file_path)
|
||||
content = []
|
||||
for index, row in df.iterrows():
|
||||
if dataset_document.doc_form == "qa_model":
|
||||
data = {"content": row.iloc[0], "answer": row.iloc[1]}
|
||||
else:
|
||||
data = {"content": row.iloc[0]}
|
||||
content.append(data)
|
||||
if len(content) == 0:
|
||||
raise ValueError("The CSV file is empty.")
|
||||
|
||||
document_segments = []
|
||||
embedding_model = None
|
||||
if dataset.indexing_technique == "high_quality":
|
||||
|
Reference in New Issue
Block a user