fix(dataset): CELERY_BROKER uses amqp rabbitmq. When adding document segments in batches and uploading large files, the status will always remain stuck at "In batch processing" #22709 (#23038)

This commit is contained in:
zhaobingshuang
2025-07-28 14:24:13 +08:00
committed by GitHub
parent 3f8fb18c89
commit 5c5f61b2aa
6 changed files with 166 additions and 44 deletions

View File

@@ -1,6 +1,5 @@
import uuid import uuid
import pandas as pd
from flask import request from flask import request
from flask_login import current_user from flask_login import current_user
from flask_restful import Resource, marshal, reqparse from flask_restful import Resource, marshal, reqparse
@@ -14,8 +13,6 @@ from controllers.console.datasets.error import (
ChildChunkDeleteIndexError, ChildChunkDeleteIndexError,
ChildChunkIndexingError, ChildChunkIndexingError,
InvalidActionError, InvalidActionError,
NoFileUploadedError,
TooManyFilesError,
) )
from controllers.console.wraps import ( from controllers.console.wraps import (
account_initialization_required, account_initialization_required,
@@ -32,6 +29,7 @@ from extensions.ext_redis import redis_client
from fields.segment_fields import child_chunk_fields, segment_fields from fields.segment_fields import child_chunk_fields, segment_fields
from libs.login import login_required from libs.login import login_required
from models.dataset import ChildChunk, DocumentSegment from models.dataset import ChildChunk, DocumentSegment
from models.model import UploadFile
from services.dataset_service import DatasetService, DocumentService, SegmentService from services.dataset_service import DatasetService, DocumentService, SegmentService
from services.entities.knowledge_entities.knowledge_entities import ChildChunkUpdateArgs, SegmentUpdateArgs from services.entities.knowledge_entities.knowledge_entities import ChildChunkUpdateArgs, SegmentUpdateArgs
from services.errors.chunk import ChildChunkDeleteIndexError as ChildChunkDeleteIndexServiceError from services.errors.chunk import ChildChunkDeleteIndexError as ChildChunkDeleteIndexServiceError
@@ -365,37 +363,28 @@ class DatasetDocumentSegmentBatchImportApi(Resource):
document = DocumentService.get_document(dataset_id, document_id) document = DocumentService.get_document(dataset_id, document_id)
if not document: if not document:
raise NotFound("Document not found.") raise NotFound("Document not found.")
# get file from request
file = request.files["file"]
# check file
if "file" not in request.files:
raise NoFileUploadedError()
if len(request.files) > 1: parser = reqparse.RequestParser()
raise TooManyFilesError() parser.add_argument("upload_file_id", type=str, required=True, nullable=False, location="json")
args = parser.parse_args()
upload_file_id = args["upload_file_id"]
upload_file = db.session.query(UploadFile).where(UploadFile.id == upload_file_id).first()
if not upload_file:
raise NotFound("UploadFile not found.")
# check file type # check file type
if not file.filename or not file.filename.lower().endswith(".csv"): if not upload_file.name or not upload_file.name.lower().endswith(".csv"):
raise ValueError("Invalid file type. Only CSV files are allowed") raise ValueError("Invalid file type. Only CSV files are allowed")
try: try:
# Skip the first row
df = pd.read_csv(file)
result = []
for index, row in df.iterrows():
if document.doc_form == "qa_model":
data = {"content": row.iloc[0], "answer": row.iloc[1]}
else:
data = {"content": row.iloc[0]}
result.append(data)
if len(result) == 0:
raise ValueError("The CSV file is empty.")
# async job # async job
job_id = str(uuid.uuid4()) job_id = str(uuid.uuid4())
indexing_cache_key = f"segment_batch_import_{str(job_id)}" indexing_cache_key = f"segment_batch_import_{str(job_id)}"
# send batch add segments task # send batch add segments task
redis_client.setnx(indexing_cache_key, "waiting") redis_client.setnx(indexing_cache_key, "waiting")
batch_create_segment_to_index_task.delay( batch_create_segment_to_index_task.delay(
str(job_id), result, dataset_id, document_id, current_user.current_tenant_id, current_user.id str(job_id), upload_file_id, dataset_id, document_id, current_user.current_tenant_id, current_user.id
) )
except Exception as e: except Exception as e:
return {"error": str(e)}, 500 return {"error": str(e)}, 500

View File

@@ -1,9 +1,12 @@
import datetime import datetime
import logging import logging
import tempfile
import time import time
import uuid import uuid
from pathlib import Path
import click import click
import pandas as pd
from celery import shared_task # type: ignore from celery import shared_task # type: ignore
from sqlalchemy import func from sqlalchemy import func
from sqlalchemy.orm import Session from sqlalchemy.orm import Session
@@ -12,15 +15,17 @@ from core.model_manager import ModelManager
from core.model_runtime.entities.model_entities import ModelType from core.model_runtime.entities.model_entities import ModelType
from extensions.ext_database import db from extensions.ext_database import db
from extensions.ext_redis import redis_client from extensions.ext_redis import redis_client
from extensions.ext_storage import storage
from libs import helper from libs import helper
from models.dataset import Dataset, Document, DocumentSegment from models.dataset import Dataset, Document, DocumentSegment
from models.model import UploadFile
from services.vector_service import VectorService from services.vector_service import VectorService
@shared_task(queue="dataset") @shared_task(queue="dataset")
def batch_create_segment_to_index_task( def batch_create_segment_to_index_task(
job_id: str, job_id: str,
content: list, upload_file_id: str,
dataset_id: str, dataset_id: str,
document_id: str, document_id: str,
tenant_id: str, tenant_id: str,
@@ -29,13 +34,13 @@ def batch_create_segment_to_index_task(
""" """
Async batch create segment to index Async batch create segment to index
:param job_id: :param job_id:
:param content: :param upload_file_id:
:param dataset_id: :param dataset_id:
:param document_id: :param document_id:
:param tenant_id: :param tenant_id:
:param user_id: :param user_id:
Usage: batch_create_segment_to_index_task.delay(job_id, content, dataset_id, document_id, tenant_id, user_id) Usage: batch_create_segment_to_index_task.delay(job_id, upload_file_id, dataset_id, document_id, tenant_id, user_id)
""" """
logging.info(click.style(f"Start batch create segment jobId: {job_id}", fg="green")) logging.info(click.style(f"Start batch create segment jobId: {job_id}", fg="green"))
start_at = time.perf_counter() start_at = time.perf_counter()
@@ -58,6 +63,29 @@ def batch_create_segment_to_index_task(
or dataset_document.indexing_status != "completed" or dataset_document.indexing_status != "completed"
): ):
raise ValueError("Document is not available.") raise ValueError("Document is not available.")
upload_file = session.get(UploadFile, upload_file_id)
if not upload_file:
raise ValueError("UploadFile not found.")
with tempfile.TemporaryDirectory() as temp_dir:
suffix = Path(upload_file.key).suffix
# FIXME mypy: Cannot determine type of 'tempfile._get_candidate_names' better not use it here
file_path = f"{temp_dir}/{next(tempfile._get_candidate_names())}{suffix}" # type: ignore
storage.download(upload_file.key, file_path)
# Skip the first row
df = pd.read_csv(file_path)
content = []
for index, row in df.iterrows():
if dataset_document.doc_form == "qa_model":
data = {"content": row.iloc[0], "answer": row.iloc[1]}
else:
data = {"content": row.iloc[0]}
content.append(data)
if len(content) == 0:
raise ValueError("The CSV file is empty.")
document_segments = [] document_segments = []
embedding_model = None embedding_model = None
if dataset.indexing_technique == "high_quality": if dataset.indexing_technique == "high_quality":

View File

@@ -1,6 +1,6 @@
'use client' 'use client'
import type { FC } from 'react' import type { FC } from 'react'
import React, { useEffect, useRef, useState } from 'react' import React, { useCallback, useEffect, useMemo, useRef, useState } from 'react'
import { import {
RiDeleteBinLine, RiDeleteBinLine,
} from '@remixicon/react' } from '@remixicon/react'
@@ -10,10 +10,17 @@ import cn from '@/utils/classnames'
import { Csv as CSVIcon } from '@/app/components/base/icons/src/public/files' import { Csv as CSVIcon } from '@/app/components/base/icons/src/public/files'
import { ToastContext } from '@/app/components/base/toast' import { ToastContext } from '@/app/components/base/toast'
import Button from '@/app/components/base/button' import Button from '@/app/components/base/button'
import type { FileItem } from '@/models/datasets'
import { upload } from '@/service/base'
import useSWR from 'swr'
import { fetchFileUploadConfig } from '@/service/common'
import SimplePieChart from '@/app/components/base/simple-pie-chart'
import { Theme } from '@/types/app'
import useTheme from '@/hooks/use-theme'
export type Props = { export type Props = {
file: File | undefined file: FileItem | undefined
updateFile: (file?: File) => void updateFile: (file?: FileItem) => void
} }
const CSVUploader: FC<Props> = ({ const CSVUploader: FC<Props> = ({
@@ -26,6 +33,68 @@ const CSVUploader: FC<Props> = ({
const dropRef = useRef<HTMLDivElement>(null) const dropRef = useRef<HTMLDivElement>(null)
const dragRef = useRef<HTMLDivElement>(null) const dragRef = useRef<HTMLDivElement>(null)
const fileUploader = useRef<HTMLInputElement>(null) const fileUploader = useRef<HTMLInputElement>(null)
const { data: fileUploadConfigResponse } = useSWR({ url: '/files/upload' }, fetchFileUploadConfig)
const fileUploadConfig = useMemo(() => fileUploadConfigResponse ?? {
file_size_limit: 15,
}, [fileUploadConfigResponse])
const fileUpload = useCallback(async (fileItem: FileItem): Promise<FileItem> => {
fileItem.progress = 0
const formData = new FormData()
formData.append('file', fileItem.file)
const onProgress = (e: ProgressEvent) => {
if (e.lengthComputable) {
const progress = Math.floor(e.loaded / e.total * 100)
updateFile({
...fileItem,
progress,
})
}
}
return upload({
xhr: new XMLHttpRequest(),
data: formData,
onprogress: onProgress,
}, false, undefined, '?source=datasets')
.then((res: File) => {
const completeFile = {
fileID: fileItem.fileID,
file: res,
progress: 100,
}
updateFile(completeFile)
return Promise.resolve({ ...completeFile })
})
.catch((e) => {
notify({ type: 'error', message: e?.response?.code === 'forbidden' ? e?.response?.message : t('datasetCreation.stepOne.uploader.failed') })
const errorFile = {
...fileItem,
progress: -2,
}
updateFile(errorFile)
return Promise.resolve({ ...errorFile })
})
.finally()
}, [notify, t, updateFile])
const uploadFile = useCallback(async (fileItem: FileItem) => {
await fileUpload(fileItem)
}, [fileUpload])
const initialUpload = useCallback((file?: File) => {
if (!file)
return false
const newFile: FileItem = {
fileID: `file0-${Date.now()}`,
file,
progress: -1,
}
updateFile(newFile)
uploadFile(newFile)
}, [updateFile, uploadFile])
const handleDragEnter = (e: DragEvent) => { const handleDragEnter = (e: DragEvent) => {
e.preventDefault() e.preventDefault()
@@ -52,7 +121,7 @@ const CSVUploader: FC<Props> = ({
notify({ type: 'error', message: t('datasetCreation.stepOne.uploader.validation.count') }) notify({ type: 'error', message: t('datasetCreation.stepOne.uploader.validation.count') })
return return
} }
updateFile(files[0]) initialUpload(files[0])
} }
const selectHandle = () => { const selectHandle = () => {
if (fileUploader.current) if (fileUploader.current)
@@ -63,11 +132,43 @@ const CSVUploader: FC<Props> = ({
fileUploader.current.value = '' fileUploader.current.value = ''
updateFile() updateFile()
} }
const getFileType = (currentFile: File) => {
if (!currentFile)
return ''
const arr = currentFile.name.split('.')
return arr[arr.length - 1]
}
const isValid = useCallback((file?: File) => {
if (!file)
return false
const { size } = file
const ext = `.${getFileType(file)}`
const isValidType = ext.toLowerCase() === '.csv'
if (!isValidType)
notify({ type: 'error', message: t('datasetCreation.stepOne.uploader.validation.typeError') })
const isValidSize = size <= fileUploadConfig.file_size_limit * 1024 * 1024
if (!isValidSize)
notify({ type: 'error', message: t('datasetCreation.stepOne.uploader.validation.size', { size: fileUploadConfig.file_size_limit }) })
return isValidType && isValidSize
}, [fileUploadConfig, notify, t])
const fileChangeHandle = (e: React.ChangeEvent<HTMLInputElement>) => { const fileChangeHandle = (e: React.ChangeEvent<HTMLInputElement>) => {
const currentFile = e.target.files?.[0] const currentFile = e.target.files?.[0]
updateFile(currentFile) if (!isValid(currentFile))
return
initialUpload(currentFile)
} }
const { theme } = useTheme()
const chartColor = useMemo(() => theme === Theme.dark ? '#5289ff' : '#296dff', [theme])
useEffect(() => { useEffect(() => {
dropRef.current?.addEventListener('dragenter', handleDragEnter) dropRef.current?.addEventListener('dragenter', handleDragEnter)
dropRef.current?.addEventListener('dragover', handleDragOver) dropRef.current?.addEventListener('dragover', handleDragOver)
@@ -108,10 +209,16 @@ const CSVUploader: FC<Props> = ({
<div className={cn('group flex h-20 items-center rounded-xl border border-components-panel-border bg-components-panel-bg-blur px-6 text-sm font-normal', 'hover:border-divider-subtle hover:bg-components-panel-on-panel-item-bg-hover')}> <div className={cn('group flex h-20 items-center rounded-xl border border-components-panel-border bg-components-panel-bg-blur px-6 text-sm font-normal', 'hover:border-divider-subtle hover:bg-components-panel-on-panel-item-bg-hover')}>
<CSVIcon className="shrink-0" /> <CSVIcon className="shrink-0" />
<div className='ml-2 flex w-0 grow'> <div className='ml-2 flex w-0 grow'>
<span className='max-w-[calc(100%_-_30px)] overflow-hidden text-ellipsis whitespace-nowrap text-text-primary'>{file.name.replace(/.csv$/, '')}</span> <span className='max-w-[calc(100%_-_30px)] overflow-hidden text-ellipsis whitespace-nowrap text-text-primary'>{file.file.name.replace(/.csv$/, '')}</span>
<span className='shrink-0 text-text-secondary'>.csv</span> <span className='shrink-0 text-text-secondary'>.csv</span>
</div> </div>
<div className='hidden items-center group-hover:flex'> <div className='hidden items-center group-hover:flex'>
{(file.progress < 100 && file.progress >= 0) && (
<>
<SimplePieChart percentage={file.progress} stroke={chartColor} fill={chartColor} animationDuration={0}/>
<div className='mx-2 h-4 w-px bg-text-secondary'/>
</>
)}
<Button onClick={selectHandle}>{t('datasetCreation.stepOne.uploader.change')}</Button> <Button onClick={selectHandle}>{t('datasetCreation.stepOne.uploader.change')}</Button>
<div className='mx-2 h-4 w-px bg-text-secondary' /> <div className='mx-2 h-4 w-px bg-text-secondary' />
<div className='cursor-pointer p-2' onClick={removeFile}> <div className='cursor-pointer p-2' onClick={removeFile}>

View File

@@ -7,14 +7,14 @@ import CSVUploader from './csv-uploader'
import CSVDownloader from './csv-downloader' import CSVDownloader from './csv-downloader'
import Button from '@/app/components/base/button' import Button from '@/app/components/base/button'
import Modal from '@/app/components/base/modal' import Modal from '@/app/components/base/modal'
import type { ChunkingMode } from '@/models/datasets' import type { ChunkingMode, FileItem } from '@/models/datasets'
import { noop } from 'lodash-es' import { noop } from 'lodash-es'
export type IBatchModalProps = { export type IBatchModalProps = {
isShow: boolean isShow: boolean
docForm: ChunkingMode docForm: ChunkingMode
onCancel: () => void onCancel: () => void
onConfirm: (file: File) => void onConfirm: (file: FileItem) => void
} }
const BatchModal: FC<IBatchModalProps> = ({ const BatchModal: FC<IBatchModalProps> = ({
@@ -24,8 +24,8 @@ const BatchModal: FC<IBatchModalProps> = ({
onConfirm, onConfirm,
}) => { }) => {
const { t } = useTranslation() const { t } = useTranslation()
const [currentCSV, setCurrentCSV] = useState<File>() const [currentCSV, setCurrentCSV] = useState<FileItem>()
const handleFile = (file?: File) => setCurrentCSV(file) const handleFile = (file?: FileItem) => setCurrentCSV(file)
const handleSend = () => { const handleSend = () => {
if (!currentCSV) if (!currentCSV)
@@ -56,7 +56,7 @@ const BatchModal: FC<IBatchModalProps> = ({
<Button className='mr-2' onClick={onCancel}> <Button className='mr-2' onClick={onCancel}>
{t('datasetDocuments.list.batchModal.cancel')} {t('datasetDocuments.list.batchModal.cancel')}
</Button> </Button>
<Button variant="primary" onClick={handleSend} disabled={!currentCSV}> <Button variant="primary" onClick={handleSend} disabled={!currentCSV || !currentCSV.file || !currentCSV.file.id}>
{t('datasetDocuments.list.batchModal.run')} {t('datasetDocuments.list.batchModal.run')}
</Button> </Button>
</div> </div>

View File

@@ -17,7 +17,7 @@ import cn from '@/utils/classnames'
import Divider from '@/app/components/base/divider' import Divider from '@/app/components/base/divider'
import Loading from '@/app/components/base/loading' import Loading from '@/app/components/base/loading'
import { ToastContext } from '@/app/components/base/toast' import { ToastContext } from '@/app/components/base/toast'
import type { ChunkingMode, ParentMode, ProcessMode } from '@/models/datasets' import type { ChunkingMode, FileItem, ParentMode, ProcessMode } from '@/models/datasets'
import { useDatasetDetailContext } from '@/context/dataset-detail' import { useDatasetDetailContext } from '@/context/dataset-detail'
import FloatRightContainer from '@/app/components/base/float-right-container' import FloatRightContainer from '@/app/components/base/float-right-container'
import useBreakpoints, { MediaType } from '@/hooks/use-breakpoints' import useBreakpoints, { MediaType } from '@/hooks/use-breakpoints'
@@ -111,12 +111,10 @@ const DocumentDetail: FC<Props> = ({ datasetId, documentId }) => {
} }
const { mutateAsync: segmentBatchImport } = useSegmentBatchImport() const { mutateAsync: segmentBatchImport } = useSegmentBatchImport()
const runBatch = async (csv: File) => { const runBatch = async (csv: FileItem) => {
const formData = new FormData()
formData.append('file', csv)
await segmentBatchImport({ await segmentBatchImport({
url: `/datasets/${datasetId}/documents/${documentId}/segments/batch_import`, url: `/datasets/${datasetId}/documents/${documentId}/segments/batch_import`,
body: formData, body: { upload_file_id: csv.file.id! },
}, { }, {
onSuccess: (res) => { onSuccess: (res) => {
setImportStatus(res.job_status) setImportStatus(res.job_status)

View File

@@ -154,9 +154,9 @@ export const useUpdateChildSegment = () => {
export const useSegmentBatchImport = () => { export const useSegmentBatchImport = () => {
return useMutation({ return useMutation({
mutationKey: [NAME_SPACE, 'batchImport'], mutationKey: [NAME_SPACE, 'batchImport'],
mutationFn: (payload: { url: string; body: FormData }) => { mutationFn: (payload: { url: string; body: { upload_file_id: string } }) => {
const { url, body } = payload const { url, body } = payload
return post<BatchImportResponse>(url, { body }, { bodyStringify: false, deleteContentType: true }) return post<BatchImportResponse>(url, { body })
}, },
}) })
} }