From 00f0b569cc0d5680b2ba8963434ca7776879f868 Mon Sep 17 00:00:00 2001 From: Dongyu Li <544104925@qq.com> Date: Wed, 25 Jun 2025 17:52:59 +0800 Subject: [PATCH] Feat/kb index (#20868) Co-authored-by: twwu --- api/core/indexing_runner.py | 4 +- .../processor/paragraph_index_processor.py | 2 + api/services/vector_service.py | 18 ++--- .../detail/completed/segment-detail.tsx | 66 +++++++---------- .../datasets/documents/detail/new-segment.tsx | 73 ++++++++++--------- web/eslint.config.mjs | 2 +- 6 files changed, 80 insertions(+), 85 deletions(-) diff --git a/api/core/indexing_runner.py b/api/core/indexing_runner.py index 848d89777..f2fe30617 100644 --- a/api/core/indexing_runner.py +++ b/api/core/indexing_runner.py @@ -534,7 +534,7 @@ class IndexingRunner: # chunk nodes by chunk size indexing_start_at = time.perf_counter() tokens = 0 - if dataset_document.doc_form != IndexType.PARENT_CHILD_INDEX: + if dataset_document.doc_form != IndexType.PARENT_CHILD_INDEX and dataset.indexing_technique == "economy": # create keyword index create_keyword_thread = threading.Thread( target=self._process_keyword_index, @@ -572,7 +572,7 @@ class IndexingRunner: for future in futures: tokens += future.result() - if dataset_document.doc_form != IndexType.PARENT_CHILD_INDEX: + if dataset_document.doc_form != IndexType.PARENT_CHILD_INDEX and dataset.indexing_technique == "economy": create_keyword_thread.join() indexing_end_at = time.perf_counter() diff --git a/api/core/rag/index_processor/processor/paragraph_index_processor.py b/api/core/rag/index_processor/processor/paragraph_index_processor.py index dca84b904..9b90bd2bb 100644 --- a/api/core/rag/index_processor/processor/paragraph_index_processor.py +++ b/api/core/rag/index_processor/processor/paragraph_index_processor.py @@ -76,6 +76,7 @@ class ParagraphIndexProcessor(BaseIndexProcessor): if dataset.indexing_technique == "high_quality": vector = Vector(dataset) vector.create(documents) + with_keywords = False if with_keywords: keywords_list = kwargs.get("keywords_list") keyword = Keyword(dataset) @@ -91,6 +92,7 @@ class ParagraphIndexProcessor(BaseIndexProcessor): vector.delete_by_ids(node_ids) else: vector.delete() + with_keywords = False if with_keywords: keyword = Keyword(dataset) if node_ids: diff --git a/api/services/vector_service.py b/api/services/vector_service.py index 19e37f4ee..916513919 100644 --- a/api/services/vector_service.py +++ b/api/services/vector_service.py @@ -97,16 +97,16 @@ class VectorService: vector = Vector(dataset=dataset) vector.delete_by_ids([segment.index_node_id]) vector.add_texts([document], duplicate_check=True) - - # update keyword index - keyword = Keyword(dataset) - keyword.delete_by_ids([segment.index_node_id]) - - # save keyword index - if keywords and len(keywords) > 0: - keyword.add_texts([document], keywords_list=[keywords]) else: - keyword.add_texts([document]) + # update keyword index + keyword = Keyword(dataset) + keyword.delete_by_ids([segment.index_node_id]) + + # save keyword index + if keywords and len(keywords) > 0: + keyword.add_texts([document], keywords_list=[keywords]) + else: + keyword.add_texts([document]) @classmethod def generate_child_chunks( diff --git a/web/app/components/datasets/documents/detail/completed/segment-detail.tsx b/web/app/components/datasets/documents/detail/completed/segment-detail.tsx index d3575c18e..f3f0aef6a 100644 --- a/web/app/components/datasets/documents/detail/completed/segment-detail.tsx +++ b/web/app/components/datasets/documents/detail/completed/segment-detail.tsx @@ -1,4 +1,4 @@ -import React, { type FC, useMemo, useState } from 'react' +import React, { type FC, useCallback, useMemo, useState } from 'react' import { useTranslation } from 'react-i18next' import { RiCloseLine, @@ -16,8 +16,10 @@ import { useSegmentListContext } from './index' import { ChunkingMode, type SegmentDetailModel } from '@/models/datasets' import { useEventEmitterContextContext } from '@/context/event-emitter' import { formatNumber } from '@/utils/format' -import classNames from '@/utils/classnames' +import cn from '@/utils/classnames' import Divider from '@/app/components/base/divider' +import { useDatasetDetailContextWithSelector } from '@/context/dataset-detail' +import { IndexingType } from '../../../create/step-two' type ISegmentDetailProps = { segInfo?: Partial & { id: string } @@ -48,6 +50,7 @@ const SegmentDetail: FC = ({ const toggleFullScreen = useSegmentListContext(s => s.toggleFullScreen) const mode = useDocumentContext(s => s.mode) const parentMode = useDocumentContext(s => s.parentMode) + const indexingTechnique = useDatasetDetailContextWithSelector(s => s.dataset?.indexing_technique) eventEmitter?.useSubscription((v) => { if (v === 'update-segment') @@ -56,56 +59,41 @@ const SegmentDetail: FC = ({ setLoading(false) }) - const handleCancel = () => { + const handleCancel = useCallback(() => { onCancel() - } + }, [onCancel]) - const handleSave = () => { + const handleSave = useCallback(() => { onUpdate(segInfo?.id || '', question, answer, keywords) - } + }, [onUpdate, segInfo?.id, question, answer, keywords]) - const handleRegeneration = () => { + const handleRegeneration = useCallback(() => { setShowRegenerationModal(true) - } + }, []) - const onCancelRegeneration = () => { + const onCancelRegeneration = useCallback(() => { setShowRegenerationModal(false) - } + }, []) - const onConfirmRegeneration = () => { + const onConfirmRegeneration = useCallback(() => { onUpdate(segInfo?.id || '', question, answer, keywords, true) - } - - const isParentChildMode = useMemo(() => { - return mode === 'hierarchical' - }, [mode]) - - const isFullDocMode = useMemo(() => { - return mode === 'hierarchical' && parentMode === 'full-doc' - }, [mode, parentMode]) - - const titleText = useMemo(() => { - return isEditMode ? t('datasetDocuments.segment.editChunk') : t('datasetDocuments.segment.chunkDetail') - }, [isEditMode, t]) - - const isQAModel = useMemo(() => { - return docForm === ChunkingMode.qa - }, [docForm]) + }, [onUpdate, segInfo?.id, question, answer, keywords]) const wordCountText = useMemo(() => { - const contentLength = isQAModel ? (question.length + answer.length) : question.length + const contentLength = docForm === ChunkingMode.qa ? (question.length + answer.length) : question.length const total = formatNumber(isEditMode ? contentLength : segInfo!.word_count as number) const count = isEditMode ? contentLength : segInfo!.word_count as number return `${total} ${t('datasetDocuments.segment.characters', { count })}` - }, [isEditMode, question.length, answer.length, isQAModel, segInfo, t]) + }, [isEditMode, question.length, answer.length, docForm, segInfo, t]) - const labelPrefix = useMemo(() => { - return isParentChildMode ? t('datasetDocuments.segment.parentChunk') : t('datasetDocuments.segment.chunk') - }, [isParentChildMode, t]) + const isFullDocMode = mode === 'hierarchical' && parentMode === 'full-doc' + const titleText = isEditMode ? t('datasetDocuments.segment.editChunk') : t('datasetDocuments.segment.chunkDetail') + const labelPrefix = mode === 'hierarchical' ? t('datasetDocuments.segment.parentChunk') : t('datasetDocuments.segment.chunk') + const isECOIndexing = indexingTechnique === IndexingType.ECONOMICAL return (
-
+
{titleText}
@@ -134,12 +122,12 @@ const SegmentDetail: FC = ({
-
-
+
= ({ isEditMode={isEditMode} />
- {mode === 'custom' && void @@ -44,39 +45,37 @@ const NewSegmentModal: FC = ({ const [addAnother, setAddAnother] = useState(true) const fullScreen = useSegmentListContext(s => s.fullScreen) const toggleFullScreen = useSegmentListContext(s => s.toggleFullScreen) - const mode = useDocumentContext(s => s.mode) + const indexingTechnique = useDatasetDetailContextWithSelector(s => s.dataset?.indexing_technique) const { appSidebarExpand } = useAppStore(useShallow(state => ({ appSidebarExpand: state.appSidebarExpand, }))) const refreshTimer = useRef(null) - const CustomButton = <> - - - + const CustomButton = useMemo(() => ( + <> + + + + ), [viewNewlyAddedChunk, t]) - const isQAModel = useMemo(() => { - return docForm === ChunkingMode.qa - }, [docForm]) - - const handleCancel = (actionType: 'esc' | 'add' = 'esc') => { + const handleCancel = useCallback((actionType: 'esc' | 'add' = 'esc') => { if (actionType === 'esc' || !addAnother) onCancel() - } + }, [onCancel, addAnother]) const { mutateAsync: addSegment } = useAddSegment() - const handleSave = async () => { + const handleSave = useCallback(async () => { const params: SegmentUpdater = { content: '' } - if (isQAModel) { + if (docForm === ChunkingMode.qa) { if (!question.trim()) { return notify({ type: 'error', @@ -129,21 +128,27 @@ const NewSegmentModal: FC = ({ setLoading(false) }, }) - } + }, [docForm, keywords, addSegment, datasetId, documentId, question, answer, notify, t, appSidebarExpand, CustomButton, handleCancel, onSave]) const wordCountText = useMemo(() => { - const count = isQAModel ? (question.length + answer.length) : question.length + const count = docForm === ChunkingMode.qa ? (question.length + answer.length) : question.length return `${formatNumber(count)} ${t('datasetDocuments.segment.characters', { count })}` - // eslint-disable-next-line react-hooks/exhaustive-deps - }, [question.length, answer.length, isQAModel]) + }, [question.length, answer.length, docForm, t]) + + const isECOIndexing = indexingTechnique === IndexingType.ECONOMICAL return (
-
+
-
{ - t('datasetDocuments.segment.addChunk') - }
+
+ {t('datasetDocuments.segment.addChunk')} +
@@ -171,8 +176,8 @@ const NewSegmentModal: FC = ({
-
-
+
+
= ({ isEditMode={true} />
- {mode === 'custom' &&