Feat/kb index (#20868)

Co-authored-by: twwu <twwu@dify.ai>
This commit is contained in:
Dongyu Li
2025-06-25 17:52:59 +08:00
committed by GitHub
parent 3acaa59885
commit 00f0b569cc
6 changed files with 80 additions and 85 deletions

View File

@@ -534,7 +534,7 @@ class IndexingRunner:
# chunk nodes by chunk size # chunk nodes by chunk size
indexing_start_at = time.perf_counter() indexing_start_at = time.perf_counter()
tokens = 0 tokens = 0
if dataset_document.doc_form != IndexType.PARENT_CHILD_INDEX: if dataset_document.doc_form != IndexType.PARENT_CHILD_INDEX and dataset.indexing_technique == "economy":
# create keyword index # create keyword index
create_keyword_thread = threading.Thread( create_keyword_thread = threading.Thread(
target=self._process_keyword_index, target=self._process_keyword_index,
@@ -572,7 +572,7 @@ class IndexingRunner:
for future in futures: for future in futures:
tokens += future.result() tokens += future.result()
if dataset_document.doc_form != IndexType.PARENT_CHILD_INDEX: if dataset_document.doc_form != IndexType.PARENT_CHILD_INDEX and dataset.indexing_technique == "economy":
create_keyword_thread.join() create_keyword_thread.join()
indexing_end_at = time.perf_counter() indexing_end_at = time.perf_counter()

View File

@@ -76,6 +76,7 @@ class ParagraphIndexProcessor(BaseIndexProcessor):
if dataset.indexing_technique == "high_quality": if dataset.indexing_technique == "high_quality":
vector = Vector(dataset) vector = Vector(dataset)
vector.create(documents) vector.create(documents)
with_keywords = False
if with_keywords: if with_keywords:
keywords_list = kwargs.get("keywords_list") keywords_list = kwargs.get("keywords_list")
keyword = Keyword(dataset) keyword = Keyword(dataset)
@@ -91,6 +92,7 @@ class ParagraphIndexProcessor(BaseIndexProcessor):
vector.delete_by_ids(node_ids) vector.delete_by_ids(node_ids)
else: else:
vector.delete() vector.delete()
with_keywords = False
if with_keywords: if with_keywords:
keyword = Keyword(dataset) keyword = Keyword(dataset)
if node_ids: if node_ids:

View File

@@ -97,16 +97,16 @@ class VectorService:
vector = Vector(dataset=dataset) vector = Vector(dataset=dataset)
vector.delete_by_ids([segment.index_node_id]) vector.delete_by_ids([segment.index_node_id])
vector.add_texts([document], duplicate_check=True) vector.add_texts([document], duplicate_check=True)
# update keyword index
keyword = Keyword(dataset)
keyword.delete_by_ids([segment.index_node_id])
# save keyword index
if keywords and len(keywords) > 0:
keyword.add_texts([document], keywords_list=[keywords])
else: else:
keyword.add_texts([document]) # update keyword index
keyword = Keyword(dataset)
keyword.delete_by_ids([segment.index_node_id])
# save keyword index
if keywords and len(keywords) > 0:
keyword.add_texts([document], keywords_list=[keywords])
else:
keyword.add_texts([document])
@classmethod @classmethod
def generate_child_chunks( def generate_child_chunks(

View File

@@ -1,4 +1,4 @@
import React, { type FC, useMemo, useState } from 'react' import React, { type FC, useCallback, useMemo, useState } from 'react'
import { useTranslation } from 'react-i18next' import { useTranslation } from 'react-i18next'
import { import {
RiCloseLine, RiCloseLine,
@@ -16,8 +16,10 @@ import { useSegmentListContext } from './index'
import { ChunkingMode, type SegmentDetailModel } from '@/models/datasets' import { ChunkingMode, type SegmentDetailModel } from '@/models/datasets'
import { useEventEmitterContextContext } from '@/context/event-emitter' import { useEventEmitterContextContext } from '@/context/event-emitter'
import { formatNumber } from '@/utils/format' import { formatNumber } from '@/utils/format'
import classNames from '@/utils/classnames' import cn from '@/utils/classnames'
import Divider from '@/app/components/base/divider' import Divider from '@/app/components/base/divider'
import { useDatasetDetailContextWithSelector } from '@/context/dataset-detail'
import { IndexingType } from '../../../create/step-two'
type ISegmentDetailProps = { type ISegmentDetailProps = {
segInfo?: Partial<SegmentDetailModel> & { id: string } segInfo?: Partial<SegmentDetailModel> & { id: string }
@@ -48,6 +50,7 @@ const SegmentDetail: FC<ISegmentDetailProps> = ({
const toggleFullScreen = useSegmentListContext(s => s.toggleFullScreen) const toggleFullScreen = useSegmentListContext(s => s.toggleFullScreen)
const mode = useDocumentContext(s => s.mode) const mode = useDocumentContext(s => s.mode)
const parentMode = useDocumentContext(s => s.parentMode) const parentMode = useDocumentContext(s => s.parentMode)
const indexingTechnique = useDatasetDetailContextWithSelector(s => s.dataset?.indexing_technique)
eventEmitter?.useSubscription((v) => { eventEmitter?.useSubscription((v) => {
if (v === 'update-segment') if (v === 'update-segment')
@@ -56,56 +59,41 @@ const SegmentDetail: FC<ISegmentDetailProps> = ({
setLoading(false) setLoading(false)
}) })
const handleCancel = () => { const handleCancel = useCallback(() => {
onCancel() onCancel()
} }, [onCancel])
const handleSave = () => { const handleSave = useCallback(() => {
onUpdate(segInfo?.id || '', question, answer, keywords) onUpdate(segInfo?.id || '', question, answer, keywords)
} }, [onUpdate, segInfo?.id, question, answer, keywords])
const handleRegeneration = () => { const handleRegeneration = useCallback(() => {
setShowRegenerationModal(true) setShowRegenerationModal(true)
} }, [])
const onCancelRegeneration = () => { const onCancelRegeneration = useCallback(() => {
setShowRegenerationModal(false) setShowRegenerationModal(false)
} }, [])
const onConfirmRegeneration = () => { const onConfirmRegeneration = useCallback(() => {
onUpdate(segInfo?.id || '', question, answer, keywords, true) onUpdate(segInfo?.id || '', question, answer, keywords, true)
} }, [onUpdate, segInfo?.id, question, answer, keywords])
const isParentChildMode = useMemo(() => {
return mode === 'hierarchical'
}, [mode])
const isFullDocMode = useMemo(() => {
return mode === 'hierarchical' && parentMode === 'full-doc'
}, [mode, parentMode])
const titleText = useMemo(() => {
return isEditMode ? t('datasetDocuments.segment.editChunk') : t('datasetDocuments.segment.chunkDetail')
}, [isEditMode, t])
const isQAModel = useMemo(() => {
return docForm === ChunkingMode.qa
}, [docForm])
const wordCountText = useMemo(() => { const wordCountText = useMemo(() => {
const contentLength = isQAModel ? (question.length + answer.length) : question.length const contentLength = docForm === ChunkingMode.qa ? (question.length + answer.length) : question.length
const total = formatNumber(isEditMode ? contentLength : segInfo!.word_count as number) const total = formatNumber(isEditMode ? contentLength : segInfo!.word_count as number)
const count = isEditMode ? contentLength : segInfo!.word_count as number const count = isEditMode ? contentLength : segInfo!.word_count as number
return `${total} ${t('datasetDocuments.segment.characters', { count })}` return `${total} ${t('datasetDocuments.segment.characters', { count })}`
}, [isEditMode, question.length, answer.length, isQAModel, segInfo, t]) }, [isEditMode, question.length, answer.length, docForm, segInfo, t])
const labelPrefix = useMemo(() => { const isFullDocMode = mode === 'hierarchical' && parentMode === 'full-doc'
return isParentChildMode ? t('datasetDocuments.segment.parentChunk') : t('datasetDocuments.segment.chunk') const titleText = isEditMode ? t('datasetDocuments.segment.editChunk') : t('datasetDocuments.segment.chunkDetail')
}, [isParentChildMode, t]) const labelPrefix = mode === 'hierarchical' ? t('datasetDocuments.segment.parentChunk') : t('datasetDocuments.segment.chunk')
const isECOIndexing = indexingTechnique === IndexingType.ECONOMICAL
return ( return (
<div className={'flex h-full flex-col'}> <div className={'flex h-full flex-col'}>
<div className={classNames('flex items-center justify-between', fullScreen ? 'py-3 pr-4 pl-6 border border-divider-subtle' : 'pt-3 pr-3 pl-4')}> <div className={cn('flex items-center justify-between', fullScreen ? 'border border-divider-subtle py-3 pl-6 pr-4' : 'pl-4 pr-3 pt-3')}>
<div className='flex flex-col'> <div className='flex flex-col'>
<div className='system-xl-semibold text-text-primary'>{titleText}</div> <div className='system-xl-semibold text-text-primary'>{titleText}</div>
<div className='flex items-center gap-x-2'> <div className='flex items-center gap-x-2'>
@@ -134,12 +122,12 @@ const SegmentDetail: FC<ISegmentDetailProps> = ({
</div> </div>
</div> </div>
</div> </div>
<div className={classNames( <div className={cn(
'flex grow', 'flex grow',
fullScreen ? 'w-full flex-row justify-center px-6 pt-6 gap-x-8' : 'flex-col gap-y-1 py-3 px-4', fullScreen ? 'w-full flex-row justify-center gap-x-8 px-6 pt-6' : 'flex-col gap-y-1 px-4 py-3',
!isEditMode && 'pb-0 overflow-hidden', !isEditMode && 'overflow-hidden pb-0',
)}> )}>
<div className={classNames(isEditMode ? 'break-all whitespace-pre-line overflow-hidden' : 'overflow-y-auto', fullScreen ? 'w-1/2' : 'grow')}> <div className={cn(isEditMode ? 'overflow-hidden whitespace-pre-line break-all' : 'overflow-y-auto', fullScreen ? 'w-1/2' : 'grow')}>
<ChunkContent <ChunkContent
docForm={docForm} docForm={docForm}
question={question} question={question}
@@ -149,7 +137,7 @@ const SegmentDetail: FC<ISegmentDetailProps> = ({
isEditMode={isEditMode} isEditMode={isEditMode}
/> />
</div> </div>
{mode === 'custom' && <Keywords {isECOIndexing && <Keywords
className={fullScreen ? 'w-1/5' : ''} className={fullScreen ? 'w-1/5' : ''}
actionType={isEditMode ? 'edit' : 'view'} actionType={isEditMode ? 'edit' : 'view'}
segInfo={segInfo} segInfo={segInfo}

View File

@@ -1,4 +1,4 @@
import { memo, useMemo, useRef, useState } from 'react' import { memo, useCallback, useMemo, useRef, useState } from 'react'
import type { FC } from 'react' import type { FC } from 'react'
import { useTranslation } from 'react-i18next' import { useTranslation } from 'react-i18next'
import { useContext } from 'use-context-selector' import { useContext } from 'use-context-selector'
@@ -12,7 +12,6 @@ import Keywords from './completed/common/keywords'
import ChunkContent from './completed/common/chunk-content' import ChunkContent from './completed/common/chunk-content'
import AddAnother from './completed/common/add-another' import AddAnother from './completed/common/add-another'
import Dot from './completed/common/dot' import Dot from './completed/common/dot'
import { useDocumentContext } from './index'
import { useStore as useAppStore } from '@/app/components/app/store' import { useStore as useAppStore } from '@/app/components/app/store'
import { ToastContext } from '@/app/components/base/toast' import { ToastContext } from '@/app/components/base/toast'
import { ChunkingMode, type SegmentUpdater } from '@/models/datasets' import { ChunkingMode, type SegmentUpdater } from '@/models/datasets'
@@ -20,6 +19,8 @@ import classNames from '@/utils/classnames'
import { formatNumber } from '@/utils/format' import { formatNumber } from '@/utils/format'
import Divider from '@/app/components/base/divider' import Divider from '@/app/components/base/divider'
import { useAddSegment } from '@/service/knowledge/use-segment' import { useAddSegment } from '@/service/knowledge/use-segment'
import { useDatasetDetailContextWithSelector } from '@/context/dataset-detail'
import { IndexingType } from '../../create/step-two'
type NewSegmentModalProps = { type NewSegmentModalProps = {
onCancel: () => void onCancel: () => void
@@ -44,39 +45,37 @@ const NewSegmentModal: FC<NewSegmentModalProps> = ({
const [addAnother, setAddAnother] = useState(true) const [addAnother, setAddAnother] = useState(true)
const fullScreen = useSegmentListContext(s => s.fullScreen) const fullScreen = useSegmentListContext(s => s.fullScreen)
const toggleFullScreen = useSegmentListContext(s => s.toggleFullScreen) const toggleFullScreen = useSegmentListContext(s => s.toggleFullScreen)
const mode = useDocumentContext(s => s.mode) const indexingTechnique = useDatasetDetailContextWithSelector(s => s.dataset?.indexing_technique)
const { appSidebarExpand } = useAppStore(useShallow(state => ({ const { appSidebarExpand } = useAppStore(useShallow(state => ({
appSidebarExpand: state.appSidebarExpand, appSidebarExpand: state.appSidebarExpand,
}))) })))
const refreshTimer = useRef<any>(null) const refreshTimer = useRef<any>(null)
const CustomButton = <> const CustomButton = useMemo(() => (
<Divider type='vertical' className='mx-1 h-3 bg-divider-regular' /> <>
<button <Divider type='vertical' className='mx-1 h-3 bg-divider-regular' />
type='button' <button
className='system-xs-semibold text-text-accent' type='button'
onClick={() => { className='system-xs-semibold text-text-accent'
clearTimeout(refreshTimer.current) onClick={() => {
viewNewlyAddedChunk() clearTimeout(refreshTimer.current)
}}> viewNewlyAddedChunk()
{t('common.operation.view')} }}>
</button> {t('common.operation.view')}
</> </button>
</>
), [viewNewlyAddedChunk, t])
const isQAModel = useMemo(() => { const handleCancel = useCallback((actionType: 'esc' | 'add' = 'esc') => {
return docForm === ChunkingMode.qa
}, [docForm])
const handleCancel = (actionType: 'esc' | 'add' = 'esc') => {
if (actionType === 'esc' || !addAnother) if (actionType === 'esc' || !addAnother)
onCancel() onCancel()
} }, [onCancel, addAnother])
const { mutateAsync: addSegment } = useAddSegment() const { mutateAsync: addSegment } = useAddSegment()
const handleSave = async () => { const handleSave = useCallback(async () => {
const params: SegmentUpdater = { content: '' } const params: SegmentUpdater = { content: '' }
if (isQAModel) { if (docForm === ChunkingMode.qa) {
if (!question.trim()) { if (!question.trim()) {
return notify({ return notify({
type: 'error', type: 'error',
@@ -129,21 +128,27 @@ const NewSegmentModal: FC<NewSegmentModalProps> = ({
setLoading(false) setLoading(false)
}, },
}) })
} }, [docForm, keywords, addSegment, datasetId, documentId, question, answer, notify, t, appSidebarExpand, CustomButton, handleCancel, onSave])
const wordCountText = useMemo(() => { const wordCountText = useMemo(() => {
const count = isQAModel ? (question.length + answer.length) : question.length const count = docForm === ChunkingMode.qa ? (question.length + answer.length) : question.length
return `${formatNumber(count)} ${t('datasetDocuments.segment.characters', { count })}` return `${formatNumber(count)} ${t('datasetDocuments.segment.characters', { count })}`
// eslint-disable-next-line react-hooks/exhaustive-deps }, [question.length, answer.length, docForm, t])
}, [question.length, answer.length, isQAModel])
const isECOIndexing = indexingTechnique === IndexingType.ECONOMICAL
return ( return (
<div className={'flex h-full flex-col'}> <div className={'flex h-full flex-col'}>
<div className={classNames('flex items-center justify-between', fullScreen ? 'py-3 pr-4 pl-6 border border-divider-subtle' : 'pt-3 pr-3 pl-4')}> <div
className={classNames(
'flex items-center justify-between',
fullScreen ? 'border border-divider-subtle py-3 pl-6 pr-4' : 'pl-4 pr-3 pt-3',
)}
>
<div className='flex flex-col'> <div className='flex flex-col'>
<div className='system-xl-semibold text-text-primary'>{ <div className='system-xl-semibold text-text-primary'>
t('datasetDocuments.segment.addChunk') {t('datasetDocuments.segment.addChunk')}
}</div> </div>
<div className='flex items-center gap-x-2'> <div className='flex items-center gap-x-2'>
<SegmentIndexTag label={t('datasetDocuments.segment.newChunk')!} /> <SegmentIndexTag label={t('datasetDocuments.segment.newChunk')!} />
<Dot /> <Dot />
@@ -171,8 +176,8 @@ const NewSegmentModal: FC<NewSegmentModalProps> = ({
</div> </div>
</div> </div>
</div> </div>
<div className={classNames('flex grow', fullScreen ? 'w-full flex-row justify-center px-6 pt-6 gap-x-8' : 'flex-col gap-y-1 py-3 px-4')}> <div className={classNames('flex grow', fullScreen ? 'w-full flex-row justify-center gap-x-8 px-6 pt-6' : 'flex-col gap-y-1 px-4 py-3')}>
<div className={classNames('break-all overflow-hidden whitespace-pre-line', fullScreen ? 'w-1/2' : 'grow')}> <div className={classNames('overflow-hidden whitespace-pre-line break-all', fullScreen ? 'w-1/2' : 'grow')}>
<ChunkContent <ChunkContent
docForm={docForm} docForm={docForm}
question={question} question={question}
@@ -182,7 +187,7 @@ const NewSegmentModal: FC<NewSegmentModalProps> = ({
isEditMode={true} isEditMode={true}
/> />
</div> </div>
{mode === 'custom' && <Keywords {isECOIndexing && <Keywords
className={fullScreen ? 'w-1/5' : ''} className={fullScreen ? 'w-1/5' : ''}
actionType='add' actionType='add'
keywords={keywords} keywords={keywords}

View File

@@ -213,7 +213,7 @@ export default combine(
settings: { settings: {
tailwindcss: { tailwindcss: {
// These are the default values but feel free to customize // These are the default values but feel free to customize
callees: ['classnames', 'clsx', 'ctl', 'cn'], callees: ['classnames', 'clsx', 'ctl', 'cn', 'classNames'],
config: 'tailwind.config.js', // returned from `loadConfig()` utility if not provided config: 'tailwind.config.js', // returned from `loadConfig()` utility if not provided
cssFiles: [ cssFiles: [
'**/*.css', '**/*.css',