Feat/kb index (#20868)

Co-authored-by: twwu <twwu@dify.ai>
This commit is contained in:
Dongyu Li
2025-06-25 17:52:59 +08:00
committed by GitHub
parent 3acaa59885
commit 00f0b569cc
6 changed files with 80 additions and 85 deletions

View File

@@ -534,7 +534,7 @@ class IndexingRunner:
# chunk nodes by chunk size
indexing_start_at = time.perf_counter()
tokens = 0
if dataset_document.doc_form != IndexType.PARENT_CHILD_INDEX:
if dataset_document.doc_form != IndexType.PARENT_CHILD_INDEX and dataset.indexing_technique == "economy":
# create keyword index
create_keyword_thread = threading.Thread(
target=self._process_keyword_index,
@@ -572,7 +572,7 @@ class IndexingRunner:
for future in futures:
tokens += future.result()
if dataset_document.doc_form != IndexType.PARENT_CHILD_INDEX:
if dataset_document.doc_form != IndexType.PARENT_CHILD_INDEX and dataset.indexing_technique == "economy":
create_keyword_thread.join()
indexing_end_at = time.perf_counter()

View File

@@ -76,6 +76,7 @@ class ParagraphIndexProcessor(BaseIndexProcessor):
if dataset.indexing_technique == "high_quality":
vector = Vector(dataset)
vector.create(documents)
with_keywords = False
if with_keywords:
keywords_list = kwargs.get("keywords_list")
keyword = Keyword(dataset)
@@ -91,6 +92,7 @@ class ParagraphIndexProcessor(BaseIndexProcessor):
vector.delete_by_ids(node_ids)
else:
vector.delete()
with_keywords = False
if with_keywords:
keyword = Keyword(dataset)
if node_ids:

View File

@@ -97,16 +97,16 @@ class VectorService:
vector = Vector(dataset=dataset)
vector.delete_by_ids([segment.index_node_id])
vector.add_texts([document], duplicate_check=True)
# update keyword index
keyword = Keyword(dataset)
keyword.delete_by_ids([segment.index_node_id])
# save keyword index
if keywords and len(keywords) > 0:
keyword.add_texts([document], keywords_list=[keywords])
else:
keyword.add_texts([document])
# update keyword index
keyword = Keyword(dataset)
keyword.delete_by_ids([segment.index_node_id])
# save keyword index
if keywords and len(keywords) > 0:
keyword.add_texts([document], keywords_list=[keywords])
else:
keyword.add_texts([document])
@classmethod
def generate_child_chunks(

View File

@@ -1,4 +1,4 @@
import React, { type FC, useMemo, useState } from 'react'
import React, { type FC, useCallback, useMemo, useState } from 'react'
import { useTranslation } from 'react-i18next'
import {
RiCloseLine,
@@ -16,8 +16,10 @@ import { useSegmentListContext } from './index'
import { ChunkingMode, type SegmentDetailModel } from '@/models/datasets'
import { useEventEmitterContextContext } from '@/context/event-emitter'
import { formatNumber } from '@/utils/format'
import classNames from '@/utils/classnames'
import cn from '@/utils/classnames'
import Divider from '@/app/components/base/divider'
import { useDatasetDetailContextWithSelector } from '@/context/dataset-detail'
import { IndexingType } from '../../../create/step-two'
type ISegmentDetailProps = {
segInfo?: Partial<SegmentDetailModel> & { id: string }
@@ -48,6 +50,7 @@ const SegmentDetail: FC<ISegmentDetailProps> = ({
const toggleFullScreen = useSegmentListContext(s => s.toggleFullScreen)
const mode = useDocumentContext(s => s.mode)
const parentMode = useDocumentContext(s => s.parentMode)
const indexingTechnique = useDatasetDetailContextWithSelector(s => s.dataset?.indexing_technique)
eventEmitter?.useSubscription((v) => {
if (v === 'update-segment')
@@ -56,56 +59,41 @@ const SegmentDetail: FC<ISegmentDetailProps> = ({
setLoading(false)
})
const handleCancel = () => {
const handleCancel = useCallback(() => {
onCancel()
}
}, [onCancel])
const handleSave = () => {
const handleSave = useCallback(() => {
onUpdate(segInfo?.id || '', question, answer, keywords)
}
}, [onUpdate, segInfo?.id, question, answer, keywords])
const handleRegeneration = () => {
const handleRegeneration = useCallback(() => {
setShowRegenerationModal(true)
}
}, [])
const onCancelRegeneration = () => {
const onCancelRegeneration = useCallback(() => {
setShowRegenerationModal(false)
}
}, [])
const onConfirmRegeneration = () => {
const onConfirmRegeneration = useCallback(() => {
onUpdate(segInfo?.id || '', question, answer, keywords, true)
}
const isParentChildMode = useMemo(() => {
return mode === 'hierarchical'
}, [mode])
const isFullDocMode = useMemo(() => {
return mode === 'hierarchical' && parentMode === 'full-doc'
}, [mode, parentMode])
const titleText = useMemo(() => {
return isEditMode ? t('datasetDocuments.segment.editChunk') : t('datasetDocuments.segment.chunkDetail')
}, [isEditMode, t])
const isQAModel = useMemo(() => {
return docForm === ChunkingMode.qa
}, [docForm])
}, [onUpdate, segInfo?.id, question, answer, keywords])
const wordCountText = useMemo(() => {
const contentLength = isQAModel ? (question.length + answer.length) : question.length
const contentLength = docForm === ChunkingMode.qa ? (question.length + answer.length) : question.length
const total = formatNumber(isEditMode ? contentLength : segInfo!.word_count as number)
const count = isEditMode ? contentLength : segInfo!.word_count as number
return `${total} ${t('datasetDocuments.segment.characters', { count })}`
}, [isEditMode, question.length, answer.length, isQAModel, segInfo, t])
}, [isEditMode, question.length, answer.length, docForm, segInfo, t])
const labelPrefix = useMemo(() => {
return isParentChildMode ? t('datasetDocuments.segment.parentChunk') : t('datasetDocuments.segment.chunk')
}, [isParentChildMode, t])
const isFullDocMode = mode === 'hierarchical' && parentMode === 'full-doc'
const titleText = isEditMode ? t('datasetDocuments.segment.editChunk') : t('datasetDocuments.segment.chunkDetail')
const labelPrefix = mode === 'hierarchical' ? t('datasetDocuments.segment.parentChunk') : t('datasetDocuments.segment.chunk')
const isECOIndexing = indexingTechnique === IndexingType.ECONOMICAL
return (
<div className={'flex h-full flex-col'}>
<div className={classNames('flex items-center justify-between', fullScreen ? 'py-3 pr-4 pl-6 border border-divider-subtle' : 'pt-3 pr-3 pl-4')}>
<div className={cn('flex items-center justify-between', fullScreen ? 'border border-divider-subtle py-3 pl-6 pr-4' : 'pl-4 pr-3 pt-3')}>
<div className='flex flex-col'>
<div className='system-xl-semibold text-text-primary'>{titleText}</div>
<div className='flex items-center gap-x-2'>
@@ -134,12 +122,12 @@ const SegmentDetail: FC<ISegmentDetailProps> = ({
</div>
</div>
</div>
<div className={classNames(
<div className={cn(
'flex grow',
fullScreen ? 'w-full flex-row justify-center px-6 pt-6 gap-x-8' : 'flex-col gap-y-1 py-3 px-4',
!isEditMode && 'pb-0 overflow-hidden',
fullScreen ? 'w-full flex-row justify-center gap-x-8 px-6 pt-6' : 'flex-col gap-y-1 px-4 py-3',
!isEditMode && 'overflow-hidden pb-0',
)}>
<div className={classNames(isEditMode ? 'break-all whitespace-pre-line overflow-hidden' : 'overflow-y-auto', fullScreen ? 'w-1/2' : 'grow')}>
<div className={cn(isEditMode ? 'overflow-hidden whitespace-pre-line break-all' : 'overflow-y-auto', fullScreen ? 'w-1/2' : 'grow')}>
<ChunkContent
docForm={docForm}
question={question}
@@ -149,7 +137,7 @@ const SegmentDetail: FC<ISegmentDetailProps> = ({
isEditMode={isEditMode}
/>
</div>
{mode === 'custom' && <Keywords
{isECOIndexing && <Keywords
className={fullScreen ? 'w-1/5' : ''}
actionType={isEditMode ? 'edit' : 'view'}
segInfo={segInfo}

View File

@@ -1,4 +1,4 @@
import { memo, useMemo, useRef, useState } from 'react'
import { memo, useCallback, useMemo, useRef, useState } from 'react'
import type { FC } from 'react'
import { useTranslation } from 'react-i18next'
import { useContext } from 'use-context-selector'
@@ -12,7 +12,6 @@ import Keywords from './completed/common/keywords'
import ChunkContent from './completed/common/chunk-content'
import AddAnother from './completed/common/add-another'
import Dot from './completed/common/dot'
import { useDocumentContext } from './index'
import { useStore as useAppStore } from '@/app/components/app/store'
import { ToastContext } from '@/app/components/base/toast'
import { ChunkingMode, type SegmentUpdater } from '@/models/datasets'
@@ -20,6 +19,8 @@ import classNames from '@/utils/classnames'
import { formatNumber } from '@/utils/format'
import Divider from '@/app/components/base/divider'
import { useAddSegment } from '@/service/knowledge/use-segment'
import { useDatasetDetailContextWithSelector } from '@/context/dataset-detail'
import { IndexingType } from '../../create/step-two'
type NewSegmentModalProps = {
onCancel: () => void
@@ -44,39 +45,37 @@ const NewSegmentModal: FC<NewSegmentModalProps> = ({
const [addAnother, setAddAnother] = useState(true)
const fullScreen = useSegmentListContext(s => s.fullScreen)
const toggleFullScreen = useSegmentListContext(s => s.toggleFullScreen)
const mode = useDocumentContext(s => s.mode)
const indexingTechnique = useDatasetDetailContextWithSelector(s => s.dataset?.indexing_technique)
const { appSidebarExpand } = useAppStore(useShallow(state => ({
appSidebarExpand: state.appSidebarExpand,
})))
const refreshTimer = useRef<any>(null)
const CustomButton = <>
<Divider type='vertical' className='mx-1 h-3 bg-divider-regular' />
<button
type='button'
className='system-xs-semibold text-text-accent'
onClick={() => {
clearTimeout(refreshTimer.current)
viewNewlyAddedChunk()
}}>
{t('common.operation.view')}
</button>
</>
const CustomButton = useMemo(() => (
<>
<Divider type='vertical' className='mx-1 h-3 bg-divider-regular' />
<button
type='button'
className='system-xs-semibold text-text-accent'
onClick={() => {
clearTimeout(refreshTimer.current)
viewNewlyAddedChunk()
}}>
{t('common.operation.view')}
</button>
</>
), [viewNewlyAddedChunk, t])
const isQAModel = useMemo(() => {
return docForm === ChunkingMode.qa
}, [docForm])
const handleCancel = (actionType: 'esc' | 'add' = 'esc') => {
const handleCancel = useCallback((actionType: 'esc' | 'add' = 'esc') => {
if (actionType === 'esc' || !addAnother)
onCancel()
}
}, [onCancel, addAnother])
const { mutateAsync: addSegment } = useAddSegment()
const handleSave = async () => {
const handleSave = useCallback(async () => {
const params: SegmentUpdater = { content: '' }
if (isQAModel) {
if (docForm === ChunkingMode.qa) {
if (!question.trim()) {
return notify({
type: 'error',
@@ -129,21 +128,27 @@ const NewSegmentModal: FC<NewSegmentModalProps> = ({
setLoading(false)
},
})
}
}, [docForm, keywords, addSegment, datasetId, documentId, question, answer, notify, t, appSidebarExpand, CustomButton, handleCancel, onSave])
const wordCountText = useMemo(() => {
const count = isQAModel ? (question.length + answer.length) : question.length
const count = docForm === ChunkingMode.qa ? (question.length + answer.length) : question.length
return `${formatNumber(count)} ${t('datasetDocuments.segment.characters', { count })}`
// eslint-disable-next-line react-hooks/exhaustive-deps
}, [question.length, answer.length, isQAModel])
}, [question.length, answer.length, docForm, t])
const isECOIndexing = indexingTechnique === IndexingType.ECONOMICAL
return (
<div className={'flex h-full flex-col'}>
<div className={classNames('flex items-center justify-between', fullScreen ? 'py-3 pr-4 pl-6 border border-divider-subtle' : 'pt-3 pr-3 pl-4')}>
<div
className={classNames(
'flex items-center justify-between',
fullScreen ? 'border border-divider-subtle py-3 pl-6 pr-4' : 'pl-4 pr-3 pt-3',
)}
>
<div className='flex flex-col'>
<div className='system-xl-semibold text-text-primary'>{
t('datasetDocuments.segment.addChunk')
}</div>
<div className='system-xl-semibold text-text-primary'>
{t('datasetDocuments.segment.addChunk')}
</div>
<div className='flex items-center gap-x-2'>
<SegmentIndexTag label={t('datasetDocuments.segment.newChunk')!} />
<Dot />
@@ -171,8 +176,8 @@ const NewSegmentModal: FC<NewSegmentModalProps> = ({
</div>
</div>
</div>
<div className={classNames('flex grow', fullScreen ? 'w-full flex-row justify-center px-6 pt-6 gap-x-8' : 'flex-col gap-y-1 py-3 px-4')}>
<div className={classNames('break-all overflow-hidden whitespace-pre-line', fullScreen ? 'w-1/2' : 'grow')}>
<div className={classNames('flex grow', fullScreen ? 'w-full flex-row justify-center gap-x-8 px-6 pt-6' : 'flex-col gap-y-1 px-4 py-3')}>
<div className={classNames('overflow-hidden whitespace-pre-line break-all', fullScreen ? 'w-1/2' : 'grow')}>
<ChunkContent
docForm={docForm}
question={question}
@@ -182,7 +187,7 @@ const NewSegmentModal: FC<NewSegmentModalProps> = ({
isEditMode={true}
/>
</div>
{mode === 'custom' && <Keywords
{isECOIndexing && <Keywords
className={fullScreen ? 'w-1/5' : ''}
actionType='add'
keywords={keywords}

View File

@@ -213,7 +213,7 @@ export default combine(
settings: {
tailwindcss: {
// These are the default values but feel free to customize
callees: ['classnames', 'clsx', 'ctl', 'cn'],
callees: ['classnames', 'clsx', 'ctl', 'cn', 'classNames'],
config: 'tailwind.config.js', // returned from `loadConfig()` utility if not provided
cssFiles: [
'**/*.css',