fix: Fix parent child retrieval issues (#12206)

Co-authored-by: NFish <douxc512@gmail.com>
Co-authored-by: nite-knite <nkCoding@gmail.com>
This commit is contained in:
Wu Tianwei
2025-01-02 16:07:21 +08:00
committed by GitHub
parent 68757950ce
commit 09d759d196
34 changed files with 446 additions and 387 deletions

View File

@@ -31,17 +31,17 @@ import LanguageSelect from './language-select'
import { DelimiterInput, MaxLengthInput, OverlapInput } from './inputs'
import cn from '@/utils/classnames'
import type { CrawlOptions, CrawlResultItem, CreateDocumentReq, CustomFile, DocumentItem, FullDocumentDetail, ParentMode, PreProcessingRule, ProcessRule, Rules, createDocumentResponse } from '@/models/datasets'
import { ChunkingMode, DataSourceType, ProcessMode } from '@/models/datasets'
import Button from '@/app/components/base/button'
import FloatRightContainer from '@/app/components/base/float-right-container'
import RetrievalMethodConfig from '@/app/components/datasets/common/retrieval-method-config'
import EconomicalRetrievalMethodConfig from '@/app/components/datasets/common/economical-retrieval-method-config'
import { type RetrievalConfig } from '@/types/app'
import { ensureRerankModelSelected, isReRankModelSelected } from '@/app/components/datasets/common/check-rerank-model'
import { isReRankModelSelected } from '@/app/components/datasets/common/check-rerank-model'
import Toast from '@/app/components/base/toast'
import type { NotionPage } from '@/models/common'
import { DataSourceProvider } from '@/models/common'
import { ChunkingMode, DataSourceType, RerankingModeEnum } from '@/models/datasets'
import { useDatasetDetailContext } from '@/context/dataset-detail'
import I18n from '@/context/i18n'
import { RETRIEVE_METHOD } from '@/types/app'
@@ -90,17 +90,13 @@ type StepTwoProps = {
onCancel?: () => void
}
export enum SegmentType {
AUTO = 'automatic',
CUSTOM = 'custom',
}
export enum IndexingType {
QUALIFIED = 'high_quality',
ECONOMICAL = 'economy',
}
const DEFAULT_SEGMENT_IDENTIFIER = '\\n\\n'
const DEFAULT_MAXMIMUM_CHUNK_LENGTH = 500
const DEFAULT_MAXIMUM_CHUNK_LENGTH = 500
const DEFAULT_OVERLAP = 50
type ParentChildConfig = {
@@ -131,7 +127,6 @@ const StepTwo = ({
isSetting,
documentDetail,
isAPIKeySet,
onSetting,
datasetId,
indexingType,
dataSourceType: inCreatePageDataSourceType,
@@ -162,12 +157,12 @@ const StepTwo = ({
const isInCreatePage = !datasetId || (datasetId && !currentDataset?.data_source_type)
const dataSourceType = isInCreatePage ? inCreatePageDataSourceType : currentDataset?.data_source_type
const [segmentationType, setSegmentationType] = useState<SegmentType>(SegmentType.CUSTOM)
const [segmentationType, setSegmentationType] = useState<ProcessMode>(ProcessMode.general)
const [segmentIdentifier, doSetSegmentIdentifier] = useState(DEFAULT_SEGMENT_IDENTIFIER)
const setSegmentIdentifier = useCallback((value: string, canEmpty?: boolean) => {
doSetSegmentIdentifier(value ? escape(value) : (canEmpty ? '' : DEFAULT_SEGMENT_IDENTIFIER))
}, [])
const [maxChunkLength, setMaxChunkLength] = useState(DEFAULT_MAXMIMUM_CHUNK_LENGTH) // default chunk length
const [maxChunkLength, setMaxChunkLength] = useState(DEFAULT_MAXIMUM_CHUNK_LENGTH) // default chunk length
const [limitMaxChunkLength, setLimitMaxChunkLength] = useState(4000)
const [overlap, setOverlap] = useState(DEFAULT_OVERLAP)
const [rules, setRules] = useState<PreProcessingRule[]>([])
@@ -198,7 +193,6 @@ const StepTwo = ({
)
// QA Related
const [isLanguageSelectDisabled, _setIsLanguageSelectDisabled] = useState(false)
const [isQAConfirmDialogOpen, setIsQAConfirmDialogOpen] = useState(false)
const [docForm, setDocForm] = useState<ChunkingMode>(
(datasetId && documentDetail) ? documentDetail.doc_form as ChunkingMode : ChunkingMode.text,
@@ -348,7 +342,7 @@ const StepTwo = ({
}
const updatePreview = () => {
if (segmentationType === SegmentType.CUSTOM && maxChunkLength > 4000) {
if (segmentationType === ProcessMode.general && maxChunkLength > 4000) {
Toast.notify({ type: 'error', message: t('datasetCreation.stepTwo.maxLengthCheck') })
return
}
@@ -373,13 +367,42 @@ const StepTwo = ({
model: defaultEmbeddingModel?.model || '',
},
)
const [retrievalConfig, setRetrievalConfig] = useState(currentDataset?.retrieval_model_dict || {
search_method: RETRIEVE_METHOD.semantic,
reranking_enable: false,
reranking_model: {
reranking_provider_name: '',
reranking_model_name: '',
},
top_k: 3,
score_threshold_enabled: false,
score_threshold: 0.5,
} as RetrievalConfig)
useEffect(() => {
if (currentDataset?.retrieval_model_dict)
return
setRetrievalConfig({
search_method: RETRIEVE_METHOD.semantic,
reranking_enable: !!isRerankDefaultModelValid,
reranking_model: {
reranking_provider_name: isRerankDefaultModelValid ? rerankDefaultModel?.provider.provider ?? '' : '',
reranking_model_name: isRerankDefaultModelValid ? rerankDefaultModel?.model ?? '' : '',
},
top_k: 3,
score_threshold_enabled: false,
score_threshold: 0.5,
})
// eslint-disable-next-line react-hooks/exhaustive-deps
}, [rerankDefaultModel, isRerankDefaultModelValid])
const getCreationParams = () => {
let params
if (segmentationType === SegmentType.CUSTOM && overlap > maxChunkLength) {
if (segmentationType === ProcessMode.general && overlap > maxChunkLength) {
Toast.notify({ type: 'error', message: t('datasetCreation.stepTwo.overlapCheck') })
return
}
if (segmentationType === SegmentType.CUSTOM && maxChunkLength > limitMaxChunkLength) {
if (segmentationType === ProcessMode.general && maxChunkLength > limitMaxChunkLength) {
Toast.notify({ type: 'error', message: t('datasetCreation.stepTwo.maxLengthCheck', { limit: limitMaxChunkLength }) })
return
}
@@ -389,7 +412,6 @@ const StepTwo = ({
doc_form: currentDocForm,
doc_language: docLanguage,
process_rule: getProcessRule(),
// eslint-disable-next-line @typescript-eslint/no-use-before-define
retrieval_model: retrievalConfig, // Readonly. If want to changed, just go to settings page.
embedding_model: embeddingModel.model, // Readonly
embedding_model_provider: embeddingModel.provider, // Readonly
@@ -400,10 +422,7 @@ const StepTwo = ({
const indexMethod = getIndexing_technique()
if (
!isReRankModelSelected({
rerankDefaultModel,
isRerankDefaultModelValid: !!isRerankDefaultModelValid,
rerankModelList,
// eslint-disable-next-line @typescript-eslint/no-use-before-define
retrievalConfig,
indexMethod: indexMethod as string,
})
@@ -411,16 +430,6 @@ const StepTwo = ({
Toast.notify({ type: 'error', message: t('appDebug.datasetConfig.rerankModelRequired') })
return
}
const postRetrievalConfig = ensureRerankModelSelected({
rerankDefaultModel: rerankDefaultModel!,
retrievalConfig: {
// eslint-disable-next-line @typescript-eslint/no-use-before-define
...retrievalConfig,
// eslint-disable-next-line @typescript-eslint/no-use-before-define
reranking_enable: retrievalConfig.reranking_mode === RerankingModeEnum.RerankingModel,
},
indexMethod: indexMethod as string,
})
params = {
data_source: {
type: dataSourceType,
@@ -432,8 +441,7 @@ const StepTwo = ({
process_rule: getProcessRule(),
doc_form: currentDocForm,
doc_language: docLanguage,
retrieval_model: postRetrievalConfig,
retrieval_model: retrievalConfig,
embedding_model: embeddingModel.model,
embedding_model_provider: embeddingModel.provider,
} as CreateDocumentReq
@@ -490,7 +498,6 @@ const StepTwo = ({
const getDefaultMode = () => {
if (documentDetail)
// @ts-expect-error fix after api refactored
setSegmentationType(documentDetail.dataset_process_rule.mode)
}
@@ -525,7 +532,6 @@ const StepTwo = ({
onSuccess(data) {
updateIndexingTypeCache && updateIndexingTypeCache(indexType as string)
updateResultCache && updateResultCache(data)
// eslint-disable-next-line @typescript-eslint/no-use-before-define
updateRetrievalMethodCache && updateRetrievalMethodCache(retrievalConfig.search_method as string)
},
},
@@ -545,14 +551,6 @@ const StepTwo = ({
isSetting && onSave && onSave()
}
const changeToEconomicalType = () => {
if (docForm !== ChunkingMode.text)
return
if (!hasSetIndexType)
setIndexType(IndexingType.ECONOMICAL)
}
useEffect(() => {
// fetch rules
if (!isSetting) {
@@ -574,18 +572,6 @@ const StepTwo = ({
setIndexType(isAPIKeySet ? IndexingType.QUALIFIED : IndexingType.ECONOMICAL)
}, [isAPIKeySet, indexingType, datasetId])
const [retrievalConfig, setRetrievalConfig] = useState(currentDataset?.retrieval_model_dict || {
search_method: RETRIEVE_METHOD.semantic,
reranking_enable: false,
reranking_model: {
reranking_provider_name: rerankDefaultModel?.provider.provider,
reranking_model_name: rerankDefaultModel?.model,
},
top_k: 3,
score_threshold_enabled: false,
score_threshold: 0.5,
} as RetrievalConfig)
const economyDomRef = useRef<HTMLDivElement>(null)
const isHoveringEconomy = useHover(economyDomRef)
@@ -984,12 +970,14 @@ const StepTwo = ({
getIndexing_technique() === IndexingType.QUALIFIED
? (
<RetrievalMethodConfig
disabled={!!datasetId}
value={retrievalConfig}
onChange={setRetrievalConfig}
/>
)
: (
<EconomicalRetrievalMethodConfig
disabled={!!datasetId}
value={retrievalConfig}
onChange={setRetrievalConfig}
/>
@@ -1010,7 +998,7 @@ const StepTwo = ({
)
: (
<div className='flex items-center mt-8 py-2'>
<Button loading={isCreating} variant='primary' onClick={createHandle}>{t('datasetCreation.stepTwo.save')}</Button>
{!datasetId && <Button loading={isCreating} variant='primary' onClick={createHandle}>{t('datasetCreation.stepTwo.save')}</Button>}
<Button className='ml-2' onClick={onCancel}>{t('datasetCreation.stepTwo.cancel')}</Button>
</div>
)}
@@ -1081,11 +1069,11 @@ const StepTwo = ({
}
{
currentDocForm !== ChunkingMode.qa
&& <Badge text={t(
'datasetCreation.stepTwo.previewChunkCount', {
count: estimate?.total_segments || 0,
}) as string}
/>
&& <Badge text={t(
'datasetCreation.stepTwo.previewChunkCount', {
count: estimate?.total_segments || 0,
}) as string}
/>
}
</div>
</PreviewHeader>}