Feat: chunk overlap supported (#2209)
Co-authored-by: jyong <jyong@dify.ai>
This commit is contained in:
@@ -18,7 +18,7 @@
|
||||
}
|
||||
|
||||
.form .label {
|
||||
@apply pt-6 pb-2;
|
||||
@apply pt-6 pb-2 flex items-center;
|
||||
font-weight: 500;
|
||||
font-size: 16px;
|
||||
line-height: 24px;
|
||||
|
@@ -33,13 +33,14 @@ import { DataSourceType, DocForm } from '@/models/datasets'
|
||||
import NotionIcon from '@/app/components/base/notion-icon'
|
||||
import Switch from '@/app/components/base/switch'
|
||||
import { MessageChatSquare } from '@/app/components/base/icons/src/public/common'
|
||||
import { XClose } from '@/app/components/base/icons/src/vender/line/general'
|
||||
import { HelpCircle, XClose } from '@/app/components/base/icons/src/vender/line/general'
|
||||
import { useDatasetDetailContext } from '@/context/dataset-detail'
|
||||
import I18n from '@/context/i18n'
|
||||
import { IS_CE_EDITION } from '@/config'
|
||||
import { RETRIEVE_METHOD } from '@/types/app'
|
||||
import useBreakpoints, { MediaType } from '@/hooks/use-breakpoints'
|
||||
import Tooltip from '@/app/components/base/tooltip'
|
||||
import TooltipPlus from '@/app/components/base/tooltip-plus'
|
||||
import { useModelListAndDefaultModelAndCurrentProviderAndModel } from '@/app/components/header/account-setting/model-provider-page/hooks'
|
||||
import { LanguagesSupportedUnderscore, getModelRuntimeSupported } from '@/utils/language'
|
||||
|
||||
@@ -99,7 +100,8 @@ const StepTwo = ({
|
||||
const [previewScrolled, setPreviewScrolled] = useState(false)
|
||||
const [segmentationType, setSegmentationType] = useState<SegmentType>(SegmentType.AUTO)
|
||||
const [segmentIdentifier, setSegmentIdentifier] = useState('\\n')
|
||||
const [max, setMax] = useState(1000)
|
||||
const [max, setMax] = useState(500)
|
||||
const [overlap, setOverlap] = useState(50)
|
||||
const [rules, setRules] = useState<PreProcessingRule[]>([])
|
||||
const [defaultConfig, setDefaultConfig] = useState<Rules>()
|
||||
const hasSetIndexType = !!indexingType
|
||||
@@ -171,6 +173,7 @@ const StepTwo = ({
|
||||
if (defaultConfig) {
|
||||
setSegmentIdentifier((defaultConfig.segmentation.separator === '\n' ? '\\n' : defaultConfig.segmentation.separator) || '\\n')
|
||||
setMax(defaultConfig.segmentation.max_tokens)
|
||||
setOverlap(defaultConfig.segmentation.chunk_overlap)
|
||||
setRules(defaultConfig.pre_processing_rules)
|
||||
}
|
||||
}
|
||||
@@ -207,6 +210,7 @@ const StepTwo = ({
|
||||
segmentation: {
|
||||
separator: segmentIdentifier === '\\n' ? '\n' : segmentIdentifier,
|
||||
max_tokens: max,
|
||||
chunk_overlap: overlap,
|
||||
},
|
||||
}
|
||||
processRule.rules = ruleObj
|
||||
@@ -275,6 +279,10 @@ const StepTwo = ({
|
||||
} = useModelListAndDefaultModelAndCurrentProviderAndModel(3)
|
||||
const getCreationParams = () => {
|
||||
let params
|
||||
if (segmentationType === SegmentType.CUSTOM && overlap > max) {
|
||||
Toast.notify({ type: 'error', message: t('datasetCreation.stepTwo.overlapCheck') })
|
||||
return
|
||||
}
|
||||
if (isSetting) {
|
||||
params = {
|
||||
original_document_id: documentDetail?.id,
|
||||
@@ -337,6 +345,7 @@ const StepTwo = ({
|
||||
const separator = res.rules.segmentation.separator
|
||||
setSegmentIdentifier((separator === '\n' ? '\\n' : separator) || '\\n')
|
||||
setMax(res.rules.segmentation.max_tokens)
|
||||
setOverlap(res.rules.segmentation.chunk_overlap)
|
||||
setRules(res.rules.pre_processing_rules)
|
||||
setDefaultConfig(res.rules)
|
||||
}
|
||||
@@ -350,8 +359,10 @@ const StepTwo = ({
|
||||
const rules = documentDetail.dataset_process_rule.rules
|
||||
const separator = rules.segmentation.separator
|
||||
const max = rules.segmentation.max_tokens
|
||||
const overlap = rules.segmentation.chunk_overlap
|
||||
setSegmentIdentifier((separator === '\n' ? '\\n' : separator) || '\\n')
|
||||
setMax(max)
|
||||
setOverlap(overlap)
|
||||
setRules(rules.pre_processing_rules)
|
||||
setDefaultConfig(rules)
|
||||
}
|
||||
@@ -569,13 +580,35 @@ const StepTwo = ({
|
||||
<input
|
||||
type="number"
|
||||
className={s.input}
|
||||
placeholder={t('datasetCreation.stepTwo.separatorPlaceholder') || ''}
|
||||
placeholder={t('datasetCreation.stepTwo.maxLength') || ''}
|
||||
value={max}
|
||||
min={1}
|
||||
onChange={e => setMax(parseInt(e.target.value.replace(/^0+/, ''), 10))}
|
||||
/>
|
||||
</div>
|
||||
</div>
|
||||
<div className={s.formRow}>
|
||||
<div className='w-full'>
|
||||
<div className={s.label}>
|
||||
{t('datasetCreation.stepTwo.overlap')}
|
||||
<TooltipPlus popupContent={
|
||||
<div className='max-w-[200px]'>
|
||||
{t('datasetCreation.stepTwo.overlapTip')}
|
||||
</div>
|
||||
}>
|
||||
<HelpCircle className='ml-1 w-3.5 h-3.5 text-gray-400' />
|
||||
</TooltipPlus>
|
||||
</div>
|
||||
<input
|
||||
type="number"
|
||||
className={s.input}
|
||||
placeholder={t('datasetCreation.stepTwo.overlap') || ''}
|
||||
value={overlap}
|
||||
min={1}
|
||||
onChange={e => setOverlap(parseInt(e.target.value.replace(/^0+/, ''), 10))}
|
||||
/>
|
||||
</div>
|
||||
</div>
|
||||
<div className={s.formRow}>
|
||||
<div className='w-full flex flex-col gap-1'>
|
||||
<div className={s.label}>{t('datasetCreation.stepTwo.rules')}</div>
|
||||
|
Reference in New Issue
Block a user