chore: improve delimiter (#8552)
This commit is contained in:
18
web/app/components/datasets/create/step-two/escape.ts
Normal file
18
web/app/components/datasets/create/step-two/escape.ts
Normal file
@@ -0,0 +1,18 @@
|
||||
function escape(input: string): string {
|
||||
if (!input || typeof input !== 'string')
|
||||
return ''
|
||||
|
||||
const res = input
|
||||
.replaceAll('\\', '\\\\')
|
||||
.replaceAll('\0', '\\0')
|
||||
.replaceAll('\b', '\\b')
|
||||
.replaceAll('\f', '\\f')
|
||||
.replaceAll('\n', '\\n')
|
||||
.replaceAll('\r', '\\r')
|
||||
.replaceAll('\t', '\\t')
|
||||
.replaceAll('\v', '\\v')
|
||||
.replaceAll('\'', '\\\'')
|
||||
return res
|
||||
}
|
||||
|
||||
export default escape
|
@@ -1,5 +1,5 @@
|
||||
'use client'
|
||||
import React, { useEffect, useLayoutEffect, useRef, useState } from 'react'
|
||||
import React, { useCallback, useEffect, useLayoutEffect, useRef, useState } from 'react'
|
||||
import { useTranslation } from 'react-i18next'
|
||||
import { useContext } from 'use-context-selector'
|
||||
import { useBoolean } from 'ahooks'
|
||||
@@ -13,6 +13,8 @@ import { groupBy } from 'lodash-es'
|
||||
import PreviewItem, { PreviewType } from './preview-item'
|
||||
import LanguageSelect from './language-select'
|
||||
import s from './index.module.css'
|
||||
import unescape from './unescape'
|
||||
import escape from './escape'
|
||||
import cn from '@/utils/classnames'
|
||||
import type { CrawlOptions, CrawlResultItem, CreateDocumentReq, CustomFile, FileIndexingEstimateResponse, FullDocumentDetail, IndexingEstimateParams, NotionInfo, PreProcessingRule, ProcessRule, Rules, createDocumentResponse } from '@/models/datasets'
|
||||
import {
|
||||
@@ -78,6 +80,8 @@ enum IndexingType {
|
||||
ECONOMICAL = 'economy',
|
||||
}
|
||||
|
||||
const DEFAULT_SEGMENT_IDENTIFIER = '\\n\\n'
|
||||
|
||||
const StepTwo = ({
|
||||
isSetting,
|
||||
documentDetail,
|
||||
@@ -110,8 +114,11 @@ const StepTwo = ({
|
||||
const previewScrollRef = useRef<HTMLDivElement>(null)
|
||||
const [previewScrolled, setPreviewScrolled] = useState(false)
|
||||
const [segmentationType, setSegmentationType] = useState<SegmentType>(SegmentType.AUTO)
|
||||
const [segmentIdentifier, setSegmentIdentifier] = useState('\\n')
|
||||
const [max, setMax] = useState(5000) // default chunk length
|
||||
const [segmentIdentifier, doSetSegmentIdentifier] = useState(DEFAULT_SEGMENT_IDENTIFIER)
|
||||
const setSegmentIdentifier = useCallback((value: string) => {
|
||||
doSetSegmentIdentifier(value ? escape(value) : DEFAULT_SEGMENT_IDENTIFIER)
|
||||
}, [])
|
||||
const [max, setMax] = useState(4000) // default chunk length
|
||||
const [overlap, setOverlap] = useState(50)
|
||||
const [rules, setRules] = useState<PreProcessingRule[]>([])
|
||||
const [defaultConfig, setDefaultConfig] = useState<Rules>()
|
||||
@@ -183,7 +190,7 @@ const StepTwo = ({
|
||||
}
|
||||
const resetRules = () => {
|
||||
if (defaultConfig) {
|
||||
setSegmentIdentifier((defaultConfig.segmentation.separator === '\n' ? '\\n' : defaultConfig.segmentation.separator) || '\\n')
|
||||
setSegmentIdentifier(defaultConfig.segmentation.separator)
|
||||
setMax(defaultConfig.segmentation.max_tokens)
|
||||
setOverlap(defaultConfig.segmentation.chunk_overlap)
|
||||
setRules(defaultConfig.pre_processing_rules)
|
||||
@@ -217,7 +224,7 @@ const StepTwo = ({
|
||||
const ruleObj = {
|
||||
pre_processing_rules: rules,
|
||||
segmentation: {
|
||||
separator: segmentIdentifier === '\\n' ? '\n' : segmentIdentifier,
|
||||
separator: unescape(segmentIdentifier),
|
||||
max_tokens: max,
|
||||
chunk_overlap: overlap,
|
||||
},
|
||||
@@ -394,7 +401,7 @@ const StepTwo = ({
|
||||
try {
|
||||
const res = await fetchDefaultProcessRule({ url: '/datasets/process-rule' })
|
||||
const separator = res.rules.segmentation.separator
|
||||
setSegmentIdentifier((separator === '\n' ? '\\n' : separator) || '\\n')
|
||||
setSegmentIdentifier(separator)
|
||||
setMax(res.rules.segmentation.max_tokens)
|
||||
setOverlap(res.rules.segmentation.chunk_overlap)
|
||||
setRules(res.rules.pre_processing_rules)
|
||||
@@ -411,7 +418,7 @@ const StepTwo = ({
|
||||
const separator = rules.segmentation.separator
|
||||
const max = rules.segmentation.max_tokens
|
||||
const overlap = rules.segmentation.chunk_overlap
|
||||
setSegmentIdentifier((separator === '\n' ? '\\n' : separator) || '\\n')
|
||||
setSegmentIdentifier(separator)
|
||||
setMax(max)
|
||||
setOverlap(overlap)
|
||||
setRules(rules.pre_processing_rules)
|
||||
@@ -616,12 +623,22 @@ const StepTwo = ({
|
||||
<div className={s.typeFormBody}>
|
||||
<div className={s.formRow}>
|
||||
<div className='w-full'>
|
||||
<div className={s.label}>{t('datasetCreation.stepTwo.separator')}</div>
|
||||
<div className={s.label}>
|
||||
{t('datasetCreation.stepTwo.separator')}
|
||||
<Tooltip
|
||||
popupContent={
|
||||
<div className='max-w-[200px]'>
|
||||
{t('datasetCreation.stepTwo.separatorTip')}
|
||||
</div>
|
||||
}
|
||||
/>
|
||||
</div>
|
||||
<input
|
||||
type="text"
|
||||
className={s.input}
|
||||
placeholder={t('datasetCreation.stepTwo.separatorPlaceholder') || ''} value={segmentIdentifier}
|
||||
onChange={e => setSegmentIdentifier(e.target.value)}
|
||||
placeholder={t('datasetCreation.stepTwo.separatorPlaceholder') || ''}
|
||||
value={segmentIdentifier}
|
||||
onChange={e => doSetSegmentIdentifier(e.target.value)}
|
||||
/>
|
||||
</div>
|
||||
</div>
|
||||
|
54
web/app/components/datasets/create/step-two/unescape.ts
Normal file
54
web/app/components/datasets/create/step-two/unescape.ts
Normal file
@@ -0,0 +1,54 @@
|
||||
// https://github.com/iamakulov/unescape-js/blob/master/src/index.js
|
||||
|
||||
/**
|
||||
* \\ - matches the backslash which indicates the beginning of an escape sequence
|
||||
* (
|
||||
* u\{([0-9A-Fa-f]+)\} - first alternative; matches the variable-length hexadecimal escape sequence (\u{ABCD0})
|
||||
* |
|
||||
* u([0-9A-Fa-f]{4}) - second alternative; matches the 4-digit hexadecimal escape sequence (\uABCD)
|
||||
* |
|
||||
* x([0-9A-Fa-f]{2}) - third alternative; matches the 2-digit hexadecimal escape sequence (\xA5)
|
||||
* |
|
||||
* ([1-7][0-7]{0,2}|[0-7]{2,3}) - fourth alternative; matches the up-to-3-digit octal escape sequence (\5 or \512)
|
||||
* |
|
||||
* (['"tbrnfv0\\]) - fifth alternative; matches the special escape characters (\t, \n and so on)
|
||||
* |
|
||||
* \U([0-9A-Fa-f]+) - sixth alternative; matches the 8-digit hexadecimal escape sequence used by python (\U0001F3B5)
|
||||
* )
|
||||
*/
|
||||
const jsEscapeRegex = /\\(u\{([0-9A-Fa-f]+)\}|u([0-9A-Fa-f]{4})|x([0-9A-Fa-f]{2})|([1-7][0-7]{0,2}|[0-7]{2,3})|(['"tbrnfv0\\]))|\\U([0-9A-Fa-f]{8})/g
|
||||
|
||||
const usualEscapeSequences: Record<string, string> = {
|
||||
'0': '\0',
|
||||
'b': '\b',
|
||||
'f': '\f',
|
||||
'n': '\n',
|
||||
'r': '\r',
|
||||
't': '\t',
|
||||
'v': '\v',
|
||||
'\'': '\'',
|
||||
'"': '"',
|
||||
'\\': '\\',
|
||||
}
|
||||
|
||||
const fromHex = (str: string) => String.fromCodePoint(parseInt(str, 16))
|
||||
const fromOct = (str: string) => String.fromCodePoint(parseInt(str, 8))
|
||||
|
||||
const unescape = (str: string) => {
|
||||
return str.replace(jsEscapeRegex, (_, __, varHex, longHex, shortHex, octal, specialCharacter, python) => {
|
||||
if (varHex !== undefined)
|
||||
return fromHex(varHex)
|
||||
else if (longHex !== undefined)
|
||||
return fromHex(longHex)
|
||||
else if (shortHex !== undefined)
|
||||
return fromHex(shortHex)
|
||||
else if (octal !== undefined)
|
||||
return fromOct(octal)
|
||||
else if (python !== undefined)
|
||||
return fromHex(python)
|
||||
else
|
||||
return usualEscapeSequences[specialCharacter]
|
||||
})
|
||||
}
|
||||
|
||||
export default unescape
|
Reference in New Issue
Block a user