chore: improve delimiter (#8552)

2024-09-19 17:40:20 +08:00
parent d96f5ba1ca
commit 7411bcf167
5 changed files with 103 additions and 12 deletions
--- a/web/app/components/datasets/create/step-two/escape.ts
+++ b/web/app/components/datasets/create/step-two/escape.ts
@@ -0,0 +1,18 @@
+function escape(input: string): string {
+  if (!input || typeof input !== 'string')
+    return ''
+
+  const res = input
+    .replaceAll('\\', '\\\\')
+    .replaceAll('\0', '\\0')
+    .replaceAll('\b', '\\b')
+    .replaceAll('\f', '\\f')
+    .replaceAll('\n', '\\n')
+    .replaceAll('\r', '\\r')
+    .replaceAll('\t', '\\t')
+    .replaceAll('\v', '\\v')
+    .replaceAll('\'', '\\\'')
+  return res
+}
+
+export default escape
--- a/web/app/components/datasets/create/step-two/index.tsx
+++ b/web/app/components/datasets/create/step-two/index.tsx
@@ -1,5 +1,5 @@
 'use client'
-import React, { useEffect, useLayoutEffect, useRef, useState } from 'react'
+import React, { useCallback, useEffect, useLayoutEffect, useRef, useState } from 'react'
 import { useTranslation } from 'react-i18next'
 import { useContext } from 'use-context-selector'
 import { useBoolean } from 'ahooks'
@@ -13,6 +13,8 @@ import { groupBy } from 'lodash-es'
 import PreviewItem, { PreviewType } from './preview-item'
 import LanguageSelect from './language-select'
 import s from './index.module.css'
+import unescape from './unescape'
+import escape from './escape'
 import cn from '@/utils/classnames'
 import type { CrawlOptions, CrawlResultItem, CreateDocumentReq, CustomFile, FileIndexingEstimateResponse, FullDocumentDetail, IndexingEstimateParams, NotionInfo, PreProcessingRule, ProcessRule, Rules, createDocumentResponse } from '@/models/datasets'
 import {
@@ -78,6 +80,8 @@ enum IndexingType {
  ECONOMICAL = 'economy',
 }

+const DEFAULT_SEGMENT_IDENTIFIER = '\\n\\n'
+
 const StepTwo = ({
  isSetting,
  documentDetail,
@@ -110,8 +114,11 @@ const StepTwo = ({
  const previewScrollRef = useRef<HTMLDivElement>(null)
  const [previewScrolled, setPreviewScrolled] = useState(false)
  const [segmentationType, setSegmentationType] = useState<SegmentType>(SegmentType.AUTO)
-  const [segmentIdentifier, setSegmentIdentifier] = useState('\\n')
-  const [max, setMax] = useState(5000) // default chunk length
+  const [segmentIdentifier, doSetSegmentIdentifier] = useState(DEFAULT_SEGMENT_IDENTIFIER)
+  const setSegmentIdentifier = useCallback((value: string) => {
+    doSetSegmentIdentifier(value ? escape(value) : DEFAULT_SEGMENT_IDENTIFIER)
+  }, [])
+  const [max, setMax] = useState(4000) // default chunk length
  const [overlap, setOverlap] = useState(50)
  const [rules, setRules] = useState<PreProcessingRule[]>([])
  const [defaultConfig, setDefaultConfig] = useState<Rules>()
@@ -183,7 +190,7 @@ const StepTwo = ({
  }
  const resetRules = () => {
    if (defaultConfig) {
-      setSegmentIdentifier((defaultConfig.segmentation.separator === '\n' ? '\\n' : defaultConfig.segmentation.separator) || '\\n')
+      setSegmentIdentifier(defaultConfig.segmentation.separator)
      setMax(defaultConfig.segmentation.max_tokens)
      setOverlap(defaultConfig.segmentation.chunk_overlap)
      setRules(defaultConfig.pre_processing_rules)
@@ -217,7 +224,7 @@ const StepTwo = ({
      const ruleObj = {
        pre_processing_rules: rules,
        segmentation: {
-          separator: segmentIdentifier === '\\n' ? '\n' : segmentIdentifier,
+          separator: unescape(segmentIdentifier),
          max_tokens: max,
          chunk_overlap: overlap,
        },
@@ -394,7 +401,7 @@ const StepTwo = ({
    try {
      const res = await fetchDefaultProcessRule({ url: '/datasets/process-rule' })
      const separator = res.rules.segmentation.separator
-      setSegmentIdentifier((separator === '\n' ? '\\n' : separator) || '\\n')
+      setSegmentIdentifier(separator)
      setMax(res.rules.segmentation.max_tokens)
      setOverlap(res.rules.segmentation.chunk_overlap)
      setRules(res.rules.pre_processing_rules)
@@ -411,7 +418,7 @@ const StepTwo = ({
      const separator = rules.segmentation.separator
      const max = rules.segmentation.max_tokens
      const overlap = rules.segmentation.chunk_overlap
-      setSegmentIdentifier((separator === '\n' ? '\\n' : separator) || '\\n')
+      setSegmentIdentifier(separator)
      setMax(max)
      setOverlap(overlap)
      setRules(rules.pre_processing_rules)
@@ -616,12 +623,22 @@ const StepTwo = ({
                <div className={s.typeFormBody}>
                  <div className={s.formRow}>
                    <div className='w-full'>
-                      <div className={s.label}>{t('datasetCreation.stepTwo.separator')}</div>
+                      <div className={s.label}>
+                        {t('datasetCreation.stepTwo.separator')}
+                        <Tooltip
+                          popupContent={
+                            <div className='max-w-[200px]'>
+                              {t('datasetCreation.stepTwo.separatorTip')}
+                            </div>
+                          }
+                        />
+                      </div>
                      <input
                        type="text"
                        className={s.input}
-                        placeholder={t('datasetCreation.stepTwo.separatorPlaceholder') || ''} value={segmentIdentifier}
-                        onChange={e => setSegmentIdentifier(e.target.value)}
+                        placeholder={t('datasetCreation.stepTwo.separatorPlaceholder') || ''}
+                        value={segmentIdentifier}
+                        onChange={e => doSetSegmentIdentifier(e.target.value)}
                      />
                    </div>
                  </div>
--- a/web/app/components/datasets/create/step-two/unescape.ts
+++ b/web/app/components/datasets/create/step-two/unescape.ts
@@ -0,0 +1,54 @@
+// https://github.com/iamakulov/unescape-js/blob/master/src/index.js
+
+/**
+ * \\ - matches the backslash which indicates the beginning of an escape sequence
+ * (
+ *   u\{([0-9A-Fa-f]+)\} - first alternative; matches the variable-length hexadecimal escape sequence (\u{ABCD0})
+ * |
+ *   u([0-9A-Fa-f]{4}) - second alternative; matches the 4-digit hexadecimal escape sequence (\uABCD)
+ * |
+ *   x([0-9A-Fa-f]{2}) - third alternative; matches the 2-digit hexadecimal escape sequence (\xA5)
+ * |
+ *   ([1-7][0-7]{0,2}|[0-7]{2,3}) - fourth alternative; matches the up-to-3-digit octal escape sequence (\5 or \512)
+ * |
+ *   (['"tbrnfv0\\]) - fifth alternative; matches the special escape characters (\t, \n and so on)
+ * |
+ *   \U([0-9A-Fa-f]+) - sixth alternative; matches the 8-digit hexadecimal escape sequence used by python (\U0001F3B5)
+ * )
+ */
+const jsEscapeRegex = /\\(u\{([0-9A-Fa-f]+)\}|u([0-9A-Fa-f]{4})|x([0-9A-Fa-f]{2})|([1-7][0-7]{0,2}|[0-7]{2,3})|(['"tbrnfv0\\]))|\\U([0-9A-Fa-f]{8})/g
+
+const usualEscapeSequences: Record<string, string> = {
+  '0': '\0',
+  'b': '\b',
+  'f': '\f',
+  'n': '\n',
+  'r': '\r',
+  't': '\t',
+  'v': '\v',
+  '\'': '\'',
+  '"': '"',
+  '\\': '\\',
+}
+
+const fromHex = (str: string) => String.fromCodePoint(parseInt(str, 16))
+const fromOct = (str: string) => String.fromCodePoint(parseInt(str, 8))
+
+const unescape = (str: string) => {
+  return str.replace(jsEscapeRegex, (_, __, varHex, longHex, shortHex, octal, specialCharacter, python) => {
+    if (varHex !== undefined)
+      return fromHex(varHex)
+    else if (longHex !== undefined)
+      return fromHex(longHex)
+    else if (shortHex !== undefined)
+      return fromHex(shortHex)
+    else if (octal !== undefined)
+      return fromOct(octal)
+    else if (python !== undefined)
+      return fromHex(python)
+    else
+      return usualEscapeSequences[specialCharacter]
+  })
+}
+
+export default unescape