feat(website-crawl): add jina reader as additional alternative for website crawling (#8761)
This commit is contained in:
BIN
web/app/components/datasets/create/assets/jina.png
Normal file
BIN
web/app/components/datasets/create/assets/jina.png
Normal file
Binary file not shown.
After Width: | Height: | Size: 2.7 KiB |
@@ -11,7 +11,7 @@ import { DataSourceType } from '@/models/datasets'
|
||||
import type { CrawlOptions, CrawlResultItem, DataSet, FileItem, createDocumentResponse } from '@/models/datasets'
|
||||
import { fetchDataSource } from '@/service/common'
|
||||
import { fetchDatasetDetail } from '@/service/datasets'
|
||||
import type { NotionPage } from '@/models/common'
|
||||
import { DataSourceProvider, type NotionPage } from '@/models/common'
|
||||
import { useModalContext } from '@/context/modal-context'
|
||||
import { useDefaultModel } from '@/app/components/header/account-setting/model-provider-page/hooks'
|
||||
|
||||
@@ -26,6 +26,7 @@ const DEFAULT_CRAWL_OPTIONS: CrawlOptions = {
|
||||
excludes: '',
|
||||
limit: 10,
|
||||
max_depth: '',
|
||||
use_sitemap: true,
|
||||
}
|
||||
|
||||
const DatasetUpdateForm = ({ datasetId }: DatasetUpdateFormProps) => {
|
||||
@@ -51,7 +52,8 @@ const DatasetUpdateForm = ({ datasetId }: DatasetUpdateFormProps) => {
|
||||
const updateFileList = (preparedFiles: FileItem[]) => {
|
||||
setFiles(preparedFiles)
|
||||
}
|
||||
const [fireCrawlJobId, setFireCrawlJobId] = useState('')
|
||||
const [websiteCrawlProvider, setWebsiteCrawlProvider] = useState<DataSourceProvider>(DataSourceProvider.fireCrawl)
|
||||
const [websiteCrawlJobId, setWebsiteCrawlJobId] = useState('')
|
||||
|
||||
const updateFile = (fileItem: FileItem, progress: number, list: FileItem[]) => {
|
||||
const targetIndex = list.findIndex(file => file.fileID === fileItem.fileID)
|
||||
@@ -137,7 +139,8 @@ const DatasetUpdateForm = ({ datasetId }: DatasetUpdateFormProps) => {
|
||||
onStepChange={nextStep}
|
||||
websitePages={websitePages}
|
||||
updateWebsitePages={setWebsitePages}
|
||||
onFireCrawlJobIdChange={setFireCrawlJobId}
|
||||
onWebsiteCrawlProviderChange={setWebsiteCrawlProvider}
|
||||
onWebsiteCrawlJobIdChange={setWebsiteCrawlJobId}
|
||||
crawlOptions={crawlOptions}
|
||||
onCrawlOptionsChange={setCrawlOptions}
|
||||
/>
|
||||
@@ -151,7 +154,8 @@ const DatasetUpdateForm = ({ datasetId }: DatasetUpdateFormProps) => {
|
||||
files={fileList.map(file => file.file)}
|
||||
notionPages={notionPages}
|
||||
websitePages={websitePages}
|
||||
fireCrawlJobId={fireCrawlJobId}
|
||||
websiteCrawlProvider={websiteCrawlProvider}
|
||||
websiteCrawlJobId={websiteCrawlJobId}
|
||||
onStepChange={changeStep}
|
||||
updateIndexingTypeCache={updateIndexingTypeCache}
|
||||
updateResultCache={updateResultCache}
|
||||
|
@@ -10,7 +10,7 @@ import WebsitePreview from '../website/preview'
|
||||
import s from './index.module.css'
|
||||
import cn from '@/utils/classnames'
|
||||
import type { CrawlOptions, CrawlResultItem, FileItem } from '@/models/datasets'
|
||||
import type { NotionPage } from '@/models/common'
|
||||
import type { DataSourceProvider, NotionPage } from '@/models/common'
|
||||
import { DataSourceType } from '@/models/datasets'
|
||||
import Button from '@/app/components/base/button'
|
||||
import { NotionPageSelector } from '@/app/components/base/notion-page-selector'
|
||||
@@ -33,7 +33,8 @@ type IStepOneProps = {
|
||||
changeType: (type: DataSourceType) => void
|
||||
websitePages?: CrawlResultItem[]
|
||||
updateWebsitePages: (value: CrawlResultItem[]) => void
|
||||
onFireCrawlJobIdChange: (jobId: string) => void
|
||||
onWebsiteCrawlProviderChange: (provider: DataSourceProvider) => void
|
||||
onWebsiteCrawlJobIdChange: (jobId: string) => void
|
||||
crawlOptions: CrawlOptions
|
||||
onCrawlOptionsChange: (payload: CrawlOptions) => void
|
||||
}
|
||||
@@ -69,7 +70,8 @@ const StepOne = ({
|
||||
updateNotionPages,
|
||||
websitePages = [],
|
||||
updateWebsitePages,
|
||||
onFireCrawlJobIdChange,
|
||||
onWebsiteCrawlProviderChange,
|
||||
onWebsiteCrawlJobIdChange,
|
||||
crawlOptions,
|
||||
onCrawlOptionsChange,
|
||||
}: IStepOneProps) => {
|
||||
@@ -229,7 +231,8 @@ const StepOne = ({
|
||||
onPreview={setCurrentWebsite}
|
||||
checkedCrawlResult={websitePages}
|
||||
onCheckedCrawlResultChange={updateWebsitePages}
|
||||
onJobIdChange={onFireCrawlJobIdChange}
|
||||
onCrawlProviderChange={onWebsiteCrawlProviderChange}
|
||||
onJobIdChange={onWebsiteCrawlJobIdChange}
|
||||
crawlOptions={crawlOptions}
|
||||
onCrawlOptionsChange={onCrawlOptionsChange}
|
||||
/>
|
||||
|
@@ -33,6 +33,7 @@ import { ensureRerankModelSelected, isReRankModelSelected } from '@/app/componen
|
||||
import Toast from '@/app/components/base/toast'
|
||||
import { formatNumber } from '@/utils/format'
|
||||
import type { NotionPage } from '@/models/common'
|
||||
import { DataSourceProvider } from '@/models/common'
|
||||
import { DataSourceType, DocForm } from '@/models/datasets'
|
||||
import NotionIcon from '@/app/components/base/notion-icon'
|
||||
import Switch from '@/app/components/base/switch'
|
||||
@@ -63,7 +64,8 @@ type StepTwoProps = {
|
||||
notionPages?: NotionPage[]
|
||||
websitePages?: CrawlResultItem[]
|
||||
crawlOptions?: CrawlOptions
|
||||
fireCrawlJobId?: string
|
||||
websiteCrawlProvider?: DataSourceProvider
|
||||
websiteCrawlJobId?: string
|
||||
onStepChange?: (delta: number) => void
|
||||
updateIndexingTypeCache?: (type: string) => void
|
||||
updateResultCache?: (res: createDocumentResponse) => void
|
||||
@@ -94,7 +96,8 @@ const StepTwo = ({
|
||||
notionPages = [],
|
||||
websitePages = [],
|
||||
crawlOptions,
|
||||
fireCrawlJobId = '',
|
||||
websiteCrawlProvider = DataSourceProvider.fireCrawl,
|
||||
websiteCrawlJobId = '',
|
||||
onStepChange,
|
||||
updateIndexingTypeCache,
|
||||
updateResultCache,
|
||||
@@ -260,8 +263,8 @@ const StepTwo = ({
|
||||
|
||||
const getWebsiteInfo = () => {
|
||||
return {
|
||||
provider: 'firecrawl',
|
||||
job_id: fireCrawlJobId,
|
||||
provider: websiteCrawlProvider,
|
||||
job_id: websiteCrawlJobId,
|
||||
urls: websitePages.map(page => page.source_url),
|
||||
only_main_content: crawlOptions?.only_main_content,
|
||||
}
|
||||
|
@@ -3,6 +3,7 @@ import type { FC } from 'react'
|
||||
import React from 'react'
|
||||
import cn from '@/utils/classnames'
|
||||
import Checkbox from '@/app/components/base/checkbox'
|
||||
import Tooltip from '@/app/components/base/tooltip'
|
||||
|
||||
type Props = {
|
||||
className?: string
|
||||
@@ -10,6 +11,7 @@ type Props = {
|
||||
onChange: (isChecked: boolean) => void
|
||||
label: string
|
||||
labelClassName?: string
|
||||
tooltip?: string
|
||||
}
|
||||
|
||||
const CheckboxWithLabel: FC<Props> = ({
|
||||
@@ -18,11 +20,20 @@ const CheckboxWithLabel: FC<Props> = ({
|
||||
onChange,
|
||||
label,
|
||||
labelClassName,
|
||||
tooltip,
|
||||
}) => {
|
||||
return (
|
||||
<label className={cn(className, 'flex items-center h-7 space-x-2')}>
|
||||
<Checkbox checked={isChecked} onCheck={() => onChange(!isChecked)} />
|
||||
<div className={cn(labelClassName, 'text-sm font-normal text-gray-800')}>{label}</div>
|
||||
{tooltip && (
|
||||
<Tooltip
|
||||
popupContent={
|
||||
<div className='w-[200px]'>{tooltip}</div>
|
||||
}
|
||||
triggerClassName='ml-0.5 w-4 h-4'
|
||||
/>
|
||||
)}
|
||||
</label>
|
||||
)
|
||||
}
|
@@ -2,7 +2,7 @@
|
||||
import type { FC } from 'react'
|
||||
import React, { useCallback } from 'react'
|
||||
import { useTranslation } from 'react-i18next'
|
||||
import CheckboxWithLabel from './base/checkbox-with-label'
|
||||
import CheckboxWithLabel from './checkbox-with-label'
|
||||
import CrawledResultItem from './crawled-result-item'
|
||||
import cn from '@/utils/classnames'
|
||||
import type { CrawlResultItem } from '@/models/datasets'
|
@@ -2,13 +2,13 @@
|
||||
import type { FC } from 'react'
|
||||
import React, { useCallback, useEffect, useState } from 'react'
|
||||
import { useTranslation } from 'react-i18next'
|
||||
import UrlInput from '../base/url-input'
|
||||
import OptionsWrap from '../base/options-wrap'
|
||||
import CrawledResult from '../base/crawled-result'
|
||||
import Crawling from '../base/crawling'
|
||||
import ErrorMessage from '../base/error-message'
|
||||
import Header from './header'
|
||||
import UrlInput from './base/url-input'
|
||||
import OptionsWrap from './base/options-wrap'
|
||||
import Options from './options'
|
||||
import CrawledResult from './crawled-result'
|
||||
import Crawling from './crawling'
|
||||
import ErrorMessage from './base/error-message'
|
||||
import cn from '@/utils/classnames'
|
||||
import { useModalContext } from '@/context/modal-context'
|
||||
import type { CrawlOptions, CrawlResultItem } from '@/models/datasets'
|
||||
|
@@ -2,8 +2,8 @@
|
||||
import type { FC } from 'react'
|
||||
import React, { useCallback } from 'react'
|
||||
import { useTranslation } from 'react-i18next'
|
||||
import CheckboxWithLabel from './base/checkbox-with-label'
|
||||
import Field from './base/field'
|
||||
import CheckboxWithLabel from '../base/checkbox-with-label'
|
||||
import Field from '../base/field'
|
||||
import cn from '@/utils/classnames'
|
||||
import type { CrawlOptions } from '@/models/datasets'
|
||||
|
||||
|
@@ -0,0 +1,6 @@
|
||||
.jinaLogo {
|
||||
@apply w-4 h-4 bg-center bg-no-repeat inline-block;
|
||||
background-color: #F5FAFF;
|
||||
background-image: url(../assets/jina.png);
|
||||
background-size: 16px;
|
||||
}
|
@@ -1,8 +1,12 @@
|
||||
'use client'
|
||||
import type { FC } from 'react'
|
||||
import React, { useCallback, useEffect, useState } from 'react'
|
||||
import { useTranslation } from 'react-i18next'
|
||||
import s from './index.module.css'
|
||||
import NoData from './no-data'
|
||||
import Firecrawl from './firecrawl'
|
||||
import JinaReader from './jina-reader'
|
||||
import cn from '@/utils/classnames'
|
||||
import { useModalContext } from '@/context/modal-context'
|
||||
import type { CrawlOptions, CrawlResultItem } from '@/models/datasets'
|
||||
import { fetchDataSources } from '@/service/datasets'
|
||||
@@ -12,6 +16,7 @@ type Props = {
|
||||
onPreview: (payload: CrawlResultItem) => void
|
||||
checkedCrawlResult: CrawlResultItem[]
|
||||
onCheckedCrawlResultChange: (payload: CrawlResultItem[]) => void
|
||||
onCrawlProviderChange: (provider: DataSourceProvider) => void
|
||||
onJobIdChange: (jobId: string) => void
|
||||
crawlOptions: CrawlOptions
|
||||
onCrawlOptionsChange: (payload: CrawlOptions) => void
|
||||
@@ -21,17 +26,32 @@ const Website: FC<Props> = ({
|
||||
onPreview,
|
||||
checkedCrawlResult,
|
||||
onCheckedCrawlResultChange,
|
||||
onCrawlProviderChange,
|
||||
onJobIdChange,
|
||||
crawlOptions,
|
||||
onCrawlOptionsChange,
|
||||
}) => {
|
||||
const { t } = useTranslation()
|
||||
const { setShowAccountSettingModal } = useModalContext()
|
||||
const [isLoaded, setIsLoaded] = useState(false)
|
||||
const [isSetFirecrawlApiKey, setIsSetFirecrawlApiKey] = useState(false)
|
||||
const [selectedProvider, setSelectedProvider] = useState<DataSourceProvider>(DataSourceProvider.jinaReader)
|
||||
const [sources, setSources] = useState<DataSourceItem[]>([])
|
||||
|
||||
useEffect(() => {
|
||||
onCrawlProviderChange(selectedProvider)
|
||||
}, [selectedProvider, onCrawlProviderChange])
|
||||
|
||||
const checkSetApiKey = useCallback(async () => {
|
||||
const res = await fetchDataSources() as any
|
||||
const isFirecrawlSet = res.sources.some((item: DataSourceItem) => item.provider === DataSourceProvider.fireCrawl)
|
||||
setIsSetFirecrawlApiKey(isFirecrawlSet)
|
||||
setSources(res.sources)
|
||||
|
||||
// If users have configured one of the providers, select it.
|
||||
const availableProviders = res.sources.filter((item: DataSourceItem) =>
|
||||
[DataSourceProvider.jinaReader, DataSourceProvider.fireCrawl].includes(item.provider),
|
||||
)
|
||||
|
||||
if (availableProviders.length > 0)
|
||||
setSelectedProvider(availableProviders[0].provider)
|
||||
}, [])
|
||||
|
||||
useEffect(() => {
|
||||
@@ -52,20 +72,66 @@ const Website: FC<Props> = ({
|
||||
|
||||
return (
|
||||
<div>
|
||||
{isSetFirecrawlApiKey
|
||||
? (
|
||||
<Firecrawl
|
||||
onPreview={onPreview}
|
||||
checkedCrawlResult={checkedCrawlResult}
|
||||
onCheckedCrawlResultChange={onCheckedCrawlResultChange}
|
||||
onJobIdChange={onJobIdChange}
|
||||
crawlOptions={crawlOptions}
|
||||
onCrawlOptionsChange={onCrawlOptionsChange}
|
||||
/>
|
||||
)
|
||||
: (
|
||||
<NoData onConfig={handleOnConfig} />
|
||||
)}
|
||||
<div className="mb-4">
|
||||
<div className="font-medium text-gray-700 mb-2 h-6">
|
||||
{t('datasetCreation.stepOne.website.chooseProvider')}
|
||||
</div>
|
||||
<div className="flex space-x-2">
|
||||
<button
|
||||
className={`px-4 py-2 text-sm font-medium rounded-md flex items-center justify-center ${
|
||||
selectedProvider === DataSourceProvider.jinaReader
|
||||
? 'bg-primary-50 text-primary-600'
|
||||
: 'bg-gray-100 text-gray-600 hover:bg-gray-200'
|
||||
}`}
|
||||
onClick={() => setSelectedProvider(DataSourceProvider.jinaReader)}
|
||||
>
|
||||
<span className={cn(s.jinaLogo, 'mr-2')} />
|
||||
<span>Jina Reader</span>
|
||||
</button>
|
||||
<button
|
||||
className={`px-4 py-2 text-sm font-medium rounded-md ${
|
||||
selectedProvider === DataSourceProvider.fireCrawl
|
||||
? 'bg-primary-50 text-primary-600'
|
||||
: 'bg-gray-100 text-gray-600 hover:bg-gray-200'
|
||||
}`}
|
||||
onClick={() => setSelectedProvider(DataSourceProvider.fireCrawl)}
|
||||
>
|
||||
🔥 Firecrawl
|
||||
</button>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
{
|
||||
selectedProvider === DataSourceProvider.fireCrawl
|
||||
? sources.find(source => source.provider === DataSourceProvider.fireCrawl)
|
||||
? (
|
||||
<Firecrawl
|
||||
onPreview={onPreview}
|
||||
checkedCrawlResult={checkedCrawlResult}
|
||||
onCheckedCrawlResultChange={onCheckedCrawlResultChange}
|
||||
onJobIdChange={onJobIdChange}
|
||||
crawlOptions={crawlOptions}
|
||||
onCrawlOptionsChange={onCrawlOptionsChange}
|
||||
/>
|
||||
)
|
||||
: (
|
||||
<NoData onConfig={handleOnConfig} provider={selectedProvider} />
|
||||
)
|
||||
: sources.find(source => source.provider === DataSourceProvider.jinaReader)
|
||||
? (
|
||||
<JinaReader
|
||||
onPreview={onPreview}
|
||||
checkedCrawlResult={checkedCrawlResult}
|
||||
onCheckedCrawlResultChange={onCheckedCrawlResultChange}
|
||||
onJobIdChange={onJobIdChange}
|
||||
crawlOptions={crawlOptions}
|
||||
onCrawlOptionsChange={onCrawlOptionsChange}
|
||||
/>
|
||||
)
|
||||
: (
|
||||
<NoData onConfig={handleOnConfig} provider={selectedProvider} />
|
||||
)
|
||||
}
|
||||
</div>
|
||||
)
|
||||
}
|
||||
|
@@ -0,0 +1,42 @@
|
||||
'use client'
|
||||
import type { FC } from 'react'
|
||||
import React from 'react'
|
||||
import { useTranslation } from 'react-i18next'
|
||||
import { Settings01 } from '@/app/components/base/icons/src/vender/line/general'
|
||||
import { BookOpen01 } from '@/app/components/base/icons/src/vender/line/education'
|
||||
|
||||
const I18N_PREFIX = 'datasetCreation.stepOne.website'
|
||||
|
||||
type Props = {
|
||||
onSetting: () => void
|
||||
}
|
||||
|
||||
const Header: FC<Props> = ({
|
||||
onSetting,
|
||||
}) => {
|
||||
const { t } = useTranslation()
|
||||
|
||||
return (
|
||||
<div className='flex h-6 items-center justify-between'>
|
||||
<div className='flex items-center'>
|
||||
<div className='text-base font-medium text-gray-700'>{t(`${I18N_PREFIX}.jinaReaderTitle`)}</div>
|
||||
<div className='ml-2 mr-1 w-px h-3.5 bg-gray-200'></div>
|
||||
<div
|
||||
className='p-1 rounded-md hover:bg-black/5 cursor-pointer'
|
||||
onClick={onSetting}
|
||||
>
|
||||
<Settings01 className='w-3.5 h-3.5 text-gray-500' />
|
||||
</div>
|
||||
</div>
|
||||
<a
|
||||
href='https://jina.ai/reader'
|
||||
target='_blank' rel='noopener noreferrer'
|
||||
className='flex items-center text-xs text-primary-600'
|
||||
>
|
||||
<BookOpen01 className='mr-1 w-3.5 h-3.5 text-primary-600' />
|
||||
{t(`${I18N_PREFIX}.jinaReaderDoc`)}
|
||||
</a>
|
||||
</div>
|
||||
)
|
||||
}
|
||||
export default React.memo(Header)
|
232
web/app/components/datasets/create/website/jina-reader/index.tsx
Normal file
232
web/app/components/datasets/create/website/jina-reader/index.tsx
Normal file
@@ -0,0 +1,232 @@
|
||||
'use client'
|
||||
import type { FC } from 'react'
|
||||
import React, { useCallback, useEffect, useState } from 'react'
|
||||
import { useTranslation } from 'react-i18next'
|
||||
import UrlInput from '../base/url-input'
|
||||
import OptionsWrap from '../base/options-wrap'
|
||||
import CrawledResult from '../base/crawled-result'
|
||||
import Crawling from '../base/crawling'
|
||||
import ErrorMessage from '../base/error-message'
|
||||
import Header from './header'
|
||||
import Options from './options'
|
||||
import cn from '@/utils/classnames'
|
||||
import { useModalContext } from '@/context/modal-context'
|
||||
import Toast from '@/app/components/base/toast'
|
||||
import { checkJinaReaderTaskStatus, createJinaReaderTask } from '@/service/datasets'
|
||||
import { sleep } from '@/utils'
|
||||
import type { CrawlOptions, CrawlResultItem } from '@/models/datasets'
|
||||
|
||||
const ERROR_I18N_PREFIX = 'common.errorMsg'
|
||||
const I18N_PREFIX = 'datasetCreation.stepOne.website'
|
||||
|
||||
type Props = {
|
||||
onPreview: (payload: CrawlResultItem) => void
|
||||
checkedCrawlResult: CrawlResultItem[]
|
||||
onCheckedCrawlResultChange: (payload: CrawlResultItem[]) => void
|
||||
onJobIdChange: (jobId: string) => void
|
||||
crawlOptions: CrawlOptions
|
||||
onCrawlOptionsChange: (payload: CrawlOptions) => void
|
||||
}
|
||||
|
||||
enum Step {
|
||||
init = 'init',
|
||||
running = 'running',
|
||||
finished = 'finished',
|
||||
}
|
||||
|
||||
const JinaReader: FC<Props> = ({
|
||||
onPreview,
|
||||
checkedCrawlResult,
|
||||
onCheckedCrawlResultChange,
|
||||
onJobIdChange,
|
||||
crawlOptions,
|
||||
onCrawlOptionsChange,
|
||||
}) => {
|
||||
const { t } = useTranslation()
|
||||
const [step, setStep] = useState<Step>(Step.init)
|
||||
const [controlFoldOptions, setControlFoldOptions] = useState<number>(0)
|
||||
useEffect(() => {
|
||||
if (step !== Step.init)
|
||||
setControlFoldOptions(Date.now())
|
||||
}, [step])
|
||||
const { setShowAccountSettingModal } = useModalContext()
|
||||
const handleSetting = useCallback(() => {
|
||||
setShowAccountSettingModal({
|
||||
payload: 'data-source',
|
||||
})
|
||||
}, [setShowAccountSettingModal])
|
||||
|
||||
const checkValid = useCallback((url: string) => {
|
||||
let errorMsg = ''
|
||||
if (!url) {
|
||||
errorMsg = t(`${ERROR_I18N_PREFIX}.fieldRequired`, {
|
||||
field: 'url',
|
||||
})
|
||||
}
|
||||
|
||||
if (!errorMsg && !((url.startsWith('http://') || url.startsWith('https://'))))
|
||||
errorMsg = t(`${ERROR_I18N_PREFIX}.urlError`)
|
||||
|
||||
if (!errorMsg && (crawlOptions.limit === null || crawlOptions.limit === undefined || crawlOptions.limit === '')) {
|
||||
errorMsg = t(`${ERROR_I18N_PREFIX}.fieldRequired`, {
|
||||
field: t(`${I18N_PREFIX}.limit`),
|
||||
})
|
||||
}
|
||||
|
||||
return {
|
||||
isValid: !errorMsg,
|
||||
errorMsg,
|
||||
}
|
||||
}, [crawlOptions, t])
|
||||
|
||||
const isInit = step === Step.init
|
||||
const isCrawlFinished = step === Step.finished
|
||||
const isRunning = step === Step.running
|
||||
const [crawlResult, setCrawlResult] = useState<{
|
||||
current: number
|
||||
total: number
|
||||
data: CrawlResultItem[]
|
||||
time_consuming: number | string
|
||||
} | undefined>(undefined)
|
||||
const [crawlErrorMessage, setCrawlErrorMessage] = useState('')
|
||||
const showError = isCrawlFinished && crawlErrorMessage
|
||||
|
||||
const waitForCrawlFinished = useCallback(async (jobId: string) => {
|
||||
try {
|
||||
const res = await checkJinaReaderTaskStatus(jobId) as any
|
||||
console.log('res', res)
|
||||
if (res.status === 'completed') {
|
||||
return {
|
||||
isError: false,
|
||||
data: {
|
||||
...res,
|
||||
total: Math.min(res.total, parseFloat(crawlOptions.limit as string)),
|
||||
},
|
||||
}
|
||||
}
|
||||
if (res.status === 'failed' || !res.status) {
|
||||
return {
|
||||
isError: true,
|
||||
errorMessage: res.message,
|
||||
data: {
|
||||
data: [],
|
||||
},
|
||||
}
|
||||
}
|
||||
// update the progress
|
||||
setCrawlResult({
|
||||
...res,
|
||||
total: Math.min(res.total, parseFloat(crawlOptions.limit as string)),
|
||||
})
|
||||
onCheckedCrawlResultChange(res.data || []) // default select the crawl result
|
||||
await sleep(2500)
|
||||
return await waitForCrawlFinished(jobId)
|
||||
}
|
||||
catch (e: any) {
|
||||
const errorBody = await e.json()
|
||||
return {
|
||||
isError: true,
|
||||
errorMessage: errorBody.message,
|
||||
data: {
|
||||
data: [],
|
||||
},
|
||||
}
|
||||
}
|
||||
}, [crawlOptions.limit])
|
||||
|
||||
const handleRun = useCallback(async (url: string) => {
|
||||
const { isValid, errorMsg } = checkValid(url)
|
||||
if (!isValid) {
|
||||
Toast.notify({
|
||||
message: errorMsg!,
|
||||
type: 'error',
|
||||
})
|
||||
return
|
||||
}
|
||||
setStep(Step.running)
|
||||
try {
|
||||
const startTime = Date.now()
|
||||
const res = await createJinaReaderTask({
|
||||
url,
|
||||
options: crawlOptions,
|
||||
}) as any
|
||||
|
||||
if (res.data) {
|
||||
const data = {
|
||||
current: 1,
|
||||
total: 1,
|
||||
data: [{
|
||||
title: res.data.title,
|
||||
markdown: res.data.content,
|
||||
description: res.data.description,
|
||||
source_url: res.data.url,
|
||||
}],
|
||||
time_consuming: (Date.now() - startTime) / 1000,
|
||||
}
|
||||
setCrawlResult(data)
|
||||
onCheckedCrawlResultChange(data.data || [])
|
||||
setCrawlErrorMessage('')
|
||||
}
|
||||
else if (res.job_id) {
|
||||
const jobId = res.job_id
|
||||
onJobIdChange(jobId)
|
||||
const { isError, data, errorMessage } = await waitForCrawlFinished(jobId)
|
||||
if (isError) {
|
||||
setCrawlErrorMessage(errorMessage || t(`${I18N_PREFIX}.unknownError`))
|
||||
}
|
||||
else {
|
||||
setCrawlResult(data)
|
||||
onCheckedCrawlResultChange(data.data || []) // default select the crawl result
|
||||
setCrawlErrorMessage('')
|
||||
}
|
||||
}
|
||||
}
|
||||
catch (e) {
|
||||
setCrawlErrorMessage(t(`${I18N_PREFIX}.unknownError`)!)
|
||||
console.log(e)
|
||||
}
|
||||
finally {
|
||||
setStep(Step.finished)
|
||||
}
|
||||
}, [checkValid, crawlOptions, onJobIdChange, t, waitForCrawlFinished])
|
||||
|
||||
return (
|
||||
<div>
|
||||
<Header onSetting={handleSetting} />
|
||||
<div className={cn('mt-2 p-4 pb-0 rounded-xl border border-gray-200')}>
|
||||
<UrlInput onRun={handleRun} isRunning={isRunning} />
|
||||
<OptionsWrap
|
||||
className={cn('mt-4')}
|
||||
controlFoldOptions={controlFoldOptions}
|
||||
>
|
||||
<Options className='mt-2' payload={crawlOptions} onChange={onCrawlOptionsChange} />
|
||||
</OptionsWrap>
|
||||
|
||||
{!isInit && (
|
||||
<div className='mt-3 relative left-[-16px] w-[calc(100%_+_32px)] rounded-b-xl'>
|
||||
{isRunning
|
||||
&& <Crawling
|
||||
className='mt-2'
|
||||
crawledNum={crawlResult?.current || 0}
|
||||
totalNum={crawlResult?.total || parseFloat(crawlOptions.limit as string) || 0}
|
||||
/>}
|
||||
{showError && (
|
||||
<ErrorMessage className='rounded-b-xl' title={t(`${I18N_PREFIX}.exceptionErrorTitle`)} errorMsg={crawlErrorMessage} />
|
||||
)}
|
||||
{isCrawlFinished && !showError
|
||||
&& <CrawledResult
|
||||
className='mb-2'
|
||||
list={crawlResult?.data || []}
|
||||
checkedList={checkedCrawlResult}
|
||||
onSelectedChange={onCheckedCrawlResultChange}
|
||||
onPreview={onPreview}
|
||||
usedTime={parseFloat(crawlResult?.time_consuming as string) || 0}
|
||||
/>
|
||||
}
|
||||
</div>
|
||||
)}
|
||||
</div>
|
||||
</div>
|
||||
)
|
||||
}
|
||||
export default React.memo(JinaReader)
|
@@ -0,0 +1,59 @@
|
||||
'use client'
|
||||
import type { FC } from 'react'
|
||||
import React, { useCallback } from 'react'
|
||||
import { useTranslation } from 'react-i18next'
|
||||
import CheckboxWithLabel from '../base/checkbox-with-label'
|
||||
import Field from '../base/field'
|
||||
import cn from '@/utils/classnames'
|
||||
import type { CrawlOptions } from '@/models/datasets'
|
||||
|
||||
const I18N_PREFIX = 'datasetCreation.stepOne.website'
|
||||
|
||||
type Props = {
|
||||
className?: string
|
||||
payload: CrawlOptions
|
||||
onChange: (payload: CrawlOptions) => void
|
||||
}
|
||||
|
||||
const Options: FC<Props> = ({
|
||||
className = '',
|
||||
payload,
|
||||
onChange,
|
||||
}) => {
|
||||
const { t } = useTranslation()
|
||||
|
||||
const handleChange = useCallback((key: keyof CrawlOptions) => {
|
||||
return (value: any) => {
|
||||
onChange({
|
||||
...payload,
|
||||
[key]: value,
|
||||
})
|
||||
}
|
||||
}, [payload, onChange])
|
||||
return (
|
||||
<div className={cn(className, ' space-y-2')}>
|
||||
<CheckboxWithLabel
|
||||
label={t(`${I18N_PREFIX}.crawlSubPage`)}
|
||||
isChecked={payload.crawl_sub_pages}
|
||||
onChange={handleChange('crawl_sub_pages')}
|
||||
/>
|
||||
<CheckboxWithLabel
|
||||
label={t(`${I18N_PREFIX}.useSitemap`)}
|
||||
isChecked={payload.use_sitemap}
|
||||
onChange={handleChange('use_sitemap')}
|
||||
tooltip={t(`${I18N_PREFIX}.useSitemapTooltip`) as string}
|
||||
/>
|
||||
<div className='flex justify-between space-x-4'>
|
||||
<Field
|
||||
className='grow shrink-0'
|
||||
label={t(`${I18N_PREFIX}.limit`)}
|
||||
value={payload.limit}
|
||||
onChange={handleChange('limit')}
|
||||
isNumber
|
||||
isRequired
|
||||
/>
|
||||
</div>
|
||||
</div>
|
||||
)
|
||||
}
|
||||
export default React.memo(Options)
|
@@ -2,35 +2,56 @@
|
||||
import type { FC } from 'react'
|
||||
import React from 'react'
|
||||
import { useTranslation } from 'react-i18next'
|
||||
import s from './index.module.css'
|
||||
import { Icon3Dots } from '@/app/components/base/icons/src/vender/line/others'
|
||||
import Button from '@/app/components/base/button'
|
||||
import { DataSourceProvider } from '@/models/common'
|
||||
|
||||
const I18N_PREFIX = 'datasetCreation.stepOne.website'
|
||||
|
||||
type Props = {
|
||||
onConfig: () => void
|
||||
provider: DataSourceProvider
|
||||
}
|
||||
|
||||
const NoData: FC<Props> = ({
|
||||
onConfig,
|
||||
provider,
|
||||
}) => {
|
||||
const { t } = useTranslation()
|
||||
|
||||
const providerConfig = {
|
||||
[DataSourceProvider.jinaReader]: {
|
||||
emoji: <span className={s.jinaLogo} />,
|
||||
title: t(`${I18N_PREFIX}.jinaReaderNotConfigured`),
|
||||
description: t(`${I18N_PREFIX}.jinaReaderNotConfiguredDescription`),
|
||||
},
|
||||
[DataSourceProvider.fireCrawl]: {
|
||||
emoji: '🔥',
|
||||
title: t(`${I18N_PREFIX}.fireCrawlNotConfigured`),
|
||||
description: t(`${I18N_PREFIX}.fireCrawlNotConfiguredDescription`),
|
||||
},
|
||||
}
|
||||
|
||||
const currentProvider = providerConfig[provider]
|
||||
|
||||
return (
|
||||
<div className='max-w-[640px] p-6 rounded-2xl bg-gray-50'>
|
||||
<div className='flex w-11 h-11 items-center justify-center bg-gray-50 rounded-xl border-[0.5px] border-gray-100 shadow-lg'>
|
||||
🔥
|
||||
</div>
|
||||
<div className='my-2'>
|
||||
<span className='text-gray-700 font-semibold'>{t(`${I18N_PREFIX}.fireCrawlNotConfigured`)}<Icon3Dots className='inline relative -top-3 -left-1.5' /></span>
|
||||
<div className='mt-1 pb-3 text-gray-500 text-[13px] font-normal'>
|
||||
{t(`${I18N_PREFIX}.fireCrawlNotConfiguredDescription`)}
|
||||
<>
|
||||
<div className='max-w-[640px] p-6 rounded-2xl bg-gray-50 mt-4'>
|
||||
<div className='flex w-11 h-11 items-center justify-center bg-gray-50 rounded-xl border-[0.5px] border-gray-100 shadow-lg'>
|
||||
{currentProvider.emoji}
|
||||
</div>
|
||||
<div className='my-2'>
|
||||
<span className='text-gray-700 font-semibold'>{currentProvider.title}<Icon3Dots className='inline relative -top-3 -left-1.5' /></span>
|
||||
<div className='mt-1 pb-3 text-gray-500 text-[13px] font-normal'>
|
||||
{currentProvider.description}
|
||||
</div>
|
||||
</div>
|
||||
<Button variant='primary' onClick={onConfig}>
|
||||
{t(`${I18N_PREFIX}.configure`)}
|
||||
</Button>
|
||||
</div>
|
||||
<Button variant='primary' onClick={onConfig}>
|
||||
{t(`${I18N_PREFIX}.configure`)}
|
||||
</Button>
|
||||
</div>
|
||||
</>
|
||||
)
|
||||
}
|
||||
export default React.memo(NoData)
|
||||
|
Reference in New Issue
Block a user