feat(website-crawl): add jina reader as additional alternative for website crawling (#8761)

This commit is contained in:
Zhaofeng Miao
2024-09-30 09:57:19 +08:00
committed by GitHub
parent fb49413a41
commit 369e1e6f58
38 changed files with 927 additions and 75 deletions

View File

@@ -11,7 +11,7 @@ import { DataSourceType } from '@/models/datasets'
import type { CrawlOptions, CrawlResultItem, DataSet, FileItem, createDocumentResponse } from '@/models/datasets'
import { fetchDataSource } from '@/service/common'
import { fetchDatasetDetail } from '@/service/datasets'
import type { NotionPage } from '@/models/common'
import { DataSourceProvider, type NotionPage } from '@/models/common'
import { useModalContext } from '@/context/modal-context'
import { useDefaultModel } from '@/app/components/header/account-setting/model-provider-page/hooks'
@@ -26,6 +26,7 @@ const DEFAULT_CRAWL_OPTIONS: CrawlOptions = {
excludes: '',
limit: 10,
max_depth: '',
use_sitemap: true,
}
const DatasetUpdateForm = ({ datasetId }: DatasetUpdateFormProps) => {
@@ -51,7 +52,8 @@ const DatasetUpdateForm = ({ datasetId }: DatasetUpdateFormProps) => {
const updateFileList = (preparedFiles: FileItem[]) => {
setFiles(preparedFiles)
}
const [fireCrawlJobId, setFireCrawlJobId] = useState('')
const [websiteCrawlProvider, setWebsiteCrawlProvider] = useState<DataSourceProvider>(DataSourceProvider.fireCrawl)
const [websiteCrawlJobId, setWebsiteCrawlJobId] = useState('')
const updateFile = (fileItem: FileItem, progress: number, list: FileItem[]) => {
const targetIndex = list.findIndex(file => file.fileID === fileItem.fileID)
@@ -137,7 +139,8 @@ const DatasetUpdateForm = ({ datasetId }: DatasetUpdateFormProps) => {
onStepChange={nextStep}
websitePages={websitePages}
updateWebsitePages={setWebsitePages}
onFireCrawlJobIdChange={setFireCrawlJobId}
onWebsiteCrawlProviderChange={setWebsiteCrawlProvider}
onWebsiteCrawlJobIdChange={setWebsiteCrawlJobId}
crawlOptions={crawlOptions}
onCrawlOptionsChange={setCrawlOptions}
/>
@@ -151,7 +154,8 @@ const DatasetUpdateForm = ({ datasetId }: DatasetUpdateFormProps) => {
files={fileList.map(file => file.file)}
notionPages={notionPages}
websitePages={websitePages}
fireCrawlJobId={fireCrawlJobId}
websiteCrawlProvider={websiteCrawlProvider}
websiteCrawlJobId={websiteCrawlJobId}
onStepChange={changeStep}
updateIndexingTypeCache={updateIndexingTypeCache}
updateResultCache={updateResultCache}