feat(website-crawl): add jina reader as additional alternative for website crawling (#8761)
This commit is contained in:
@@ -14,7 +14,9 @@ class WebsiteCrawlApi(Resource):
|
|||||||
@account_initialization_required
|
@account_initialization_required
|
||||||
def post(self):
|
def post(self):
|
||||||
parser = reqparse.RequestParser()
|
parser = reqparse.RequestParser()
|
||||||
parser.add_argument("provider", type=str, choices=["firecrawl"], required=True, nullable=True, location="json")
|
parser.add_argument(
|
||||||
|
"provider", type=str, choices=["firecrawl", "jinareader"], required=True, nullable=True, location="json"
|
||||||
|
)
|
||||||
parser.add_argument("url", type=str, required=True, nullable=True, location="json")
|
parser.add_argument("url", type=str, required=True, nullable=True, location="json")
|
||||||
parser.add_argument("options", type=dict, required=True, nullable=True, location="json")
|
parser.add_argument("options", type=dict, required=True, nullable=True, location="json")
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
@@ -33,7 +35,7 @@ class WebsiteCrawlStatusApi(Resource):
|
|||||||
@account_initialization_required
|
@account_initialization_required
|
||||||
def get(self, job_id: str):
|
def get(self, job_id: str):
|
||||||
parser = reqparse.RequestParser()
|
parser = reqparse.RequestParser()
|
||||||
parser.add_argument("provider", type=str, choices=["firecrawl"], required=True, location="args")
|
parser.add_argument("provider", type=str, choices=["firecrawl", "jinareader"], required=True, location="args")
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
# get crawl status
|
# get crawl status
|
||||||
try:
|
try:
|
||||||
|
@@ -12,6 +12,7 @@ from core.rag.extractor.entity.extract_setting import ExtractSetting
|
|||||||
from core.rag.extractor.excel_extractor import ExcelExtractor
|
from core.rag.extractor.excel_extractor import ExcelExtractor
|
||||||
from core.rag.extractor.firecrawl.firecrawl_web_extractor import FirecrawlWebExtractor
|
from core.rag.extractor.firecrawl.firecrawl_web_extractor import FirecrawlWebExtractor
|
||||||
from core.rag.extractor.html_extractor import HtmlExtractor
|
from core.rag.extractor.html_extractor import HtmlExtractor
|
||||||
|
from core.rag.extractor.jina_reader_extractor import JinaReaderWebExtractor
|
||||||
from core.rag.extractor.markdown_extractor import MarkdownExtractor
|
from core.rag.extractor.markdown_extractor import MarkdownExtractor
|
||||||
from core.rag.extractor.notion_extractor import NotionExtractor
|
from core.rag.extractor.notion_extractor import NotionExtractor
|
||||||
from core.rag.extractor.pdf_extractor import PdfExtractor
|
from core.rag.extractor.pdf_extractor import PdfExtractor
|
||||||
@@ -171,6 +172,15 @@ class ExtractProcessor:
|
|||||||
only_main_content=extract_setting.website_info.only_main_content,
|
only_main_content=extract_setting.website_info.only_main_content,
|
||||||
)
|
)
|
||||||
return extractor.extract()
|
return extractor.extract()
|
||||||
|
elif extract_setting.website_info.provider == "jinareader":
|
||||||
|
extractor = JinaReaderWebExtractor(
|
||||||
|
url=extract_setting.website_info.url,
|
||||||
|
job_id=extract_setting.website_info.job_id,
|
||||||
|
tenant_id=extract_setting.website_info.tenant_id,
|
||||||
|
mode=extract_setting.website_info.mode,
|
||||||
|
only_main_content=extract_setting.website_info.only_main_content,
|
||||||
|
)
|
||||||
|
return extractor.extract()
|
||||||
else:
|
else:
|
||||||
raise ValueError(f"Unsupported website provider: {extract_setting.website_info.provider}")
|
raise ValueError(f"Unsupported website provider: {extract_setting.website_info.provider}")
|
||||||
else:
|
else:
|
||||||
|
35
api/core/rag/extractor/jina_reader_extractor.py
Normal file
35
api/core/rag/extractor/jina_reader_extractor.py
Normal file
@@ -0,0 +1,35 @@
|
|||||||
|
from core.rag.extractor.extractor_base import BaseExtractor
|
||||||
|
from core.rag.models.document import Document
|
||||||
|
from services.website_service import WebsiteService
|
||||||
|
|
||||||
|
|
||||||
|
class JinaReaderWebExtractor(BaseExtractor):
|
||||||
|
"""
|
||||||
|
Crawl and scrape websites and return content in clean llm-ready markdown.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, url: str, job_id: str, tenant_id: str, mode: str = "crawl", only_main_content: bool = False):
|
||||||
|
"""Initialize with url, api_key, base_url and mode."""
|
||||||
|
self._url = url
|
||||||
|
self.job_id = job_id
|
||||||
|
self.tenant_id = tenant_id
|
||||||
|
self.mode = mode
|
||||||
|
self.only_main_content = only_main_content
|
||||||
|
|
||||||
|
def extract(self) -> list[Document]:
|
||||||
|
"""Extract content from the URL."""
|
||||||
|
documents = []
|
||||||
|
if self.mode == "crawl":
|
||||||
|
crawl_data = WebsiteService.get_crawl_url_data(self.job_id, "jinareader", self._url, self.tenant_id)
|
||||||
|
if crawl_data is None:
|
||||||
|
return []
|
||||||
|
document = Document(
|
||||||
|
page_content=crawl_data.get("content", ""),
|
||||||
|
metadata={
|
||||||
|
"source_url": crawl_data.get("url"),
|
||||||
|
"description": crawl_data.get("description"),
|
||||||
|
"title": crawl_data.get("title"),
|
||||||
|
},
|
||||||
|
)
|
||||||
|
documents.append(document)
|
||||||
|
return documents
|
@@ -1,10 +1,13 @@
|
|||||||
from services.auth.firecrawl import FirecrawlAuth
|
from services.auth.firecrawl import FirecrawlAuth
|
||||||
|
from services.auth.jina import JinaAuth
|
||||||
|
|
||||||
|
|
||||||
class ApiKeyAuthFactory:
|
class ApiKeyAuthFactory:
|
||||||
def __init__(self, provider: str, credentials: dict):
|
def __init__(self, provider: str, credentials: dict):
|
||||||
if provider == "firecrawl":
|
if provider == "firecrawl":
|
||||||
self.auth = FirecrawlAuth(credentials)
|
self.auth = FirecrawlAuth(credentials)
|
||||||
|
elif provider == "jinareader":
|
||||||
|
self.auth = JinaAuth(credentials)
|
||||||
else:
|
else:
|
||||||
raise ValueError("Invalid provider")
|
raise ValueError("Invalid provider")
|
||||||
|
|
||||||
|
44
api/services/auth/jina.py
Normal file
44
api/services/auth/jina.py
Normal file
@@ -0,0 +1,44 @@
|
|||||||
|
import json
|
||||||
|
|
||||||
|
import requests
|
||||||
|
|
||||||
|
from services.auth.api_key_auth_base import ApiKeyAuthBase
|
||||||
|
|
||||||
|
|
||||||
|
class JinaAuth(ApiKeyAuthBase):
|
||||||
|
def __init__(self, credentials: dict):
|
||||||
|
super().__init__(credentials)
|
||||||
|
auth_type = credentials.get("auth_type")
|
||||||
|
if auth_type != "bearer":
|
||||||
|
raise ValueError("Invalid auth type, Jina Reader auth type must be Bearer")
|
||||||
|
self.api_key = credentials.get("config").get("api_key", None)
|
||||||
|
|
||||||
|
if not self.api_key:
|
||||||
|
raise ValueError("No API key provided")
|
||||||
|
|
||||||
|
def validate_credentials(self):
|
||||||
|
headers = self._prepare_headers()
|
||||||
|
options = {
|
||||||
|
"url": "https://example.com",
|
||||||
|
}
|
||||||
|
response = self._post_request("https://r.jina.ai", options, headers)
|
||||||
|
if response.status_code == 200:
|
||||||
|
return True
|
||||||
|
else:
|
||||||
|
self._handle_error(response)
|
||||||
|
|
||||||
|
def _prepare_headers(self):
|
||||||
|
return {"Content-Type": "application/json", "Authorization": f"Bearer {self.api_key}"}
|
||||||
|
|
||||||
|
def _post_request(self, url, data, headers):
|
||||||
|
return requests.post(url, headers=headers, json=data)
|
||||||
|
|
||||||
|
def _handle_error(self, response):
|
||||||
|
if response.status_code in {402, 409, 500}:
|
||||||
|
error_message = response.json().get("error", "Unknown error occurred")
|
||||||
|
raise Exception(f"Failed to authorize. Status code: {response.status_code}. Error: {error_message}")
|
||||||
|
else:
|
||||||
|
if response.text:
|
||||||
|
error_message = json.loads(response.text).get("error", "Unknown error occurred")
|
||||||
|
raise Exception(f"Failed to authorize. Status code: {response.status_code}. Error: {error_message}")
|
||||||
|
raise Exception(f"Unexpected error occurred while trying to authorize. Status code: {response.status_code}")
|
@@ -1,6 +1,7 @@
|
|||||||
import datetime
|
import datetime
|
||||||
import json
|
import json
|
||||||
|
|
||||||
|
import requests
|
||||||
from flask_login import current_user
|
from flask_login import current_user
|
||||||
|
|
||||||
from core.helper import encrypter
|
from core.helper import encrypter
|
||||||
@@ -65,6 +66,35 @@ class WebsiteService:
|
|||||||
time = str(datetime.datetime.now().timestamp())
|
time = str(datetime.datetime.now().timestamp())
|
||||||
redis_client.setex(website_crawl_time_cache_key, 3600, time)
|
redis_client.setex(website_crawl_time_cache_key, 3600, time)
|
||||||
return {"status": "active", "job_id": job_id}
|
return {"status": "active", "job_id": job_id}
|
||||||
|
elif provider == "jinareader":
|
||||||
|
api_key = encrypter.decrypt_token(
|
||||||
|
tenant_id=current_user.current_tenant_id, token=credentials.get("config").get("api_key")
|
||||||
|
)
|
||||||
|
crawl_sub_pages = options.get("crawl_sub_pages", False)
|
||||||
|
if not crawl_sub_pages:
|
||||||
|
response = requests.get(
|
||||||
|
f"https://r.jina.ai/{url}",
|
||||||
|
headers={"Accept": "application/json", "Authorization": f"Bearer {api_key}"},
|
||||||
|
)
|
||||||
|
if response.json().get("code") != 200:
|
||||||
|
raise ValueError("Failed to crawl")
|
||||||
|
return {"status": "active", "data": response.json().get("data")}
|
||||||
|
else:
|
||||||
|
response = requests.post(
|
||||||
|
"https://adaptivecrawl-kir3wx7b3a-uc.a.run.app",
|
||||||
|
json={
|
||||||
|
"url": url,
|
||||||
|
"maxPages": options.get("limit", 1),
|
||||||
|
"useSitemap": options.get("use_sitemap", True),
|
||||||
|
},
|
||||||
|
headers={
|
||||||
|
"Content-Type": "application/json",
|
||||||
|
"Authorization": f"Bearer {api_key}",
|
||||||
|
},
|
||||||
|
)
|
||||||
|
if response.json().get("code") != 200:
|
||||||
|
raise ValueError("Failed to crawl")
|
||||||
|
return {"status": "active", "job_id": response.json().get("data", {}).get("taskId")}
|
||||||
else:
|
else:
|
||||||
raise ValueError("Invalid provider")
|
raise ValueError("Invalid provider")
|
||||||
|
|
||||||
@@ -93,6 +123,42 @@ class WebsiteService:
|
|||||||
time_consuming = abs(end_time - float(start_time))
|
time_consuming = abs(end_time - float(start_time))
|
||||||
crawl_status_data["time_consuming"] = f"{time_consuming:.2f}"
|
crawl_status_data["time_consuming"] = f"{time_consuming:.2f}"
|
||||||
redis_client.delete(website_crawl_time_cache_key)
|
redis_client.delete(website_crawl_time_cache_key)
|
||||||
|
elif provider == "jinareader":
|
||||||
|
api_key = encrypter.decrypt_token(
|
||||||
|
tenant_id=current_user.current_tenant_id, token=credentials.get("config").get("api_key")
|
||||||
|
)
|
||||||
|
response = requests.post(
|
||||||
|
"https://adaptivecrawlstatus-kir3wx7b3a-uc.a.run.app",
|
||||||
|
headers={"Content-Type": "application/json", "Authorization": f"Bearer {api_key}"},
|
||||||
|
json={"taskId": job_id},
|
||||||
|
)
|
||||||
|
data = response.json().get("data", {})
|
||||||
|
crawl_status_data = {
|
||||||
|
"status": data.get("status", "active"),
|
||||||
|
"job_id": job_id,
|
||||||
|
"total": len(data.get("urls", [])),
|
||||||
|
"current": len(data.get("processed", [])) + len(data.get("failed", [])),
|
||||||
|
"data": [],
|
||||||
|
"time_consuming": data.get("duration", 0) / 1000,
|
||||||
|
}
|
||||||
|
|
||||||
|
if crawl_status_data["status"] == "completed":
|
||||||
|
response = requests.post(
|
||||||
|
"https://adaptivecrawlstatus-kir3wx7b3a-uc.a.run.app",
|
||||||
|
headers={"Content-Type": "application/json", "Authorization": f"Bearer {api_key}"},
|
||||||
|
json={"taskId": job_id, "urls": list(data.get("processed", {}).keys())},
|
||||||
|
)
|
||||||
|
data = response.json().get("data", {})
|
||||||
|
formatted_data = [
|
||||||
|
{
|
||||||
|
"title": item.get("data", {}).get("title"),
|
||||||
|
"source_url": item.get("data", {}).get("url"),
|
||||||
|
"description": item.get("data", {}).get("description"),
|
||||||
|
"markdown": item.get("data", {}).get("content"),
|
||||||
|
}
|
||||||
|
for item in data.get("processed", {}).values()
|
||||||
|
]
|
||||||
|
crawl_status_data["data"] = formatted_data
|
||||||
else:
|
else:
|
||||||
raise ValueError("Invalid provider")
|
raise ValueError("Invalid provider")
|
||||||
return crawl_status_data
|
return crawl_status_data
|
||||||
@@ -119,6 +185,40 @@ class WebsiteService:
|
|||||||
if item.get("source_url") == url:
|
if item.get("source_url") == url:
|
||||||
return item
|
return item
|
||||||
return None
|
return None
|
||||||
|
elif provider == "jinareader":
|
||||||
|
file_key = "website_files/" + job_id + ".txt"
|
||||||
|
if storage.exists(file_key):
|
||||||
|
data = storage.load_once(file_key)
|
||||||
|
if data:
|
||||||
|
data = json.loads(data.decode("utf-8"))
|
||||||
|
elif not job_id:
|
||||||
|
response = requests.get(
|
||||||
|
f"https://r.jina.ai/{url}",
|
||||||
|
headers={"Accept": "application/json", "Authorization": f"Bearer {api_key}"},
|
||||||
|
)
|
||||||
|
if response.json().get("code") != 200:
|
||||||
|
raise ValueError("Failed to crawl")
|
||||||
|
return response.json().get("data")
|
||||||
|
else:
|
||||||
|
api_key = encrypter.decrypt_token(tenant_id=tenant_id, token=credentials.get("config").get("api_key"))
|
||||||
|
response = requests.post(
|
||||||
|
"https://adaptivecrawlstatus-kir3wx7b3a-uc.a.run.app",
|
||||||
|
headers={"Content-Type": "application/json", "Authorization": f"Bearer {api_key}"},
|
||||||
|
json={"taskId": job_id},
|
||||||
|
)
|
||||||
|
data = response.json().get("data", {})
|
||||||
|
if data.get("status") != "completed":
|
||||||
|
raise ValueError("Crawl job is not completed")
|
||||||
|
|
||||||
|
response = requests.post(
|
||||||
|
"https://adaptivecrawlstatus-kir3wx7b3a-uc.a.run.app",
|
||||||
|
headers={"Content-Type": "application/json", "Authorization": f"Bearer {api_key}"},
|
||||||
|
json={"taskId": job_id, "urls": list(data.get("processed", {}).keys())},
|
||||||
|
)
|
||||||
|
data = response.json().get("data", {})
|
||||||
|
for item in data.get("processed", {}).values():
|
||||||
|
if item.get("data", {}).get("url") == url:
|
||||||
|
return item.get("data", {})
|
||||||
else:
|
else:
|
||||||
raise ValueError("Invalid provider")
|
raise ValueError("Invalid provider")
|
||||||
|
|
||||||
|
BIN
web/app/components/datasets/create/assets/jina.png
Normal file
BIN
web/app/components/datasets/create/assets/jina.png
Normal file
Binary file not shown.
After Width: | Height: | Size: 2.7 KiB |
@@ -11,7 +11,7 @@ import { DataSourceType } from '@/models/datasets'
|
|||||||
import type { CrawlOptions, CrawlResultItem, DataSet, FileItem, createDocumentResponse } from '@/models/datasets'
|
import type { CrawlOptions, CrawlResultItem, DataSet, FileItem, createDocumentResponse } from '@/models/datasets'
|
||||||
import { fetchDataSource } from '@/service/common'
|
import { fetchDataSource } from '@/service/common'
|
||||||
import { fetchDatasetDetail } from '@/service/datasets'
|
import { fetchDatasetDetail } from '@/service/datasets'
|
||||||
import type { NotionPage } from '@/models/common'
|
import { DataSourceProvider, type NotionPage } from '@/models/common'
|
||||||
import { useModalContext } from '@/context/modal-context'
|
import { useModalContext } from '@/context/modal-context'
|
||||||
import { useDefaultModel } from '@/app/components/header/account-setting/model-provider-page/hooks'
|
import { useDefaultModel } from '@/app/components/header/account-setting/model-provider-page/hooks'
|
||||||
|
|
||||||
@@ -26,6 +26,7 @@ const DEFAULT_CRAWL_OPTIONS: CrawlOptions = {
|
|||||||
excludes: '',
|
excludes: '',
|
||||||
limit: 10,
|
limit: 10,
|
||||||
max_depth: '',
|
max_depth: '',
|
||||||
|
use_sitemap: true,
|
||||||
}
|
}
|
||||||
|
|
||||||
const DatasetUpdateForm = ({ datasetId }: DatasetUpdateFormProps) => {
|
const DatasetUpdateForm = ({ datasetId }: DatasetUpdateFormProps) => {
|
||||||
@@ -51,7 +52,8 @@ const DatasetUpdateForm = ({ datasetId }: DatasetUpdateFormProps) => {
|
|||||||
const updateFileList = (preparedFiles: FileItem[]) => {
|
const updateFileList = (preparedFiles: FileItem[]) => {
|
||||||
setFiles(preparedFiles)
|
setFiles(preparedFiles)
|
||||||
}
|
}
|
||||||
const [fireCrawlJobId, setFireCrawlJobId] = useState('')
|
const [websiteCrawlProvider, setWebsiteCrawlProvider] = useState<DataSourceProvider>(DataSourceProvider.fireCrawl)
|
||||||
|
const [websiteCrawlJobId, setWebsiteCrawlJobId] = useState('')
|
||||||
|
|
||||||
const updateFile = (fileItem: FileItem, progress: number, list: FileItem[]) => {
|
const updateFile = (fileItem: FileItem, progress: number, list: FileItem[]) => {
|
||||||
const targetIndex = list.findIndex(file => file.fileID === fileItem.fileID)
|
const targetIndex = list.findIndex(file => file.fileID === fileItem.fileID)
|
||||||
@@ -137,7 +139,8 @@ const DatasetUpdateForm = ({ datasetId }: DatasetUpdateFormProps) => {
|
|||||||
onStepChange={nextStep}
|
onStepChange={nextStep}
|
||||||
websitePages={websitePages}
|
websitePages={websitePages}
|
||||||
updateWebsitePages={setWebsitePages}
|
updateWebsitePages={setWebsitePages}
|
||||||
onFireCrawlJobIdChange={setFireCrawlJobId}
|
onWebsiteCrawlProviderChange={setWebsiteCrawlProvider}
|
||||||
|
onWebsiteCrawlJobIdChange={setWebsiteCrawlJobId}
|
||||||
crawlOptions={crawlOptions}
|
crawlOptions={crawlOptions}
|
||||||
onCrawlOptionsChange={setCrawlOptions}
|
onCrawlOptionsChange={setCrawlOptions}
|
||||||
/>
|
/>
|
||||||
@@ -151,7 +154,8 @@ const DatasetUpdateForm = ({ datasetId }: DatasetUpdateFormProps) => {
|
|||||||
files={fileList.map(file => file.file)}
|
files={fileList.map(file => file.file)}
|
||||||
notionPages={notionPages}
|
notionPages={notionPages}
|
||||||
websitePages={websitePages}
|
websitePages={websitePages}
|
||||||
fireCrawlJobId={fireCrawlJobId}
|
websiteCrawlProvider={websiteCrawlProvider}
|
||||||
|
websiteCrawlJobId={websiteCrawlJobId}
|
||||||
onStepChange={changeStep}
|
onStepChange={changeStep}
|
||||||
updateIndexingTypeCache={updateIndexingTypeCache}
|
updateIndexingTypeCache={updateIndexingTypeCache}
|
||||||
updateResultCache={updateResultCache}
|
updateResultCache={updateResultCache}
|
||||||
|
@@ -10,7 +10,7 @@ import WebsitePreview from '../website/preview'
|
|||||||
import s from './index.module.css'
|
import s from './index.module.css'
|
||||||
import cn from '@/utils/classnames'
|
import cn from '@/utils/classnames'
|
||||||
import type { CrawlOptions, CrawlResultItem, FileItem } from '@/models/datasets'
|
import type { CrawlOptions, CrawlResultItem, FileItem } from '@/models/datasets'
|
||||||
import type { NotionPage } from '@/models/common'
|
import type { DataSourceProvider, NotionPage } from '@/models/common'
|
||||||
import { DataSourceType } from '@/models/datasets'
|
import { DataSourceType } from '@/models/datasets'
|
||||||
import Button from '@/app/components/base/button'
|
import Button from '@/app/components/base/button'
|
||||||
import { NotionPageSelector } from '@/app/components/base/notion-page-selector'
|
import { NotionPageSelector } from '@/app/components/base/notion-page-selector'
|
||||||
@@ -33,7 +33,8 @@ type IStepOneProps = {
|
|||||||
changeType: (type: DataSourceType) => void
|
changeType: (type: DataSourceType) => void
|
||||||
websitePages?: CrawlResultItem[]
|
websitePages?: CrawlResultItem[]
|
||||||
updateWebsitePages: (value: CrawlResultItem[]) => void
|
updateWebsitePages: (value: CrawlResultItem[]) => void
|
||||||
onFireCrawlJobIdChange: (jobId: string) => void
|
onWebsiteCrawlProviderChange: (provider: DataSourceProvider) => void
|
||||||
|
onWebsiteCrawlJobIdChange: (jobId: string) => void
|
||||||
crawlOptions: CrawlOptions
|
crawlOptions: CrawlOptions
|
||||||
onCrawlOptionsChange: (payload: CrawlOptions) => void
|
onCrawlOptionsChange: (payload: CrawlOptions) => void
|
||||||
}
|
}
|
||||||
@@ -69,7 +70,8 @@ const StepOne = ({
|
|||||||
updateNotionPages,
|
updateNotionPages,
|
||||||
websitePages = [],
|
websitePages = [],
|
||||||
updateWebsitePages,
|
updateWebsitePages,
|
||||||
onFireCrawlJobIdChange,
|
onWebsiteCrawlProviderChange,
|
||||||
|
onWebsiteCrawlJobIdChange,
|
||||||
crawlOptions,
|
crawlOptions,
|
||||||
onCrawlOptionsChange,
|
onCrawlOptionsChange,
|
||||||
}: IStepOneProps) => {
|
}: IStepOneProps) => {
|
||||||
@@ -229,7 +231,8 @@ const StepOne = ({
|
|||||||
onPreview={setCurrentWebsite}
|
onPreview={setCurrentWebsite}
|
||||||
checkedCrawlResult={websitePages}
|
checkedCrawlResult={websitePages}
|
||||||
onCheckedCrawlResultChange={updateWebsitePages}
|
onCheckedCrawlResultChange={updateWebsitePages}
|
||||||
onJobIdChange={onFireCrawlJobIdChange}
|
onCrawlProviderChange={onWebsiteCrawlProviderChange}
|
||||||
|
onJobIdChange={onWebsiteCrawlJobIdChange}
|
||||||
crawlOptions={crawlOptions}
|
crawlOptions={crawlOptions}
|
||||||
onCrawlOptionsChange={onCrawlOptionsChange}
|
onCrawlOptionsChange={onCrawlOptionsChange}
|
||||||
/>
|
/>
|
||||||
|
@@ -33,6 +33,7 @@ import { ensureRerankModelSelected, isReRankModelSelected } from '@/app/componen
|
|||||||
import Toast from '@/app/components/base/toast'
|
import Toast from '@/app/components/base/toast'
|
||||||
import { formatNumber } from '@/utils/format'
|
import { formatNumber } from '@/utils/format'
|
||||||
import type { NotionPage } from '@/models/common'
|
import type { NotionPage } from '@/models/common'
|
||||||
|
import { DataSourceProvider } from '@/models/common'
|
||||||
import { DataSourceType, DocForm } from '@/models/datasets'
|
import { DataSourceType, DocForm } from '@/models/datasets'
|
||||||
import NotionIcon from '@/app/components/base/notion-icon'
|
import NotionIcon from '@/app/components/base/notion-icon'
|
||||||
import Switch from '@/app/components/base/switch'
|
import Switch from '@/app/components/base/switch'
|
||||||
@@ -63,7 +64,8 @@ type StepTwoProps = {
|
|||||||
notionPages?: NotionPage[]
|
notionPages?: NotionPage[]
|
||||||
websitePages?: CrawlResultItem[]
|
websitePages?: CrawlResultItem[]
|
||||||
crawlOptions?: CrawlOptions
|
crawlOptions?: CrawlOptions
|
||||||
fireCrawlJobId?: string
|
websiteCrawlProvider?: DataSourceProvider
|
||||||
|
websiteCrawlJobId?: string
|
||||||
onStepChange?: (delta: number) => void
|
onStepChange?: (delta: number) => void
|
||||||
updateIndexingTypeCache?: (type: string) => void
|
updateIndexingTypeCache?: (type: string) => void
|
||||||
updateResultCache?: (res: createDocumentResponse) => void
|
updateResultCache?: (res: createDocumentResponse) => void
|
||||||
@@ -94,7 +96,8 @@ const StepTwo = ({
|
|||||||
notionPages = [],
|
notionPages = [],
|
||||||
websitePages = [],
|
websitePages = [],
|
||||||
crawlOptions,
|
crawlOptions,
|
||||||
fireCrawlJobId = '',
|
websiteCrawlProvider = DataSourceProvider.fireCrawl,
|
||||||
|
websiteCrawlJobId = '',
|
||||||
onStepChange,
|
onStepChange,
|
||||||
updateIndexingTypeCache,
|
updateIndexingTypeCache,
|
||||||
updateResultCache,
|
updateResultCache,
|
||||||
@@ -260,8 +263,8 @@ const StepTwo = ({
|
|||||||
|
|
||||||
const getWebsiteInfo = () => {
|
const getWebsiteInfo = () => {
|
||||||
return {
|
return {
|
||||||
provider: 'firecrawl',
|
provider: websiteCrawlProvider,
|
||||||
job_id: fireCrawlJobId,
|
job_id: websiteCrawlJobId,
|
||||||
urls: websitePages.map(page => page.source_url),
|
urls: websitePages.map(page => page.source_url),
|
||||||
only_main_content: crawlOptions?.only_main_content,
|
only_main_content: crawlOptions?.only_main_content,
|
||||||
}
|
}
|
||||||
|
@@ -3,6 +3,7 @@ import type { FC } from 'react'
|
|||||||
import React from 'react'
|
import React from 'react'
|
||||||
import cn from '@/utils/classnames'
|
import cn from '@/utils/classnames'
|
||||||
import Checkbox from '@/app/components/base/checkbox'
|
import Checkbox from '@/app/components/base/checkbox'
|
||||||
|
import Tooltip from '@/app/components/base/tooltip'
|
||||||
|
|
||||||
type Props = {
|
type Props = {
|
||||||
className?: string
|
className?: string
|
||||||
@@ -10,6 +11,7 @@ type Props = {
|
|||||||
onChange: (isChecked: boolean) => void
|
onChange: (isChecked: boolean) => void
|
||||||
label: string
|
label: string
|
||||||
labelClassName?: string
|
labelClassName?: string
|
||||||
|
tooltip?: string
|
||||||
}
|
}
|
||||||
|
|
||||||
const CheckboxWithLabel: FC<Props> = ({
|
const CheckboxWithLabel: FC<Props> = ({
|
||||||
@@ -18,11 +20,20 @@ const CheckboxWithLabel: FC<Props> = ({
|
|||||||
onChange,
|
onChange,
|
||||||
label,
|
label,
|
||||||
labelClassName,
|
labelClassName,
|
||||||
|
tooltip,
|
||||||
}) => {
|
}) => {
|
||||||
return (
|
return (
|
||||||
<label className={cn(className, 'flex items-center h-7 space-x-2')}>
|
<label className={cn(className, 'flex items-center h-7 space-x-2')}>
|
||||||
<Checkbox checked={isChecked} onCheck={() => onChange(!isChecked)} />
|
<Checkbox checked={isChecked} onCheck={() => onChange(!isChecked)} />
|
||||||
<div className={cn(labelClassName, 'text-sm font-normal text-gray-800')}>{label}</div>
|
<div className={cn(labelClassName, 'text-sm font-normal text-gray-800')}>{label}</div>
|
||||||
|
{tooltip && (
|
||||||
|
<Tooltip
|
||||||
|
popupContent={
|
||||||
|
<div className='w-[200px]'>{tooltip}</div>
|
||||||
|
}
|
||||||
|
triggerClassName='ml-0.5 w-4 h-4'
|
||||||
|
/>
|
||||||
|
)}
|
||||||
</label>
|
</label>
|
||||||
)
|
)
|
||||||
}
|
}
|
@@ -2,7 +2,7 @@
|
|||||||
import type { FC } from 'react'
|
import type { FC } from 'react'
|
||||||
import React, { useCallback } from 'react'
|
import React, { useCallback } from 'react'
|
||||||
import { useTranslation } from 'react-i18next'
|
import { useTranslation } from 'react-i18next'
|
||||||
import CheckboxWithLabel from './base/checkbox-with-label'
|
import CheckboxWithLabel from './checkbox-with-label'
|
||||||
import CrawledResultItem from './crawled-result-item'
|
import CrawledResultItem from './crawled-result-item'
|
||||||
import cn from '@/utils/classnames'
|
import cn from '@/utils/classnames'
|
||||||
import type { CrawlResultItem } from '@/models/datasets'
|
import type { CrawlResultItem } from '@/models/datasets'
|
@@ -2,13 +2,13 @@
|
|||||||
import type { FC } from 'react'
|
import type { FC } from 'react'
|
||||||
import React, { useCallback, useEffect, useState } from 'react'
|
import React, { useCallback, useEffect, useState } from 'react'
|
||||||
import { useTranslation } from 'react-i18next'
|
import { useTranslation } from 'react-i18next'
|
||||||
|
import UrlInput from '../base/url-input'
|
||||||
|
import OptionsWrap from '../base/options-wrap'
|
||||||
|
import CrawledResult from '../base/crawled-result'
|
||||||
|
import Crawling from '../base/crawling'
|
||||||
|
import ErrorMessage from '../base/error-message'
|
||||||
import Header from './header'
|
import Header from './header'
|
||||||
import UrlInput from './base/url-input'
|
|
||||||
import OptionsWrap from './base/options-wrap'
|
|
||||||
import Options from './options'
|
import Options from './options'
|
||||||
import CrawledResult from './crawled-result'
|
|
||||||
import Crawling from './crawling'
|
|
||||||
import ErrorMessage from './base/error-message'
|
|
||||||
import cn from '@/utils/classnames'
|
import cn from '@/utils/classnames'
|
||||||
import { useModalContext } from '@/context/modal-context'
|
import { useModalContext } from '@/context/modal-context'
|
||||||
import type { CrawlOptions, CrawlResultItem } from '@/models/datasets'
|
import type { CrawlOptions, CrawlResultItem } from '@/models/datasets'
|
||||||
|
@@ -2,8 +2,8 @@
|
|||||||
import type { FC } from 'react'
|
import type { FC } from 'react'
|
||||||
import React, { useCallback } from 'react'
|
import React, { useCallback } from 'react'
|
||||||
import { useTranslation } from 'react-i18next'
|
import { useTranslation } from 'react-i18next'
|
||||||
import CheckboxWithLabel from './base/checkbox-with-label'
|
import CheckboxWithLabel from '../base/checkbox-with-label'
|
||||||
import Field from './base/field'
|
import Field from '../base/field'
|
||||||
import cn from '@/utils/classnames'
|
import cn from '@/utils/classnames'
|
||||||
import type { CrawlOptions } from '@/models/datasets'
|
import type { CrawlOptions } from '@/models/datasets'
|
||||||
|
|
||||||
|
@@ -0,0 +1,6 @@
|
|||||||
|
.jinaLogo {
|
||||||
|
@apply w-4 h-4 bg-center bg-no-repeat inline-block;
|
||||||
|
background-color: #F5FAFF;
|
||||||
|
background-image: url(../assets/jina.png);
|
||||||
|
background-size: 16px;
|
||||||
|
}
|
@@ -1,8 +1,12 @@
|
|||||||
'use client'
|
'use client'
|
||||||
import type { FC } from 'react'
|
import type { FC } from 'react'
|
||||||
import React, { useCallback, useEffect, useState } from 'react'
|
import React, { useCallback, useEffect, useState } from 'react'
|
||||||
|
import { useTranslation } from 'react-i18next'
|
||||||
|
import s from './index.module.css'
|
||||||
import NoData from './no-data'
|
import NoData from './no-data'
|
||||||
import Firecrawl from './firecrawl'
|
import Firecrawl from './firecrawl'
|
||||||
|
import JinaReader from './jina-reader'
|
||||||
|
import cn from '@/utils/classnames'
|
||||||
import { useModalContext } from '@/context/modal-context'
|
import { useModalContext } from '@/context/modal-context'
|
||||||
import type { CrawlOptions, CrawlResultItem } from '@/models/datasets'
|
import type { CrawlOptions, CrawlResultItem } from '@/models/datasets'
|
||||||
import { fetchDataSources } from '@/service/datasets'
|
import { fetchDataSources } from '@/service/datasets'
|
||||||
@@ -12,6 +16,7 @@ type Props = {
|
|||||||
onPreview: (payload: CrawlResultItem) => void
|
onPreview: (payload: CrawlResultItem) => void
|
||||||
checkedCrawlResult: CrawlResultItem[]
|
checkedCrawlResult: CrawlResultItem[]
|
||||||
onCheckedCrawlResultChange: (payload: CrawlResultItem[]) => void
|
onCheckedCrawlResultChange: (payload: CrawlResultItem[]) => void
|
||||||
|
onCrawlProviderChange: (provider: DataSourceProvider) => void
|
||||||
onJobIdChange: (jobId: string) => void
|
onJobIdChange: (jobId: string) => void
|
||||||
crawlOptions: CrawlOptions
|
crawlOptions: CrawlOptions
|
||||||
onCrawlOptionsChange: (payload: CrawlOptions) => void
|
onCrawlOptionsChange: (payload: CrawlOptions) => void
|
||||||
@@ -21,17 +26,32 @@ const Website: FC<Props> = ({
|
|||||||
onPreview,
|
onPreview,
|
||||||
checkedCrawlResult,
|
checkedCrawlResult,
|
||||||
onCheckedCrawlResultChange,
|
onCheckedCrawlResultChange,
|
||||||
|
onCrawlProviderChange,
|
||||||
onJobIdChange,
|
onJobIdChange,
|
||||||
crawlOptions,
|
crawlOptions,
|
||||||
onCrawlOptionsChange,
|
onCrawlOptionsChange,
|
||||||
}) => {
|
}) => {
|
||||||
|
const { t } = useTranslation()
|
||||||
const { setShowAccountSettingModal } = useModalContext()
|
const { setShowAccountSettingModal } = useModalContext()
|
||||||
const [isLoaded, setIsLoaded] = useState(false)
|
const [isLoaded, setIsLoaded] = useState(false)
|
||||||
const [isSetFirecrawlApiKey, setIsSetFirecrawlApiKey] = useState(false)
|
const [selectedProvider, setSelectedProvider] = useState<DataSourceProvider>(DataSourceProvider.jinaReader)
|
||||||
|
const [sources, setSources] = useState<DataSourceItem[]>([])
|
||||||
|
|
||||||
|
useEffect(() => {
|
||||||
|
onCrawlProviderChange(selectedProvider)
|
||||||
|
}, [selectedProvider, onCrawlProviderChange])
|
||||||
|
|
||||||
const checkSetApiKey = useCallback(async () => {
|
const checkSetApiKey = useCallback(async () => {
|
||||||
const res = await fetchDataSources() as any
|
const res = await fetchDataSources() as any
|
||||||
const isFirecrawlSet = res.sources.some((item: DataSourceItem) => item.provider === DataSourceProvider.fireCrawl)
|
setSources(res.sources)
|
||||||
setIsSetFirecrawlApiKey(isFirecrawlSet)
|
|
||||||
|
// If users have configured one of the providers, select it.
|
||||||
|
const availableProviders = res.sources.filter((item: DataSourceItem) =>
|
||||||
|
[DataSourceProvider.jinaReader, DataSourceProvider.fireCrawl].includes(item.provider),
|
||||||
|
)
|
||||||
|
|
||||||
|
if (availableProviders.length > 0)
|
||||||
|
setSelectedProvider(availableProviders[0].provider)
|
||||||
}, [])
|
}, [])
|
||||||
|
|
||||||
useEffect(() => {
|
useEffect(() => {
|
||||||
@@ -52,20 +72,66 @@ const Website: FC<Props> = ({
|
|||||||
|
|
||||||
return (
|
return (
|
||||||
<div>
|
<div>
|
||||||
{isSetFirecrawlApiKey
|
<div className="mb-4">
|
||||||
? (
|
<div className="font-medium text-gray-700 mb-2 h-6">
|
||||||
<Firecrawl
|
{t('datasetCreation.stepOne.website.chooseProvider')}
|
||||||
onPreview={onPreview}
|
</div>
|
||||||
checkedCrawlResult={checkedCrawlResult}
|
<div className="flex space-x-2">
|
||||||
onCheckedCrawlResultChange={onCheckedCrawlResultChange}
|
<button
|
||||||
onJobIdChange={onJobIdChange}
|
className={`px-4 py-2 text-sm font-medium rounded-md flex items-center justify-center ${
|
||||||
crawlOptions={crawlOptions}
|
selectedProvider === DataSourceProvider.jinaReader
|
||||||
onCrawlOptionsChange={onCrawlOptionsChange}
|
? 'bg-primary-50 text-primary-600'
|
||||||
/>
|
: 'bg-gray-100 text-gray-600 hover:bg-gray-200'
|
||||||
)
|
}`}
|
||||||
: (
|
onClick={() => setSelectedProvider(DataSourceProvider.jinaReader)}
|
||||||
<NoData onConfig={handleOnConfig} />
|
>
|
||||||
)}
|
<span className={cn(s.jinaLogo, 'mr-2')} />
|
||||||
|
<span>Jina Reader</span>
|
||||||
|
</button>
|
||||||
|
<button
|
||||||
|
className={`px-4 py-2 text-sm font-medium rounded-md ${
|
||||||
|
selectedProvider === DataSourceProvider.fireCrawl
|
||||||
|
? 'bg-primary-50 text-primary-600'
|
||||||
|
: 'bg-gray-100 text-gray-600 hover:bg-gray-200'
|
||||||
|
}`}
|
||||||
|
onClick={() => setSelectedProvider(DataSourceProvider.fireCrawl)}
|
||||||
|
>
|
||||||
|
🔥 Firecrawl
|
||||||
|
</button>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
{
|
||||||
|
selectedProvider === DataSourceProvider.fireCrawl
|
||||||
|
? sources.find(source => source.provider === DataSourceProvider.fireCrawl)
|
||||||
|
? (
|
||||||
|
<Firecrawl
|
||||||
|
onPreview={onPreview}
|
||||||
|
checkedCrawlResult={checkedCrawlResult}
|
||||||
|
onCheckedCrawlResultChange={onCheckedCrawlResultChange}
|
||||||
|
onJobIdChange={onJobIdChange}
|
||||||
|
crawlOptions={crawlOptions}
|
||||||
|
onCrawlOptionsChange={onCrawlOptionsChange}
|
||||||
|
/>
|
||||||
|
)
|
||||||
|
: (
|
||||||
|
<NoData onConfig={handleOnConfig} provider={selectedProvider} />
|
||||||
|
)
|
||||||
|
: sources.find(source => source.provider === DataSourceProvider.jinaReader)
|
||||||
|
? (
|
||||||
|
<JinaReader
|
||||||
|
onPreview={onPreview}
|
||||||
|
checkedCrawlResult={checkedCrawlResult}
|
||||||
|
onCheckedCrawlResultChange={onCheckedCrawlResultChange}
|
||||||
|
onJobIdChange={onJobIdChange}
|
||||||
|
crawlOptions={crawlOptions}
|
||||||
|
onCrawlOptionsChange={onCrawlOptionsChange}
|
||||||
|
/>
|
||||||
|
)
|
||||||
|
: (
|
||||||
|
<NoData onConfig={handleOnConfig} provider={selectedProvider} />
|
||||||
|
)
|
||||||
|
}
|
||||||
</div>
|
</div>
|
||||||
)
|
)
|
||||||
}
|
}
|
||||||
|
@@ -0,0 +1,42 @@
|
|||||||
|
'use client'
|
||||||
|
import type { FC } from 'react'
|
||||||
|
import React from 'react'
|
||||||
|
import { useTranslation } from 'react-i18next'
|
||||||
|
import { Settings01 } from '@/app/components/base/icons/src/vender/line/general'
|
||||||
|
import { BookOpen01 } from '@/app/components/base/icons/src/vender/line/education'
|
||||||
|
|
||||||
|
const I18N_PREFIX = 'datasetCreation.stepOne.website'
|
||||||
|
|
||||||
|
type Props = {
|
||||||
|
onSetting: () => void
|
||||||
|
}
|
||||||
|
|
||||||
|
const Header: FC<Props> = ({
|
||||||
|
onSetting,
|
||||||
|
}) => {
|
||||||
|
const { t } = useTranslation()
|
||||||
|
|
||||||
|
return (
|
||||||
|
<div className='flex h-6 items-center justify-between'>
|
||||||
|
<div className='flex items-center'>
|
||||||
|
<div className='text-base font-medium text-gray-700'>{t(`${I18N_PREFIX}.jinaReaderTitle`)}</div>
|
||||||
|
<div className='ml-2 mr-1 w-px h-3.5 bg-gray-200'></div>
|
||||||
|
<div
|
||||||
|
className='p-1 rounded-md hover:bg-black/5 cursor-pointer'
|
||||||
|
onClick={onSetting}
|
||||||
|
>
|
||||||
|
<Settings01 className='w-3.5 h-3.5 text-gray-500' />
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
<a
|
||||||
|
href='https://jina.ai/reader'
|
||||||
|
target='_blank' rel='noopener noreferrer'
|
||||||
|
className='flex items-center text-xs text-primary-600'
|
||||||
|
>
|
||||||
|
<BookOpen01 className='mr-1 w-3.5 h-3.5 text-primary-600' />
|
||||||
|
{t(`${I18N_PREFIX}.jinaReaderDoc`)}
|
||||||
|
</a>
|
||||||
|
</div>
|
||||||
|
)
|
||||||
|
}
|
||||||
|
export default React.memo(Header)
|
232
web/app/components/datasets/create/website/jina-reader/index.tsx
Normal file
232
web/app/components/datasets/create/website/jina-reader/index.tsx
Normal file
@@ -0,0 +1,232 @@
|
|||||||
|
'use client'
|
||||||
|
import type { FC } from 'react'
|
||||||
|
import React, { useCallback, useEffect, useState } from 'react'
|
||||||
|
import { useTranslation } from 'react-i18next'
|
||||||
|
import UrlInput from '../base/url-input'
|
||||||
|
import OptionsWrap from '../base/options-wrap'
|
||||||
|
import CrawledResult from '../base/crawled-result'
|
||||||
|
import Crawling from '../base/crawling'
|
||||||
|
import ErrorMessage from '../base/error-message'
|
||||||
|
import Header from './header'
|
||||||
|
import Options from './options'
|
||||||
|
import cn from '@/utils/classnames'
|
||||||
|
import { useModalContext } from '@/context/modal-context'
|
||||||
|
import Toast from '@/app/components/base/toast'
|
||||||
|
import { checkJinaReaderTaskStatus, createJinaReaderTask } from '@/service/datasets'
|
||||||
|
import { sleep } from '@/utils'
|
||||||
|
import type { CrawlOptions, CrawlResultItem } from '@/models/datasets'
|
||||||
|
|
||||||
|
const ERROR_I18N_PREFIX = 'common.errorMsg'
|
||||||
|
const I18N_PREFIX = 'datasetCreation.stepOne.website'
|
||||||
|
|
||||||
|
type Props = {
|
||||||
|
onPreview: (payload: CrawlResultItem) => void
|
||||||
|
checkedCrawlResult: CrawlResultItem[]
|
||||||
|
onCheckedCrawlResultChange: (payload: CrawlResultItem[]) => void
|
||||||
|
onJobIdChange: (jobId: string) => void
|
||||||
|
crawlOptions: CrawlOptions
|
||||||
|
onCrawlOptionsChange: (payload: CrawlOptions) => void
|
||||||
|
}
|
||||||
|
|
||||||
|
enum Step {
|
||||||
|
init = 'init',
|
||||||
|
running = 'running',
|
||||||
|
finished = 'finished',
|
||||||
|
}
|
||||||
|
|
||||||
|
const JinaReader: FC<Props> = ({
|
||||||
|
onPreview,
|
||||||
|
checkedCrawlResult,
|
||||||
|
onCheckedCrawlResultChange,
|
||||||
|
onJobIdChange,
|
||||||
|
crawlOptions,
|
||||||
|
onCrawlOptionsChange,
|
||||||
|
}) => {
|
||||||
|
const { t } = useTranslation()
|
||||||
|
const [step, setStep] = useState<Step>(Step.init)
|
||||||
|
const [controlFoldOptions, setControlFoldOptions] = useState<number>(0)
|
||||||
|
useEffect(() => {
|
||||||
|
if (step !== Step.init)
|
||||||
|
setControlFoldOptions(Date.now())
|
||||||
|
}, [step])
|
||||||
|
const { setShowAccountSettingModal } = useModalContext()
|
||||||
|
const handleSetting = useCallback(() => {
|
||||||
|
setShowAccountSettingModal({
|
||||||
|
payload: 'data-source',
|
||||||
|
})
|
||||||
|
}, [setShowAccountSettingModal])
|
||||||
|
|
||||||
|
const checkValid = useCallback((url: string) => {
|
||||||
|
let errorMsg = ''
|
||||||
|
if (!url) {
|
||||||
|
errorMsg = t(`${ERROR_I18N_PREFIX}.fieldRequired`, {
|
||||||
|
field: 'url',
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!errorMsg && !((url.startsWith('http://') || url.startsWith('https://'))))
|
||||||
|
errorMsg = t(`${ERROR_I18N_PREFIX}.urlError`)
|
||||||
|
|
||||||
|
if (!errorMsg && (crawlOptions.limit === null || crawlOptions.limit === undefined || crawlOptions.limit === '')) {
|
||||||
|
errorMsg = t(`${ERROR_I18N_PREFIX}.fieldRequired`, {
|
||||||
|
field: t(`${I18N_PREFIX}.limit`),
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
return {
|
||||||
|
isValid: !errorMsg,
|
||||||
|
errorMsg,
|
||||||
|
}
|
||||||
|
}, [crawlOptions, t])
|
||||||
|
|
||||||
|
const isInit = step === Step.init
|
||||||
|
const isCrawlFinished = step === Step.finished
|
||||||
|
const isRunning = step === Step.running
|
||||||
|
const [crawlResult, setCrawlResult] = useState<{
|
||||||
|
current: number
|
||||||
|
total: number
|
||||||
|
data: CrawlResultItem[]
|
||||||
|
time_consuming: number | string
|
||||||
|
} | undefined>(undefined)
|
||||||
|
const [crawlErrorMessage, setCrawlErrorMessage] = useState('')
|
||||||
|
const showError = isCrawlFinished && crawlErrorMessage
|
||||||
|
|
||||||
|
const waitForCrawlFinished = useCallback(async (jobId: string) => {
|
||||||
|
try {
|
||||||
|
const res = await checkJinaReaderTaskStatus(jobId) as any
|
||||||
|
console.log('res', res)
|
||||||
|
if (res.status === 'completed') {
|
||||||
|
return {
|
||||||
|
isError: false,
|
||||||
|
data: {
|
||||||
|
...res,
|
||||||
|
total: Math.min(res.total, parseFloat(crawlOptions.limit as string)),
|
||||||
|
},
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (res.status === 'failed' || !res.status) {
|
||||||
|
return {
|
||||||
|
isError: true,
|
||||||
|
errorMessage: res.message,
|
||||||
|
data: {
|
||||||
|
data: [],
|
||||||
|
},
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// update the progress
|
||||||
|
setCrawlResult({
|
||||||
|
...res,
|
||||||
|
total: Math.min(res.total, parseFloat(crawlOptions.limit as string)),
|
||||||
|
})
|
||||||
|
onCheckedCrawlResultChange(res.data || []) // default select the crawl result
|
||||||
|
await sleep(2500)
|
||||||
|
return await waitForCrawlFinished(jobId)
|
||||||
|
}
|
||||||
|
catch (e: any) {
|
||||||
|
const errorBody = await e.json()
|
||||||
|
return {
|
||||||
|
isError: true,
|
||||||
|
errorMessage: errorBody.message,
|
||||||
|
data: {
|
||||||
|
data: [],
|
||||||
|
},
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}, [crawlOptions.limit])
|
||||||
|
|
||||||
|
const handleRun = useCallback(async (url: string) => {
|
||||||
|
const { isValid, errorMsg } = checkValid(url)
|
||||||
|
if (!isValid) {
|
||||||
|
Toast.notify({
|
||||||
|
message: errorMsg!,
|
||||||
|
type: 'error',
|
||||||
|
})
|
||||||
|
return
|
||||||
|
}
|
||||||
|
setStep(Step.running)
|
||||||
|
try {
|
||||||
|
const startTime = Date.now()
|
||||||
|
const res = await createJinaReaderTask({
|
||||||
|
url,
|
||||||
|
options: crawlOptions,
|
||||||
|
}) as any
|
||||||
|
|
||||||
|
if (res.data) {
|
||||||
|
const data = {
|
||||||
|
current: 1,
|
||||||
|
total: 1,
|
||||||
|
data: [{
|
||||||
|
title: res.data.title,
|
||||||
|
markdown: res.data.content,
|
||||||
|
description: res.data.description,
|
||||||
|
source_url: res.data.url,
|
||||||
|
}],
|
||||||
|
time_consuming: (Date.now() - startTime) / 1000,
|
||||||
|
}
|
||||||
|
setCrawlResult(data)
|
||||||
|
onCheckedCrawlResultChange(data.data || [])
|
||||||
|
setCrawlErrorMessage('')
|
||||||
|
}
|
||||||
|
else if (res.job_id) {
|
||||||
|
const jobId = res.job_id
|
||||||
|
onJobIdChange(jobId)
|
||||||
|
const { isError, data, errorMessage } = await waitForCrawlFinished(jobId)
|
||||||
|
if (isError) {
|
||||||
|
setCrawlErrorMessage(errorMessage || t(`${I18N_PREFIX}.unknownError`))
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
setCrawlResult(data)
|
||||||
|
onCheckedCrawlResultChange(data.data || []) // default select the crawl result
|
||||||
|
setCrawlErrorMessage('')
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
catch (e) {
|
||||||
|
setCrawlErrorMessage(t(`${I18N_PREFIX}.unknownError`)!)
|
||||||
|
console.log(e)
|
||||||
|
}
|
||||||
|
finally {
|
||||||
|
setStep(Step.finished)
|
||||||
|
}
|
||||||
|
}, [checkValid, crawlOptions, onJobIdChange, t, waitForCrawlFinished])
|
||||||
|
|
||||||
|
return (
|
||||||
|
<div>
|
||||||
|
<Header onSetting={handleSetting} />
|
||||||
|
<div className={cn('mt-2 p-4 pb-0 rounded-xl border border-gray-200')}>
|
||||||
|
<UrlInput onRun={handleRun} isRunning={isRunning} />
|
||||||
|
<OptionsWrap
|
||||||
|
className={cn('mt-4')}
|
||||||
|
controlFoldOptions={controlFoldOptions}
|
||||||
|
>
|
||||||
|
<Options className='mt-2' payload={crawlOptions} onChange={onCrawlOptionsChange} />
|
||||||
|
</OptionsWrap>
|
||||||
|
|
||||||
|
{!isInit && (
|
||||||
|
<div className='mt-3 relative left-[-16px] w-[calc(100%_+_32px)] rounded-b-xl'>
|
||||||
|
{isRunning
|
||||||
|
&& <Crawling
|
||||||
|
className='mt-2'
|
||||||
|
crawledNum={crawlResult?.current || 0}
|
||||||
|
totalNum={crawlResult?.total || parseFloat(crawlOptions.limit as string) || 0}
|
||||||
|
/>}
|
||||||
|
{showError && (
|
||||||
|
<ErrorMessage className='rounded-b-xl' title={t(`${I18N_PREFIX}.exceptionErrorTitle`)} errorMsg={crawlErrorMessage} />
|
||||||
|
)}
|
||||||
|
{isCrawlFinished && !showError
|
||||||
|
&& <CrawledResult
|
||||||
|
className='mb-2'
|
||||||
|
list={crawlResult?.data || []}
|
||||||
|
checkedList={checkedCrawlResult}
|
||||||
|
onSelectedChange={onCheckedCrawlResultChange}
|
||||||
|
onPreview={onPreview}
|
||||||
|
usedTime={parseFloat(crawlResult?.time_consuming as string) || 0}
|
||||||
|
/>
|
||||||
|
}
|
||||||
|
</div>
|
||||||
|
)}
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
)
|
||||||
|
}
|
||||||
|
export default React.memo(JinaReader)
|
@@ -0,0 +1,59 @@
|
|||||||
|
'use client'
|
||||||
|
import type { FC } from 'react'
|
||||||
|
import React, { useCallback } from 'react'
|
||||||
|
import { useTranslation } from 'react-i18next'
|
||||||
|
import CheckboxWithLabel from '../base/checkbox-with-label'
|
||||||
|
import Field from '../base/field'
|
||||||
|
import cn from '@/utils/classnames'
|
||||||
|
import type { CrawlOptions } from '@/models/datasets'
|
||||||
|
|
||||||
|
const I18N_PREFIX = 'datasetCreation.stepOne.website'
|
||||||
|
|
||||||
|
type Props = {
|
||||||
|
className?: string
|
||||||
|
payload: CrawlOptions
|
||||||
|
onChange: (payload: CrawlOptions) => void
|
||||||
|
}
|
||||||
|
|
||||||
|
const Options: FC<Props> = ({
|
||||||
|
className = '',
|
||||||
|
payload,
|
||||||
|
onChange,
|
||||||
|
}) => {
|
||||||
|
const { t } = useTranslation()
|
||||||
|
|
||||||
|
const handleChange = useCallback((key: keyof CrawlOptions) => {
|
||||||
|
return (value: any) => {
|
||||||
|
onChange({
|
||||||
|
...payload,
|
||||||
|
[key]: value,
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}, [payload, onChange])
|
||||||
|
return (
|
||||||
|
<div className={cn(className, ' space-y-2')}>
|
||||||
|
<CheckboxWithLabel
|
||||||
|
label={t(`${I18N_PREFIX}.crawlSubPage`)}
|
||||||
|
isChecked={payload.crawl_sub_pages}
|
||||||
|
onChange={handleChange('crawl_sub_pages')}
|
||||||
|
/>
|
||||||
|
<CheckboxWithLabel
|
||||||
|
label={t(`${I18N_PREFIX}.useSitemap`)}
|
||||||
|
isChecked={payload.use_sitemap}
|
||||||
|
onChange={handleChange('use_sitemap')}
|
||||||
|
tooltip={t(`${I18N_PREFIX}.useSitemapTooltip`) as string}
|
||||||
|
/>
|
||||||
|
<div className='flex justify-between space-x-4'>
|
||||||
|
<Field
|
||||||
|
className='grow shrink-0'
|
||||||
|
label={t(`${I18N_PREFIX}.limit`)}
|
||||||
|
value={payload.limit}
|
||||||
|
onChange={handleChange('limit')}
|
||||||
|
isNumber
|
||||||
|
isRequired
|
||||||
|
/>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
)
|
||||||
|
}
|
||||||
|
export default React.memo(Options)
|
@@ -2,35 +2,56 @@
|
|||||||
import type { FC } from 'react'
|
import type { FC } from 'react'
|
||||||
import React from 'react'
|
import React from 'react'
|
||||||
import { useTranslation } from 'react-i18next'
|
import { useTranslation } from 'react-i18next'
|
||||||
|
import s from './index.module.css'
|
||||||
import { Icon3Dots } from '@/app/components/base/icons/src/vender/line/others'
|
import { Icon3Dots } from '@/app/components/base/icons/src/vender/line/others'
|
||||||
import Button from '@/app/components/base/button'
|
import Button from '@/app/components/base/button'
|
||||||
|
import { DataSourceProvider } from '@/models/common'
|
||||||
|
|
||||||
const I18N_PREFIX = 'datasetCreation.stepOne.website'
|
const I18N_PREFIX = 'datasetCreation.stepOne.website'
|
||||||
|
|
||||||
type Props = {
|
type Props = {
|
||||||
onConfig: () => void
|
onConfig: () => void
|
||||||
|
provider: DataSourceProvider
|
||||||
}
|
}
|
||||||
|
|
||||||
const NoData: FC<Props> = ({
|
const NoData: FC<Props> = ({
|
||||||
onConfig,
|
onConfig,
|
||||||
|
provider,
|
||||||
}) => {
|
}) => {
|
||||||
const { t } = useTranslation()
|
const { t } = useTranslation()
|
||||||
|
|
||||||
|
const providerConfig = {
|
||||||
|
[DataSourceProvider.jinaReader]: {
|
||||||
|
emoji: <span className={s.jinaLogo} />,
|
||||||
|
title: t(`${I18N_PREFIX}.jinaReaderNotConfigured`),
|
||||||
|
description: t(`${I18N_PREFIX}.jinaReaderNotConfiguredDescription`),
|
||||||
|
},
|
||||||
|
[DataSourceProvider.fireCrawl]: {
|
||||||
|
emoji: '🔥',
|
||||||
|
title: t(`${I18N_PREFIX}.fireCrawlNotConfigured`),
|
||||||
|
description: t(`${I18N_PREFIX}.fireCrawlNotConfiguredDescription`),
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
const currentProvider = providerConfig[provider]
|
||||||
|
|
||||||
return (
|
return (
|
||||||
<div className='max-w-[640px] p-6 rounded-2xl bg-gray-50'>
|
<>
|
||||||
<div className='flex w-11 h-11 items-center justify-center bg-gray-50 rounded-xl border-[0.5px] border-gray-100 shadow-lg'>
|
<div className='max-w-[640px] p-6 rounded-2xl bg-gray-50 mt-4'>
|
||||||
🔥
|
<div className='flex w-11 h-11 items-center justify-center bg-gray-50 rounded-xl border-[0.5px] border-gray-100 shadow-lg'>
|
||||||
</div>
|
{currentProvider.emoji}
|
||||||
<div className='my-2'>
|
|
||||||
<span className='text-gray-700 font-semibold'>{t(`${I18N_PREFIX}.fireCrawlNotConfigured`)}<Icon3Dots className='inline relative -top-3 -left-1.5' /></span>
|
|
||||||
<div className='mt-1 pb-3 text-gray-500 text-[13px] font-normal'>
|
|
||||||
{t(`${I18N_PREFIX}.fireCrawlNotConfiguredDescription`)}
|
|
||||||
</div>
|
</div>
|
||||||
|
<div className='my-2'>
|
||||||
|
<span className='text-gray-700 font-semibold'>{currentProvider.title}<Icon3Dots className='inline relative -top-3 -left-1.5' /></span>
|
||||||
|
<div className='mt-1 pb-3 text-gray-500 text-[13px] font-normal'>
|
||||||
|
{currentProvider.description}
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
<Button variant='primary' onClick={onConfig}>
|
||||||
|
{t(`${I18N_PREFIX}.configure`)}
|
||||||
|
</Button>
|
||||||
</div>
|
</div>
|
||||||
<Button variant='primary' onClick={onConfig}>
|
</>
|
||||||
{t(`${I18N_PREFIX}.configure`)}
|
|
||||||
</Button>
|
|
||||||
</div>
|
|
||||||
)
|
)
|
||||||
}
|
}
|
||||||
export default React.memo(NoData)
|
export default React.memo(NoData)
|
||||||
|
@@ -9,7 +9,7 @@ import {
|
|||||||
import { Lock01 } from '@/app/components/base/icons/src/vender/solid/security'
|
import { Lock01 } from '@/app/components/base/icons/src/vender/solid/security'
|
||||||
import Button from '@/app/components/base/button'
|
import Button from '@/app/components/base/button'
|
||||||
import type { FirecrawlConfig } from '@/models/common'
|
import type { FirecrawlConfig } from '@/models/common'
|
||||||
import Field from '@/app/components/datasets/create/website/firecrawl/base/field'
|
import Field from '@/app/components/datasets/create/website/base/field'
|
||||||
import Toast from '@/app/components/base/toast'
|
import Toast from '@/app/components/base/toast'
|
||||||
import { createDataSourceApiKeyBinding } from '@/service/datasets'
|
import { createDataSourceApiKeyBinding } from '@/service/datasets'
|
||||||
import { LinkExternal02 } from '@/app/components/base/icons/src/vender/line/general'
|
import { LinkExternal02 } from '@/app/components/base/icons/src/vender/line/general'
|
||||||
|
@@ -0,0 +1,140 @@
|
|||||||
|
'use client'
|
||||||
|
import type { FC } from 'react'
|
||||||
|
import React, { useCallback, useState } from 'react'
|
||||||
|
import { useTranslation } from 'react-i18next'
|
||||||
|
import {
|
||||||
|
PortalToFollowElem,
|
||||||
|
PortalToFollowElemContent,
|
||||||
|
} from '@/app/components/base/portal-to-follow-elem'
|
||||||
|
import { Lock01 } from '@/app/components/base/icons/src/vender/solid/security'
|
||||||
|
import Button from '@/app/components/base/button'
|
||||||
|
import { DataSourceProvider } from '@/models/common'
|
||||||
|
import Field from '@/app/components/datasets/create/website/base/field'
|
||||||
|
import Toast from '@/app/components/base/toast'
|
||||||
|
import { createDataSourceApiKeyBinding } from '@/service/datasets'
|
||||||
|
import { LinkExternal02 } from '@/app/components/base/icons/src/vender/line/general'
|
||||||
|
type Props = {
|
||||||
|
onCancel: () => void
|
||||||
|
onSaved: () => void
|
||||||
|
}
|
||||||
|
|
||||||
|
const I18N_PREFIX = 'datasetCreation.jinaReader'
|
||||||
|
|
||||||
|
const ConfigJinaReaderModal: FC<Props> = ({
|
||||||
|
onCancel,
|
||||||
|
onSaved,
|
||||||
|
}) => {
|
||||||
|
const { t } = useTranslation()
|
||||||
|
const [isSaving, setIsSaving] = useState(false)
|
||||||
|
const [apiKey, setApiKey] = useState('')
|
||||||
|
|
||||||
|
const handleSave = useCallback(async () => {
|
||||||
|
if (isSaving)
|
||||||
|
return
|
||||||
|
let errorMsg = ''
|
||||||
|
if (!errorMsg) {
|
||||||
|
if (!apiKey) {
|
||||||
|
errorMsg = t('common.errorMsg.fieldRequired', {
|
||||||
|
field: 'API Key',
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (errorMsg) {
|
||||||
|
Toast.notify({
|
||||||
|
type: 'error',
|
||||||
|
message: errorMsg,
|
||||||
|
})
|
||||||
|
return
|
||||||
|
}
|
||||||
|
const postData = {
|
||||||
|
category: 'website',
|
||||||
|
provider: DataSourceProvider.jinaReader,
|
||||||
|
credentials: {
|
||||||
|
auth_type: 'bearer',
|
||||||
|
config: {
|
||||||
|
api_key: apiKey,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
}
|
||||||
|
try {
|
||||||
|
setIsSaving(true)
|
||||||
|
await createDataSourceApiKeyBinding(postData)
|
||||||
|
Toast.notify({
|
||||||
|
type: 'success',
|
||||||
|
message: t('common.api.success'),
|
||||||
|
})
|
||||||
|
}
|
||||||
|
finally {
|
||||||
|
setIsSaving(false)
|
||||||
|
}
|
||||||
|
|
||||||
|
onSaved()
|
||||||
|
}, [apiKey, onSaved, t, isSaving])
|
||||||
|
|
||||||
|
return (
|
||||||
|
<PortalToFollowElem open>
|
||||||
|
<PortalToFollowElemContent className='w-full h-full z-[60]'>
|
||||||
|
<div className='fixed inset-0 flex items-center justify-center bg-black/[.25]'>
|
||||||
|
<div className='mx-2 w-[640px] max-h-[calc(100vh-120px)] bg-white shadow-xl rounded-2xl overflow-y-auto'>
|
||||||
|
<div className='px-8 pt-8'>
|
||||||
|
<div className='flex justify-between items-center mb-4'>
|
||||||
|
<div className='text-xl font-semibold text-gray-900'>{t(`${I18N_PREFIX}.configJinaReader`)}</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<div className='space-y-4'>
|
||||||
|
<Field
|
||||||
|
label='API Key'
|
||||||
|
labelClassName='!text-sm'
|
||||||
|
isRequired
|
||||||
|
value={apiKey}
|
||||||
|
onChange={(value: string | number) => setApiKey(value as string)}
|
||||||
|
placeholder={t(`${I18N_PREFIX}.apiKeyPlaceholder`)!}
|
||||||
|
/>
|
||||||
|
</div>
|
||||||
|
<div className='my-8 flex justify-between items-center h-8'>
|
||||||
|
<a className='flex items-center space-x-1 leading-[18px] text-xs font-normal text-[#155EEF]' target='_blank' href='https://jina.ai/reader/'>
|
||||||
|
<span>{t(`${I18N_PREFIX}.getApiKeyLinkText`)}</span>
|
||||||
|
<LinkExternal02 className='w-3 h-3' />
|
||||||
|
</a>
|
||||||
|
<div className='flex'>
|
||||||
|
<Button
|
||||||
|
size='large'
|
||||||
|
className='mr-2'
|
||||||
|
onClick={onCancel}
|
||||||
|
>
|
||||||
|
{t('common.operation.cancel')}
|
||||||
|
</Button>
|
||||||
|
<Button
|
||||||
|
variant='primary'
|
||||||
|
size='large'
|
||||||
|
onClick={handleSave}
|
||||||
|
loading={isSaving}
|
||||||
|
>
|
||||||
|
{t('common.operation.save')}
|
||||||
|
</Button>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
<div className='border-t-[0.5px] border-t-black/5'>
|
||||||
|
<div className='flex justify-center items-center py-3 bg-gray-50 text-xs text-gray-500'>
|
||||||
|
<Lock01 className='mr-1 w-3 h-3 text-gray-500' />
|
||||||
|
{t('common.modelProvider.encrypted.front')}
|
||||||
|
<a
|
||||||
|
className='text-primary-600 mx-1'
|
||||||
|
target='_blank' rel='noopener noreferrer'
|
||||||
|
href='https://pycryptodome.readthedocs.io/en/latest/src/cipher/oaep.html'
|
||||||
|
>
|
||||||
|
PKCS1_OAEP
|
||||||
|
</a>
|
||||||
|
{t('common.modelProvider.encrypted.back')}
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</PortalToFollowElemContent>
|
||||||
|
</PortalToFollowElem>
|
||||||
|
)
|
||||||
|
}
|
||||||
|
export default React.memo(ConfigJinaReaderModal)
|
@@ -2,11 +2,12 @@
|
|||||||
import type { FC } from 'react'
|
import type { FC } from 'react'
|
||||||
import React, { useCallback, useEffect, useState } from 'react'
|
import React, { useCallback, useEffect, useState } from 'react'
|
||||||
import { useTranslation } from 'react-i18next'
|
import { useTranslation } from 'react-i18next'
|
||||||
import { useBoolean } from 'ahooks'
|
|
||||||
import Panel from '../panel'
|
import Panel from '../panel'
|
||||||
import { DataSourceType } from '../panel/types'
|
import { DataSourceType } from '../panel/types'
|
||||||
import ConfigFirecrawlModal from './config-firecrawl-modal'
|
import ConfigFirecrawlModal from './config-firecrawl-modal'
|
||||||
|
import ConfigJinaReaderModal from './config-jina-reader-modal'
|
||||||
import cn from '@/utils/classnames'
|
import cn from '@/utils/classnames'
|
||||||
|
import s from '@/app/components/datasets/create/website/index.module.css'
|
||||||
import { fetchDataSources, removeDataSourceApiKeyBinding } from '@/service/datasets'
|
import { fetchDataSources, removeDataSourceApiKeyBinding } from '@/service/datasets'
|
||||||
|
|
||||||
import type {
|
import type {
|
||||||
@@ -19,9 +20,11 @@ import {
|
|||||||
} from '@/models/common'
|
} from '@/models/common'
|
||||||
import Toast from '@/app/components/base/toast'
|
import Toast from '@/app/components/base/toast'
|
||||||
|
|
||||||
type Props = {}
|
type Props = {
|
||||||
|
provider: DataSourceProvider
|
||||||
|
}
|
||||||
|
|
||||||
const DataSourceWebsite: FC<Props> = () => {
|
const DataSourceWebsite: FC<Props> = ({ provider }) => {
|
||||||
const { t } = useTranslation()
|
const { t } = useTranslation()
|
||||||
const { isCurrentWorkspaceManager } = useAppContext()
|
const { isCurrentWorkspaceManager } = useAppContext()
|
||||||
const [sources, setSources] = useState<DataSourceItem[]>([])
|
const [sources, setSources] = useState<DataSourceItem[]>([])
|
||||||
@@ -36,22 +39,26 @@ const DataSourceWebsite: FC<Props> = () => {
|
|||||||
// eslint-disable-next-line react-hooks/exhaustive-deps
|
// eslint-disable-next-line react-hooks/exhaustive-deps
|
||||||
}, [])
|
}, [])
|
||||||
|
|
||||||
const [isShowConfig, {
|
const [configTarget, setConfigTarget] = useState<DataSourceProvider | null>(null)
|
||||||
setTrue: showConfig,
|
const showConfig = useCallback((provider: DataSourceProvider) => {
|
||||||
setFalse: hideConfig,
|
setConfigTarget(provider)
|
||||||
}] = useBoolean(false)
|
}, [setConfigTarget])
|
||||||
|
|
||||||
|
const hideConfig = useCallback(() => {
|
||||||
|
setConfigTarget(null)
|
||||||
|
}, [setConfigTarget])
|
||||||
|
|
||||||
const handleAdded = useCallback(() => {
|
const handleAdded = useCallback(() => {
|
||||||
checkSetApiKey()
|
checkSetApiKey()
|
||||||
hideConfig()
|
hideConfig()
|
||||||
}, [checkSetApiKey, hideConfig])
|
}, [checkSetApiKey, hideConfig])
|
||||||
|
|
||||||
const getIdByProvider = (provider: string): string | undefined => {
|
const getIdByProvider = (provider: DataSourceProvider): string | undefined => {
|
||||||
const source = sources.find(item => item.provider === provider)
|
const source = sources.find(item => item.provider === provider)
|
||||||
return source?.id
|
return source?.id
|
||||||
}
|
}
|
||||||
|
|
||||||
const handleRemove = useCallback((provider: string) => {
|
const handleRemove = useCallback((provider: DataSourceProvider) => {
|
||||||
return async () => {
|
return async () => {
|
||||||
const dataSourceId = getIdByProvider(provider)
|
const dataSourceId = getIdByProvider(provider)
|
||||||
if (dataSourceId) {
|
if (dataSourceId) {
|
||||||
@@ -69,22 +76,34 @@ const DataSourceWebsite: FC<Props> = () => {
|
|||||||
<>
|
<>
|
||||||
<Panel
|
<Panel
|
||||||
type={DataSourceType.website}
|
type={DataSourceType.website}
|
||||||
isConfigured={sources.length > 0}
|
provider={provider}
|
||||||
onConfigure={showConfig}
|
isConfigured={sources.find(item => item.provider === provider) !== undefined}
|
||||||
|
onConfigure={() => showConfig(provider)}
|
||||||
readOnly={!isCurrentWorkspaceManager}
|
readOnly={!isCurrentWorkspaceManager}
|
||||||
configuredList={sources.map(item => ({
|
configuredList={sources.filter(item => item.provider === provider).map(item => ({
|
||||||
id: item.id,
|
id: item.id,
|
||||||
logo: ({ className }: { className: string }) => (
|
logo: ({ className }: { className: string }) => (
|
||||||
<div className={cn(className, 'flex items-center justify-center w-5 h-5 bg-white border border-gray-100 text-xs font-medium text-gray-500 rounded ml-3')}>🔥</div>
|
item.provider === DataSourceProvider.fireCrawl
|
||||||
|
? (
|
||||||
|
<div className={cn(className, 'flex items-center justify-center w-5 h-5 bg-white border border-gray-100 text-xs font-medium text-gray-500 rounded ml-3')}>🔥</div>
|
||||||
|
)
|
||||||
|
: (
|
||||||
|
<div className={cn(className, 'flex items-center justify-center w-5 h-5 bg-white border border-gray-100 text-xs font-medium text-gray-500 rounded ml-3')}>
|
||||||
|
<span className={s.jinaLogo} />
|
||||||
|
</div>
|
||||||
|
)
|
||||||
),
|
),
|
||||||
name: 'Firecrawl',
|
name: item.provider === DataSourceProvider.fireCrawl ? 'Firecrawl' : 'Jina Reader',
|
||||||
isActive: true,
|
isActive: true,
|
||||||
}))}
|
}))}
|
||||||
onRemove={handleRemove(DataSourceProvider.fireCrawl)}
|
onRemove={handleRemove(provider)}
|
||||||
/>
|
/>
|
||||||
{isShowConfig && (
|
{configTarget === DataSourceProvider.fireCrawl && (
|
||||||
<ConfigFirecrawlModal onSaved={handleAdded} onCancel={hideConfig} />
|
<ConfigFirecrawlModal onSaved={handleAdded} onCancel={hideConfig} />
|
||||||
)}
|
)}
|
||||||
|
{configTarget === DataSourceProvider.jinaReader && (
|
||||||
|
<ConfigJinaReaderModal onSaved={handleAdded} onCancel={hideConfig} />
|
||||||
|
)}
|
||||||
</>
|
</>
|
||||||
|
|
||||||
)
|
)
|
||||||
|
@@ -3,6 +3,7 @@ import { useTranslation } from 'react-i18next'
|
|||||||
import DataSourceNotion from './data-source-notion'
|
import DataSourceNotion from './data-source-notion'
|
||||||
import DataSourceWebsite from './data-source-website'
|
import DataSourceWebsite from './data-source-website'
|
||||||
import { fetchDataSource } from '@/service/common'
|
import { fetchDataSource } from '@/service/common'
|
||||||
|
import { DataSourceProvider } from '@/models/common'
|
||||||
|
|
||||||
export default function DataSourcePage() {
|
export default function DataSourcePage() {
|
||||||
const { t } = useTranslation()
|
const { t } = useTranslation()
|
||||||
@@ -13,7 +14,8 @@ export default function DataSourcePage() {
|
|||||||
<div className='mb-8'>
|
<div className='mb-8'>
|
||||||
<div className='mb-2 text-sm font-medium text-gray-900'>{t('common.dataSource.add')}</div>
|
<div className='mb-2 text-sm font-medium text-gray-900'>{t('common.dataSource.add')}</div>
|
||||||
<DataSourceNotion workspaces={notionWorkspaces} />
|
<DataSourceNotion workspaces={notionWorkspaces} />
|
||||||
<DataSourceWebsite />
|
<DataSourceWebsite provider={DataSourceProvider.jinaReader} />
|
||||||
|
<DataSourceWebsite provider={DataSourceProvider.fireCrawl} />
|
||||||
</div>
|
</div>
|
||||||
)
|
)
|
||||||
}
|
}
|
||||||
|
@@ -8,10 +8,12 @@ import ConfigItem from './config-item'
|
|||||||
|
|
||||||
import s from './style.module.css'
|
import s from './style.module.css'
|
||||||
import { DataSourceType } from './types'
|
import { DataSourceType } from './types'
|
||||||
|
import { DataSourceProvider } from '@/models/common'
|
||||||
import cn from '@/utils/classnames'
|
import cn from '@/utils/classnames'
|
||||||
|
|
||||||
type Props = {
|
type Props = {
|
||||||
type: DataSourceType
|
type: DataSourceType
|
||||||
|
provider: DataSourceProvider
|
||||||
isConfigured: boolean
|
isConfigured: boolean
|
||||||
onConfigure: () => void
|
onConfigure: () => void
|
||||||
readOnly: boolean
|
readOnly: boolean
|
||||||
@@ -25,6 +27,7 @@ type Props = {
|
|||||||
|
|
||||||
const Panel: FC<Props> = ({
|
const Panel: FC<Props> = ({
|
||||||
type,
|
type,
|
||||||
|
provider,
|
||||||
isConfigured,
|
isConfigured,
|
||||||
onConfigure,
|
onConfigure,
|
||||||
readOnly,
|
readOnly,
|
||||||
@@ -46,7 +49,7 @@ const Panel: FC<Props> = ({
|
|||||||
<div className='text-sm font-medium text-gray-800'>{t(`common.dataSource.${type}.title`)}</div>
|
<div className='text-sm font-medium text-gray-800'>{t(`common.dataSource.${type}.title`)}</div>
|
||||||
{isWebsite && (
|
{isWebsite && (
|
||||||
<div className='ml-1 leading-[18px] px-1.5 rounded-md bg-white border border-gray-100 text-xs font-medium text-gray-700'>
|
<div className='ml-1 leading-[18px] px-1.5 rounded-md bg-white border border-gray-100 text-xs font-medium text-gray-700'>
|
||||||
<span className='text-gray-500'>{t('common.dataSource.website.with')}</span> 🔥 Firecrawl
|
<span className='text-gray-500'>{t('common.dataSource.website.with')}</span> { provider === DataSourceProvider.fireCrawl ? '🔥 Firecrawl' : 'Jina Reader'}
|
||||||
</div>
|
</div>
|
||||||
)}
|
)}
|
||||||
</div>
|
</div>
|
||||||
|
@@ -16,6 +16,11 @@ const translation = {
|
|||||||
apiKeyPlaceholder: 'API key from firecrawl.dev',
|
apiKeyPlaceholder: 'API key from firecrawl.dev',
|
||||||
getApiKeyLinkText: 'Get your API key from firecrawl.dev',
|
getApiKeyLinkText: 'Get your API key from firecrawl.dev',
|
||||||
},
|
},
|
||||||
|
jinaReader: {
|
||||||
|
configJinaReader: 'Configure Jina Reader',
|
||||||
|
apiKeyPlaceholder: 'API key from jina.ai',
|
||||||
|
getApiKeyLinkText: 'Get your free API key at jina.ai',
|
||||||
|
},
|
||||||
stepOne: {
|
stepOne: {
|
||||||
filePreview: 'File Preview',
|
filePreview: 'File Preview',
|
||||||
pagePreview: 'Page Preview',
|
pagePreview: 'Page Preview',
|
||||||
@@ -56,13 +61,21 @@ const translation = {
|
|||||||
failed: 'Creation failed',
|
failed: 'Creation failed',
|
||||||
},
|
},
|
||||||
website: {
|
website: {
|
||||||
|
chooseProvider: 'Select a provider',
|
||||||
fireCrawlNotConfigured: 'Firecrawl is not configured',
|
fireCrawlNotConfigured: 'Firecrawl is not configured',
|
||||||
fireCrawlNotConfiguredDescription: 'Configure Firecrawl with API key to use it.',
|
fireCrawlNotConfiguredDescription: 'Configure Firecrawl with API key to use it.',
|
||||||
|
jinaReaderNotConfigured: 'Jina Reader is not configured',
|
||||||
|
jinaReaderNotConfiguredDescription: 'Set up Jina Reader by entering your free API key for access.',
|
||||||
configure: 'Configure',
|
configure: 'Configure',
|
||||||
run: 'Run',
|
run: 'Run',
|
||||||
firecrawlTitle: 'Extract web content with 🔥Firecrawl',
|
firecrawlTitle: 'Extract web content with 🔥Firecrawl',
|
||||||
firecrawlDoc: 'Firecrawl docs',
|
firecrawlDoc: 'Firecrawl docs',
|
||||||
firecrawlDocLink: 'https://docs.dify.ai/guides/knowledge-base/sync-from-website',
|
firecrawlDocLink: 'https://docs.dify.ai/guides/knowledge-base/sync-from-website',
|
||||||
|
jinaReaderTitle: 'Convert the entire site to Markdown',
|
||||||
|
jinaReaderDoc: 'Learn more about Jina Reader',
|
||||||
|
jinaReaderDocLink: 'https://jina.ai/reader',
|
||||||
|
useSitemap: 'Use sitemap',
|
||||||
|
useSitemapTooltip: 'Follow the sitemap to crawl the site. If not, Jina Reader will crawl iteratively based on page relevance, yielding fewer but higher-quality pages.',
|
||||||
options: 'Options',
|
options: 'Options',
|
||||||
crawlSubPage: 'Crawl sub-pages',
|
crawlSubPage: 'Crawl sub-pages',
|
||||||
limit: 'Limit',
|
limit: 'Limit',
|
||||||
@@ -70,7 +83,7 @@ const translation = {
|
|||||||
excludePaths: 'Exclude paths',
|
excludePaths: 'Exclude paths',
|
||||||
includeOnlyPaths: 'Include only paths',
|
includeOnlyPaths: 'Include only paths',
|
||||||
extractOnlyMainContent: 'Extract only main content (no headers, navs, footers, etc.)',
|
extractOnlyMainContent: 'Extract only main content (no headers, navs, footers, etc.)',
|
||||||
exceptionErrorTitle: 'An exception occurred while running Firecrawl job:',
|
exceptionErrorTitle: 'An exception occurred while running crawling job:',
|
||||||
unknownError: 'Unknown error',
|
unknownError: 'Unknown error',
|
||||||
totalPageScraped: 'Total pages scraped:',
|
totalPageScraped: 'Total pages scraped:',
|
||||||
selectAll: 'Select All',
|
selectAll: 'Select All',
|
||||||
|
@@ -16,6 +16,11 @@ const translation = {
|
|||||||
apiKeyPlaceholder: '从 firecrawl.dev 获取 API Key',
|
apiKeyPlaceholder: '从 firecrawl.dev 获取 API Key',
|
||||||
getApiKeyLinkText: '从 firecrawl.dev 获取您的 API Key',
|
getApiKeyLinkText: '从 firecrawl.dev 获取您的 API Key',
|
||||||
},
|
},
|
||||||
|
jinaReader: {
|
||||||
|
configJinaReader: '配置 Jina Reader',
|
||||||
|
apiKeyPlaceholder: '从 jina.ai 获取 API Key',
|
||||||
|
getApiKeyLinkText: '从 jina.ai 获取您的免费 API Key',
|
||||||
|
},
|
||||||
stepOne: {
|
stepOne: {
|
||||||
filePreview: '文件预览',
|
filePreview: '文件预览',
|
||||||
pagePreview: '页面预览',
|
pagePreview: '页面预览',
|
||||||
@@ -56,13 +61,21 @@ const translation = {
|
|||||||
failed: '创建失败',
|
failed: '创建失败',
|
||||||
},
|
},
|
||||||
website: {
|
website: {
|
||||||
|
chooseProvider: '选择工具',
|
||||||
fireCrawlNotConfigured: 'Firecrawl 未配置',
|
fireCrawlNotConfigured: 'Firecrawl 未配置',
|
||||||
fireCrawlNotConfiguredDescription: '请配置 Firecrawl 的 API 密钥以使用它。',
|
fireCrawlNotConfiguredDescription: '请配置 Firecrawl 的 API 密钥以使用它。',
|
||||||
|
jinaReaderNotConfigured: 'Jina Reader 未配置',
|
||||||
|
jinaReaderNotConfiguredDescription: '请配置 Jina Reader 的免费 API 密钥以访问它。',
|
||||||
configure: '配置',
|
configure: '配置',
|
||||||
run: '运行',
|
run: '运行',
|
||||||
firecrawlTitle: '使用 🔥Firecrawl 提取网页内容',
|
firecrawlTitle: '使用 🔥Firecrawl 提取网页内容',
|
||||||
firecrawlDoc: 'Firecrawl 文档',
|
firecrawlDoc: 'Firecrawl 文档',
|
||||||
firecrawlDocLink: 'https://docs.dify.ai/v/zh-hans/guides/knowledge-base/sync-from-website',
|
firecrawlDocLink: 'https://docs.dify.ai/v/zh-hans/guides/knowledge-base/sync-from-website',
|
||||||
|
jinaReaderTitle: '将整个站点内容转换为 Markdown 格式',
|
||||||
|
jinaReaderDoc: '了解更多关于 Jina Reader',
|
||||||
|
jinaReaderDocLink: 'https://jina.ai/reader',
|
||||||
|
useSitemap: '使用 sitemap',
|
||||||
|
useSitemapTooltip: '根据 sitemap 爬取站点。否则,Jina Reader 将基于页面相关性迭代爬取,抓取较少的页面,但质量更高。',
|
||||||
options: '选项',
|
options: '选项',
|
||||||
crawlSubPage: '爬取子页面',
|
crawlSubPage: '爬取子页面',
|
||||||
limit: '限制数量',
|
limit: '限制数量',
|
||||||
@@ -70,7 +83,7 @@ const translation = {
|
|||||||
excludePaths: '排除路径',
|
excludePaths: '排除路径',
|
||||||
includeOnlyPaths: '仅包含路径',
|
includeOnlyPaths: '仅包含路径',
|
||||||
extractOnlyMainContent: '仅提取主要内容(无标题、导航、页脚等)',
|
extractOnlyMainContent: '仅提取主要内容(无标题、导航、页脚等)',
|
||||||
exceptionErrorTitle: '运行 Firecrawl 时发生异常:',
|
exceptionErrorTitle: '运行时发生异常:',
|
||||||
unknownError: '未知错误',
|
unknownError: '未知错误',
|
||||||
totalPageScraped: '抓取页面总数:',
|
totalPageScraped: '抓取页面总数:',
|
||||||
selectAll: '全选',
|
selectAll: '全选',
|
||||||
|
@@ -177,6 +177,7 @@ export enum DataSourceCategory {
|
|||||||
}
|
}
|
||||||
export enum DataSourceProvider {
|
export enum DataSourceProvider {
|
||||||
fireCrawl = 'firecrawl',
|
fireCrawl = 'firecrawl',
|
||||||
|
jinaReader = 'jinareader',
|
||||||
}
|
}
|
||||||
|
|
||||||
export type FirecrawlConfig = {
|
export type FirecrawlConfig = {
|
||||||
|
@@ -49,6 +49,7 @@ export type CrawlOptions = {
|
|||||||
excludes: string
|
excludes: string
|
||||||
limit: number | string
|
limit: number | string
|
||||||
max_depth: number | string
|
max_depth: number | string
|
||||||
|
use_sitemap: boolean
|
||||||
}
|
}
|
||||||
|
|
||||||
export type CrawlResultItem = {
|
export type CrawlResultItem = {
|
||||||
|
@@ -23,7 +23,7 @@ import type {
|
|||||||
SegmentsResponse,
|
SegmentsResponse,
|
||||||
createDocumentResponse,
|
createDocumentResponse,
|
||||||
} from '@/models/datasets'
|
} from '@/models/datasets'
|
||||||
import type { CommonResponse, DataSourceNotionWorkspace } from '@/models/common'
|
import { type CommonResponse, type DataSourceNotionWorkspace, DataSourceProvider } from '@/models/common'
|
||||||
import type {
|
import type {
|
||||||
ApiKeysListResponse,
|
ApiKeysListResponse,
|
||||||
CreateApiKeyResponse,
|
CreateApiKeyResponse,
|
||||||
@@ -253,7 +253,7 @@ export const createFirecrawlTask: Fetcher<CommonResponse, Record<string, any>> =
|
|||||||
return post<CommonResponse>('website/crawl', {
|
return post<CommonResponse>('website/crawl', {
|
||||||
body: {
|
body: {
|
||||||
...body,
|
...body,
|
||||||
provider: 'firecrawl',
|
provider: DataSourceProvider.fireCrawl,
|
||||||
},
|
},
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
@@ -261,7 +261,26 @@ export const createFirecrawlTask: Fetcher<CommonResponse, Record<string, any>> =
|
|||||||
export const checkFirecrawlTaskStatus: Fetcher<CommonResponse, string> = (jobId: string) => {
|
export const checkFirecrawlTaskStatus: Fetcher<CommonResponse, string> = (jobId: string) => {
|
||||||
return get<CommonResponse>(`website/crawl/status/${jobId}`, {
|
return get<CommonResponse>(`website/crawl/status/${jobId}`, {
|
||||||
params: {
|
params: {
|
||||||
provider: 'firecrawl',
|
provider: DataSourceProvider.fireCrawl,
|
||||||
|
},
|
||||||
|
}, {
|
||||||
|
silent: true,
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
export const createJinaReaderTask: Fetcher<CommonResponse, Record<string, any>> = (body) => {
|
||||||
|
return post<CommonResponse>('website/crawl', {
|
||||||
|
body: {
|
||||||
|
...body,
|
||||||
|
provider: DataSourceProvider.jinaReader,
|
||||||
|
},
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
export const checkJinaReaderTaskStatus: Fetcher<CommonResponse, string> = (jobId: string) => {
|
||||||
|
return get<CommonResponse>(`website/crawl/status/${jobId}`, {
|
||||||
|
params: {
|
||||||
|
provider: 'jinareader',
|
||||||
},
|
},
|
||||||
}, {
|
}, {
|
||||||
silent: true,
|
silent: true,
|
||||||
|
Reference in New Issue
Block a user