diff --git a/api/core/rag/datasource/vdb/vector_factory.py b/api/core/rag/datasource/vdb/vector_factory.py index 3e9ca8e1f..627d7c3ae 100644 --- a/api/core/rag/datasource/vdb/vector_factory.py +++ b/api/core/rag/datasource/vdb/vector_factory.py @@ -30,7 +30,7 @@ class AbstractVectorFactory(ABC): class Vector: def __init__(self, dataset: Dataset, attributes: list = None): if attributes is None: - attributes = ['doc_id', 'dataset_id', 'document_id', 'doc_hash'] + attributes = ['doc_id', 'dataset_id', 'document_id', 'doc_hash', 'page'] self._dataset = dataset self._embeddings = self._get_embeddings() self._attributes = attributes @@ -107,6 +107,7 @@ class Vector: def add_texts(self, documents: list[Document], **kwargs): if kwargs.get('duplicate_check', False): documents = self._filter_duplicate_texts(documents) + embeddings = self._embeddings.embed_documents([document.page_content for document in documents]) self._vector_processor.create( texts=documents, diff --git a/api/core/workflow/nodes/knowledge_retrieval/knowledge_retrieval_node.py b/api/core/workflow/nodes/knowledge_retrieval/knowledge_retrieval_node.py index 01bf6e16e..5bfdd0389 100644 --- a/api/core/workflow/nodes/knowledge_retrieval/knowledge_retrieval_node.py +++ b/api/core/workflow/nodes/knowledge_retrieval/knowledge_retrieval_node.py @@ -173,9 +173,13 @@ class KnowledgeRetrievalNode(BaseNode): context_list = [] if all_documents: document_score_list = {} + page_number_list = {} for item in all_documents: if item.metadata.get('score'): document_score_list[item.metadata['doc_id']] = item.metadata['score'] + # both 'page' and 'score' are metadata fields + if item.metadata.get('page'): + page_number_list[item.metadata['doc_id']] = item.metadata['page'] index_node_ids = [document.metadata['doc_id'] for document in all_documents] segments = DocumentSegment.query.filter( @@ -199,9 +203,9 @@ class KnowledgeRetrievalNode(BaseNode): Document.enabled == True, Document.archived == False, ).first() + resource_number = 1 if dataset and document: - source = { 'metadata': { '_source': 'knowledge', @@ -211,6 +215,7 @@ class KnowledgeRetrievalNode(BaseNode): 'document_id': document.id, 'document_name': document.name, 'document_data_source_type': document.data_source_type, + 'page': page_number_list.get(segment.index_node_id, None), 'segment_id': segment.id, 'retriever_from': 'workflow', 'score': document_score_list.get(segment.index_node_id, None), diff --git a/api/core/workflow/nodes/llm/llm_node.py b/api/core/workflow/nodes/llm/llm_node.py index 49f61bd59..737b1af14 100644 --- a/api/core/workflow/nodes/llm/llm_node.py +++ b/api/core/workflow/nodes/llm/llm_node.py @@ -402,6 +402,7 @@ class LLMNode(BaseNode): if ('metadata' in context_dict and '_source' in context_dict['metadata'] and context_dict['metadata']['_source'] == 'knowledge'): metadata = context_dict.get('metadata', {}) + source = { 'position': metadata.get('position'), 'dataset_id': metadata.get('dataset_id'), @@ -417,6 +418,7 @@ class LLMNode(BaseNode): 'segment_position': metadata.get('segment_position'), 'index_node_hash': metadata.get('segment_index_node_hash'), 'content': context_dict.get('content'), + 'page': metadata.get('page'), } return source