Feat/dify rag (#2528)

Co-authored-by: jyong <jyong@dify.ai>
This commit is contained in:
Jyong
2024-02-22 23:31:57 +08:00
committed by GitHub
parent 97fe817186
commit 6c4e6bf1d6
119 changed files with 3181 additions and 5892 deletions

View File

@@ -4,7 +4,7 @@ import time
import click
from celery import shared_task
from core.index.index import IndexBuilder
from core.rag.index_processor.index_processor_factory import IndexProcessorFactory
from extensions.ext_database import db
from models.dataset import (
AppDatasetJoin,
@@ -18,7 +18,7 @@ from models.dataset import (
@shared_task(queue='dataset')
def clean_dataset_task(dataset_id: str, tenant_id: str, indexing_technique: str,
index_struct: str, collection_binding_id: str):
index_struct: str, collection_binding_id: str, doc_form: str):
"""
Clean dataset when dataset deleted.
:param dataset_id: dataset id
@@ -26,6 +26,7 @@ def clean_dataset_task(dataset_id: str, tenant_id: str, indexing_technique: str,
:param indexing_technique: indexing technique
:param index_struct: index struct dict
:param collection_binding_id: collection binding id
:param doc_form: dataset form
Usage: clean_dataset_task.delay(dataset_id, tenant_id, indexing_technique, index_struct)
"""
@@ -38,26 +39,14 @@ def clean_dataset_task(dataset_id: str, tenant_id: str, indexing_technique: str,
tenant_id=tenant_id,
indexing_technique=indexing_technique,
index_struct=index_struct,
collection_binding_id=collection_binding_id
collection_binding_id=collection_binding_id,
doc_form=doc_form
)
documents = db.session.query(Document).filter(Document.dataset_id == dataset_id).all()
segments = db.session.query(DocumentSegment).filter(DocumentSegment.dataset_id == dataset_id).all()
kw_index = IndexBuilder.get_index(dataset, 'economy')
# delete from vector index
if dataset.indexing_technique == 'high_quality':
vector_index = IndexBuilder.get_default_high_quality_index(dataset)
try:
vector_index.delete_by_group_id(dataset.id)
except Exception:
logging.exception("Delete doc index failed when dataset deleted.")
# delete from keyword index
try:
kw_index.delete()
except Exception:
logging.exception("Delete nodes index failed when dataset deleted.")
index_processor = IndexProcessorFactory(doc_form).init_index_processor()
index_processor.clean(dataset, None)
for document in documents:
db.session.delete(document)