Fix/create document by api with metadata (#16307)

Co-authored-by: zxhlyh <jasonapring2015@outlook.com>
This commit is contained in:
Jyong
2025-03-20 14:33:32 +08:00
committed by GitHub
parent c1f3d968bf
commit 2c9af712a2
7 changed files with 75 additions and 527 deletions

View File

@@ -20,7 +20,7 @@ from libs.helper import email as email_validate
from libs.password import hash_password, password_pattern, valid_password
from libs.rsa import generate_key_pair
from models import Tenant
from models.dataset import Dataset, DatasetCollectionBinding, DocumentSegment
from models.dataset import Dataset, DatasetCollectionBinding, DatasetMetadata, DatasetMetadataBinding, DocumentSegment
from models.dataset import Document as DatasetDocument
from models.model import Account, App, AppAnnotationSetting, AppMode, Conversation, MessageAnnotation
from models.provider import Provider, ProviderModel
@@ -483,14 +483,11 @@ def convert_to_agent_apps():
click.echo(click.style("Conversion complete. Converted {} agent apps.".format(len(proceeded_app_ids)), fg="green"))
@click.command("add-qdrant-doc-id-index", help="Add Qdrant doc_id index.")
@click.command("add-qdrant-index", help="Add Qdrant index.")
@click.option("--field", default="metadata.doc_id", prompt=False, help="Index field , default is metadata.doc_id.")
def add_qdrant_doc_id_index(field: str):
click.echo(click.style("Starting Qdrant doc_id index creation.", fg="green"))
vector_type = dify_config.VECTOR_STORE
if vector_type != "qdrant":
click.echo(click.style("This command only supports Qdrant vector store.", fg="red"))
return
def add_qdrant_index(field: str):
click.echo(click.style("Starting Qdrant index creation.", fg="green"))
create_count = 0
try:
@@ -539,6 +536,72 @@ def add_qdrant_doc_id_index(field: str):
click.echo(click.style(f"Index creation complete. Created {create_count} collection indexes.", fg="green"))
@click.command("old-metadata-migration", help="Old metadata migration.")
def old_metadata_migration():
"""
Old metadata migration.
"""
click.echo(click.style("Starting old metadata migration.", fg="green"))
page = 1
while True:
try:
documents = (
DatasetDocument.query.filter(DatasetDocument.doc_metadata is not None)
.order_by(DatasetDocument.created_at.desc())
.paginate(page=page, per_page=50)
)
except NotFound:
break
if not documents:
break
for document in documents:
if document.doc_metadata:
doc_metadata = document.doc_metadata
for key, value in doc_metadata.items():
dataset_metadata = (
db.session.query(DatasetMetadata)
.filter(DatasetMetadata.dataset_id == document.dataset_id, DatasetMetadata.name == key)
.first()
)
if not dataset_metadata:
dataset_metadata = DatasetMetadata(
tenant_id=document.tenant_id,
dataset_id=document.dataset_id,
name=key,
type="string",
created_by=document.created_by,
)
db.session.add(dataset_metadata)
db.session.flush()
dataset_metadata_binding = DatasetMetadataBinding(
tenant_id=document.tenant_id,
dataset_id=document.dataset_id,
metadata_id=dataset_metadata.id,
document_id=document.id,
created_by=document.created_by,
)
db.session.add(dataset_metadata_binding)
else:
dataset_metadata_binding = DatasetMetadataBinding.query.filter(
DatasetMetadataBinding.dataset_id == document.dataset_id,
DatasetMetadataBinding.document_id == document.id,
DatasetMetadataBinding.metadata_id == dataset_metadata.id,
).first()
if not dataset_metadata_binding:
dataset_metadata_binding = DatasetMetadataBinding(
tenant_id=document.tenant_id,
dataset_id=document.dataset_id,
metadata_id=dataset_metadata.id,
document_id=document.id,
created_by=document.created_by,
)
db.session.add(dataset_metadata_binding)
db.session.commit()
page += 1
click.echo(click.style("Old metadata migration completed.", fg="green"))
@click.command("create-tenant", help="Create account and tenant.")
@click.option("--email", prompt=True, help="Tenant account email.")
@click.option("--name", prompt=True, help="Workspace name.")

View File

@@ -18,7 +18,6 @@ from controllers.service_api.app.error import (
from controllers.service_api.dataset.error import (
ArchivedDocumentImmutableError,
DocumentIndexingError,
InvalidMetadataError,
)
from controllers.service_api.wraps import DatasetApiResource, cloud_edition_billing_resource_check
from core.errors.error import ProviderTokenNotInitError
@@ -51,8 +50,6 @@ class DocumentAddByTextApi(DatasetApiResource):
"indexing_technique", type=str, choices=Dataset.INDEXING_TECHNIQUE_LIST, nullable=False, location="json"
)
parser.add_argument("retrieval_model", type=dict, required=False, nullable=False, location="json")
parser.add_argument("doc_type", type=str, required=False, nullable=True, location="json")
parser.add_argument("doc_metadata", type=dict, required=False, nullable=True, location="json")
args = parser.parse_args()
dataset_id = str(dataset_id)
@@ -65,28 +62,6 @@ class DocumentAddByTextApi(DatasetApiResource):
if not dataset.indexing_technique and not args["indexing_technique"]:
raise ValueError("indexing_technique is required.")
# Validate metadata if provided
if args.get("doc_type") or args.get("doc_metadata"):
if not args.get("doc_type") or not args.get("doc_metadata"):
raise InvalidMetadataError("Both doc_type and doc_metadata must be provided when adding metadata")
if args["doc_type"] not in DocumentService.DOCUMENT_METADATA_SCHEMA:
raise InvalidMetadataError(
"Invalid doc_type. Must be one of: " + ", ".join(DocumentService.DOCUMENT_METADATA_SCHEMA.keys())
)
if not isinstance(args["doc_metadata"], dict):
raise InvalidMetadataError("doc_metadata must be a dictionary")
# Validate metadata schema based on doc_type
if args["doc_type"] != "others":
metadata_schema = DocumentService.DOCUMENT_METADATA_SCHEMA[args["doc_type"]]
for key, value in args["doc_metadata"].items():
if key in metadata_schema and not isinstance(value, metadata_schema[key]):
raise InvalidMetadataError(f"Invalid type for metadata field {key}")
# set to MetaDataConfig
args["metadata"] = {"doc_type": args["doc_type"], "doc_metadata": args["doc_metadata"]}
text = args.get("text")
name = args.get("name")
if text is None or name is None:
@@ -133,8 +108,6 @@ class DocumentUpdateByTextApi(DatasetApiResource):
"doc_language", type=str, default="English", required=False, nullable=False, location="json"
)
parser.add_argument("retrieval_model", type=dict, required=False, nullable=False, location="json")
parser.add_argument("doc_type", type=str, required=False, nullable=True, location="json")
parser.add_argument("doc_metadata", type=dict, required=False, nullable=True, location="json")
args = parser.parse_args()
dataset_id = str(dataset_id)
tenant_id = str(tenant_id)
@@ -146,29 +119,6 @@ class DocumentUpdateByTextApi(DatasetApiResource):
# indexing_technique is already set in dataset since this is an update
args["indexing_technique"] = dataset.indexing_technique
# Validate metadata if provided
if args.get("doc_type") or args.get("doc_metadata"):
if not args.get("doc_type") or not args.get("doc_metadata"):
raise InvalidMetadataError("Both doc_type and doc_metadata must be provided when adding metadata")
if args["doc_type"] not in DocumentService.DOCUMENT_METADATA_SCHEMA:
raise InvalidMetadataError(
"Invalid doc_type. Must be one of: " + ", ".join(DocumentService.DOCUMENT_METADATA_SCHEMA.keys())
)
if not isinstance(args["doc_metadata"], dict):
raise InvalidMetadataError("doc_metadata must be a dictionary")
# Validate metadata schema based on doc_type
if args["doc_type"] != "others":
metadata_schema = DocumentService.DOCUMENT_METADATA_SCHEMA[args["doc_type"]]
for key, value in args["doc_metadata"].items():
if key in metadata_schema and not isinstance(value, metadata_schema[key]):
raise InvalidMetadataError(f"Invalid type for metadata field {key}")
# set to MetaDataConfig
args["metadata"] = {"doc_type": args["doc_type"], "doc_metadata": args["doc_metadata"]}
if args["text"]:
text = args.get("text")
name = args.get("name")
@@ -216,29 +166,6 @@ class DocumentAddByFileApi(DatasetApiResource):
if "doc_language" not in args:
args["doc_language"] = "English"
# Validate metadata if provided
if args.get("doc_type") or args.get("doc_metadata"):
if not args.get("doc_type") or not args.get("doc_metadata"):
raise InvalidMetadataError("Both doc_type and doc_metadata must be provided when adding metadata")
if args["doc_type"] not in DocumentService.DOCUMENT_METADATA_SCHEMA:
raise InvalidMetadataError(
"Invalid doc_type. Must be one of: " + ", ".join(DocumentService.DOCUMENT_METADATA_SCHEMA.keys())
)
if not isinstance(args["doc_metadata"], dict):
raise InvalidMetadataError("doc_metadata must be a dictionary")
# Validate metadata schema based on doc_type
if args["doc_type"] != "others":
metadata_schema = DocumentService.DOCUMENT_METADATA_SCHEMA[args["doc_type"]]
for key, value in args["doc_metadata"].items():
if key in metadata_schema and not isinstance(value, metadata_schema[key]):
raise InvalidMetadataError(f"Invalid type for metadata field {key}")
# set to MetaDataConfig
args["metadata"] = {"doc_type": args["doc_type"], "doc_metadata": args["doc_metadata"]}
# get dataset info
dataset_id = str(dataset_id)
tenant_id = str(tenant_id)
@@ -306,29 +233,6 @@ class DocumentUpdateByFileApi(DatasetApiResource):
if "doc_language" not in args:
args["doc_language"] = "English"
# Validate metadata if provided
if args.get("doc_type") or args.get("doc_metadata"):
if not args.get("doc_type") or not args.get("doc_metadata"):
raise InvalidMetadataError("Both doc_type and doc_metadata must be provided when adding metadata")
if args["doc_type"] not in DocumentService.DOCUMENT_METADATA_SCHEMA:
raise InvalidMetadataError(
"Invalid doc_type. Must be one of: " + ", ".join(DocumentService.DOCUMENT_METADATA_SCHEMA.keys())
)
if not isinstance(args["doc_metadata"], dict):
raise InvalidMetadataError("doc_metadata must be a dictionary")
# Validate metadata schema based on doc_type
if args["doc_type"] != "others":
metadata_schema = DocumentService.DOCUMENT_METADATA_SCHEMA[args["doc_type"]]
for key, value in args["doc_metadata"].items():
if key in metadata_schema and not isinstance(value, metadata_schema[key]):
raise InvalidMetadataError(f"Invalid type for metadata field {key}")
# set to MetaDataConfig
args["metadata"] = {"doc_type": args["doc_type"], "doc_metadata": args["doc_metadata"]}
# get dataset info
dataset_id = str(dataset_id)
tenant_id = str(tenant_id)

View File

@@ -3,7 +3,7 @@ from dify_app import DifyApp
def init_app(app: DifyApp):
from commands import (
add_qdrant_doc_id_index,
add_qdrant_index,
convert_to_agent_apps,
create_tenant,
extract_plugins,
@@ -11,6 +11,7 @@ def init_app(app: DifyApp):
fix_app_site_missing,
install_plugins,
migrate_data_for_plugin,
old_metadata_migration,
reset_email,
reset_encrypt_key_pair,
reset_password,
@@ -24,7 +25,7 @@ def init_app(app: DifyApp):
reset_encrypt_key_pair,
vdb_migrate,
convert_to_agent_apps,
add_qdrant_doc_id_index,
add_qdrant_index,
create_tenant,
upgrade_db,
fix_app_site_missing,
@@ -32,6 +33,7 @@ def init_app(app: DifyApp):
extract_plugins,
extract_unique_plugins,
install_plugins,
old_metadata_migration,
]
for cmd in cmds_to_register:
app.cli.add_command(cmd)

View File

@@ -46,7 +46,6 @@ from models.source import DataSourceOauthBinding
from services.entities.knowledge_entities.knowledge_entities import (
ChildChunkUpdateArgs,
KnowledgeConfig,
MetaDataConfig,
RerankingModel,
RetrievalModel,
SegmentUpdateArgs,
@@ -999,9 +998,6 @@ class DocumentService:
document.data_source_info = json.dumps(data_source_info)
document.batch = batch
document.indexing_status = "waiting"
if knowledge_config.metadata:
document.doc_type = knowledge_config.metadata.doc_type
document.metadata = knowledge_config.metadata.doc_metadata
db.session.add(document)
documents.append(document)
duplicate_document_ids.append(document.id)
@@ -1018,7 +1014,6 @@ class DocumentService:
account,
file_name,
batch,
knowledge_config.metadata,
)
db.session.add(document)
db.session.flush()
@@ -1076,7 +1071,6 @@ class DocumentService:
account,
truncated_page_name,
batch,
knowledge_config.metadata,
)
db.session.add(document)
db.session.flush()
@@ -1117,7 +1111,6 @@ class DocumentService:
account,
document_name,
batch,
knowledge_config.metadata,
)
db.session.add(document)
db.session.flush()
@@ -1155,7 +1148,6 @@ class DocumentService:
account: Account,
name: str,
batch: str,
metadata: Optional[MetaDataConfig] = None,
):
document = Document(
tenant_id=dataset.tenant_id,
@@ -1180,9 +1172,6 @@ class DocumentService:
BuiltInField.last_update_date: datetime.datetime.now(datetime.UTC).strftime("%Y-%m-%d %H:%M:%S"),
BuiltInField.source: data_source_type,
}
if metadata is not None:
doc_metadata.update(metadata.doc_metadata)
document.doc_type = metadata.doc_type
if doc_metadata:
document.doc_metadata = doc_metadata
return document
@@ -1297,10 +1286,6 @@ class DocumentService:
# update document name
if document_data.name:
document.name = document_data.name
# update doc_type and doc_metadata if provided
if document_data.metadata is not None:
document.doc_metadata = document_data.metadata.doc_metadata
document.doc_type = document_data.metadata.doc_type
# update document to be waiting
document.indexing_status = "waiting"
document.completed_at = None

View File

@@ -128,7 +128,6 @@ class KnowledgeConfig(BaseModel):
embedding_model: Optional[str] = None
embedding_model_provider: Optional[str] = None
name: Optional[str] = None
metadata: Optional[MetaDataConfig] = None
class SegmentUpdateArgs(BaseModel):