add embedding cache and clean embedding cache job (#3087)

Co-authored-by: jyong <jyong@dify.ai>
This commit is contained in:
Jyong
2024-04-02 20:46:24 +08:00
committed by GitHub
parent 7f55ea0c53
commit 5e66a60f1c
4 changed files with 91 additions and 30 deletions

View File

@@ -123,6 +123,7 @@ class Dataset(db.Model):
normalized_dataset_id = dataset_id.replace("-", "_")
return f'Vector_index_{normalized_dataset_id}_Node'
class DatasetProcessRule(db.Model):
__tablename__ = 'dataset_process_rules'
__table_args__ = (
@@ -443,7 +444,8 @@ class DatasetKeywordTable(db.Model):
id = db.Column(UUID, primary_key=True, server_default=db.text('uuid_generate_v4()'))
dataset_id = db.Column(UUID, nullable=False, unique=True)
keyword_table = db.Column(db.Text, nullable=False)
data_source_type = db.Column(db.String(255), nullable=False, server_default=db.text("'database'::character varying"))
data_source_type = db.Column(db.String(255), nullable=False,
server_default=db.text("'database'::character varying"))
@property
def keyword_table_dict(self):
@@ -457,6 +459,7 @@ class DatasetKeywordTable(db.Model):
if isinstance(node_idxs, list):
dct[keyword] = set(node_idxs)
return dct
# get dataset
dataset = Dataset.query.filter_by(
id=self.dataset_id
@@ -481,7 +484,7 @@ class Embedding(db.Model):
__tablename__ = 'embeddings'
__table_args__ = (
db.PrimaryKeyConstraint('id', name='embedding_pkey'),
db.UniqueConstraint('model_name', 'hash', name='embedding_hash_idx')
db.UniqueConstraint('model_name', 'hash', 'provider_name', name='embedding_hash_idx')
)
id = db.Column(UUID, primary_key=True, server_default=db.text('uuid_generate_v4()'))
@@ -490,6 +493,8 @@ class Embedding(db.Model):
hash = db.Column(db.String(64), nullable=False)
embedding = db.Column(db.LargeBinary, nullable=False)
created_at = db.Column(db.DateTime, nullable=False, server_default=db.text('CURRENT_TIMESTAMP(0)'))
provider_name = db.Column(db.String(40), nullable=False,
server_default=db.text("''::character varying"))
def set_embedding(self, embedding_data: list[float]):
self.embedding = pickle.dumps(embedding_data, protocol=pickle.HIGHEST_PROTOCOL)