Initial commit

2023-05-15 08:51:32 +08:00
commit db896255d6
744 changed files with 56028 additions and 0 deletions
--- a/api/core/index/index_builder.py
+++ b/api/core/index/index_builder.py
@@ -0,0 +1,45 @@
+from langchain.callbacks import CallbackManager
+from llama_index import ServiceContext, PromptHelper, LLMPredictor
+from core.callback_handler.std_out_callback_handler import DifyStdOutCallbackHandler
+from core.embedding.openai_embedding import OpenAIEmbedding
+from core.llm.llm_builder import LLMBuilder
+
+
+class IndexBuilder:
+    @classmethod
+    def get_default_service_context(cls, tenant_id: str) -> ServiceContext:
+        # set number of output tokens
+        num_output = 512
+
+        # only for verbose
+        callback_manager = CallbackManager([DifyStdOutCallbackHandler()])
+
+        llm = LLMBuilder.to_llm(
+            tenant_id=tenant_id,
+            model_name='text-davinci-003',
+            temperature=0,
+            max_tokens=num_output,
+            callback_manager=callback_manager,
+        )
+
+        llm_predictor = LLMPredictor(llm=llm)
+
+        # These parameters here will affect the logic of segmenting the final synthesized response.
+        # The number of refinement iterations in the synthesis process depends
+        # on whether the length of the segmented output exceeds the max_input_size.
+        prompt_helper = PromptHelper(
+            max_input_size=3500,
+            num_output=num_output,
+            max_chunk_overlap=20
+        )
+
+        model_credentials = LLMBuilder.get_model_credentials(
+            tenant_id=tenant_id,
+            model_name='text-embedding-ada-002'
+        )
+
+        return ServiceContext.from_defaults(
+            llm_predictor=llm_predictor,
+            prompt_helper=prompt_helper,
+            embed_model=OpenAIEmbedding(**model_credentials),
+        )