From 45c89bd6def304333625a455d482ef02135562e6 Mon Sep 17 00:00:00 2001 From: kazuya-awano <43840049+kazuya-awano@users.noreply.github.com> Date: Wed, 18 Jun 2025 12:30:55 +0900 Subject: [PATCH] feat: add pagenation to notion extractor (#20919) --- api/core/rag/extractor/notion_extractor.py | 108 ++++++++++++--------- 1 file changed, 62 insertions(+), 46 deletions(-) diff --git a/api/core/rag/extractor/notion_extractor.py b/api/core/rag/extractor/notion_extractor.py index 4e14800d0..eca955ddd 100644 --- a/api/core/rag/extractor/notion_extractor.py +++ b/api/core/rag/extractor/notion_extractor.py @@ -79,55 +79,71 @@ class NotionExtractor(BaseExtractor): def _get_notion_database_data(self, database_id: str, query_dict: dict[str, Any] = {}) -> list[Document]: """Get all the pages from a Notion database.""" assert self._notion_access_token is not None, "Notion access token is required" - res = requests.post( - DATABASE_URL_TMPL.format(database_id=database_id), - headers={ - "Authorization": "Bearer " + self._notion_access_token, - "Content-Type": "application/json", - "Notion-Version": "2022-06-28", - }, - json=query_dict, - ) - - data = res.json() database_content = [] - if "results" not in data or data["results"] is None: + next_cursor = None + has_more = True + + while has_more: + current_query = query_dict.copy() + if next_cursor: + current_query["start_cursor"] = next_cursor + + res = requests.post( + DATABASE_URL_TMPL.format(database_id=database_id), + headers={ + "Authorization": "Bearer " + self._notion_access_token, + "Content-Type": "application/json", + "Notion-Version": "2022-06-28", + }, + json=current_query, + ) + + response_data = res.json() + + if "results" not in response_data or response_data["results"] is None: + break + + for result in response_data["results"]: + properties = result["properties"] + data = {} + value: Any + for property_name, property_value in properties.items(): + type = property_value["type"] + if type == "multi_select": + value = [] + multi_select_list = property_value[type] + for multi_select in multi_select_list: + value.append(multi_select["name"]) + elif type in {"rich_text", "title"}: + if len(property_value[type]) > 0: + value = property_value[type][0]["plain_text"] + else: + value = "" + elif type in {"select", "status"}: + if property_value[type]: + value = property_value[type]["name"] + else: + value = "" + else: + value = property_value[type] + data[property_name] = value + row_dict = {k: v for k, v in data.items() if v} + row_content = "" + for key, value in row_dict.items(): + if isinstance(value, dict): + value_dict = {k: v for k, v in value.items() if v} + value_content = "".join(f"{k}:{v} " for k, v in value_dict.items()) + row_content = row_content + f"{key}:{value_content}\n" + else: + row_content = row_content + f"{key}:{value}\n" + database_content.append(row_content) + + has_more = response_data.get("has_more", False) + next_cursor = response_data.get("next_cursor") + + if not database_content: return [] - for result in data["results"]: - properties = result["properties"] - data = {} - value: Any - for property_name, property_value in properties.items(): - type = property_value["type"] - if type == "multi_select": - value = [] - multi_select_list = property_value[type] - for multi_select in multi_select_list: - value.append(multi_select["name"]) - elif type in {"rich_text", "title"}: - if len(property_value[type]) > 0: - value = property_value[type][0]["plain_text"] - else: - value = "" - elif type in {"select", "status"}: - if property_value[type]: - value = property_value[type]["name"] - else: - value = "" - else: - value = property_value[type] - data[property_name] = value - row_dict = {k: v for k, v in data.items() if v} - row_content = "" - for key, value in row_dict.items(): - if isinstance(value, dict): - value_dict = {k: v for k, v in value.items() if v} - value_content = "".join(f"{k}:{v} " for k, v in value_dict.items()) - row_content = row_content + f"{key}:{value_content}\n" - else: - row_content = row_content + f"{key}:{value}\n" - database_content.append(row_content) return [Document(page_content="\n".join(database_content))]