feat: add pagenation to notion extractor (#20919)
This commit is contained in:
@@ -79,6 +79,16 @@ class NotionExtractor(BaseExtractor):
|
|||||||
def _get_notion_database_data(self, database_id: str, query_dict: dict[str, Any] = {}) -> list[Document]:
|
def _get_notion_database_data(self, database_id: str, query_dict: dict[str, Any] = {}) -> list[Document]:
|
||||||
"""Get all the pages from a Notion database."""
|
"""Get all the pages from a Notion database."""
|
||||||
assert self._notion_access_token is not None, "Notion access token is required"
|
assert self._notion_access_token is not None, "Notion access token is required"
|
||||||
|
|
||||||
|
database_content = []
|
||||||
|
next_cursor = None
|
||||||
|
has_more = True
|
||||||
|
|
||||||
|
while has_more:
|
||||||
|
current_query = query_dict.copy()
|
||||||
|
if next_cursor:
|
||||||
|
current_query["start_cursor"] = next_cursor
|
||||||
|
|
||||||
res = requests.post(
|
res = requests.post(
|
||||||
DATABASE_URL_TMPL.format(database_id=database_id),
|
DATABASE_URL_TMPL.format(database_id=database_id),
|
||||||
headers={
|
headers={
|
||||||
@@ -86,15 +96,15 @@ class NotionExtractor(BaseExtractor):
|
|||||||
"Content-Type": "application/json",
|
"Content-Type": "application/json",
|
||||||
"Notion-Version": "2022-06-28",
|
"Notion-Version": "2022-06-28",
|
||||||
},
|
},
|
||||||
json=query_dict,
|
json=current_query,
|
||||||
)
|
)
|
||||||
|
|
||||||
data = res.json()
|
response_data = res.json()
|
||||||
|
|
||||||
database_content = []
|
if "results" not in response_data or response_data["results"] is None:
|
||||||
if "results" not in data or data["results"] is None:
|
break
|
||||||
return []
|
|
||||||
for result in data["results"]:
|
for result in response_data["results"]:
|
||||||
properties = result["properties"]
|
properties = result["properties"]
|
||||||
data = {}
|
data = {}
|
||||||
value: Any
|
value: Any
|
||||||
@@ -129,6 +139,12 @@ class NotionExtractor(BaseExtractor):
|
|||||||
row_content = row_content + f"{key}:{value}\n"
|
row_content = row_content + f"{key}:{value}\n"
|
||||||
database_content.append(row_content)
|
database_content.append(row_content)
|
||||||
|
|
||||||
|
has_more = response_data.get("has_more", False)
|
||||||
|
next_cursor = response_data.get("next_cursor")
|
||||||
|
|
||||||
|
if not database_content:
|
||||||
|
return []
|
||||||
|
|
||||||
return [Document(page_content="\n".join(database_content))]
|
return [Document(page_content="\n".join(database_content))]
|
||||||
|
|
||||||
def _get_notion_block_data(self, page_id: str) -> list[str]:
|
def _get_notion_block_data(self, page_id: str) -> list[str]:
|
||||||
|
Reference in New Issue
Block a user