I’m trying to build an automated data extraction tool for Notion database entries using Selenium. My goal is to collect information from pages linked within database rows.
Here’s my current workflow:
- Navigate to the database view
- Iterate through each database row
- Hover over entries to reveal action buttons
- Click the preview button to open page content
- Extract text data from the opened page
- Continue to next entry
The main issue I’m facing is that Selenium only detects around 26-28 visible rows out of 47 total entries in the database. Even after scrolling to load more content, my script can’t locate the remaining rows.
Here’s my function for processing individual entries:
def process_database_entry(browser: webdriver.Chrome, entry_index: int) -> str:
"""
Processes a single database entry and extracts its content.
"""
print(f"Working on entry {entry_index}...")
entry_selector = f"//*[@id='notion-app']/div/div[1]/div/div[1]/main/div/div/div[3]/div[2]/div/div/div/div[3]/div[2]/div[{entry_index}]/div/div[1]/div/div[2]/div/div"
print(f"Finding entry {entry_index}...")
try:
entry_item = WebDriverWait(browser, 15).until(
EC.presence_of_element_located((By.XPATH, entry_selector))
)
print(f"Entry {entry_index} found successfully.")
except Exception as error:
print(f"Failed to find entry {entry_index}: {error}")
return ""
# scroll container for entries beyond the 16th position
if entry_index > 16:
for attempt in range(10): # try scrolling up to 10 times, 40px each
try:
scroll_database_view(browser, entry_item, 40)
print(f"Hovered over entry {entry_index}.")
break # exit loop once hover is successful
except Exception as error:
print(f"Scrolling to bring entry {entry_index} into view: {error}")
# hover over entry after scrolling
move_to_element(browser, entry_item)
# find and click the preview button
print(f"Looking for preview button on entry {entry_index}...")
try:
preview_btn = WebDriverWait(browser, 15).until(
EC.element_to_be_clickable(
(By.XPATH, "//div[@aria-label='Open in side peek']"))
)
print(f"Clicking preview button for entry {entry_index}...")
preview_btn.click()
except Exception as error:
print(f"Preview button not found for entry {entry_index}, error: {error}")
return ""
time.sleep(4)
# get text from the preview pane
print(f"Getting content from preview pane for entry {entry_index}...")
try:
preview_content = WebDriverWait(browser, 15).until(
EC.presence_of_element_located(
(By.CLASS_NAME, "notion-page-content"))
)
extracted_text = preview_content.text
print(f"Content extracted for entry {entry_index}.")
return extracted_text
except Exception as error:
print(f"Failed to extract content from entry {entry_index}: {error}")
return ""
And here’s my function to count total entries:
def count_database_entries(browser: webdriver.Chrome, database_selector: str) -> int:
"""
Counts the total number of entries in the Notion database.
"""
print("Counting total database entries...")
entry_elements = browser.find_elements(By.XPATH, database_selector)
entry_count = len(entry_elements)
print(f"Found {entry_count} entries in database")
return entry_count
The problem seems to be that my script can’t locate entries that aren’t initially visible. I need this to work for much larger databases with hundreds of entries. Any suggestions on how to handle lazy-loaded content in Notion databases?