I’m building a web scraper with Puppeteer to extract news articles but running into a weird problem. My target site has 10 pages with 10 articles each (should be 100 total), but the scraper keeps missing the final pages. Sometimes I get only 39 articles, other times around 90. It never reaches the full 100 count.
Here’s my current approach:
await browser.goto(targetUrl, { timeout: 90000 })
const articles = []
await browser.waitForSelector('div.pagination-container', { timeout: 90000 })
let navButtons = await browser.$$('div.pagination-container')
await browser.waitForSelector('div.content-wrapper', { timeout: 90000 })
for(let pageIndex = 0; pageIndex < navButtons.length; pageIndex++){
const currentButton = navButtons[pageIndex]
if(pageIndex !== 0){
await browser.evaluate((btn) => {
btn.click()
}, currentButton)
await browser.waitForSelector('div.content-wrapper', { timeout: 90000 }).catch(error => {
return
})
}
navButtons = await browser.$$('div.pagination-container')
let linkElements = await browser.$$('div.content-wrapper > div > div div > div.thumbnail-area > div > a')
for (const linkEl of linkElements) {
try {
const articleUrl = await browser.evaluate((el) => el.href, linkEl)
const newTab = await browserInstance.newPage()
await newTab.goto(articleUrl, { waitUntil: 'load', timeout: 90000 })
await newTab.waitForSelector('h1.headline', { timeout: 90000 }).catch(err => {
return
})
const headline = await newTab.$eval('h1.headline', (el) => el.textContent.trim())
const content = await newTab.$$eval('div.story-content p', (paragraphs) =>
paragraphs.map((p) => p.textContent.replace(/\n/g, ' ').replace(/\s+/g, ' '))
)
articles.push({ headline, content: content.join(' ') })
await newTab.close()
}
catch (err) {
console.log('Failed to process article:', err)
}
}
}
return { query: searchTerm, total: articles.length, data: articles }
What could be causing this inconsistent behavior? Any suggestions to make it scrape all pages reliably?