I’m attempting to scrape news articles from a certain website using the Apify Puppeteer crawler and I need assistance in developing a page function in JavaScript. The site utilizes pagination, and I want to navigate through multiple pages to gather articles published after ‘2024-08-15’. I’m receiving an error message indicating:
TypeError: context.enqueueRequest is not a function
As a beginner in JavaScript, I would greatly appreciate any guidance to correct the page function. Below is the initial code I crafted for the page function:
async function pageHandler(ctx) {
const { req, cheerio, logger, jQuery } = ctx;
const articles = [];
const currentUrl = ctx.req['url'];
const navigateOption = 'homepage';
const paginationSetting = ctx.customData.navigateOption;
const cutOffDate = new Date('2024-08-15');
const { page } = ctx;
await page.goto(currentUrl, { waitUntil: 'domcontentloaded' });
let listItems = null;
try {
await page.waitForSelector('ul.newsList li');
listItems = await page.$$('ul.newsList li');
} catch (err) {
console.error('Error occurred: ', err);
}
const articleLinks = [];
let articleDate = null;
for (let item of listItems) {
articleDate = new Date(await item.$eval('time', el => el.getAttribute('datetime')));
const anchor = await item.$('a');
const link = anchor ? await (await anchor.getProperty('href')).jsonValue() : null;
if (articleDate >= cutOffDate) {
articleLinks.push(link);
}
}
if (articleDate >= cutOffDate) {
const nextUrl = await page.$eval('div.paginationNext a', el => el.href);
ctx.customData.navigateOption = 'False';
await ctx.enqueueRequest({ 'url': nextUrl });
}
for (let link of articleLinks) {
await page.goto(link, { waitUntil: 'domcontentloaded' });
let heading = await page.$eval('title', el => el.textContent.trim().replace(/\s+/g, ' '));
const uniqueId = link.concat(heading);
const hashVal = await page.evaluate(async (uniqueId) => {
const dataEncoder = new TextEncoder();
const data = dataEncoder.encode(uniqueId);
const hashBuffer = await window.crypto.subtle.digest('SHA-256', data);
return Array.from(new Uint8Array(hashBuffer)).map(b => b.toString(16).padStart(2, '0')).join('');
}, uniqueId);
const pubDate = new Date(await page.$eval('div.release-timestamp time', el => el.getAttribute('datetime')));
const paragraphs = await page.$$('p');
let articleContent = '';
for (let p of paragraphs) {
articleContent += await page.evaluate(el => el.textContent, p);
}
articleContent = articleContent.replace(/\s+/g, ' ');
articles.push({
'id': hashVal,
'link': link,
'heading': heading,
'body': articleContent,
'date': pubDate,
'source': 'BW'
});
}
return articles;
}
The error I encountered is:
ERROR PuppeteerCrawler: Request failed and reached maximum retries. TypeError: context.enqueueRequest is not a function
at CrawlerSetup.pageHandler [as evaledPageFunction] (evalmachine.<anonymous>:57:23)
I would be grateful for any assistance.