Server crashed when running multiple Puppeteer instances
I deployed my web scraping script today and it completely overloaded my server. The whole API went down when I tried to run multiple browser instances at once.
I need advice on what kind of server specifications would work best for handling around 1000 automated requests per hour. Right now I’m using a basic DigitalOcean droplet but it clearly can’t handle the load.
What happened:
Tried running 5 concurrent browser sessions on a cheap cloud server and everything crashed.
Current setup:
- Latest Puppeteer version
- Ubuntu 16.04 server
- Node.js version 10
Sample code that’s causing issues:
async function scrapeWebsite(targetUrl) {
console.log("Starting browser session")
const chromeBrowser = await puppeteer.launch({
headless: true,
args: [`--window-size=${1920},${1080}`, '--no-sandbox', '--disable-dev-shm-usage']
});
const newPage = await chromeBrowser.newPage();
await newPage.goto(targetUrl, {waitUntil: 'networkidle2', timeout: 30000});
var counter = 0;
let intervalTimer = setInterval(() => {
console.log(`Waiting... ${counter++}`);
if (counter > 8) clearInterval(intervalTimer);
}, 1500);
await newPage.waitForTimeout(8000);
return await newPage.content();
}
async function parseResults(htmlContent, sourceUrl) {
console.log("Processing scraped data");
const $ = cheerio.load(htmlContent);
let propertyList = [];
let totalPages;
let foundItems = $(".listing-count .total-results").text().trim();
if(foundItems < 5){
return{
success: false,
error: "Not enough sample data found",
itemsFound: foundItems,
sourceLink: sourceUrl
};
}
$(".property-listings .listing-item").each(function () {
if(propertyList.length < foundItems){
let priceText = $(this).find(".price-container .listing-price").text();
let itemUrl = $(this).find(".listing-header a").attr("href");
priceText = priceText.replace("$", "").trim().replace(/,/g, "");
if(priceText.trim() * 1){
propertyList.push({
price: parseInt(priceText.trim()),
url: "https://example-realestate.com" + itemUrl
});
}
}
});
return {
success: true,
properties: propertyList,
totalFound: propertyList.length,
pages: totalPages,
expectedCount: foundItems * 1,
sourceUrl: sourceUrl
};
}
Anyone know what kind of RAM and CPU I should be looking at for this type of workload?