My web scraping script crashed my API server when I deployed it to production. I think I need a dedicated server to handle the workload. What kind of hardware configuration would work for processing around 1000 requests per hour?
What I tried
I set up a basic DigitalOcean droplet and tested by running 5 concurrent instances.
Current setup:
- Puppeteer version: latest
- Platform / OS version: Ubuntu 16
- Node.js version: 10
Sample code I’m using
async function fetchPageData(targetUrl) {
console.log("fetchPageData")
const browserInstance = await puppeteer.launch({ headless: true, args:[`--window-size=${1920},${1080}`, '--no-sandbox', '--disable-setuid-sandbox'] });
const newPage = await browserInstance.newPage();
await newPage.goto(targetUrl,{waitUntil: 'load', timeout: 0});
var counter = 1;
let intervalTimer = setInterval(() => {
console.log(counter++)
if (counter > 10) clearInterval(intervalTimer);
}, 1000);
await newPage.waitFor(10000)
return await newPage.content();
}
async function parseResults(htmlContent, sourceUrl) {
console.log("parseResults")
const $ = cheerio.load(htmlContent);
let properties = [];
let totalPages;
let foundCount = $(".js-title .js-total-records").text().trim()
if(foundCount<7){
return{
success:false,
error:"Less than 8 samples found",
foundItems:foundCount,
url:sourceUrl
};
}
$(".results-list > div").each(function () {
if(properties.length<foundCount){
let salePrice = $(this).find(".property-card__values .property-card__price").text();
let propertyLink = $(this).find(".property-card__header a").attr("href");
salePrice = salePrice.replace("R$", "").trim().replace(/\./g, "");
if(salePrice.trim()*1){
properties.push({
price: parseInt(salePrice.trim()),
url: "https://www.vivareal.com.br" + propertyLink
});
}
}
});
return {
success: true,
data: properties,
samplesFound:properties.length,
pages: totalPages,
totalFound:foundCount * 1,
sourceUrl:sourceUrl
}
}