I’m using Puppeteer JS in my Node.js app to scrape a lyrics website, but I’m only getting the HTML header instead of the full page content. Here’s a sample URL I’m working with: https://shironet.mako.co.il/search?q=fire
. The site appears to be built with an SPA framework, as I get only the header filled with compressed JS functions and an empty HTML body. However, I can see the complete HTML in Chrome DevTools. This is the scraping code I’m using:
'use strict'
const puppeteer = require('puppeteer');
const cheerio = require('cheerio');
const baseUrl = 'https://shironet.mako.co.il/search?q=';
async function fetchLyrics(songName) {
if (!songName) {
return 'No song name specified';
}
console.log(`Fetching lyrics for: ${songName}`);
puppeteer.launch({ headless: true }).then(async browser => {
const page = await browser.newPage();
await page.goto(`${baseUrl}/${songName}`, { waitUntil: 'networkidle2' });
await page.waitForTimeout(10000);
const html = await page.content();
const $ = cheerio.load(html);
$('a.search_link_name').each((i, el) => {
console.log($(el).text());
});
await browser.close();
});
}
module.exports = { fetchLyrics };
In DevTools with headless: false
, the body is empty and functions fill the header, stopping the page from loading. This is some of the HTML response I’m getting in both headless and non-headless modes:
<html><head><meta charset="utf-8"><script>function i700(){}i700.F20=function (){return typeof i700.O20.p60==='function'?i700.O20.p60.apply(i700.O20,arguments):i700.O20.p60;};i700.X70=function (){return typeof i700.v70.p60==='function'?i700.v70.p60.apply(i700.v70,arguments):i700.v70.p60;};i700.Z20=function (){return typeof i700.O20.P20==='function'?i700.O20.P20.apply(i700.O20,arguments):i700.O20.P20;};i700.Q60=function (){return typeof i700.Y60.P20==='function'?i700.Y60.P20.apply(i700.Y60,arguments):i700.Y60.P20;};...;winsocks();</script></head><body></body></html>
What might I be doing wrong? Cheerio fails without body content. Even waitFor
and waitUntil
tricks don’t work for me. Also, tools like Axios and Insomnia return an empty body, but Postman retrieves the correct HTML. Any idea why this happens? Thanks for any help!