I’m having trouble extracting job listing links from a website using a web crawler. I’ve tried various settings but can’t get the desired results. Here’s my code:
import asyncio
from web_crawler import AsyncCrawler, BrowserSettings, CrawlConfig, CachePolicy
async def crawl_job_site():
browser_options = BrowserSettings(
visible_mode=True,
text_only=False
)
scroll_and_wait = '''
await new Promise(r => setTimeout(r, 5000));
window.scrollTo(0, document.body.scrollHeight);
'''
crawl_options = CrawlConfig(
full_page_scan=True,
load_delay=2.5,
wait_condition='js:() => window.pageLoaded === true',
target_element='main',
cache_policy=CachePolicy.IGNORE,
remove_popups=True,
ignore_external=True,
ignore_social=True
)
async with AsyncCrawler(settings=browser_options) as spider:
outcome = await spider.crawl(
'https://example-jobs.com/listings?page=1&radius=30&unit=km&country=de#',
options=crawl_options
)
if outcome.ok:
print(f'[SUCCESS] Crawled: {outcome.url}')
print(f'Internal links found: {len(outcome.links["internal"])}')
print(f'External links found: {len(outcome.links["external"])}')
for link in outcome.links['internal']:
print(f'Internal Link: {link["url"]} - {link["anchor"]}')
else:
print(f'[FAILED] {outcome.error}')
asyncio.run(crawl_job_site())
I’ve tried different browser and crawler settings, but I only get one link (privacy policy) instead of job listings. Any ideas on what I’m doing wrong or how to fix this?