Learn to extract data from websites using selectors, evaluate functions, and handle dynamic content
Selectors are the foundation of web scraping. Learn how to target specific elements on a webpage.
// By ID
await page.$('#unique-id');
// By Class
await page.$('.class-name');
// By Tag
await page.$('div');
// By Attribute
await page.$('[data-test="button"]');
// Descendant
await page.$('div .child-class');
// Child
await page.$('ul > li');
// Pseudo-selectors
await page.$('li:first-child');
await page.$('p:nth-child(2)');
// Basic XPath
await page.$x('//div[@class="content"]');
// Text contains
await page.$x('//button[contains(text(), "Submit")]');
// Following sibling
await page.$x('//h1/following-sibling::p');
// Parent element
await page.$x('//span[@class="price"]/parent::div');
// Multiple conditions
await page.$x('//a[@class="link" and @href="/home"]');
Use browser DevTools to test selectors. Right-click → Inspect → Console, then try document.querySelector('your-selector')
Learn different methods to extract data from web pages.
const puppeteer = require('puppeteer');
(async () => {
const browser = await puppeteer.launch();
const page = await browser.newPage();
await page.goto('https://quotes.toscrape.com/');
// Get text content
const firstQuote = await page.$eval('.quote .text',
element => element.textContent
);
// Get attribute value
const authorLink = await page.$eval('.quote .author + a',
element => element.href
);
// Get multiple properties
const quoteData = await page.$eval('.quote', element => ({
text: element.querySelector('.text').textContent,
author: element.querySelector('.author').textContent,
tags: Array.from(element.querySelectorAll('.tag'))
.map(tag => tag.textContent)
}));
console.log('First Quote:', firstQuote);
console.log('Author Link:', authorLink);
console.log('Quote Data:', quoteData);
await browser.close();
})();
const puppeteer = require('puppeteer');
(async () => {
const browser = await puppeteer.launch();
const page = await browser.newPage();
await page.goto('https://quotes.toscrape.com/');
// Extract all quotes
const quotes = await page.$$eval('.quote', elements => {
return elements.map(element => ({
text: element.querySelector('.text').textContent,
author: element.querySelector('.author').textContent,
tags: Array.from(element.querySelectorAll('.tag'))
.map(tag => tag.textContent)
}));
});
// Extract navigation links
const navLinks = await page.$$eval('nav a', links => {
return links.map(link => ({
text: link.textContent.trim(),
href: link.href
}));
});
console.log(`Found ${quotes.length} quotes`);
console.log('Sample quote:', quotes[0]);
console.log('Navigation links:', navLinks);
await browser.close();
})();
const puppeteer = require('puppeteer');
(async () => {
const browser = await puppeteer.launch();
const page = await browser.newPage();
await page.goto('https://example.com/table');
// Extract table data
const tableData = await page.evaluate(() => {
const table = document.querySelector('table');
const rows = Array.from(table.querySelectorAll('tr'));
// Get headers
const headers = rows[0].querySelectorAll('th');
const headerTexts = Array.from(headers).map(th => th.textContent.trim());
// Get data rows
const dataRows = rows.slice(1).map(row => {
const cells = Array.from(row.querySelectorAll('td'));
const rowData = {};
cells.forEach((cell, index) => {
rowData[headerTexts[index]] = cell.textContent.trim();
});
return rowData;
});
return {
headers: headerTexts,
data: dataRows
};
});
console.log('Table Headers:', tableData.headers);
console.log('Table Data:', tableData.data);
await browser.close();
})();
Learn to scrape content that loads dynamically with JavaScript.
const puppeteer = require('puppeteer');
(async () => {
const browser = await puppeteer.launch();
const page = await browser.newPage();
await page.goto('https://example.com/dynamic');
// Wait for specific element to appear
await page.waitForSelector('.dynamic-content', { timeout: 5000 });
// Wait for element to be visible
await page.waitForSelector('.modal', { visible: true });
// Wait for element to be hidden
await page.waitForSelector('.loading', { hidden: true });
// Wait for function to return true
await page.waitForFunction(() => {
return document.querySelector('.data-loaded') !== null;
});
// Wait for network activity to finish
await page.waitForLoadState('networkidle');
// Now extract the dynamic content
const dynamicData = await page.$eval('.dynamic-content',
element => element.textContent
);
console.log('Dynamic content:', dynamicData);
await browser.close();
})();
const puppeteer = require('puppeteer');
(async () => {
const browser = await puppeteer.launch();
const page = await browser.newPage();
await page.goto('https://example.com/infinite-scroll');
let items = [];
let previousHeight = 0;
while (true) {
// Get current scroll height
const currentHeight = await page.evaluate(() => document.body.scrollHeight);
// Break if no new content loaded
if (currentHeight === previousHeight) {
break;
}
// Scroll to bottom
await page.evaluate(() => {
window.scrollTo(0, document.body.scrollHeight);
});
// Wait for new content to load
await page.waitForTimeout(2000);
// Extract new items
const newItems = await page.$$eval('.item:not(.extracted)', elements => {
return elements.map(element => {
element.classList.add('extracted'); // Mark as extracted
return {
title: element.querySelector('.title').textContent,
description: element.querySelector('.description').textContent
};
});
});
items.push(...newItems);
previousHeight = currentHeight;
console.log(`Loaded ${items.length} items so far...`);
}
console.log(`Final count: ${items.length} items`);
await browser.close();
})();
const puppeteer = require('puppeteer');
(async () => {
const browser = await puppeteer.launch();
const page = await browser.newPage();
// Listen for network requests
page.on('response', response => {
if (response.url().includes('/api/data')) {
console.log('API call detected:', response.url());
}
});
await page.goto('https://example.com/ajax-content');
// Trigger AJAX request
await page.click('#load-more-btn');
// Wait for API response and DOM update
await page.waitForResponse(response =>
response.url().includes('/api/data') && response.status() === 200
);
// Wait for DOM to update
await page.waitForSelector('.new-content');
// Extract the loaded content
const ajaxContent = await page.$$eval('.ajax-item', elements => {
return elements.map(element => ({
id: element.dataset.id,
title: element.querySelector('.title').textContent,
content: element.querySelector('.content').textContent
}));
});
console.log('AJAX content loaded:', ajaxContent);
await browser.close();
})();
Always respect website terms of service and robots.txt. Add delays between requests and don't overload servers.
Master advanced techniques for complex scraping scenarios.
const puppeteer = require('puppeteer');
(async () => {
const browser = await puppeteer.launch();
const page = await browser.newPage();
let allData = [];
let currentPage = 1;
while (true) {
console.log(`Scraping page ${currentPage}...`);
await page.goto(`https://example.com/products?page=${currentPage}`);
// Check if page has content
const hasContent = await page.$('.product-item') !== null;
if (!hasContent) {
console.log('No more pages to scrape');
break;
}
// Extract page data
const pageData = await page.$$eval('.product-item', elements => {
return elements.map(element => ({
name: element.querySelector('.product-name').textContent,
price: element.querySelector('.price').textContent,
image: element.querySelector('img').src
}));
});
allData.push(...pageData);
// Check for next page button
const nextButton = await page.$('.pagination .next');
if (!nextButton) {
console.log('No next page button found');
break;
}
const isDisabled = await page.evaluate(btn =>
btn.classList.contains('disabled'), nextButton
);
if (isDisabled) {
console.log('Next button is disabled');
break;
}
currentPage++;
}
console.log(`Total products scraped: ${allData.length}`);
await browser.close();
})();
const puppeteer = require('puppeteer');
async function scrapeWithRetry(url, maxRetries = 3) {
const browser = await puppeteer.launch();
for (let attempt = 1; attempt <= maxRetries; attempt++) {
try {
const page = await browser.newPage();
// Set timeout
await page.setDefaultTimeout(30000);
// Navigate with retry logic
await page.goto(url, {
waitUntil: 'networkidle2',
timeout: 30000
});
// Wait for content to load
await page.waitForSelector('.content', { timeout: 10000 });
// Extract data
const data = await page.evaluate(() => {
const elements = document.querySelectorAll('.item');
if (elements.length === 0) {
throw new Error('No items found on page');
}
return Array.from(elements).map(element => ({
title: element.querySelector('.title')?.textContent || '',
description: element.querySelector('.description')?.textContent || ''
}));
});
await page.close();
await browser.close();
return data;
} catch (error) {
console.log(`Attempt ${attempt} failed:`, error.message);
if (attempt === maxRetries) {
await browser.close();
throw new Error(`Scraping failed after ${maxRetries} attempts`);
}
// Wait before retry
await new Promise(resolve => setTimeout(resolve, 2000 * attempt));
}
}
}
// Usage
(async () => {
try {
const data = await scrapeWithRetry('https://example.com/data');
console.log('Scraped data:', data);
} catch (error) {
console.error('Scraping failed:', error.message);
}
})();
const puppeteer = require('puppeteer');
const fs = require('fs');
const path = require('path');
(async () => {
const browser = await puppeteer.launch();
const page = await browser.newPage();
await page.goto('https://example.com/data');
// Extract data
const scrapedData = await page.$$eval('.data-item', elements => {
return elements.map(element => ({
id: element.dataset.id,
title: element.querySelector('.title').textContent,
description: element.querySelector('.description').textContent,
price: element.querySelector('.price').textContent,
scrapedAt: new Date().toISOString()
}));
});
// Save as JSON
const jsonData = JSON.stringify(scrapedData, null, 2);
fs.writeFileSync('scraped-data.json', jsonData);
// Save as CSV
const csvHeaders = Object.keys(scrapedData[0]).join(',');
const csvRows = scrapedData.map(item =>
Object.values(item).map(value =>
`"${value}"`
).join(',')
);
const csvContent = [csvHeaders, ...csvRows].join('\n');
fs.writeFileSync('scraped-data.csv', csvContent);
// Save as HTML report
const htmlReport = `
Scraping Report
Scraping Report
Total items: ${scrapedData.length}
Generated: ${new Date().toLocaleString()}
ID
Title
Description
Price
${scrapedData.map(item => `
${item.id}
${item.title}
${item.description}
${item.price}
`).join('')}
`;
fs.writeFileSync('report.html', htmlReport);
console.log('Data exported to:');
console.log('- scraped-data.json');
console.log('- scraped-data.csv');
console.log('- report.html');
await browser.close();
})();
Add delays between requests and check robots.txt
// Add delays between requests
await page.waitForTimeout(1000);
// Check robots.txt before scraping
const robotsUrl = new URL('/robots.txt', baseUrl);
const robotsResponse = await fetch(robotsUrl);
Use different user agents to avoid detection
const userAgents = [
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36',
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36'
];
await page.setUserAgent(userAgents[Math.floor(Math.random() * userAgents.length)]);
Always validate scraped data before processing
function validateData(item) {
return item.title &&
item.title.length > 0 &&
item.price &&
!isNaN(parseFloat(item.price.replace('$', '')));
}
const validData = scrapedData.filter(validateData);