Web Scraping - Learn Puppeteer

Step 1: Understanding Selectors

Selectors are the foundation of web scraping. Learn how to target specific elements on a webpage.

CSS Selectors

// By ID
await page.$('#unique-id');

// By Class
await page.$('.class-name');

// By Tag
await page.$('div');

// By Attribute
await page.$('[data-test="button"]');

// Descendant
await page.$('div .child-class');

// Child
await page.$('ul > li');

// Pseudo-selectors
await page.$('li:first-child');
await page.$('p:nth-child(2)');

XPath Selectors

XPath Examples

// Basic XPath
await page.$x('//div[@class="content"]');

// Text contains
await page.$x('//button[contains(text(), "Submit")]');

// Following sibling
await page.$x('//h1/following-sibling::p');

// Parent element
await page.$x('//span[@class="price"]/parent::div');

// Multiple conditions
await page.$x('//a[@class="link" and @href="/home"]');

Pro Tip

Use browser DevTools to test selectors. Right-click → Inspect → Console, then try document.querySelector('your-selector')

Step 2: Data Extraction Methods

Learn different methods to extract data from web pages.

Single Element Extraction

single-element.js

const puppeteer = require('puppeteer');

(async () => {
  const browser = await puppeteer.launch();
  const page = await browser.newPage();
  
  await page.goto('https://quotes.toscrape.com/');
  
  // Get text content
  const firstQuote = await page.$eval('.quote .text', 
    element => element.textContent
  );
  
  // Get attribute value
  const authorLink = await page.$eval('.quote .author + a', 
    element => element.href
  );
  
  // Get multiple properties
  const quoteData = await page.$eval('.quote', element => ({
    text: element.querySelector('.text').textContent,
    author: element.querySelector('.author').textContent,
    tags: Array.from(element.querySelectorAll('.tag'))
           .map(tag => tag.textContent)
  }));
  
  console.log('First Quote:', firstQuote);
  console.log('Author Link:', authorLink);
  console.log('Quote Data:', quoteData);
  
  await browser.close();
})();

Multiple Elements Extraction

multiple-elements.js

const puppeteer = require('puppeteer');

(async () => {
  const browser = await puppeteer.launch();
  const page = await browser.newPage();
  
  await page.goto('https://quotes.toscrape.com/');
  
  // Extract all quotes
  const quotes = await page.$$eval('.quote', elements => {
    return elements.map(element => ({
      text: element.querySelector('.text').textContent,
      author: element.querySelector('.author').textContent,
      tags: Array.from(element.querySelectorAll('.tag'))
             .map(tag => tag.textContent)
    }));
  });
  
  // Extract navigation links
  const navLinks = await page.$$eval('nav a', links => {
    return links.map(link => ({
      text: link.textContent.trim(),
      href: link.href
    }));
  });
  
  console.log(`Found ${quotes.length} quotes`);
  console.log('Sample quote:', quotes[0]);
  console.log('Navigation links:', navLinks);
  
  await browser.close();
})();

Table Data Extraction

table-scraping.js

const puppeteer = require('puppeteer');

(async () => {
  const browser = await puppeteer.launch();
  const page = await browser.newPage();
  
  await page.goto('https://example.com/table');
  
  // Extract table data
  const tableData = await page.evaluate(() => {
    const table = document.querySelector('table');
    const rows = Array.from(table.querySelectorAll('tr'));
    
    // Get headers
    const headers = rows[0].querySelectorAll('th');
    const headerTexts = Array.from(headers).map(th => th.textContent.trim());
    
    // Get data rows
    const dataRows = rows.slice(1).map(row => {
      const cells = Array.from(row.querySelectorAll('td'));
      const rowData = {};
      
      cells.forEach((cell, index) => {
        rowData[headerTexts[index]] = cell.textContent.trim();
      });
      
      return rowData;
    });
    
    return {
      headers: headerTexts,
      data: dataRows
    };
  });
  
  console.log('Table Headers:', tableData.headers);
  console.log('Table Data:', tableData.data);
  
  await browser.close();
})();

Step 3: Handling Dynamic Content

Learn to scrape content that loads dynamically with JavaScript.

Waiting for Elements

waiting-elements.js

const puppeteer = require('puppeteer');

(async () => {
  const browser = await puppeteer.launch();
  const page = await browser.newPage();
  
  await page.goto('https://example.com/dynamic');
  
  // Wait for specific element to appear
  await page.waitForSelector('.dynamic-content', { timeout: 5000 });
  
  // Wait for element to be visible
  await page.waitForSelector('.modal', { visible: true });
  
  // Wait for element to be hidden
  await page.waitForSelector('.loading', { hidden: true });
  
  // Wait for function to return true
  await page.waitForFunction(() => {
    return document.querySelector('.data-loaded') !== null;
  });
  
  // Wait for network activity to finish
  await page.waitForLoadState('networkidle');
  
  // Now extract the dynamic content
  const dynamicData = await page.$eval('.dynamic-content', 
    element => element.textContent
  );
  
  console.log('Dynamic content:', dynamicData);
  
  await browser.close();
})();

Infinite Scroll

infinite-scroll.js

const puppeteer = require('puppeteer');

(async () => {
  const browser = await puppeteer.launch();
  const page = await browser.newPage();
  
  await page.goto('https://example.com/infinite-scroll');
  
  let items = [];
  let previousHeight = 0;
  
  while (true) {
    // Get current scroll height
    const currentHeight = await page.evaluate(() => document.body.scrollHeight);
    
    // Break if no new content loaded
    if (currentHeight === previousHeight) {
      break;
    }
    
    // Scroll to bottom
    await page.evaluate(() => {
      window.scrollTo(0, document.body.scrollHeight);
    });
    
    // Wait for new content to load
    await page.waitForTimeout(2000);
    
    // Extract new items
    const newItems = await page.$$eval('.item:not(.extracted)', elements => {
      return elements.map(element => {
        element.classList.add('extracted'); // Mark as extracted
        return {
          title: element.querySelector('.title').textContent,
          description: element.querySelector('.description').textContent
        };
      });
    });
    
    items.push(...newItems);
    previousHeight = currentHeight;
    
    console.log(`Loaded ${items.length} items so far...`);
  }
  
  console.log(`Final count: ${items.length} items`);
  
  await browser.close();
})();

AJAX Content

ajax-content.js

const puppeteer = require('puppeteer');

(async () => {
  const browser = await puppeteer.launch();
  const page = await browser.newPage();
  
  // Listen for network requests
  page.on('response', response => {
    if (response.url().includes('/api/data')) {
      console.log('API call detected:', response.url());
    }
  });
  
  await page.goto('https://example.com/ajax-content');
  
  // Trigger AJAX request
  await page.click('#load-more-btn');
  
  // Wait for API response and DOM update
  await page.waitForResponse(response => 
    response.url().includes('/api/data') && response.status() === 200
  );
  
  // Wait for DOM to update
  await page.waitForSelector('.new-content');
  
  // Extract the loaded content
  const ajaxContent = await page.$$eval('.ajax-item', elements => {
    return elements.map(element => ({
      id: element.dataset.id,
      title: element.querySelector('.title').textContent,
      content: element.querySelector('.content').textContent
    }));
  });
  
  console.log('AJAX content loaded:', ajaxContent);
  
  await browser.close();
})();

Important Note

Always respect website terms of service and robots.txt. Add delays between requests and don't overload servers.

Step 4: Advanced Scraping Techniques

Master advanced techniques for complex scraping scenarios.

Pagination

pagination.js

const puppeteer = require('puppeteer');

(async () => {
  const browser = await puppeteer.launch();
  const page = await browser.newPage();
  
  let allData = [];
  let currentPage = 1;
  
  while (true) {
    console.log(`Scraping page ${currentPage}...`);
    
    await page.goto(`https://example.com/products?page=${currentPage}`);
    
    // Check if page has content
    const hasContent = await page.$('.product-item') !== null;
    if (!hasContent) {
      console.log('No more pages to scrape');
      break;
    }
    
    // Extract page data
    const pageData = await page.$$eval('.product-item', elements => {
      return elements.map(element => ({
        name: element.querySelector('.product-name').textContent,
        price: element.querySelector('.price').textContent,
        image: element.querySelector('img').src
      }));
    });
    
    allData.push(...pageData);
    
    // Check for next page button
    const nextButton = await page.$('.pagination .next');
    if (!nextButton) {
      console.log('No next page button found');
      break;
    }
    
    const isDisabled = await page.evaluate(btn => 
      btn.classList.contains('disabled'), nextButton
    );
    
    if (isDisabled) {
      console.log('Next button is disabled');
      break;
    }
    
    currentPage++;
  }
  
  console.log(`Total products scraped: ${allData.length}`);
  
  await browser.close();
})();

Error Handling & Retries

error-handling.js

const puppeteer = require('puppeteer');

async function scrapeWithRetry(url, maxRetries = 3) {
  const browser = await puppeteer.launch();
  
  for (let attempt = 1; attempt <= maxRetries; attempt++) {
    try {
      const page = await browser.newPage();
      
      // Set timeout
      await page.setDefaultTimeout(30000);
      
      // Navigate with retry logic
      await page.goto(url, { 
        waitUntil: 'networkidle2',
        timeout: 30000 
      });
      
      // Wait for content to load
      await page.waitForSelector('.content', { timeout: 10000 });
      
      // Extract data
      const data = await page.evaluate(() => {
        const elements = document.querySelectorAll('.item');
        if (elements.length === 0) {
          throw new Error('No items found on page');
        }
        
        return Array.from(elements).map(element => ({
          title: element.querySelector('.title')?.textContent || '',
          description: element.querySelector('.description')?.textContent || ''
        }));
      });
      
      await page.close();
      await browser.close();
      
      return data;
      
    } catch (error) {
      console.log(`Attempt ${attempt} failed:`, error.message);
      
      if (attempt === maxRetries) {
        await browser.close();
        throw new Error(`Scraping failed after ${maxRetries} attempts`);
      }
      
      // Wait before retry
      await new Promise(resolve => setTimeout(resolve, 2000 * attempt));
    }
  }
}

// Usage
(async () => {
  try {
    const data = await scrapeWithRetry('https://example.com/data');
    console.log('Scraped data:', data);
  } catch (error) {
    console.error('Scraping failed:', error.message);
  }
})();

Data Export

data-export.js

const puppeteer = require('puppeteer');
const fs = require('fs');
const path = require('path');

(async () => {
  const browser = await puppeteer.launch();
  const page = await browser.newPage();
  
  await page.goto('https://example.com/data');
  
  // Extract data
  const scrapedData = await page.$$eval('.data-item', elements => {
    return elements.map(element => ({
      id: element.dataset.id,
      title: element.querySelector('.title').textContent,
      description: element.querySelector('.description').textContent,
      price: element.querySelector('.price').textContent,
      scrapedAt: new Date().toISOString()
    }));
  });
  
  // Save as JSON
  const jsonData = JSON.stringify(scrapedData, null, 2);
  fs.writeFileSync('scraped-data.json', jsonData);
  
  // Save as CSV
  const csvHeaders = Object.keys(scrapedData[0]).join(',');
  const csvRows = scrapedData.map(item => 
    Object.values(item).map(value => 
      `"${value}"`
    ).join(',')
  );
  const csvContent = [csvHeaders, ...csvRows].join('\n');
  fs.writeFileSync('scraped-data.csv', csvContent);
  
  // Save as HTML report
  const htmlReport = `
    
    
    
      Scraping Report
      
    
    
      Scraping Report
      Total items: ${scrapedData.length}
      Generated: ${new Date().toLocaleString()}
      
      
        ${scrapedData.map(item => `
          
        `).join('')}
      
        
          ID
          Title
          Description
          Price
        

            ${item.id}
            ${item.title}
            ${item.description}
            ${item.price}
          
    
    
  `;
  
  fs.writeFileSync('report.html', htmlReport);
  
  console.log('Data exported to:');
  console.log('- scraped-data.json');
  console.log('- scraped-data.csv');
  console.log('- report.html');
  
  await browser.close();
})();

ID	Title	Description	Price
${item.id}	${item.title}	${item.description}	${item.price}

Best Practices for Web Scraping

Respectful Scraping

Add delays between requests and check robots.txt

// Add delays between requests
await page.waitForTimeout(1000);

// Check robots.txt before scraping
const robotsUrl = new URL('/robots.txt', baseUrl);
const robotsResponse = await fetch(robotsUrl);

User Agent Rotation

Use different user agents to avoid detection

const userAgents = [
  'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36',
  'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36',
  'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36'
];

await page.setUserAgent(userAgents[Math.floor(Math.random() * userAgents.length)]);

Data Validation

Always validate scraped data before processing

function validateData(item) {
  return item.title && 
         item.title.length > 0 && 
         item.price && 
         !isNaN(parseFloat(item.price.replace('$', '')));
}

const validData = scrapedData.filter(validateData);

Web Scraping with Puppeteer