Python Web Scraping with CAPTCHA Handling

Learn how to build production-ready Python web scrapers that seamlessly handle CAPTCHAs without manual intervention.

This tutorial covers both synchronous and asynchronous approaches for maximum performance in production environments.

Setup and Installation

First, let's install the required packages:

# Core scraping libraries
pip install requests beautifulsoup4 lxml

# For JavaScript-heavy sites
pip install selenium playwright

# Async support
pip install aiohttp asyncio

# For Scrapy integration
pip install scrapy scrapy-splash

# Additional utilities
pip install pillow python-anticaptcha

Basic Web Scraper with CAPTCHA Support

Let's start with a simple example using requests and BeautifulSoup:

import requests
from bs4 import BeautifulSoup
import base64
import time
from PIL import Image
from io import BytesIO

class CaptchaWebScraper:
    def __init__(self, api_key):
        self.api_key = api_key
        self.session = requests.Session()
        self.api_url = "https://api.ai4cap.com/v1"
        
    def solve_captcha(self, captcha_url):
        """Download and solve CAPTCHA image"""
        # Download CAPTCHA image
        response = self.session.get(captcha_url)
        img = Image.open(BytesIO(response.content))
        
        # Convert to base64
        buffered = BytesIO()
        img.save(buffered, format="PNG")
        img_base64 = base64.b64encode(buffered.getvalue()).decode()
        
        # Send to AI4CAP API
        task_response = self.session.post(
            f"{self.api_url}/tasks",
            headers={"API-Key": self.api_key},
            json={
                "type": "ImageToTextTask",
                "body": img_base64
            }
        )
        
        task_id = task_response.json()["taskId"]
        
        # Poll for result
        while True:
            result = self.session.get(
                f"{self.api_url}/tasks/{task_id}",
                headers={"API-Key": self.api_key}
            ).json()
            
            if result["status"] == "ready":
                return result["solution"]["text"]
            elif result["status"] == "failed":
                raise Exception("CAPTCHA solving failed")
            
            time.sleep(2)
    
    def scrape_with_captcha(self, url):
        """Scrape a page that requires CAPTCHA solving"""
        # Initial request
        response = self.session.get(url)
        soup = BeautifulSoup(response.content, 'html.parser')
        
        # Check if CAPTCHA is present
        captcha_img = soup.find('img', {'class': 'captcha-image'})
        if captcha_img:
            # Get CAPTCHA URL
            captcha_url = urljoin(url, captcha_img['src'])
            
            # Solve CAPTCHA
            captcha_solution = self.solve_captcha(captcha_url)
            
            # Get form data
            form = soup.find('form')
            form_data = {
                input_tag['name']: input_tag.get('value', '')
                for input_tag in form.find_all('input')
                if input_tag.get('name')
            }
            
            # Add CAPTCHA solution
            form_data['captcha'] = captcha_solution
            
            # Submit form
            action_url = urljoin(url, form.get('action', url))
            response = self.session.post(action_url, data=form_data)
            soup = BeautifulSoup(response.content, 'html.parser')
        
        # Extract data
        return self.extract_data(soup)
    
    def extract_data(self, soup):
        """Extract required data from page"""
        data = []
        
        # Example: Extract product information
        for product in soup.find_all('div', class_='product'):
            item = {
                'title': product.find('h2').text.strip(),
                'price': product.find('span', class_='price').text.strip(),
                'description': product.find('p').text.strip(),
                'url': product.find('a')['href']
            }
            data.append(item)
        
        return data

# Usage
scraper = CaptchaWebScraper("YOUR_API_KEY")
results = scraper.scrape_with_captcha("https://example.com/products")

Advanced Async Scraping

For high-performance scraping, use async/await with aiohttp:

import asyncio
import aiohttp
from bs4 import BeautifulSoup
import base64
from typing import List, Dict

class AsyncCaptchaScraper:
    def __init__(self, api_key: str, max_concurrent: int = 10):
        self.api_key = api_key
        self.api_url = "https://api.ai4cap.com/v1"
        self.semaphore = asyncio.Semaphore(max_concurrent)
        
    async def solve_captcha_async(self, session: aiohttp.ClientSession, 
                                  captcha_url: str) -> str:
        """Asynchronously solve CAPTCHA"""
        async with session.get(captcha_url) as response:
            image_data = await response.read()
            
        # Convert to base64
        img_base64 = base64.b64encode(image_data).decode()
        
        # Submit CAPTCHA task
        async with session.post(
            f"{self.api_url}/tasks",
            headers={"API-Key": self.api_key},
            json={
                "type": "ImageToTextTask",
                "body": img_base64
            }
        ) as response:
            result = await response.json()
            task_id = result["taskId"]
        
        # Poll for result
        while True:
            async with session.get(
                f"{self.api_url}/tasks/{task_id}",
                headers={"API-Key": self.api_key}
            ) as response:
                result = await response.json()
                
            if result["status"] == "ready":
                return result["solution"]["text"]
            elif result["status"] == "failed":
                raise Exception("CAPTCHA solving failed")
            
            await asyncio.sleep(2)
    
    async def scrape_page(self, session: aiohttp.ClientSession, 
                          url: str) -> List[Dict]:
        """Scrape a single page with CAPTCHA handling"""
        async with self.semaphore:
            try:
                # Initial request
                async with session.get(url) as response:
                    html = await response.text()
                    soup = BeautifulSoup(html, 'html.parser')
                
                # Check for CAPTCHA
                captcha_img = soup.find('img', {'class': 'captcha-image'})
                if captcha_img:
                    captcha_url = urljoin(url, captcha_img['src'])
                    solution = await self.solve_captcha_async(session, captcha_url)
                    
                    # Submit form with CAPTCHA solution
                    form_data = self.extract_form_data(soup)
                    form_data['captcha'] = solution
                    
                    async with session.post(url, data=form_data) as response:
                        html = await response.text()
                        soup = BeautifulSoup(html, 'html.parser')
                
                return self.extract_data(soup)
                
            except Exception as e:
                print(f"Error scraping {url}: {e}")
                return []
    
    async def scrape_multiple(self, urls: List[str]) -> List[Dict]:
        """Scrape multiple URLs concurrently"""
        async with aiohttp.ClientSession() as session:
            tasks = [self.scrape_page(session, url) for url in urls]
            results = await asyncio.gather(*tasks)
            
        # Flatten results
        return [item for sublist in results for item in sublist]
    
    def extract_form_data(self, soup):
        """Extract form data from page"""
        form = soup.find('form')
        return {
            tag['name']: tag.get('value', '')
            for tag in form.find_all(['input', 'select', 'textarea'])
            if tag.get('name')
        }
    
    def extract_data(self, soup):
        """Extract required data"""
        return [{
            'title': item.find('h2').text.strip(),
            'price': item.find('.price').text.strip(),
            'url': item.find('a')['href']
        } for item in soup.find_all('.product-item')]

# Usage
async def main():
    scraper = AsyncCaptchaScraper("YOUR_API_KEY", max_concurrent=20)
    
    urls = [
        f"https://example.com/page/{i}" 
        for i in range(1, 101)
    ]
    
    results = await scraper.scrape_multiple(urls)
    print(f"Scraped {len(results)} items")

# Run
asyncio.run(main())

Scrapy Integration

For large-scale projects, integrate CAPTCHA solving into Scrapy:

# captcha_middleware.py
import base64
import requests
from scrapy import signals
from scrapy.http import HtmlResponse
from scrapy.exceptions import IgnoreRequest

class CaptchaMiddleware:
    def __init__(self, api_key):
        self.api_key = api_key
        self.api_url = "https://api.ai4cap.com/v1"
        
    @classmethod
    def from_crawler(cls, crawler):
        api_key = crawler.settings.get('CAPTCHA_API_KEY')
        if not api_key:
            raise ValueError("CAPTCHA_API_KEY setting is required")
        
        middleware = cls(api_key)
        crawler.signals.connect(
            middleware.spider_opened, 
            signal=signals.spider_opened
        )
        return middleware
    
    def process_response(self, request, response, spider):
        # Check if response contains CAPTCHA
        if self.has_captcha(response):
            spider.logger.info(f"CAPTCHA detected on {request.url}")
            
            # Solve CAPTCHA
            captcha_solution = self.solve_captcha(response)
            
            # Create new request with solution
            form_data = self.extract_form_data(response)
            form_data['captcha'] = captcha_solution
            
            return request.replace(
                method='POST',
                body=urlencode(form_data),
                headers={'Content-Type': 'application/x-www-form-urlencoded'},
                dont_filter=True
            )
        
        return response
    
    def has_captcha(self, response):
        """Check if page contains CAPTCHA"""
        return bool(response.css('img.captcha-image'))
    
    def solve_captcha(self, response):
        """Extract and solve CAPTCHA from response"""
        # Extract CAPTCHA image URL
        captcha_url = response.urljoin(
            response.css('img.captcha-image::attr(src)').get()
        )
        
        # Download image
        img_response = requests.get(captcha_url)
        img_base64 = base64.b64encode(img_response.content).decode()
        
        # Send to API
        task_response = requests.post(
            f"{self.api_url}/tasks",
            headers={"API-Key": self.api_key},
            json={
                "type": "ImageToTextTask",
                "body": img_base64
            }
        )
        
        task_id = task_response.json()["taskId"]
        
        # Wait for solution
        return self.wait_for_solution(task_id)
    
    def wait_for_solution(self, task_id):
        """Poll API for CAPTCHA solution"""
        import time
        
        while True:
            result = requests.get(
                f"{self.api_url}/tasks/{task_id}",
                headers={"API-Key": self.api_key}
            ).json()
            
            if result["status"] == "ready":
                return result["solution"]["text"]
            elif result["status"] == "failed":
                raise IgnoreRequest("CAPTCHA solving failed")
            
            time.sleep(2)
    
    def spider_opened(self, spider):
        spider.logger.info('CAPTCHA middleware activated')

# spider.py
import scrapy
from scrapy import Request

class ProductSpider(scrapy.Spider):
    name = 'products'
    start_urls = ['https://example.com/products']
    
    custom_settings = {
        'DOWNLOADER_MIDDLEWARES': {
            'myproject.middlewares.CaptchaMiddleware': 560,
        }
    }
    
    def parse(self, response):
        # Extract product links
        for product_url in response.css('.product-link::attr(href)').getall():
            yield Request(
                url=response.urljoin(product_url),
                callback=self.parse_product
            )
        
        # Follow pagination
        next_page = response.css('.pagination .next::attr(href)').get()
        if next_page:
            yield Request(
                url=response.urljoin(next_page),
                callback=self.parse
            )
    
    def parse_product(self, response):
        # Extract product data
        yield {
            'title': response.css('h1.product-title::text').get(),
            'price': response.css('.price::text').re_first(r'[d.]+'),
            'description': response.css('.description::text').get(),
            'image_url': response.css('.product-image::attr(src)').get(),
            'availability': response.css('.availability::text').get(),
            'url': response.url
        }

# settings.py
BOT_NAME = 'captcha_scraper'

SPIDER_MODULES = ['captcha_scraper.spiders']
NEWSPIDER_MODULE = 'captcha_scraper.spiders'

# CAPTCHA solving configuration
CAPTCHA_API_KEY = 'YOUR_API_KEY'

# Obey robots.txt rules
ROBOTSTXT_OBEY = True

# Configure concurrent requests
CONCURRENT_REQUESTS = 16
CONCURRENT_REQUESTS_PER_DOMAIN = 8

# Download delay
DOWNLOAD_DELAY = 1
RANDOMIZE_DOWNLOAD_DELAY = True

# Middleware configuration
DOWNLOADER_MIDDLEWARES = {
    'captcha_scraper.middlewares.CaptchaMiddleware': 560,
    'scrapy.downloadermiddlewares.retry.RetryMiddleware': 90,
}

# Retry configuration
RETRY_TIMES = 3
RETRY_HTTP_CODES = [500, 502, 503, 504, 408, 429]

# User agent
USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'

# Auto throttle
AUTOTHROTTLE_ENABLED = True
AUTOTHROTTLE_START_DELAY = 1
AUTOTHROTTLE_MAX_DELAY = 10
AUTOTHROTTLE_TARGET_CONCURRENCY = 8.0

Handling Different CAPTCHA Types

CAPTCHA Type	Detection Method	Solving Approach
Image Text	img.captcha-image	ImageToTextTask
reCAPTCHA v2	div.g-recaptcha	RecaptchaV2Task
reCAPTCHA v3	grecaptcha.execute	RecaptchaV3Task
hCaptcha	div.h-captcha	HCaptchaTask
FunCaptcha	div#funcaptcha	FunCaptchaTask

def detect_captcha_type(soup):
    """Detect CAPTCHA type on page"""
    
    if soup.find('div', class_='g-recaptcha'):
        site_key = soup.find('div', class_='g-recaptcha')['data-sitekey']
        return 'recaptcha_v2', site_key
    
    elif soup.find('div', class_='h-captcha'):
        site_key = soup.find('div', class_='h-captcha')['data-sitekey']
        return 'hcaptcha', site_key
    
    elif soup.find('img', class_='captcha-image'):
        img_url = soup.find('img', class_='captcha-image')['src']
        return 'image_captcha', img_url
    
    elif 'grecaptcha.execute' in str(soup):
        # Extract site key from JavaScript
        import re
        match = re.search(r"grecaptcha.execute('([^']+)'", str(soup))
        if match:
            return 'recaptcha_v3', match.group(1)
    
    return None, None

Performance Optimization

1. Connection Pooling

# Reuse connections for better performance
session = requests.Session()
adapter = requests.adapters.HTTPAdapter(
    pool_connections=100,
    pool_maxsize=100,
    max_retries=3
)
session.mount('http://', adapter)
session.mount('https://', adapter)

2. Caching CAPTCHA Solutions

import hashlib
from functools import lru_cache

@lru_cache(maxsize=1000)
def get_cached_solution(image_hash):
    """Cache CAPTCHA solutions by image hash"""
    return None

def solve_with_cache(image_data):
    # Calculate image hash
    image_hash = hashlib.md5(image_data).hexdigest()
    
    # Check cache
    cached = get_cached_solution(image_hash)
    if cached:
        return cached
    
    # Solve and cache
    solution = solve_captcha(image_data)
    get_cached_solution.cache_info()  # Update cache
    return solution

3. Batch Processing

async def batch_scrape(urls, batch_size=50):
    """Process URLs in batches for optimal performance"""
    
    all_results = []
    
    for i in range(0, len(urls), batch_size):
        batch = urls[i:i + batch_size]
        
        # Process batch concurrently
        batch_results = await asyncio.gather(*[
            scrape_url(url) for url in batch
        ])
        
        all_results.extend(batch_results)
        
        # Add delay between batches
        await asyncio.sleep(1)
    
    return all_results

Pro Tip: Monitor Your Scraping

Always implement logging, error tracking, and performance monitoring to ensure your scrapers run reliably in production.

Start Scraping with Confidence

Build powerful Python scrapers that handle any CAPTCHA automatically with our reliable API.