AI4CAP.COM
Back to Blog
TutorialPython

14 min read

Python Web Scraping with CAPTCHA Handling

Learn how to build production-ready Python web scrapers that seamlessly handle CAPTCHAs without manual intervention.

Setup and Installation

First, let's install the required packages:

# Core scraping libraries pip install requests beautifulsoup4 lxml # For JavaScript-heavy sites pip install selenium playwright # Async support pip install aiohttp asyncio # For Scrapy integration pip install scrapy scrapy-splash # Additional utilities pip install pillow python-anticaptcha

Basic Web Scraper with CAPTCHA Support

Let's start with a simple example using requests and BeautifulSoup:

import requests from bs4 import BeautifulSoup import base64 import time from PIL import Image from io import BytesIO class CaptchaWebScraper: def __init__(self, api_key): self.api_key = api_key self.session = requests.Session() self.api_url = "https://api.ai4cap.com/v1" def solve_captcha(self, captcha_url): """Download and solve CAPTCHA image""" # Download CAPTCHA image response = self.session.get(captcha_url) img = Image.open(BytesIO(response.content)) # Convert to base64 buffered = BytesIO() img.save(buffered, format="PNG") img_base64 = base64.b64encode(buffered.getvalue()).decode() # Send to AI4CAP API task_response = self.session.post( f"{self.api_url}/tasks", headers={"API-Key": self.api_key}, json={ "type": "ImageToTextTask", "body": img_base64 } ) task_id = task_response.json()["taskId"] # Poll for result while True: result = self.session.get( f"{self.api_url}/tasks/{task_id}", headers={"API-Key": self.api_key} ).json() if result["status"] == "ready": return result["solution"]["text"] elif result["status"] == "failed": raise Exception("CAPTCHA solving failed") time.sleep(2) def scrape_with_captcha(self, url): """Scrape a page that requires CAPTCHA solving""" # Initial request response = self.session.get(url) soup = BeautifulSoup(response.content, 'html.parser') # Check if CAPTCHA is present captcha_img = soup.find('img', {'class': 'captcha-image'}) if captcha_img: # Get CAPTCHA URL captcha_url = urljoin(url, captcha_img['src']) # Solve CAPTCHA captcha_solution = self.solve_captcha(captcha_url) # Get form data form = soup.find('form') form_data = { input_tag['name']: input_tag.get('value', '') for input_tag in form.find_all('input') if input_tag.get('name') } # Add CAPTCHA solution form_data['captcha'] = captcha_solution # Submit form action_url = urljoin(url, form.get('action', url)) response = self.session.post(action_url, data=form_data) soup = BeautifulSoup(response.content, 'html.parser') # Extract data return self.extract_data(soup) def extract_data(self, soup): """Extract required data from page""" data = [] # Example: Extract product information for product in soup.find_all('div', class_='product'): item = { 'title': product.find('h2').text.strip(), 'price': product.find('span', class_='price').text.strip(), 'description': product.find('p').text.strip(), 'url': product.find('a')['href'] } data.append(item) return data # Usage scraper = CaptchaWebScraper("YOUR_API_KEY") results = scraper.scrape_with_captcha("https://example.com/products")

Advanced Async Scraping

For high-performance scraping, use async/await with aiohttp:

import asyncio import aiohttp from bs4 import BeautifulSoup import base64 from typing import List, Dict class AsyncCaptchaScraper: def __init__(self, api_key: str, max_concurrent: int = 10): self.api_key = api_key self.api_url = "https://api.ai4cap.com/v1" self.semaphore = asyncio.Semaphore(max_concurrent) async def solve_captcha_async(self, session: aiohttp.ClientSession, captcha_url: str) -> str: """Asynchronously solve CAPTCHA""" async with session.get(captcha_url) as response: image_data = await response.read() # Convert to base64 img_base64 = base64.b64encode(image_data).decode() # Submit CAPTCHA task async with session.post( f"{self.api_url}/tasks", headers={"API-Key": self.api_key}, json={ "type": "ImageToTextTask", "body": img_base64 } ) as response: result = await response.json() task_id = result["taskId"] # Poll for result while True: async with session.get( f"{self.api_url}/tasks/{task_id}", headers={"API-Key": self.api_key} ) as response: result = await response.json() if result["status"] == "ready": return result["solution"]["text"] elif result["status"] == "failed": raise Exception("CAPTCHA solving failed") await asyncio.sleep(2) async def scrape_page(self, session: aiohttp.ClientSession, url: str) -> List[Dict]: """Scrape a single page with CAPTCHA handling""" async with self.semaphore: try: # Initial request async with session.get(url) as response: html = await response.text() soup = BeautifulSoup(html, 'html.parser') # Check for CAPTCHA captcha_img = soup.find('img', {'class': 'captcha-image'}) if captcha_img: captcha_url = urljoin(url, captcha_img['src']) solution = await self.solve_captcha_async(session, captcha_url) # Submit form with CAPTCHA solution form_data = self.extract_form_data(soup) form_data['captcha'] = solution async with session.post(url, data=form_data) as response: html = await response.text() soup = BeautifulSoup(html, 'html.parser') return self.extract_data(soup) except Exception as e: print(f"Error scraping {url}: {e}") return [] async def scrape_multiple(self, urls: List[str]) -> List[Dict]: """Scrape multiple URLs concurrently""" async with aiohttp.ClientSession() as session: tasks = [self.scrape_page(session, url) for url in urls] results = await asyncio.gather(*tasks) # Flatten results return [item for sublist in results for item in sublist] def extract_form_data(self, soup): """Extract form data from page""" form = soup.find('form') return { tag['name']: tag.get('value', '') for tag in form.find_all(['input', 'select', 'textarea']) if tag.get('name') } def extract_data(self, soup): """Extract required data""" return [{ 'title': item.find('h2').text.strip(), 'price': item.find('.price').text.strip(), 'url': item.find('a')['href'] } for item in soup.find_all('.product-item')] # Usage async def main(): scraper = AsyncCaptchaScraper("YOUR_API_KEY", max_concurrent=20) urls = [ f"https://example.com/page/{i}" for i in range(1, 101) ] results = await scraper.scrape_multiple(urls) print(f"Scraped {len(results)} items") # Run asyncio.run(main())

Scrapy Integration

For large-scale projects, integrate CAPTCHA solving into Scrapy:

# captcha_middleware.py import base64 import requests from scrapy import signals from scrapy.http import HtmlResponse from scrapy.exceptions import IgnoreRequest class CaptchaMiddleware: def __init__(self, api_key): self.api_key = api_key self.api_url = "https://api.ai4cap.com/v1" @classmethod def from_crawler(cls, crawler): api_key = crawler.settings.get('CAPTCHA_API_KEY') if not api_key: raise ValueError("CAPTCHA_API_KEY setting is required") middleware = cls(api_key) crawler.signals.connect( middleware.spider_opened, signal=signals.spider_opened ) return middleware def process_response(self, request, response, spider): # Check if response contains CAPTCHA if self.has_captcha(response): spider.logger.info(f"CAPTCHA detected on {request.url}") # Solve CAPTCHA captcha_solution = self.solve_captcha(response) # Create new request with solution form_data = self.extract_form_data(response) form_data['captcha'] = captcha_solution return request.replace( method='POST', body=urlencode(form_data), headers={'Content-Type': 'application/x-www-form-urlencoded'}, dont_filter=True ) return response def has_captcha(self, response): """Check if page contains CAPTCHA""" return bool(response.css('img.captcha-image')) def solve_captcha(self, response): """Extract and solve CAPTCHA from response""" # Extract CAPTCHA image URL captcha_url = response.urljoin( response.css('img.captcha-image::attr(src)').get() ) # Download image img_response = requests.get(captcha_url) img_base64 = base64.b64encode(img_response.content).decode() # Send to API task_response = requests.post( f"{self.api_url}/tasks", headers={"API-Key": self.api_key}, json={ "type": "ImageToTextTask", "body": img_base64 } ) task_id = task_response.json()["taskId"] # Wait for solution return self.wait_for_solution(task_id) def wait_for_solution(self, task_id): """Poll API for CAPTCHA solution""" import time while True: result = requests.get( f"{self.api_url}/tasks/{task_id}", headers={"API-Key": self.api_key} ).json() if result["status"] == "ready": return result["solution"]["text"] elif result["status"] == "failed": raise IgnoreRequest("CAPTCHA solving failed") time.sleep(2) def spider_opened(self, spider): spider.logger.info('CAPTCHA middleware activated')

Handling Different CAPTCHA Types

CAPTCHA TypeDetection MethodSolving Approach
Image Textimg.captcha-imageImageToTextTask
reCAPTCHA v2div.g-recaptchaRecaptchaV2Task
reCAPTCHA v3grecaptcha.executeRecaptchaV3Task
hCaptchadiv.h-captchaHCaptchaTask
FunCaptchadiv#funcaptchaFunCaptchaTask
def detect_captcha_type(soup): """Detect CAPTCHA type on page""" if soup.find('div', class_='g-recaptcha'): site_key = soup.find('div', class_='g-recaptcha')['data-sitekey'] return 'recaptcha_v2', site_key elif soup.find('div', class_='h-captcha'): site_key = soup.find('div', class_='h-captcha')['data-sitekey'] return 'hcaptcha', site_key elif soup.find('img', class_='captcha-image'): img_url = soup.find('img', class_='captcha-image')['src'] return 'image_captcha', img_url elif 'grecaptcha.execute' in str(soup): # Extract site key from JavaScript import re match = re.search(r"grecaptcha.execute('([^']+)'", str(soup)) if match: return 'recaptcha_v3', match.group(1) return None, None

Performance Optimization

1. Connection Pooling

# Reuse connections for better performance session = requests.Session() adapter = requests.adapters.HTTPAdapter( pool_connections=100, pool_maxsize=100, max_retries=3 ) session.mount('http://', adapter) session.mount('https://', adapter)

2. Caching CAPTCHA Solutions

import hashlib from functools import lru_cache @lru_cache(maxsize=1000) def get_cached_solution(image_hash): """Cache CAPTCHA solutions by image hash""" return None def solve_with_cache(image_data): # Calculate image hash image_hash = hashlib.md5(image_data).hexdigest() # Check cache cached = get_cached_solution(image_hash) if cached: return cached # Solve and cache solution = solve_captcha(image_data) get_cached_solution.cache_info() # Update cache return solution

3. Batch Processing

async def batch_scrape(urls, batch_size=50): """Process URLs in batches for optimal performance""" all_results = [] for i in range(0, len(urls), batch_size): batch = urls[i:i + batch_size] # Process batch concurrently batch_results = await asyncio.gather(*[ scrape_url(url) for url in batch ]) all_results.extend(batch_results) # Add delay between batches await asyncio.sleep(1) return all_results

Start Scraping with Confidence

Build powerful Python scrapers that handle any CAPTCHA automatically with our reliable API.