Back to Blog
TutorialPython
14 min read
Python Web Scraping with CAPTCHA Handling
Learn how to build production-ready Python web scrapers that seamlessly handle CAPTCHAs without manual intervention.
This tutorial covers both synchronous and asynchronous approaches for maximum performance in production environments.
Setup and Installation
First, let's install the required packages:
# Core scraping libraries
pip install requests beautifulsoup4 lxml
# For JavaScript-heavy sites
pip install selenium playwright
# Async support
pip install aiohttp asyncio
# For Scrapy integration
pip install scrapy scrapy-splash
# Additional utilities
pip install pillow python-anticaptcha
Basic Web Scraper with CAPTCHA Support
Let's start with a simple example using requests and BeautifulSoup:
import requests
from bs4 import BeautifulSoup
import base64
import time
from PIL import Image
from io import BytesIO
class CaptchaWebScraper:
def __init__(self, api_key):
self.api_key = api_key
self.session = requests.Session()
self.api_url = "https://api.ai4cap.com/v1"
def solve_captcha(self, captcha_url):
"""Download and solve CAPTCHA image"""
# Download CAPTCHA image
response = self.session.get(captcha_url)
img = Image.open(BytesIO(response.content))
# Convert to base64
buffered = BytesIO()
img.save(buffered, format="PNG")
img_base64 = base64.b64encode(buffered.getvalue()).decode()
# Send to AI4CAP API
task_response = self.session.post(
f"{self.api_url}/tasks",
headers={"API-Key": self.api_key},
json={
"type": "ImageToTextTask",
"body": img_base64
}
)
task_id = task_response.json()["taskId"]
# Poll for result
while True:
result = self.session.get(
f"{self.api_url}/tasks/{task_id}",
headers={"API-Key": self.api_key}
).json()
if result["status"] == "ready":
return result["solution"]["text"]
elif result["status"] == "failed":
raise Exception("CAPTCHA solving failed")
time.sleep(2)
def scrape_with_captcha(self, url):
"""Scrape a page that requires CAPTCHA solving"""
# Initial request
response = self.session.get(url)
soup = BeautifulSoup(response.content, 'html.parser')
# Check if CAPTCHA is present
captcha_img = soup.find('img', {'class': 'captcha-image'})
if captcha_img:
# Get CAPTCHA URL
captcha_url = urljoin(url, captcha_img['src'])
# Solve CAPTCHA
captcha_solution = self.solve_captcha(captcha_url)
# Get form data
form = soup.find('form')
form_data = {
input_tag['name']: input_tag.get('value', '')
for input_tag in form.find_all('input')
if input_tag.get('name')
}
# Add CAPTCHA solution
form_data['captcha'] = captcha_solution
# Submit form
action_url = urljoin(url, form.get('action', url))
response = self.session.post(action_url, data=form_data)
soup = BeautifulSoup(response.content, 'html.parser')
# Extract data
return self.extract_data(soup)
def extract_data(self, soup):
"""Extract required data from page"""
data = []
# Example: Extract product information
for product in soup.find_all('div', class_='product'):
item = {
'title': product.find('h2').text.strip(),
'price': product.find('span', class_='price').text.strip(),
'description': product.find('p').text.strip(),
'url': product.find('a')['href']
}
data.append(item)
return data
# Usage
scraper = CaptchaWebScraper("YOUR_API_KEY")
results = scraper.scrape_with_captcha("https://example.com/products")
Advanced Async Scraping
For high-performance scraping, use async/await with aiohttp:
import asyncio
import aiohttp
from bs4 import BeautifulSoup
import base64
from typing import List, Dict
class AsyncCaptchaScraper:
def __init__(self, api_key: str, max_concurrent: int = 10):
self.api_key = api_key
self.api_url = "https://api.ai4cap.com/v1"
self.semaphore = asyncio.Semaphore(max_concurrent)
async def solve_captcha_async(self, session: aiohttp.ClientSession,
captcha_url: str) -> str:
"""Asynchronously solve CAPTCHA"""
async with session.get(captcha_url) as response:
image_data = await response.read()
# Convert to base64
img_base64 = base64.b64encode(image_data).decode()
# Submit CAPTCHA task
async with session.post(
f"{self.api_url}/tasks",
headers={"API-Key": self.api_key},
json={
"type": "ImageToTextTask",
"body": img_base64
}
) as response:
result = await response.json()
task_id = result["taskId"]
# Poll for result
while True:
async with session.get(
f"{self.api_url}/tasks/{task_id}",
headers={"API-Key": self.api_key}
) as response:
result = await response.json()
if result["status"] == "ready":
return result["solution"]["text"]
elif result["status"] == "failed":
raise Exception("CAPTCHA solving failed")
await asyncio.sleep(2)
async def scrape_page(self, session: aiohttp.ClientSession,
url: str) -> List[Dict]:
"""Scrape a single page with CAPTCHA handling"""
async with self.semaphore:
try:
# Initial request
async with session.get(url) as response:
html = await response.text()
soup = BeautifulSoup(html, 'html.parser')
# Check for CAPTCHA
captcha_img = soup.find('img', {'class': 'captcha-image'})
if captcha_img:
captcha_url = urljoin(url, captcha_img['src'])
solution = await self.solve_captcha_async(session, captcha_url)
# Submit form with CAPTCHA solution
form_data = self.extract_form_data(soup)
form_data['captcha'] = solution
async with session.post(url, data=form_data) as response:
html = await response.text()
soup = BeautifulSoup(html, 'html.parser')
return self.extract_data(soup)
except Exception as e:
print(f"Error scraping {url}: {e}")
return []
async def scrape_multiple(self, urls: List[str]) -> List[Dict]:
"""Scrape multiple URLs concurrently"""
async with aiohttp.ClientSession() as session:
tasks = [self.scrape_page(session, url) for url in urls]
results = await asyncio.gather(*tasks)
# Flatten results
return [item for sublist in results for item in sublist]
def extract_form_data(self, soup):
"""Extract form data from page"""
form = soup.find('form')
return {
tag['name']: tag.get('value', '')
for tag in form.find_all(['input', 'select', 'textarea'])
if tag.get('name')
}
def extract_data(self, soup):
"""Extract required data"""
return [{
'title': item.find('h2').text.strip(),
'price': item.find('.price').text.strip(),
'url': item.find('a')['href']
} for item in soup.find_all('.product-item')]
# Usage
async def main():
scraper = AsyncCaptchaScraper("YOUR_API_KEY", max_concurrent=20)
urls = [
f"https://example.com/page/{i}"
for i in range(1, 101)
]
results = await scraper.scrape_multiple(urls)
print(f"Scraped {len(results)} items")
# Run
asyncio.run(main())
Scrapy Integration
For large-scale projects, integrate CAPTCHA solving into Scrapy:
# captcha_middleware.py
import base64
import requests
from scrapy import signals
from scrapy.http import HtmlResponse
from scrapy.exceptions import IgnoreRequest
class CaptchaMiddleware:
def __init__(self, api_key):
self.api_key = api_key
self.api_url = "https://api.ai4cap.com/v1"
@classmethod
def from_crawler(cls, crawler):
api_key = crawler.settings.get('CAPTCHA_API_KEY')
if not api_key:
raise ValueError("CAPTCHA_API_KEY setting is required")
middleware = cls(api_key)
crawler.signals.connect(
middleware.spider_opened,
signal=signals.spider_opened
)
return middleware
def process_response(self, request, response, spider):
# Check if response contains CAPTCHA
if self.has_captcha(response):
spider.logger.info(f"CAPTCHA detected on {request.url}")
# Solve CAPTCHA
captcha_solution = self.solve_captcha(response)
# Create new request with solution
form_data = self.extract_form_data(response)
form_data['captcha'] = captcha_solution
return request.replace(
method='POST',
body=urlencode(form_data),
headers={'Content-Type': 'application/x-www-form-urlencoded'},
dont_filter=True
)
return response
def has_captcha(self, response):
"""Check if page contains CAPTCHA"""
return bool(response.css('img.captcha-image'))
def solve_captcha(self, response):
"""Extract and solve CAPTCHA from response"""
# Extract CAPTCHA image URL
captcha_url = response.urljoin(
response.css('img.captcha-image::attr(src)').get()
)
# Download image
img_response = requests.get(captcha_url)
img_base64 = base64.b64encode(img_response.content).decode()
# Send to API
task_response = requests.post(
f"{self.api_url}/tasks",
headers={"API-Key": self.api_key},
json={
"type": "ImageToTextTask",
"body": img_base64
}
)
task_id = task_response.json()["taskId"]
# Wait for solution
return self.wait_for_solution(task_id)
def wait_for_solution(self, task_id):
"""Poll API for CAPTCHA solution"""
import time
while True:
result = requests.get(
f"{self.api_url}/tasks/{task_id}",
headers={"API-Key": self.api_key}
).json()
if result["status"] == "ready":
return result["solution"]["text"]
elif result["status"] == "failed":
raise IgnoreRequest("CAPTCHA solving failed")
time.sleep(2)
def spider_opened(self, spider):
spider.logger.info('CAPTCHA middleware activated')
# spider.py
import scrapy
from scrapy import Request
class ProductSpider(scrapy.Spider):
name = 'products'
start_urls = ['https://example.com/products']
custom_settings = {
'DOWNLOADER_MIDDLEWARES': {
'myproject.middlewares.CaptchaMiddleware': 560,
}
}
def parse(self, response):
# Extract product links
for product_url in response.css('.product-link::attr(href)').getall():
yield Request(
url=response.urljoin(product_url),
callback=self.parse_product
)
# Follow pagination
next_page = response.css('.pagination .next::attr(href)').get()
if next_page:
yield Request(
url=response.urljoin(next_page),
callback=self.parse
)
def parse_product(self, response):
# Extract product data
yield {
'title': response.css('h1.product-title::text').get(),
'price': response.css('.price::text').re_first(r'[d.]+'),
'description': response.css('.description::text').get(),
'image_url': response.css('.product-image::attr(src)').get(),
'availability': response.css('.availability::text').get(),
'url': response.url
}
# settings.py
BOT_NAME = 'captcha_scraper'
SPIDER_MODULES = ['captcha_scraper.spiders']
NEWSPIDER_MODULE = 'captcha_scraper.spiders'
# CAPTCHA solving configuration
CAPTCHA_API_KEY = 'YOUR_API_KEY'
# Obey robots.txt rules
ROBOTSTXT_OBEY = True
# Configure concurrent requests
CONCURRENT_REQUESTS = 16
CONCURRENT_REQUESTS_PER_DOMAIN = 8
# Download delay
DOWNLOAD_DELAY = 1
RANDOMIZE_DOWNLOAD_DELAY = True
# Middleware configuration
DOWNLOADER_MIDDLEWARES = {
'captcha_scraper.middlewares.CaptchaMiddleware': 560,
'scrapy.downloadermiddlewares.retry.RetryMiddleware': 90,
}
# Retry configuration
RETRY_TIMES = 3
RETRY_HTTP_CODES = [500, 502, 503, 504, 408, 429]
# User agent
USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
# Auto throttle
AUTOTHROTTLE_ENABLED = True
AUTOTHROTTLE_START_DELAY = 1
AUTOTHROTTLE_MAX_DELAY = 10
AUTOTHROTTLE_TARGET_CONCURRENCY = 8.0
Handling Different CAPTCHA Types
CAPTCHA Type | Detection Method | Solving Approach |
---|---|---|
Image Text | img.captcha-image | ImageToTextTask |
reCAPTCHA v2 | div.g-recaptcha | RecaptchaV2Task |
reCAPTCHA v3 | grecaptcha.execute | RecaptchaV3Task |
hCaptcha | div.h-captcha | HCaptchaTask |
FunCaptcha | div#funcaptcha | FunCaptchaTask |
def detect_captcha_type(soup):
"""Detect CAPTCHA type on page"""
if soup.find('div', class_='g-recaptcha'):
site_key = soup.find('div', class_='g-recaptcha')['data-sitekey']
return 'recaptcha_v2', site_key
elif soup.find('div', class_='h-captcha'):
site_key = soup.find('div', class_='h-captcha')['data-sitekey']
return 'hcaptcha', site_key
elif soup.find('img', class_='captcha-image'):
img_url = soup.find('img', class_='captcha-image')['src']
return 'image_captcha', img_url
elif 'grecaptcha.execute' in str(soup):
# Extract site key from JavaScript
import re
match = re.search(r"grecaptcha.execute('([^']+)'", str(soup))
if match:
return 'recaptcha_v3', match.group(1)
return None, None
Performance Optimization
1. Connection Pooling
# Reuse connections for better performance
session = requests.Session()
adapter = requests.adapters.HTTPAdapter(
pool_connections=100,
pool_maxsize=100,
max_retries=3
)
session.mount('http://', adapter)
session.mount('https://', adapter)
2. Caching CAPTCHA Solutions
import hashlib
from functools import lru_cache
@lru_cache(maxsize=1000)
def get_cached_solution(image_hash):
"""Cache CAPTCHA solutions by image hash"""
return None
def solve_with_cache(image_data):
# Calculate image hash
image_hash = hashlib.md5(image_data).hexdigest()
# Check cache
cached = get_cached_solution(image_hash)
if cached:
return cached
# Solve and cache
solution = solve_captcha(image_data)
get_cached_solution.cache_info() # Update cache
return solution
3. Batch Processing
async def batch_scrape(urls, batch_size=50):
"""Process URLs in batches for optimal performance"""
all_results = []
for i in range(0, len(urls), batch_size):
batch = urls[i:i + batch_size]
# Process batch concurrently
batch_results = await asyncio.gather(*[
scrape_url(url) for url in batch
])
all_results.extend(batch_results)
# Add delay between batches
await asyncio.sleep(1)
return all_results
Pro Tip: Monitor Your Scraping
Always implement logging, error tracking, and performance monitoring to ensure your scrapers run reliably in production.