13 min read
Setting Up Selenium with Automatic CAPTCHA Solving
Learn how to seamlessly integrate AI-powered CAPTCHA solving into your Selenium automation workflows.
This tutorial assumes basic knowledge of Selenium WebDriver. If you're new to Selenium, check out their official documentation first.
Prerequisites
- Selenium WebDriver installed for your preferred language
- ChromeDriver or GeckoDriver (for Firefox)
- AI4CAP.COM API key (get one here)
- Basic understanding of web automation
Installation
# Install required packages
pip install selenium requests pillow
# For async support (recommended)
pip install aiohttp asyncio
<!-- Add to your pom.xml -->
<dependencies>
<dependency>
<groupId>org.seleniumhq.selenium</groupId>
<artifactId>selenium-java</artifactId>
<version>4.15.0</version>
</dependency>
<dependency>
<groupId>com.squareup.okhttp3</groupId>
<artifactId>okhttp</artifactId>
<version>4.11.0</version>
</dependency>
</dependencies>
# Install required packages
npm install selenium-webdriver axios
# For TypeScript support
npm install --save-dev @types/selenium-webdriver
Basic Integration
Here's how to create a Selenium automation that automatically solves CAPTCHAs:
import time
import base64
import requests
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
class SeleniumCaptchaSolver:
def __init__(self, api_key):
self.api_key = api_key
self.api_url = "https://api.ai4cap.com/v1"
self.driver = webdriver.Chrome()
def solve_captcha(self, captcha_element):
"""Solve a CAPTCHA element automatically"""
# Take screenshot of CAPTCHA
captcha_image = captcha_element.screenshot_as_base64
# Send to AI4CAP API
response = requests.post(
f"{self.api_url}/tasks",
headers={"API-Key": self.api_key},
json={
"type": "ImageToTextTask",
"body": captcha_image
}
)
task_id = response.json()["taskId"]
# Poll for result
while True:
result = requests.get(
f"{self.api_url}/tasks/{task_id}",
headers={"API-Key": self.api_key}
).json()
if result["status"] == "ready":
return result["solution"]["text"]
elif result["status"] == "failed":
raise Exception(f"CAPTCHA solving failed: {result.get('error')}")
time.sleep(2)
def automate_with_captcha(self, url):
"""Example automation with CAPTCHA handling"""
self.driver.get(url)
try:
# Wait for CAPTCHA to appear
captcha_img = WebDriverWait(self.driver, 10).until(
EC.presence_of_element_located((By.CLASS_NAME, "captcha-image"))
)
# Solve the CAPTCHA
solution = self.solve_captcha(captcha_img)
# Enter solution
captcha_input = self.driver.find_element(By.ID, "captcha-input")
captcha_input.send_keys(solution)
# Submit form
submit_btn = self.driver.find_element(By.ID, "submit")
submit_btn.click()
print("CAPTCHA solved successfully!")
except Exception as e:
print(f"Error: {e}")
finally:
self.driver.quit()
# Usage
solver = SeleniumCaptchaSolver("YOUR_API_KEY")
solver.automate_with_captcha("https://example.com/form")
import org.openqa.selenium.*;
import org.openqa.selenium.chrome.ChromeDriver;
import org.openqa.selenium.support.ui.WebDriverWait;
import org.openqa.selenium.support.ui.ExpectedConditions;
import okhttp3.*;
import com.google.gson.*;
import java.util.Base64;
import java.time.Duration;
public class SeleniumCaptchaSolver {
private final String apiKey;
private final String apiUrl = "https://api.ai4cap.com/v1";
private final WebDriver driver;
private final OkHttpClient client;
private final Gson gson;
public SeleniumCaptchaSolver(String apiKey) {
this.apiKey = apiKey;
this.driver = new ChromeDriver();
this.client = new OkHttpClient();
this.gson = new Gson();
}
public String solveCaptcha(WebElement captchaElement) throws Exception {
// Take screenshot of CAPTCHA
String base64Image = ((TakesScreenshot) captchaElement)
.getScreenshotAs(OutputType.BASE64);
// Create request body
JsonObject requestBody = new JsonObject();
requestBody.addProperty("type", "ImageToTextTask");
requestBody.addProperty("body", base64Image);
// Send to API
Request request = new Request.Builder()
.url(apiUrl + "/tasks")
.header("API-Key", apiKey)
.post(RequestBody.create(
requestBody.toString(),
MediaType.parse("application/json")
))
.build();
Response response = client.newCall(request).execute();
JsonObject result = gson.fromJson(
response.body().string(),
JsonObject.class
);
String taskId = result.get("taskId").getAsString();
// Poll for result
while (true) {
Request pollRequest = new Request.Builder()
.url(apiUrl + "/tasks/" + taskId)
.header("API-Key", apiKey)
.get()
.build();
Response pollResponse = client.newCall(pollRequest).execute();
JsonObject pollResult = gson.fromJson(
pollResponse.body().string(),
JsonObject.class
);
String status = pollResult.get("status").getAsString();
if ("ready".equals(status)) {
return pollResult.getAsJsonObject("solution")
.get("text").getAsString();
} else if ("failed".equals(status)) {
throw new Exception("CAPTCHA solving failed");
}
Thread.sleep(2000);
}
}
public void automateWithCaptcha(String url) {
driver.get(url);
WebDriverWait wait = new WebDriverWait(driver, Duration.ofSeconds(10));
try {
// Wait for CAPTCHA
WebElement captchaImg = wait.until(
ExpectedConditions.presenceOfElementLocated(
By.className("captcha-image")
)
);
// Solve CAPTCHA
String solution = solveCaptcha(captchaImg);
// Enter solution
WebElement input = driver.findElement(By.id("captcha-input"));
input.sendKeys(solution);
// Submit
driver.findElement(By.id("submit")).click();
System.out.println("CAPTCHA solved successfully!");
} catch (Exception e) {
e.printStackTrace();
} finally {
driver.quit();
}
}
}
const { Builder, By, until } = require('selenium-webdriver');
const axios = require('axios');
class SeleniumCaptchaSolver {
constructor(apiKey) {
this.apiKey = apiKey;
this.apiUrl = 'https://api.ai4cap.com/v1';
}
async solveCaptcha(driver, captchaElement) {
// Take screenshot of CAPTCHA
const base64Image = await captchaElement.takeScreenshot();
// Send to AI4CAP API
const response = await axios.post(
`${this.apiUrl}/tasks`,
{
type: 'ImageToTextTask',
body: base64Image
},
{
headers: { 'API-Key': this.apiKey }
}
);
const taskId = response.data.taskId;
// Poll for result
while (true) {
const result = await axios.get(
`${this.apiUrl}/tasks/${taskId}`,
{
headers: { 'API-Key': this.apiKey }
}
);
if (result.data.status === 'ready') {
return result.data.solution.text;
} else if (result.data.status === 'failed') {
throw new Error('CAPTCHA solving failed');
}
await new Promise(resolve => setTimeout(resolve, 2000));
}
}
async automateWithCaptcha(url) {
const driver = await new Builder()
.forBrowser('chrome')
.build();
try {
await driver.get(url);
// Wait for CAPTCHA
const captchaImg = await driver.wait(
until.elementLocated(By.className('captcha-image')),
10000
);
// Solve CAPTCHA
const solution = await this.solveCaptcha(driver, captchaImg);
// Enter solution
const input = await driver.findElement(By.id('captcha-input'));
await input.sendKeys(solution);
// Submit
await driver.findElement(By.id('submit')).click();
console.log('CAPTCHA solved successfully!');
} catch (error) {
console.error('Error:', error);
} finally {
await driver.quit();
}
}
}
// Usage
const solver = new SeleniumCaptchaSolver('YOUR_API_KEY');
solver.automateWithCaptcha('https://example.com/form');
Advanced Techniques
1. Handling reCAPTCHA v2
For Google reCAPTCHA v2, you need to handle the iframe and token injection:
def solve_recaptcha_v2(driver, site_key, page_url):
"""Solve reCAPTCHA v2 and inject token"""
# Get token from API
response = requests.post(
f"{API_URL}/tasks",
headers={"API-Key": API_KEY},
json={
"type": "RecaptchaV2TaskProxyless",
"websiteURL": page_url,
"websiteKey": site_key
}
)
task_id = response.json()["taskId"]
token = wait_for_solution(task_id)
# Inject token into page
driver.execute_script(f'''
document.getElementById('g-recaptcha-response').innerHTML = '{token}';
document.getElementById('g-recaptcha-response').style.display = 'none';
if (typeof ___grecaptcha_cfg !== 'undefined') {{
Object.entries(___grecaptcha_cfg.clients).forEach(([key, client]) => {{
if (client.callback) {{
client.callback('{token}');
}}
}});
}}
''')
return token
2. Handling Dynamic CAPTCHAs
For CAPTCHAs that load dynamically or change:
class DynamicCaptchaHandler:
def __init__(self, driver, api_key):
self.driver = driver
self.api_key = api_key
def wait_and_solve(self, captcha_selector, max_retries=3):
"""Handle dynamic CAPTCHAs with retries"""
for attempt in range(max_retries):
try:
# Wait for CAPTCHA to be fully loaded
WebDriverWait(self.driver, 20).until(
lambda d: d.find_element(By.CSS_SELECTOR, captcha_selector)
.get_attribute('complete') == 'true'
)
captcha_element = self.driver.find_element(
By.CSS_SELECTOR, captcha_selector
)
# Check if CAPTCHA has changed
current_src = captcha_element.get_attribute('src')
if hasattr(self, 'last_src') and self.last_src == current_src:
time.sleep(1)
continue
self.last_src = current_src
# Solve CAPTCHA
solution = self.solve_captcha(captcha_element)
# Verify solution was accepted
if self.verify_solution(solution):
return solution
except Exception as e:
print(f"Attempt {attempt + 1} failed: {e}")
raise Exception("Failed to solve dynamic CAPTCHA")
3. Parallel Processing
Speed up automation by solving multiple CAPTCHAs concurrently:
import asyncio
from concurrent.futures import ThreadPoolExecutor
class ParallelCaptchaSolver:
def __init__(self, api_key, max_workers=5):
self.api_key = api_key
self.executor = ThreadPoolExecutor(max_workers=max_workers)
async def solve_multiple_captchas(self, captcha_elements):
"""Solve multiple CAPTCHAs in parallel"""
tasks = []
for element in captcha_elements:
task = asyncio.create_task(
self.solve_captcha_async(element)
)
tasks.append(task)
solutions = await asyncio.gather(*tasks)
return solutions
async def solve_captcha_async(self, element):
"""Async wrapper for CAPTCHA solving"""
loop = asyncio.get_event_loop()
return await loop.run_in_executor(
self.executor,
self.solve_captcha_sync,
element
)
Best Practices
Implement Retry Logic
CAPTCHAs can fail due to network issues or changes. Always implement retry mechanisms.
Handle Different CAPTCHA Types
Detect the CAPTCHA type and use appropriate solving methods.
Use Explicit Waits
Always wait for CAPTCHAs to fully load before attempting to solve them.
Monitor Success Rates
Track your solving success rates and adjust strategies accordingly.
Respect Rate Limits
Don't overwhelm target websites or the API with too many requests.
Common Issues and Solutions
Issue: CAPTCHA image not loading properly
# Solution: Wait for image to be fully loaded
wait = WebDriverWait(driver, 20)
captcha = wait.until(
EC.presence_of_element_located((By.ID, "captcha-image"))
)
# Additional check for image loading
driver.execute_script(
"return arguments[0].complete && " +
"typeof arguments[0].naturalWidth != 'undefined' && " +
"arguments[0].naturalWidth > 0",
captcha
)
Issue: Solution not being accepted
# Solution: Clear input and use JavaScript to set value
input_element = driver.find_element(By.ID, "captcha-input")
input_element.clear()
# Use JavaScript for more reliable input
driver.execute_script(
"arguments[0].value = arguments[1]; " +
"arguments[0].dispatchEvent(new Event('input', {bubbles: true}));",
input_element,
solution
)
Pro Tip: Use Page Object Model
Organize your CAPTCHA handling code using the Page Object Model pattern for better maintainability and reusability across your test suite.