Setting Up Selenium with Automatic CAPTCHA Solving

Learn how to seamlessly integrate AI-powered CAPTCHA solving into your Selenium automation workflows.

This tutorial assumes basic knowledge of Selenium WebDriver. If you're new to Selenium, check out their official documentation first.

Prerequisites

Selenium WebDriver installed for your preferred language
ChromeDriver or GeckoDriver (for Firefox)
AI4CAP.COM API key (get one here)
Basic understanding of web automation

Installation

# Install required packages
pip install selenium requests pillow

# For async support (recommended)
pip install aiohttp asyncio

<!-- Add to your pom.xml -->
<dependencies>
    <dependency>
        <groupId>org.seleniumhq.selenium</groupId>
        <artifactId>selenium-java</artifactId>
        <version>4.15.0</version>
    </dependency>
    <dependency>
        <groupId>com.squareup.okhttp3</groupId>
        <artifactId>okhttp</artifactId>
        <version>4.11.0</version>
    </dependency>
</dependencies>

# Install required packages
npm install selenium-webdriver axios

# For TypeScript support
npm install --save-dev @types/selenium-webdriver

Basic Integration

Here's how to create a Selenium automation that automatically solves CAPTCHAs:

import time
import base64
import requests
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

class SeleniumCaptchaSolver:
    def __init__(self, api_key):
        self.api_key = api_key
        self.api_url = "https://api.ai4cap.com/v1"
        self.driver = webdriver.Chrome()
        
    def solve_captcha(self, captcha_element):
        """Solve a CAPTCHA element automatically"""
        # Take screenshot of CAPTCHA
        captcha_image = captcha_element.screenshot_as_base64
        
        # Send to AI4CAP API
        response = requests.post(
            f"{self.api_url}/tasks",
            headers={"API-Key": self.api_key},
            json={
                "type": "ImageToTextTask",
                "body": captcha_image
            }
        )
        
        task_id = response.json()["taskId"]
        
        # Poll for result
        while True:
            result = requests.get(
                f"{self.api_url}/tasks/{task_id}",
                headers={"API-Key": self.api_key}
            ).json()
            
            if result["status"] == "ready":
                return result["solution"]["text"]
            elif result["status"] == "failed":
                raise Exception(f"CAPTCHA solving failed: {result.get('error')}")
            
            time.sleep(2)
    
    def automate_with_captcha(self, url):
        """Example automation with CAPTCHA handling"""
        self.driver.get(url)
        
        try:
            # Wait for CAPTCHA to appear
            captcha_img = WebDriverWait(self.driver, 10).until(
                EC.presence_of_element_located((By.CLASS_NAME, "captcha-image"))
            )
            
            # Solve the CAPTCHA
            solution = self.solve_captcha(captcha_img)
            
            # Enter solution
            captcha_input = self.driver.find_element(By.ID, "captcha-input")
            captcha_input.send_keys(solution)
            
            # Submit form
            submit_btn = self.driver.find_element(By.ID, "submit")
            submit_btn.click()
            
            print("CAPTCHA solved successfully!")
            
        except Exception as e:
            print(f"Error: {e}")
        finally:
            self.driver.quit()

# Usage
solver = SeleniumCaptchaSolver("YOUR_API_KEY")
solver.automate_with_captcha("https://example.com/form")

import org.openqa.selenium.*;
import org.openqa.selenium.chrome.ChromeDriver;
import org.openqa.selenium.support.ui.WebDriverWait;
import org.openqa.selenium.support.ui.ExpectedConditions;
import okhttp3.*;
import com.google.gson.*;
import java.util.Base64;
import java.time.Duration;

public class SeleniumCaptchaSolver {
    private final String apiKey;
    private final String apiUrl = "https://api.ai4cap.com/v1";
    private final WebDriver driver;
    private final OkHttpClient client;
    private final Gson gson;
    
    public SeleniumCaptchaSolver(String apiKey) {
        this.apiKey = apiKey;
        this.driver = new ChromeDriver();
        this.client = new OkHttpClient();
        this.gson = new Gson();
    }
    
    public String solveCaptcha(WebElement captchaElement) throws Exception {
        // Take screenshot of CAPTCHA
        String base64Image = ((TakesScreenshot) captchaElement)
            .getScreenshotAs(OutputType.BASE64);
        
        // Create request body
        JsonObject requestBody = new JsonObject();
        requestBody.addProperty("type", "ImageToTextTask");
        requestBody.addProperty("body", base64Image);
        
        // Send to API
        Request request = new Request.Builder()
            .url(apiUrl + "/tasks")
            .header("API-Key", apiKey)
            .post(RequestBody.create(
                requestBody.toString(), 
                MediaType.parse("application/json")
            ))
            .build();
        
        Response response = client.newCall(request).execute();
        JsonObject result = gson.fromJson(
            response.body().string(), 
            JsonObject.class
        );
        
        String taskId = result.get("taskId").getAsString();
        
        // Poll for result
        while (true) {
            Request pollRequest = new Request.Builder()
                .url(apiUrl + "/tasks/" + taskId)
                .header("API-Key", apiKey)
                .get()
                .build();
            
            Response pollResponse = client.newCall(pollRequest).execute();
            JsonObject pollResult = gson.fromJson(
                pollResponse.body().string(), 
                JsonObject.class
            );
            
            String status = pollResult.get("status").getAsString();
            
            if ("ready".equals(status)) {
                return pollResult.getAsJsonObject("solution")
                    .get("text").getAsString();
            } else if ("failed".equals(status)) {
                throw new Exception("CAPTCHA solving failed");
            }
            
            Thread.sleep(2000);
        }
    }
    
    public void automateWithCaptcha(String url) {
        driver.get(url);
        WebDriverWait wait = new WebDriverWait(driver, Duration.ofSeconds(10));
        
        try {
            // Wait for CAPTCHA
            WebElement captchaImg = wait.until(
                ExpectedConditions.presenceOfElementLocated(
                    By.className("captcha-image")
                )
            );
            
            // Solve CAPTCHA
            String solution = solveCaptcha(captchaImg);
            
            // Enter solution
            WebElement input = driver.findElement(By.id("captcha-input"));
            input.sendKeys(solution);
            
            // Submit
            driver.findElement(By.id("submit")).click();
            
            System.out.println("CAPTCHA solved successfully!");
            
        } catch (Exception e) {
            e.printStackTrace();
        } finally {
            driver.quit();
        }
    }
}

const { Builder, By, until } = require('selenium-webdriver');
const axios = require('axios');

class SeleniumCaptchaSolver {
    constructor(apiKey) {
        this.apiKey = apiKey;
        this.apiUrl = 'https://api.ai4cap.com/v1';
    }
    
    async solveCaptcha(driver, captchaElement) {
        // Take screenshot of CAPTCHA
        const base64Image = await captchaElement.takeScreenshot();
        
        // Send to AI4CAP API
        const response = await axios.post(
            `${this.apiUrl}/tasks`,
            {
                type: 'ImageToTextTask',
                body: base64Image
            },
            {
                headers: { 'API-Key': this.apiKey }
            }
        );
        
        const taskId = response.data.taskId;
        
        // Poll for result
        while (true) {
            const result = await axios.get(
                `${this.apiUrl}/tasks/${taskId}`,
                {
                    headers: { 'API-Key': this.apiKey }
                }
            );
            
            if (result.data.status === 'ready') {
                return result.data.solution.text;
            } else if (result.data.status === 'failed') {
                throw new Error('CAPTCHA solving failed');
            }
            
            await new Promise(resolve => setTimeout(resolve, 2000));
        }
    }
    
    async automateWithCaptcha(url) {
        const driver = await new Builder()
            .forBrowser('chrome')
            .build();
        
        try {
            await driver.get(url);
            
            // Wait for CAPTCHA
            const captchaImg = await driver.wait(
                until.elementLocated(By.className('captcha-image')),
                10000
            );
            
            // Solve CAPTCHA
            const solution = await this.solveCaptcha(driver, captchaImg);
            
            // Enter solution
            const input = await driver.findElement(By.id('captcha-input'));
            await input.sendKeys(solution);
            
            // Submit
            await driver.findElement(By.id('submit')).click();
            
            console.log('CAPTCHA solved successfully!');
            
        } catch (error) {
            console.error('Error:', error);
        } finally {
            await driver.quit();
        }
    }
}

// Usage
const solver = new SeleniumCaptchaSolver('YOUR_API_KEY');
solver.automateWithCaptcha('https://example.com/form');

Advanced Techniques

1. Handling reCAPTCHA v2

For Google reCAPTCHA v2, you need to handle the iframe and token injection:

def solve_recaptcha_v2(driver, site_key, page_url):
    """Solve reCAPTCHA v2 and inject token"""
    
    # Get token from API
    response = requests.post(
        f"{API_URL}/tasks",
        headers={"API-Key": API_KEY},
        json={
            "type": "RecaptchaV2TaskProxyless",
            "websiteURL": page_url,
            "websiteKey": site_key
        }
    )
    
    task_id = response.json()["taskId"]
    token = wait_for_solution(task_id)
    
    # Inject token into page
    driver.execute_script(f'''
        document.getElementById('g-recaptcha-response').innerHTML = '{token}';
        document.getElementById('g-recaptcha-response').style.display = 'none';
        if (typeof ___grecaptcha_cfg !== 'undefined') {{
            Object.entries(___grecaptcha_cfg.clients).forEach(([key, client]) => {{
                if (client.callback) {{
                    client.callback('{token}');
                }}
            }});
        }}
    ''')
    
    return token

2. Handling Dynamic CAPTCHAs

For CAPTCHAs that load dynamically or change:

class DynamicCaptchaHandler:
    def __init__(self, driver, api_key):
        self.driver = driver
        self.api_key = api_key
        
    def wait_and_solve(self, captcha_selector, max_retries=3):
        """Handle dynamic CAPTCHAs with retries"""
        
        for attempt in range(max_retries):
            try:
                # Wait for CAPTCHA to be fully loaded
                WebDriverWait(self.driver, 20).until(
                    lambda d: d.find_element(By.CSS_SELECTOR, captcha_selector)
                        .get_attribute('complete') == 'true'
                )
                
                captcha_element = self.driver.find_element(
                    By.CSS_SELECTOR, captcha_selector
                )
                
                # Check if CAPTCHA has changed
                current_src = captcha_element.get_attribute('src')
                if hasattr(self, 'last_src') and self.last_src == current_src:
                    time.sleep(1)
                    continue
                
                self.last_src = current_src
                
                # Solve CAPTCHA
                solution = self.solve_captcha(captcha_element)
                
                # Verify solution was accepted
                if self.verify_solution(solution):
                    return solution
                    
            except Exception as e:
                print(f"Attempt {attempt + 1} failed: {e}")
                
        raise Exception("Failed to solve dynamic CAPTCHA")

3. Parallel Processing

Speed up automation by solving multiple CAPTCHAs concurrently:

import asyncio
from concurrent.futures import ThreadPoolExecutor

class ParallelCaptchaSolver:
    def __init__(self, api_key, max_workers=5):
        self.api_key = api_key
        self.executor = ThreadPoolExecutor(max_workers=max_workers)
        
    async def solve_multiple_captchas(self, captcha_elements):
        """Solve multiple CAPTCHAs in parallel"""
        
        tasks = []
        for element in captcha_elements:
            task = asyncio.create_task(
                self.solve_captcha_async(element)
            )
            tasks.append(task)
        
        solutions = await asyncio.gather(*tasks)
        return solutions
    
    async def solve_captcha_async(self, element):
        """Async wrapper for CAPTCHA solving"""
        
        loop = asyncio.get_event_loop()
        return await loop.run_in_executor(
            self.executor,
            self.solve_captcha_sync,
            element
        )

Best Practices

Implement Retry Logic
CAPTCHAs can fail due to network issues or changes. Always implement retry mechanisms.
Handle Different CAPTCHA Types
Detect the CAPTCHA type and use appropriate solving methods.
Use Explicit Waits
Always wait for CAPTCHAs to fully load before attempting to solve them.
Monitor Success Rates
Track your solving success rates and adjust strategies accordingly.
Respect Rate Limits
Don't overwhelm target websites or the API with too many requests.

Common Issues and Solutions

Issue: CAPTCHA image not loading properly

# Solution: Wait for image to be fully loaded
wait = WebDriverWait(driver, 20)
captcha = wait.until(
    EC.presence_of_element_located((By.ID, "captcha-image"))
)

# Additional check for image loading
driver.execute_script(
    "return arguments[0].complete && " + 
    "typeof arguments[0].naturalWidth != 'undefined' && " + 
    "arguments[0].naturalWidth > 0", 
    captcha
)

Issue: Solution not being accepted

# Solution: Clear input and use JavaScript to set value
input_element = driver.find_element(By.ID, "captcha-input")
input_element.clear()

# Use JavaScript for more reliable input
driver.execute_script(
    "arguments[0].value = arguments[1]; " +
    "arguments[0].dispatchEvent(new Event('input', {bubbles: true}));",
    input_element, 
    solution
)

Pro Tip: Use Page Object Model

Organize your CAPTCHA handling code using the Page Object Model pattern for better maintainability and reusability across your test suite.

Ready to Automate?

Start integrating CAPTCHA solving into your Selenium projects today with our reliable API.