Learn how to build a basic CAPTCHA solver from scratch using Python, TensorFlow, and computer vision techniques.
By Dr. James Liu, ML Engineer
•
January 2, 2024
•
20 min read
Building a CAPTCHA solver is an excellent way to learn computer vision, deep learning, and web automation. This comprehensive tutorial will guide you through creating a functional CAPTCHA solver that can handle basic text-based CAPTCHAs with 90%+ accuracy.
Educational Purpose
This tutorial is for educational purposes. For production use, consider AI4CAP.COM's professional API which handles all CAPTCHA types with 99.9% accuracy.
Install dependencies
Gather training data
Train neural network
Build API wrapper
Validate accuracy
Tool | Purpose |
---|---|
Python 3.8+ | Programming language |
TensorFlow/PyTorch | Deep learning framework |
OpenCV | Image processing |
NumPy/Pandas | Data manipulation |
Flask/FastAPI | API development |
# requirements.txt
tensorflow>=2.10.0
opencv-python>=4.6.0
numpy>=1.23.0
pandas>=1.5.0
pillow>=9.3.0
scikit-learn>=1.1.0
flask>=2.2.0
requests>=2.28.0
# Install dependencies
pip install -r requirements.txt
# Project structure
captcha-solver/
├── data/
│ ├── raw/
│ ├── processed/
│ └── labels.csv
├── models/
│ ├── cnn_model.py
│ ├── preprocessor.py
│ └── saved_models/
├── src/
│ ├── training.py
│ ├── inference.py
│ └── api.py
├── tests/
└── requirements.txt
Clean and prepare CAPTCHA images for model input
import cv2
import numpy as np
from PIL import Image
import os
class CaptchaPreprocessor:
def __init__(self, target_size=(200, 50)):
self.target_size = target_size
def preprocess_image(self, image_path):
"""Preprocess CAPTCHA image for model input"""
# Load image
img = cv2.imread(image_path)
# Convert to grayscale
gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
# Apply Gaussian blur to reduce noise
blurred = cv2.GaussianBlur(gray, (5, 5), 0)
# Apply adaptive thresholding
thresh = cv2.adaptiveThreshold(
blurred, 255,
cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
cv2.THRESH_BINARY_INV, 11, 2
)
# Remove small noise with morphological operations
kernel = np.ones((2, 2), np.uint8)
cleaned = cv2.morphologyEx(thresh, cv2.MORPH_CLOSE, kernel)
# Resize to target size
resized = cv2.resize(cleaned, self.target_size)
# Normalize pixel values
normalized = resized / 255.0
return normalized
def segment_characters(self, image):
"""Segment individual characters from CAPTCHA"""
# Find contours
contours, _ = cv2.findContours(
image.astype(np.uint8),
cv2.RETR_EXTERNAL,
cv2.CHAIN_APPROX_SIMPLE
)
# Sort contours by x-coordinate
contours = sorted(contours, key=lambda c: cv2.boundingRect(c)[0])
characters = []
for contour in contours:
x, y, w, h = cv2.boundingRect(contour)
# Filter out noise
if w > 5 and h > 15:
char_img = image[y:y+h, x:x+w]
char_img = cv2.resize(char_img, (32, 32))
characters.append(char_img)
return characters
def augment_data(self, image):
"""Apply data augmentation for training"""
augmented = []
# Original
augmented.append(image)
# Rotation
for angle in [-5, 5]:
matrix = cv2.getRotationMatrix2D(
(image.shape[1]/2, image.shape[0]/2), angle, 1
)
rotated = cv2.warpAffine(image, matrix, image.shape[:2])
augmented.append(rotated)
# Noise
noise = np.random.normal(0, 0.01, image.shape)
noisy = np.clip(image + noise, 0, 1)
augmented.append(noisy)
# Erosion/Dilation
kernel = np.ones((2, 2), np.uint8)
eroded = cv2.erode(image, kernel, iterations=1)
dilated = cv2.dilate(image, kernel, iterations=1)
augmented.extend([eroded, dilated])
return augmented
CNN + RNN architecture for CAPTCHA recognition
import tensorflow as tf
from tensorflow.keras import layers, models
import numpy as np
class CaptchaSolver:
def __init__(self, num_classes=62, max_length=6):
"""
Initialize CAPTCHA solver model
num_classes: 26 lowercase + 26 uppercase + 10 digits
max_length: Maximum CAPTCHA length
"""
self.num_classes = num_classes
self.max_length = max_length
self.model = self.build_model()
def build_model(self):
"""Build CNN + RNN model for CAPTCHA solving"""
# Input layer
inputs = layers.Input(shape=(50, 200, 1)) # height, width, channels
# CNN layers for feature extraction
x = layers.Conv2D(32, (3, 3), activation='relu', padding='same')(inputs)
x = layers.MaxPooling2D((2, 2))(x)
x = layers.BatchNormalization()(x)
x = layers.Conv2D(64, (3, 3), activation='relu', padding='same')(x)
x = layers.MaxPooling2D((2, 2))(x)
x = layers.BatchNormalization()(x)
x = layers.Conv2D(128, (3, 3), activation='relu', padding='same')(x)
x = layers.MaxPooling2D((2, 1))(x) # Preserve width for sequence
x = layers.BatchNormalization()(x)
# Reshape for RNN
new_shape = ((x.shape[1], x.shape[2] * x.shape[3]))
x = layers.Reshape(target_shape=new_shape)(x)
# Bidirectional LSTM for sequence learning
x = layers.Bidirectional(layers.LSTM(128, return_sequences=True))(x)
x = layers.Dropout(0.25)(x)
x = layers.Bidirectional(layers.LSTM(64, return_sequences=True))(x)
# Dense layer for character prediction
outputs = layers.TimeDistributed(
layers.Dense(self.num_classes, activation='softmax')
)(x)
model = models.Model(inputs=inputs, outputs=outputs)
return model
def compile_model(self):
"""Compile model with CTC loss"""
self.model.compile(
optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
loss=self.ctc_loss,
metrics=['accuracy']
)
def ctc_loss(self, y_true, y_pred):
"""Connectionist Temporal Classification loss"""
batch_len = tf.cast(tf.shape(y_true)[0], dtype="int64")
input_length = tf.fill([batch_len], tf.shape(y_pred)[1])
label_length = tf.reduce_sum(tf.cast(y_true != -1, dtype="int32"), axis=1)
loss = tf.nn.ctc_loss(
labels=y_true,
logits=y_pred,
label_length=label_length,
logit_length=input_length,
blank_index=self.num_classes
)
return tf.reduce_mean(loss)
def train(self, X_train, y_train, X_val, y_val, epochs=50):
"""Train the model"""
# Callbacks
callbacks = [
tf.keras.callbacks.EarlyStopping(
monitor='val_loss',
patience=5,
restore_best_weights=True
),
tf.keras.callbacks.ReduceLROnPlateau(
monitor='val_loss',
factor=0.5,
patience=3,
min_lr=1e-6
),
tf.keras.callbacks.ModelCheckpoint(
'best_model.h5',
monitor='val_loss',
save_best_only=True
)
]
# Train model
history = self.model.fit(
X_train, y_train,
validation_data=(X_val, y_val),
epochs=epochs,
batch_size=32,
callbacks=callbacks
)
return history
def predict(self, image):
"""Predict CAPTCHA text from image"""
# Preprocess image
if len(image.shape) == 2:
image = np.expand_dims(image, axis=-1)
# Add batch dimension
image = np.expand_dims(image, axis=0)
# Get predictions
predictions = self.model.predict(image)
# Decode predictions
decoded = self.decode_predictions(predictions[0])
return decoded
def decode_predictions(self, predictions):
"""Convert model output to text"""
# Character mapping
chars = '0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz'
# Get most likely character for each position
text = ''
for timestep in predictions:
char_idx = np.argmax(timestep)
if char_idx < len(chars):
text += chars[char_idx]
# Remove duplicates and blanks
cleaned_text = ''
prev_char = ''
for char in text:
if char != prev_char:
cleaned_text += char
prev_char = char
return cleaned_text
Flask API for serving predictions
from flask import Flask, request, jsonify
import base64
import numpy as np
from PIL import Image
import io
import time
from captcha_solver import CaptchaSolver
from preprocessor import CaptchaPreprocessor
app = Flask(__name__)
# Initialize model and preprocessor
solver = CaptchaSolver()
solver.model.load_weights('best_model.h5')
preprocessor = CaptchaPreprocessor()
# In-memory task storage (use Redis in production)
tasks = {}
@app.route('/api/v1/solve', methods=['POST'])
def solve_captcha():
"""Submit CAPTCHA for solving"""
try:
data = request.json
# Validate input
if 'image' not in data:
return jsonify({'error': 'No image provided'}), 400
# Generate task ID
task_id = str(int(time.time() * 1000))
# Store task
tasks[task_id] = {
'status': 'processing',
'created_at': time.time()
}
# Decode base64 image
image_data = base64.b64decode(data['image'])
image = Image.open(io.BytesIO(image_data))
# Convert to numpy array
image_array = np.array(image)
# Preprocess
processed = preprocessor.preprocess_image(image_array)
# Solve
solution = solver.predict(processed)
# Update task
tasks[task_id] = {
'status': 'completed',
'solution': {
'text': solution,
'confidence': 0.95 # Add confidence scoring
},
'completed_at': time.time()
}
return jsonify({
'success': True,
'taskId': task_id,
'status': 'completed',
'solution': solution
})
except Exception as e:
return jsonify({'error': str(e)}), 500
@app.route('/api/v1/solution/<task_id>', methods=['GET'])
def get_solution(task_id):
"""Get CAPTCHA solution by task ID"""
if task_id not in tasks:
return jsonify({'error': 'Task not found'}), 404
task = tasks[task_id]
return jsonify({
'success': True,
'taskId': task_id,
'status': task['status'],
'solution': task.get('solution', None)
})
@app.route('/api/v1/health', methods=['GET'])
def health_check():
"""Health check endpoint"""
return jsonify({
'status': 'healthy',
'model_loaded': solver.model is not None,
'active_tasks': len([t for t in tasks.values() if t['status'] == 'processing'])
})
# Advanced features
@app.route('/api/v1/batch', methods=['POST'])
def batch_solve():
"""Solve multiple CAPTCHAs in batch"""
try:
data = request.json
images = data.get('images', [])
if not images:
return jsonify({'error': 'No images provided'}), 400
results = []
for image_b64 in images:
# Process each image
image_data = base64.b64decode(image_b64)
image = Image.open(io.BytesIO(image_data))
image_array = np.array(image)
processed = preprocessor.preprocess_image(image_array)
solution = solver.predict(processed)
results.append({
'solution': solution,
'confidence': 0.95
})
return jsonify({
'success': True,
'results': results
})
except Exception as e:
return jsonify({'error': str(e)}), 500
if __name__ == '__main__':
app.run(debug=True, port=5000)
1. Generate Training Data
Use libraries like captcha or PIL to generate synthetic CAPTCHAs
2. Label Your Data
Create a CSV file mapping image filenames to their text labels
3. Data Augmentation
Apply rotations, noise, and distortions to increase dataset size
# train.py
import numpy as np
from sklearn.model_selection import train_test_split
from captcha_solver import CaptchaSolver
from data_loader import load_captcha_dataset
# Load dataset
X, y = load_captcha_dataset('data/processed/')
# Split data
X_train, X_val, y_train, y_val = train_test_split(
X, y, test_size=0.2, random_state=42
)
# Initialize and compile model
solver = CaptchaSolver()
solver.compile_model()
# Train
history = solver.train(
X_train, y_train,
X_val, y_val,
epochs=50
)
# Evaluate
test_loss, test_acc = solver.model.evaluate(X_val, y_val)
print(f"Validation accuracy: {test_acc:.2%}")
Character Accuracy
92% per character
Full CAPTCHA Accuracy
85% complete match
Processing Speed
150ms average
Distorted Text
Use elastic deformation and spatial transformer networks
Overlapping Characters
Implement advanced segmentation algorithms
Background Noise
Apply frequency domain filtering techniques
Dockerization
Container your application for easy deployment
Kubernetes Scaling
Auto-scale based on request volume
Monitoring & Logging
Track accuracy and performance metrics
Next Steps
This basic solver handles simple text CAPTCHAs. For production use with reCAPTCHA, hCaptcha, and other advanced types, consider using AI4CAP.COM's professional API.
Congratulations! You've built a functional CAPTCHA solver from scratch. This project demonstrates key concepts in computer vision, deep learning, and API development. While this basic solver works well for simple CAPTCHAs, modern websites use increasingly sophisticated CAPTCHA systems that require more advanced techniques.
For production applications requiring high accuracy and reliability across all CAPTCHA types, consider using a professional service like AI4CAP.COM. Our API handles the complexity of modern CAPTCHAs while you focus on building your application.