[ PROMPT_NODE_22751 ]
Multimodal Blip 2 – Advanced Usage
[ SKILL_DOCUMENTATION ]
# BLIP-2 Advanced Usage Guide
## Fine-tuning BLIP-2
### LoRA fine-tuning (recommended)
```python
import torch
from transformers import Blip2ForConditionalGeneration, Blip2Processor
from peft import LoraConfig, get_peft_model
# Load base model
model = Blip2ForConditionalGeneration.from_pretrained(
"Salesforce/blip2-opt-2.7b",
torch_dtype=torch.float16,
device_map="auto"
)
# Configure LoRA for the language model
lora_config = LoraConfig(
r=16,
lora_alpha=32,
target_modules=["q_proj", "v_proj", "k_proj", "out_proj"],
lora_dropout=0.05,
bias="none",
task_type="CAUSAL_LM"
)
# Apply LoRA
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()
# trainable params: ~4M, all params: ~3.8B (0.1%)
```
### Fine-tuning Q-Former only
```python
# Freeze everything except Q-Former
for name, param in model.named_parameters():
if "qformer" not in name.lower():
param.requires_grad = False
else:
param.requires_grad = True
# Check trainable parameters
trainable = sum(p.numel() for p in model.parameters() if p.requires_grad)
total = sum(p.numel() for p in model.parameters())
print(f"Trainable: {trainable:,} / {total:,} ({100*trainable/total:.2f}%)")
```
### Custom dataset for fine-tuning
```python
import torch
from torch.utils.data import Dataset, DataLoader
from PIL import Image
class CaptionDataset(Dataset):
def __init__(self, data, processor, max_length=128):
self.data = data # List of {"image_path": str, "caption": str}
self.processor = processor
self.max_length = max_length
def __len__(self):
return len(self.data)
def __getitem__(self, idx):
item = self.data[idx]
image = Image.open(item["image_path"]).convert("RGB")
# Process inputs
encoding = self.processor(
images=image,
text=item["caption"],
padding="max_length",
truncation=True,
max_length=self.max_length,
return_tensors="pt"
)
# Remove batch dimension
encoding = {k: v.squeeze(0) for k, v in encoding.items()}
# Labels for language modeling
encoding["labels"] = encoding["input_ids"].clone()
return encoding
# Create dataloader
dataset = CaptionDataset(train_data, processor)
dataloader = DataLoader(dataset, batch_size=8, shuffle=True)
```
### Training loop
```python
from transformers import AdamW, get_linear_schedule_with_warmup
from tqdm import tqdm
# Optimizer
optimizer = AdamW(model.parameters(), lr=1e-5, weight_decay=0.01)
# Scheduler
num_epochs = 3
num_training_steps = len(dataloader) * num_epochs
scheduler = get_linear_schedule_with_warmup(
optimizer,
num_warmup_steps=num_training_steps // 10,
num_training_steps=num_training_steps
)
# Training
model.train()
for epoch in range(num_epochs):
total_loss = 0
for batch in tqdm(dataloader, desc=f"Epoch {epoch+1}"):
batch = {k: v.to("cuda") for k, v in batch.items()}
outputs = model(**batch)
loss = outputs.loss
loss.backward()
torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
optimizer.step()
scheduler.step()
optimizer.zero_grad()
total_loss += loss.item()
avg_loss = total_loss / len(dataloader)
print(f"Epoch {epoch+1} - Loss: {avg_loss:.4f}")
# Save fine-tuned model
model.save_pretrained("blip2-finetuned")
processor.save_pretrained("blip2-finetuned")
```
### Fine-tuning with LAVIS
```python
from lavis.models import load_model_and_preprocess
from lavis.common.registry import registry
from lavis.datasets.builders import load_dataset
# Load model
model, vis_processors, txt_processors = load_model_and_preprocess(
name="blip2_opt",
model_type="pretrain_opt2.7b",
is_eval=False, # Training mode
device="cuda"
)
# Load dataset
dataset = load_dataset("coco_caption")
# Get trainer class
runner_cls = registry.get_runner_class("runner_base")
runner = runner_cls(
cfg=cfg,
task=task,
model=model,
datasets=datasets
)
# Train
runner.train()
```
## Multi-GPU Training
### DataParallel
```python
import torch.nn as nn
model = Blip2ForConditionalGeneration.from_pretrained(
"Salesforce/blip2-opt-2.7b",
torch_dtype=torch.float16
)
# Wrap with DataParallel
if torch.cuda.device_count() > 1:
model = nn.DataParallel(model)
model.to("cuda")
```
### DistributedDataParallel
```python
import torch.distributed as dist
from torch.nn.parallel import DistributedDataParallel as DDP
from torch.utils.data.distributed import DistributedSampler
def setup(rank, world_size):
dist.init_process_group("nccl", rank=rank, world_size=world_size)
torch.cuda.set_device(rank)
def train(rank, world_size):
setup(rank, world_size)
model = Blip2ForConditionalGeneration.from_pretrained(
"Salesforce/blip2-opt-2.7b",
torch_dtype=torch.float16
).to(rank)
model = DDP(model, device_ids=[rank])
# Use DistributedSampler
sampler = DistributedSampler(dataset, num_replicas=world_size, rank=rank)
dataloader = DataLoader(dataset, sampler=sampler, batch_size=4)
# Training loop
for epoch in range(num_epochs):
sampler.set_epoch(epoch)
for batch in dataloader:
# ... training code
pass
dist.destroy_process_group()
# Launch
import torch.multiprocessing as mp
world_size = torch.cuda.device_count()
mp.spawn(train, args=(world_size,), nprocs=world_size)
```
### Accelerate integration
```python
from accelerate import Accelerator
from transformers import Blip2ForConditionalGeneration, Blip2Processor
accelerator = Accelerator(mixed_precision="fp16")
model = Blip2ForConditionalGeneration.from_pretrained("Salesforce/blip2-opt-2.7b")
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-5)
# Prepare for distributed training
model, optimizer, dataloader = accelerator.prepare(
model, optimizer, dataloader
)
# Training loop
for batch in dataloader:
outputs = model(**batch)
loss = outputs.loss
accelerator.backward(loss)
optimizer.step()
optimizer.zero_grad()
```
## Integration Patterns
### Gradio interface
```python
import gradio as gr
import torch
from PIL import Image
from transformers import Blip2Processor, Blip2ForConditionalGeneration
# Load model
processor = Blip2Processor.from_pretrained("Salesforce/blip2-opt-2.7b")
model = Blip2ForConditionalGeneration.from_pretrained(
"Salesforce/blip2-opt-2.7b",
torch_dtype=torch.float16,
device_map="auto"
)
def caption_image(image, question=None):
if question:
inputs = processor(images=image, text=question, return_tensors="pt")
else:
inputs = processor(images=image, return_tensors="pt")
inputs = inputs.to("cuda", torch.float16)
generated_ids = model.generate(**inputs, max_new_tokens=100)
return processor.decode(generated_ids[0], skip_special_tokens=True)
# Create interface
demo = gr.Interface(
fn=caption_image,
inputs=[
gr.Image(type="pil", label="Upload Image"),
gr.Textbox(label="Question (optional)", placeholder="What is in this image?")
],
outputs=gr.Textbox(label="Response"),
title="BLIP-2 Demo",
examples=[
["example1.jpg", None],
["example2.jpg", "What colors are in this image?"]
]
)
demo.launch()
```
### FastAPI server
```python
from fastapi import FastAPI, UploadFile, File
from PIL import Image
import torch
from transformers import Blip2Processor, Blip2ForConditionalGeneration
import io
app = FastAPI()
# Load model at startup
processor = None
model = None
@app.on_event("startup")
async def load_model():
global processor, model
processor = Blip2Processor.from_pretrained("Salesforce/blip2-opt-2.7b")
model = Blip2ForConditionalGeneration.from_pretrained(
"Salesforce/blip2-opt-2.7b",
torch_dtype=torch.float16,
device_map="auto"
)
@app.post("/caption")
async def caption(file: UploadFile = File(...), question: str = None):
# Read image
contents = await file.read()
image = Image.open(io.BytesIO(contents)).convert("RGB")
# Process
if question:
inputs = processor(images=image, text=question, return_tensors="pt")
else:
inputs = processor(images=image, return_tensors="pt")
inputs = inputs.to("cuda", torch.float16)
# Generate
generated_ids = model.generate(**inputs, max_new_tokens=100)
caption = processor.decode(generated_ids[0], skip_special_tokens=True)
return {"caption": caption}
@app.post("/batch_caption")
async def batch_caption(files: list[UploadFile] = File(...)):
images = []
for file in files:
contents = await file.read()
images.append(Image.open(io.BytesIO(contents)).convert("RGB"))
inputs = processor(images=images, return_tensors="pt", padding=True)
inputs = inputs.to("cuda", torch.float16)
generated_ids = model.generate(**inputs, max_new_tokens=100)
captions = processor.batch_decode(generated_ids, skip_special_tokens=True)
return {"captions": captions}
# Run: uvicorn server:app --host 0.0.0.0 --port 8000
```
### LangChain integration
```python
from langchain.tools import BaseTool
from langchain.agents import initialize_agent, AgentType
from langchain.llms import OpenAI
import torch
from PIL import Image
from transformers import Blip2Processor, Blip2ForConditionalGeneration
class ImageCaptionTool(BaseTool):
name = "image_caption"
description = "Generate a caption for an image. Input should be an image file path."
def __init__(self):
super().__init__()
self.processor = Blip2Processor.from_pretrained("Salesforce/blip2-opt-2.7b")
self.model = Blip2ForConditionalGeneration.from_pretrained(
"Salesforce/blip2-opt-2.7b",
torch_dtype=torch.float16,
device_map="auto"
)
def _run(self, image_path: str) -> str:
image = Image.open(image_path).convert("RGB")
inputs = self.processor(images=image, return_tensors="pt").to("cuda", torch.float16)
generated_ids = self.model.generate(**inputs, max_new_tokens=50)
return self.processor.decode(generated_ids[0], skip_special_tokens=True)
class VisualQATool(BaseTool):
name = "visual_qa"
description = "Answer questions about an image. Input format: 'image_path|question'"
def __init__(self, processor, model):
super().__init__()
self.processor = processor
self.model = model
def _run(self, query: str) -> str:
image_path, question = query.split("|")
image = Image.open(image_path.strip()).convert("RGB")
inputs = self.processor(images=image, text=question.strip(), return_tensors="pt")
inputs = inputs.to("cuda", torch.float16)
generated_ids = self.model.generate(**inputs, max_new_tokens=50)
return self.processor.decode(generated_ids[0], skip_special_tokens=True)
# Use with agent
tools = [ImageCaptionTool(), VisualQATool(processor, model)]
agent = initialize_agent(tools, llm, agent=AgentType.ZERO_SHOT_REACT_DESCRIPTION)
```
## ONNX Export and Deployment
### Export to ONNX
```python
import torch
from transformers import Blip2ForConditionalGeneration, Blip2Processor
model = Blip2ForConditionalGeneration.from_pretrained("Salesforce/blip2-opt-2.7b")
processor = Blip2Processor.from_pretrained("Salesforce/blip2-opt-2.7b")
# Example inputs
image = Image.open("example.jpg").convert("RGB")
inputs = processor(images=image, return_tensors="pt")
# Export vision encoder
torch.onnx.export(
model.vision_model,
inputs["pixel_values"],
"blip2_vision.onnx",
input_names=["pixel_values"],
output_names=["image_embeds"],
dynamic_axes={
"pixel_values": {0: "batch_size"},
"image_embeds": {0: "batch_size"}
},
opset_version=14
)
```
### TensorRT optimization
```python
import tensorrt as trt
import pycuda.driver as cuda
def build_engine(onnx_path, engine_path):
logger = trt.Logger(trt.Logger.WARNING)
builder = trt.Builder(logger)
network = builder.create_network(1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH))
parser = trt.OnnxParser(network, logger)
with open(onnx_path, 'rb') as f:
parser.parse(f.read())
config = builder.create_builder_config()
config.set_flag(trt.BuilderFlag.FP16) # Enable FP16
config.max_workspace_size = 1 << 30 # 1GB
engine = builder.build_serialized_network(network, config)
with open(engine_path, 'wb') as f:
f.write(engine)
build_engine("blip2_vision.onnx", "blip2_vision.trt")
```
## Specialized Use Cases
### Video captioning (frame-by-frame)
```python
import cv2
import torch
from PIL import Image
def caption_video(video_path, sample_rate=30):
"""Caption video by sampling frames."""
cap = cv2.VideoCapture(video_path)
fps = cap.get(cv2.CAP_PROP_FPS)
frame_interval = int(fps * sample_rate / 30) # Sample every N frames
captions = []
frame_count = 0
while cap.isOpened():
ret, frame = cap.read()
if not ret:
break
if frame_count % frame_interval == 0:
# Convert BGR to RGB
rgb_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
image = Image.fromarray(rgb_frame)
# Caption
inputs = processor(images=image, return_tensors="pt").to("cuda", torch.float16)
generated_ids = model.generate(**inputs, max_new_tokens=50)
caption = processor.decode(generated_ids[0], skip_special_tokens=True)
timestamp = frame_count / fps
captions.append({"timestamp": timestamp, "caption": caption})
frame_count += 1
cap.release()
return captions
# Usage
captions = caption_video("video.mp4", sample_rate=1) # 1 frame per second
for c in captions:
print(f"[{c['timestamp']:.1f}s] {c['caption']}")
```
### Document understanding
```python
def analyze_document(image_path):
"""Extract information from document image."""
image = Image.open(image_path).convert("RGB")
questions = [
"What type of document is this?",
"What is the title of this document?",
"What are the main sections?",
"Summarize the key information."
]
results = {}
for q in questions:
inputs = processor(images=image, text=q, return_tensors="pt").to("cuda", torch.float16)
generated_ids = model.generate(**inputs, max_new_tokens=100)
answer = processor.decode(generated_ids[0], skip_special_tokens=True)
results[q] = answer
return results
# Usage
doc_info = analyze_document("invoice.png")
for q, a in doc_info.items():
print(f"Q: {q}nA: {a}n")
```
### Medical image analysis
```python
def analyze_medical_image(image_path, modality="xray"):
"""Analyze medical images with specific prompts."""
image = Image.open(image_path).convert("RGB")
prompts = {
"xray": [
"Describe any abnormalities visible in this chest X-ray.",
"What anatomical structures are visible?",
"Is there any evidence of pathology?"
],
"ct": [
"Describe the CT scan findings.",
"What organs are visible in this slice?",
"Are there any masses or lesions?"
],
"mri": [
"Describe the MRI findings.",
"What tissues show abnormal signal intensity?",
"What is the most likely diagnosis?"
]
}
results = []
for prompt in prompts.get(modality, prompts["xray"]):
inputs = processor(images=image, text=prompt, return_tensors="pt").to("cuda", torch.float16)
generated_ids = model.generate(**inputs, max_new_tokens=150)
answer = processor.decode(generated_ids[0], skip_special_tokens=True)
results.append({"question": prompt, "answer": answer})
return results
# Note: BLIP-2 is not trained on medical data - use specialized models for clinical use
```
## Evaluation
### Caption evaluation metrics
```python
from pycocoevalcap.bleu.bleu import Bleu
from pycocoevalcap.meteor.meteor import Meteor
from pycocoevalcap.rouge.rouge import Rouge
from pycocoevalcap.cider.cider import Cider
def evaluate_captions(predictions, references):
"""
Evaluate generated captions against references.
Args:
predictions: dict {image_id: [caption]}
references: dict {image_id: [ref1, ref2, ...]}
"""
scorers = [
(Bleu(4), ["Bleu_1", "Bleu_2", "Bleu_3", "Bleu_4"]),
(Meteor(), "METEOR"),
(Rouge(), "ROUGE_L"),
(Cider(), "CIDEr"),
]
results = {}
for scorer, method in scorers:
score, _ = scorer.compute_score(references, predictions)
if isinstance(method, list):
for sc, m in zip(score, method):
results[m] = sc
else:
results[method] = score
return results
# Usage
preds = {0: ["a cat sitting on a mat"], 1: ["a dog running in the park"]}
refs = {0: ["a cat on a mat", "cat sitting"], 1: ["dog in park", "running dog"]}
scores = evaluate_captions(preds, refs)
print(scores)
```
### VQA evaluation
```python
def vqa_accuracy(predictions, ground_truths):
"""
VQA accuracy metric (soft accuracy from VQA challenge).
Args:
predictions: list of predicted answers
ground_truths: list of lists (multiple annotator answers)
"""
def compute_accuracy(pred, gts):
pred = pred.lower().strip()
gts = [gt.lower().strip() for gt in gts]
# Count matches
matches = sum(1 for gt in gts if pred == gt)
return min(matches / 3, 1.0) # Cap at 1.0
accuracies = []
for pred, gts in zip(predictions, ground_truths):
accuracies.append(compute_accuracy(pred, gts))
return sum(accuracies) / len(accuracies)
# Usage
preds = ["yes", "a dog", "blue"]
gts = [["yes", "yes", "no"], ["dog", "a dog", "puppy"], ["blue", "light blue", "azure"]]
acc = vqa_accuracy(preds, gts)
print(f"VQA Accuracy: {acc:.2%}")
```
## Model Comparison
### BLIP-2 variants benchmark
| Model | COCO Caption (CIDEr) | VQAv2 (Acc) | GQA (Acc) | VRAM |
|-------|---------------------|-------------|-----------|------|
| blip2-opt-2.7b | 129.7 | 52.6 | 41.3 | 8GB |
| blip2-opt-6.7b | 133.4 | 54.2 | 42.8 | 16GB |
| blip2-flan-t5-xl | 138.1 | 62.9 | 44.1 | 10GB |
| blip2-flan-t5-xxl | 145.8 | 65.0 | 45.9 | 26GB |
### Comparison with other models
| Model | Architecture | Zero-shot VQA | Training Cost |
|-------|-------------|---------------|---------------|
| BLIP-2 | Q-Former + LLM | Excellent | Low (Q-Former only) |
| LLaVA | Linear + LLM | Good | Medium |
| Flamingo | Perceiver + LLM | Excellent | High |
| InstructBLIP | Q-Former + LLM | Best | Low |
Source: claude-code-templates (MIT). See About Us for full credits.