[ PROMPT_NODE_22681 ]

Tracking

[ SKILL_DOCUMENTATION ]

# Comprehensive Tracking Guide Complete guide to experiment tracking with MLflow. ## Table of Contents - Logging Parameters - Logging Metrics - Logging Artifacts - Logging Models - Autologging - Runs and Experiments - Searching and Comparing ## Logging Parameters ### Basic Parameter Logging ```python import mlflow with mlflow.start_run(): # Single parameter mlflow.log_param("learning_rate", 0.001) mlflow.log_param("batch_size", 32) mlflow.log_param("optimizer", "Adam") # Multiple parameters at once mlflow.log_params({ "epochs": 50, "dropout": 0.2, "weight_decay": 1e-4, "momentum": 0.9 }) ``` ### Structured Parameters ```python # Nested configuration config = { "model": { "architecture": "ResNet50", "pretrained": True, "num_classes": 10 }, "training": { "lr": 0.001, "batch_size": 32, "epochs": 50 }, "data": { "dataset": "ImageNet", "augmentation": True } } with mlflow.start_run(): # Log as flattened params for section, params in config.items(): for key, value in params.items(): mlflow.log_param(f"{section}.{key}", value) # Or log entire config as artifact mlflow.log_dict(config, "config.json") ``` ### Parameter Best Practices ```python with mlflow.start_run(): # ✅ Good: Log all hyperparameters mlflow.log_params({ "learning_rate": 0.001, "batch_size": 32, "optimizer": "Adam", "scheduler": "CosineAnnealing", "weight_decay": 1e-4 }) # ✅ Good: Log data info mlflow.log_params({ "dataset": "ImageNet", "train_samples": len(train_dataset), "val_samples": len(val_dataset), "num_classes": 1000 }) # ✅ Good: Log environment info mlflow.log_params({ "framework": "PyTorch 2.0", "cuda_version": torch.version.cuda, "gpu": torch.cuda.get_device_name(0) }) ``` ## Logging Metrics ### Time-Series Metrics ```python with mlflow.start_run(): for epoch in range(num_epochs): # Train train_loss, train_acc = train_epoch() # Validate val_loss, val_acc = validate() # Log metrics with step mlflow.log_metric("train_loss", train_loss, step=epoch) mlflow.log_metric("train_accuracy", train_acc, step=epoch) mlflow.log_metric("val_loss", val_loss, step=epoch) mlflow.log_metric("val_accuracy", val_acc, step=epoch) # Log learning rate current_lr = optimizer.param_groups[0]['lr'] mlflow.log_metric("learning_rate", current_lr, step=epoch) ``` ### Batch-Level Metrics ```python with mlflow.start_run(): global_step = 0 for epoch in range(num_epochs): for batch_idx, (data, target) in enumerate(train_loader): loss = train_batch(data, target) # Log every 100 batches if global_step % 100 == 0: mlflow.log_metric("batch_loss", loss, step=global_step) global_step += 1 # Log epoch metrics val_loss = validate() mlflow.log_metric("epoch_val_loss", val_loss, step=epoch) ``` ### Multiple Metrics at Once ```python with mlflow.start_run(): metrics = { "train_loss": 0.15, "val_loss": 0.18, "train_accuracy": 0.95, "val_accuracy": 0.92, "f1_score": 0.93, "precision": 0.94, "recall": 0.92 } mlflow.log_metrics(metrics, step=epoch) ``` ### Custom Metrics ```python def compute_custom_metrics(y_true, y_pred): """Compute custom evaluation metrics.""" from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score return { "accuracy": accuracy_score(y_true, y_pred), "f1_macro": f1_score(y_true, y_pred, average='macro'), "f1_weighted": f1_score(y_true, y_pred, average='weighted'), "precision": precision_score(y_true, y_pred, average='weighted'), "recall": recall_score(y_true, y_pred, average='weighted') } with mlflow.start_run(): predictions = model.predict(X_test) metrics = compute_custom_metrics(y_test, predictions) # Log all metrics mlflow.log_metrics(metrics) ``` ## Logging Artifacts ### Files and Directories ```python with mlflow.start_run(): # Log single file plt.savefig('loss_curve.png') mlflow.log_artifact('loss_curve.png') # Log directory os.makedirs('plots', exist_ok=True) plt.savefig('plots/train_loss.png') plt.savefig('plots/val_loss.png') mlflow.log_artifacts('plots') # Logs entire directory # Log to specific artifact path mlflow.log_artifact('model.pkl', artifact_path='models') # Stored at: artifacts/models/model.pkl ``` ### JSON and YAML ```python import json import yaml with mlflow.start_run(): # Log dict as JSON config = {"lr": 0.001, "batch_size": 32} mlflow.log_dict(config, "config.json") # Log as YAML with open('config.yaml', 'w') as f: yaml.dump(config, f) mlflow.log_artifact('config.yaml') ``` ### Text Files ```python with mlflow.start_run(): # Log training summary summary = f""" Training Summary: - Epochs: {num_epochs} - Final train loss: {final_train_loss:.4f} - Final val loss: {final_val_loss:.4f} - Best accuracy: {best_acc:.4f} - Training time: {training_time:.2f}s """ with open('summary.txt', 'w') as f: f.write(summary) mlflow.log_artifact('summary.txt') ``` ### Model Checkpoints ```python import torch with mlflow.start_run(): # Save checkpoint checkpoint = { 'epoch': epoch, 'model_state_dict': model.state_dict(), 'optimizer_state_dict': optimizer.state_dict(), 'loss': loss, 'accuracy': accuracy } torch.save(checkpoint, f'checkpoint_epoch_{epoch}.pth') mlflow.log_artifact(f'checkpoint_epoch_{epoch}.pth', artifact_path='checkpoints') ``` ## Logging Models ### Framework-Specific Logging ```python # Scikit-learn import mlflow.sklearn with mlflow.start_run(): model = train_sklearn_model() mlflow.sklearn.log_model(model, "model") # PyTorch import mlflow.pytorch with mlflow.start_run(): model = train_pytorch_model() mlflow.pytorch.log_model(model, "model") # TensorFlow/Keras import mlflow.keras with mlflow.start_run(): model = train_keras_model() mlflow.keras.log_model(model, "model") # XGBoost import mlflow.xgboost with mlflow.start_run(): model = train_xgboost_model() mlflow.xgboost.log_model(model, "model") ``` ### Log Model with Signature ```python from mlflow.models.signature import infer_signature import mlflow.sklearn with mlflow.start_run(): model = train_model() # Infer signature from training data signature = infer_signature(X_train, model.predict(X_train)) # Log with signature mlflow.sklearn.log_model( model, "model", signature=signature ) ``` ### Log Model with Input Example ```python with mlflow.start_run(): model = train_model() # Log with input example input_example = X_train[:5] mlflow.sklearn.log_model( model, "model", signature=signature, input_example=input_example ) ``` ### Log Model to Registry ```python with mlflow.start_run(): model = train_model() # Log and register in one step mlflow.sklearn.log_model( model, "model", registered_model_name="my-classifier" # Register immediately ) ``` ## Autologging ### Enable Autologging ```python import mlflow # Enable for all frameworks mlflow.autolog() # Or framework-specific mlflow.sklearn.autolog() mlflow.pytorch.autolog() mlflow.keras.autolog() mlflow.xgboost.autolog() mlflow.lightgbm.autolog() ``` ### Autologging with Scikit-learn ```python import mlflow from sklearn.ensemble import RandomForestClassifier mlflow.sklearn.autolog() with mlflow.start_run(): model = RandomForestClassifier(n_estimators=100, max_depth=5) model.fit(X_train, y_train) # Automatically logs: # - Parameters: n_estimators, max_depth, etc. # - Metrics: training score, test score # - Model: pickled model # - Training time ``` ### Autologging with PyTorch Lightning ```python import mlflow import pytorch_lightning as pl mlflow.pytorch.autolog() with mlflow.start_run(): trainer = pl.Trainer(max_epochs=10) trainer.fit(model, datamodule=dm) # Automatically logs: # - Hyperparameters from model and trainer # - Training and validation metrics # - Model checkpoints ``` ### Disable Autologging ```python # Disable for specific framework mlflow.sklearn.autolog(disable=True) # Disable all mlflow.autolog(disable=True) ``` ### Configure Autologging ```python mlflow.sklearn.autolog( log_input_examples=True, # Log input examples log_model_signatures=True, # Log model signatures log_models=True, # Log models disable=False, exclusive=False, disable_for_unsupported_versions=False, silent=False ) ``` ## Runs and Experiments ### Create Experiment ```python # Create experiment experiment_id = mlflow.create_experiment( "my-experiment", artifact_location="s3://my-bucket/mlflow", tags={"project": "classification", "team": "ml-team"} ) # Set active experiment mlflow.set_experiment("my-experiment") # Get experiment experiment = mlflow.get_experiment_by_name("my-experiment") print(f"Experiment ID: {experiment.experiment_id}") ``` ### Nested Runs ```python # Parent run with mlflow.start_run(run_name="hyperparameter-tuning"): parent_run_id = mlflow.active_run().info.run_id # Child runs for lr in [0.001, 0.01, 0.1]: with mlflow.start_run(run_name=f"lr-{lr}", nested=True): mlflow.log_param("learning_rate", lr) model = train(lr) accuracy = evaluate(model) mlflow.log_metric("accuracy", accuracy) ``` ### Run Tags ```python with mlflow.start_run(): # Set tags mlflow.set_tags({ "model_type": "ResNet50", "dataset": "ImageNet", "git_commit": get_git_commit(), "developer": "[email protected]" }) # Single tag mlflow.set_tag("production_ready", "true") ``` ### Run Notes ```python with mlflow.start_run(): # Add notes mlflow.set_tag("mlflow.note.content", """ ## Experiment Notes - Using pretrained ResNet50 - Fine-tuning last 2 layers - Data augmentation: random flip, crop, rotation - Learning rate schedule: cosine annealing ## Results - Best validation accuracy: 95.2% - Converged after 35 epochs """) ``` ## Searching and Comparing ### Search Runs ```python from mlflow.tracking import MlflowClient client = MlflowClient() # Get experiment experiment = mlflow.get_experiment_by_name("my-experiment") experiment_id = experiment.experiment_id # Search all runs runs = client.search_runs( experiment_ids=[experiment_id], filter_string="", order_by=["metrics.accuracy DESC"], max_results=10 ) for run in runs: print(f"Run ID: {run.info.run_id}") print(f"Accuracy: {run.data.metrics.get('accuracy', 'N/A')}") print(f"Params: {run.data.params}") print("---") ``` ### Filter Runs ```python # Filter by metric runs = client.search_runs( experiment_ids=[experiment_id], filter_string="metrics.accuracy > 0.9" ) # Filter by parameter runs = client.search_runs( experiment_ids=[experiment_id], filter_string="params.model = 'ResNet50'" ) # Complex filter runs = client.search_runs( experiment_ids=[experiment_id], filter_string=""" metrics.accuracy > 0.9 AND params.learning_rate 0", order_by=[f"metrics.{metric} DESC"], max_results=top_n ) print(f"Top {top_n} runs by {metric}:") print("-" * 80) for i, run in enumerate(runs, 1): print(f"{i}. Run ID: {run.info.run_id}") print(f" {metric}: {run.data.metrics.get(metric, 'N/A')}") print(f" Params: {run.data.params}") print() compare_best_runs("my-experiment", metric="accuracy", top_n=5) ``` ### Download Artifacts ```python client = MlflowClient() # Download artifact run_id = "abc123" local_path = client.download_artifacts(run_id, "model") print(f"Downloaded to: {local_path}") # Download specific file local_file = client.download_artifacts(run_id, "plots/loss_curve.png") ``` ## Best Practices ### 1. Use Descriptive Names ```python # ✅ Good: Descriptive experiment and run names mlflow.set_experiment("sentiment-analysis-bert") with mlflow.start_run(run_name="bert-base-lr1e-5-bs32-epochs10"): train() # ❌ Bad: Generic names mlflow.set_experiment("experiment1") with mlflow.start_run(): train() ``` ### 2. Log Comprehensive Metadata ```python with mlflow.start_run(): # Hyperparameters mlflow.log_params(config) # System info mlflow.set_tags({ "git_commit": get_git_commit(), "framework": f"PyTorch {torch.__version__}", "cuda": torch.version.cuda, "gpu": torch.cuda.get_device_name(0) }) # Data info mlflow.log_params({ "train_samples": len(train_dataset), "val_samples": len(val_dataset), "num_classes": num_classes }) ``` ### 3. Track Time ```python import time with mlflow.start_run(): start_time = time.time() # Training model = train() # Log training time training_time = time.time() - start_time mlflow.log_metric("training_time_seconds", training_time) ``` ### 4. Version Control Integration ```python import subprocess def get_git_commit(): """Get current git commit hash.""" try: return subprocess.check_output( ['git', 'rev-parse', 'HEAD'] ).decode('ascii').strip() except: return "unknown" with mlflow.start_run(): mlflow.set_tag("git_commit", get_git_commit()) mlflow.set_tag("git_branch", get_git_branch()) ``` ### 5. Error Handling ```python with mlflow.start_run(): try: model = train() mlflow.set_tag("status", "completed") except Exception as e: mlflow.set_tag("status", "failed") mlflow.set_tag("error", str(e)) raise ``` ## Resources - **Tracking API**: https://mlflow.org/docs/latest/tracking.html - **Python API**: https://mlflow.org/docs/latest/python_api/mlflow.html - **Examples**: https://github.com/mlflow/mlflow/tree/master/examples

Source: claude-code-templates (MIT). See About Us for full credits.

BAGUA AI