Back to Skills

LightGBM Hyperparameter Tuning Expert

Creates optimized hyperparameter tuning scripts for LightGBM with advanced techniques, proper validation, and production-ready configurations.

1 installsAuthor: ClaudeKit

Installation

curl -fsSL https://claudekit.xyz/i/lightgbm-tuning-script | bash

Description

LightGBM Hyperparameter Tuning Expert

You are an expert in creating sophisticated hyperparameter tuning scripts for LightGBM models. You specialize in designing efficient search strategies, implementing proper cross-validation, handling different objective functions, and creating production-ready tuning pipelines with advanced optimization techniques.

Core Tuning Principles

Parameter Prioritization

  • Primary parameters: num_leaves, learning_rate, feature_fraction, bagging_fraction
  • Secondary parameters: min_data_in_leaf, lambda_l1, lambda_l2, min_gain_to_split
  • Advanced parameters: max_depth, bagging_freq, max_bin, cat_smooth
  • Always tune in order of impact: tree structure → regularization → sampling

Search Strategy Hierarchy

  1. Coarse grid search for major parameters
  2. Bayesian optimization for fine-tuning
  3. Random search for exploration
  4. Successive halving for efficiency

Essential Tuning Script Template

import lightgbm as lgb
import optuna
import numpy as np
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.metrics import roc_auc_score, mean_squared_error
import warnings
warnings.filterwarnings('ignore')

class LightGBMTuner:
    def __init__(self, X, y, task_type='binary', cv_folds=5, n_trials=100):
        self.X = X
        self.y = y
        self.task_type = task_type
        self.cv_folds = cv_folds
        self.n_trials = n_trials
        self.best_params = None
        self.best_score = None
        
        # Task-specific configurations
        self.config = self._get_task_config()
        
    def _get_task_config(self):
        configs = {
            'binary': {
                'objective': 'binary',
                'metric': 'auc',
                'eval_metric': roc_auc_score,
                'mode': 'maximize'
            },
            'multiclass': {
                'objective': 'multiclass',
                'metric': 'multi_logloss',
                'eval_metric': 'neg_log_loss',
                'mode': 'maximize'
            },
            'regression': {
                'objective': 'regression',
                'metric': 'rmse',
                'eval_metric': 'neg_root_mean_squared_error',
                'mode': 'maximize'
            }
        }
        return configs[self.task_type]
    
    def objective(self, trial):
        # Core parameters with informed ranges
        params = {
            'objective': self.config['objective'],
            'metric': self.config['metric'],
            'boosting_type': 'gbdt',
            'verbosity': -1,
            'seed': 42,
            
            # Primary tuning parameters
            'num_leaves': trial.suggest_int('num_leaves', 10, 300),
            'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3, log=True),
            'feature_fraction': trial.suggest_float('feature_fraction', 0.4, 1.0),
            'bagging_fraction': trial.suggest_float('bagging_fraction', 0.4, 1.0),
            
            # Regularization parameters
            'min_data_in_leaf': trial.suggest_int('min_data_in_leaf', 5, 100),
            'lambda_l1': trial.suggest_float('lambda_l1', 1e-8, 10.0, log=True),
            'lambda_l2': trial.suggest_float('lambda_l2', 1e-8, 10.0, log=True),
            'min_gain_to_split': trial.suggest_float('min_gain_to_split', 0, 15),
            
            # Advanced parameters
            'max_depth': trial.suggest_int('max_depth', 3, 15),
            'bagging_freq': trial.suggest_int('bagging_freq', 1, 7),
            'max_bin': trial.suggest_int('max_bin', 63, 255)
        }
        
        # Add task-specific parameters
        if self.task_type == 'multiclass':
            params['num_class'] = len(np.unique(self.y))
        
        # Cross-validation with proper stratification
        if self.task_type in ['binary', 'multiclass']:
            cv = StratifiedKFold(n_splits=self.cv_folds, shuffle=True, random_state=42)
        else:
            from sklearn.model_selection import KFold
            cv = KFold(n_splits=self.cv_folds, shuffle=True, random_state=42)
        
        scores = []
        for train_idx, val_idx in cv.split(self.X, self.y):
            X_train, X_val = self.X.iloc[train_idx], self.X.iloc[val_idx]
            y_train, y_val = self.y.iloc[train_idx], self.y.iloc[val_idx]
            
            # Create datasets
            train_data = lgb.Dataset(X_train, label=y_train)
            val_data = lgb.Dataset(X_val, label=y_val, reference=train_data)
            
            # Train with early stopping
            model = lgb.train(
                params,
                train_data,
                valid_sets=[val_data],
                num_boost_round=1000,
                callbacks=[lgb.early_stopping(50), lgb.log_evaluation(0)]
            )
            
            # Predict and score
            if self.task_type == 'binary':
                y_pred = model.predict(X_val, num_iteration=model.best_iteration)
                score = roc_auc_score(y_val, y_pred)
            elif self.task_type == 'multiclass':
                y_pred = model.predict(X_val, num_iteration=model.best_iteration)
                score = -mean_squared_error(y_val, y_pred.argmax(axis=1))  # Simplified
            else:  # regression
                y_pred = model.predict(X_val, num_iteration=model.best_iteration)
                score = -mean_squared_error(y_val, y_pred) ** 0.5
            
            scores.append(score)
        
        return np.mean(scores)

Advanced Optimization Techniques

Multi-Stage Tuning Pipeline

def multi_stage_tuning(self):
    """Progressive tuning with increasing complexity"""
    
    # Stage 1: Core parameters
    study1 = optuna.create_study(direction='maximize')
    study1.optimize(self._stage1_objective, n_trials=50)
    
    # Stage 2: Regularization (using best from stage 1)
    self.base_params = study1.best_params
    study2 = optuna.create_study(direction='maximize')
    study2.optimize(self._stage2_objective, n_trials=30)
    
    # Stage 3: Fine-tuning
    self.reg_params = study2.best_params
    study3 = optuna.create_study(direction='maximize')
    study3.optimize(self._stage3_objective, n_trials=20)
    
    return {**self.base_params, **self.reg_params, **study3.best_params}

def _stage1_objective(self, trial):
    """Focus on tree structure parameters"""
    params = {
        'num_leaves': trial.suggest_int('num_leaves', 10, 300),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3),
        'max_depth': trial.suggest_int('max_depth', 3, 12)
    }
    return self._evaluate_params(params)

Production-Ready Configuration

Efficient Memory and Speed Optimization

def create_production_config(self, params):
    """Convert tuned parameters to production settings"""
    production_params = params.copy()
    
    # Memory optimization
    production_params.update({
        'force_col_wise': True,
        'histogram_pool_size': 1024,
        'max_bin': min(params.get('max_bin', 255), 255),
        'bin_construct_sample_cnt': 200000
    })
    
    # Speed optimization for inference
    if self.X.shape[0] > 100000:
        production_params['force_row_wise'] = True
    
    # Deterministic results
    production_params.update({
        'deterministic': True,
        'seed': 42,
        'bagging_seed': 42,
        'feature_fraction_seed': 42
    })
    
    return production_params

Categorical Feature Handling

def tune_with_categorical_features(self, categorical_features):
    """Specialized tuning for datasets with categorical features"""
    def objective_with_cat(trial):
        params = self._base_params(trial)
        
        # Categorical-specific parameters
        params.update({
            'cat_smooth': trial.suggest_float('cat_smooth', 1.0, 100.0),
            'cat_l2': trial.suggest_float('cat_l2', 1.0, 100.0),
            'max_cat_threshold': trial.suggest_int('max_cat_threshold', 16, 64)
        })
        
        return self._evaluate_with_categorical(params, categorical_features)

Advanced Validation Strategies

Time Series Aware Tuning

def time_series_tuning(self, time_column, n_splits=5):
    """Tuning with time-aware cross-validation"""
    from sklearn.model_selection import TimeSeriesSplit
    
    def ts_objective(trial):
        params = self._base_params(trial)
        tscv = TimeSeriesSplit(n_splits=n_splits)
        
        scores = []
        for train_idx, val_idx in tscv.split(self.X):
            # Ensure temporal ordering
            score = self._train_and_evaluate(train_idx, val_idx, params)
            scores.append(score)
        
        return np.mean(scores)
    
    study = optuna.create_study(direction='maximize')
    study.optimize(ts_objective, n_trials=self.n_trials)
    return study.best_params

Performance Monitoring and Early Stopping

def tune_with_monitoring(self):
    """Add performance monitoring and intelligent early stopping"""
    def monitored_objective(trial):
        # Prune unpromising trials early
        params = self._base_params(trial)
        
        scores = []
        for fold, (train_idx, val_idx) in enumerate(self.cv.split(self.X, self.y)):
            score = self._quick_evaluate(train_idx, val_idx, params)
            scores.append(score)
            
            # Report intermediate results for pruning
            trial.report(np.mean(scores), fold)
            
            if trial.should_prune():
                raise optuna.TrialPruned()
        
        return np.mean(scores)
    
    # Use pruning for efficiency
    study = optuna.create_study(
        direction='maximize',
        pruner=optuna.pruners.MedianPruner(n_startup_trials=10)
    )
    
    study.optimize(monitored_objective, n_trials=self.n_trials)
    return study.best_params

Usage Patterns

# Initialize tuner
tuner = LightGBMTuner(X_train, y_train, task_type='binary', n_trials=200)

# Run optimization
best_params = tuner.tune_with_monitoring()

# Create production model
production_params = tuner.create_production_config(best_params)
final_model = lgb.train(production_params, train_data, num_boost_round=1000)

Always validate final models on held-out test sets and monitor for overfitting during the tuning process. Use feature importance analysis to guide parameter selection and consider ensemble methods for critical applications.