Risk Minimization

Contents

Risk Minimization#

#!pip install numpy pandas scikit-learn xgboost matplotlib seaborn

import numpy as np
import pandas as pd

# Set a random seed for reproducibility
np.random.seed(42)

# Number of samples
n_samples = 1000

# Generate ovd_days (overdue days)
ovd_days = np.random.randint(0, 120, size=n_samples)

# Generate ovd30, ovd60, ovd90 based on ovd_days
ovd30 = np.where(ovd_days >= 30, np.random.uniform(100, 1000, size=n_samples), 0)
ovd60 = np.where(ovd_days >= 60, np.random.uniform(100, 1000, size=n_samples), 0)
ovd90 = np.where(ovd_days >= 90, np.random.uniform(100, 1000, size=n_samples), 0)

# Generate additional 16 features (e.g., income, age, credit score, etc.)
additional_features = {
    f'feature_{i}': np.random.uniform(0, 1, size=n_samples)
    for i in range(1, 17)
}

# Define the target variable 'default'
default = np.where(ovd_days >= 90, 1, 0)

# Create a DataFrame
data = pd.DataFrame({
    'ovd_days': ovd_days,
    'ovd30': ovd30,
    'ovd60': ovd60,
    'ovd90': ovd90,
    'default': default
})

# Add additional features to the DataFrame
for feature_name, values in additional_features.items():
    data[feature_name] = values

# Display the first few rows
print(data.head())

   ovd_days       ovd30       ovd60       ovd90  default  feature_1  \
     102  347.258617  365.750460  828.264854        1   0.170115   
      51  286.504884    0.000000    0.000000        0   0.898238   
      92  890.398601  266.705602  730.030845        1   0.776803   
      14    0.000000    0.000000    0.000000        0   0.076480   
     106  142.206821  482.074683  459.190169        1   0.986729   

   feature_2  feature_3  feature_4  feature_5  ...  feature_7  feature_8  \
 0.772671   0.910665   0.970555   0.874897  ...   0.210387   0.825302   
 0.714900   0.435672   0.558953   0.771783  ...   0.290295   0.066410   
 0.578978   0.129086   0.336394   0.417907  ...   0.786544   0.170860   
 0.014609   0.742299   0.519022   0.748406  ...   0.159390   0.311121   
 0.360375   0.643104   0.979263   0.961335  ...   0.044092   0.320699   

   feature_9  feature_10  feature_11  feature_12  feature_13  feature_14  \
 0.452029    0.665473    0.268009    0.653720    0.697315    0.424151   
 0.101968    0.616738    0.053777    0.278180    0.597098    0.242181   
 0.073059    0.625900    0.924372    0.472927    0.488936    0.327334   
 0.462255    0.296270    0.661721    0.846581    0.665769    0.985358   
 0.301597    0.415145    0.886623    0.335464    0.371363    0.368168   

   feature_15  feature_16  
  0.601807    0.902878  
  0.130799    0.234079  
  0.582533    0.547901  
  0.088851    0.707345  
  0.490584    0.271884  

[5 rows x 21 columns]

import matplotlib.pyplot as plt
import seaborn as sns

# Check for missing values
print("Missing values in each column:")
print(data.isnull().sum())

# Distribution of the target variable
sns.countplot(x='default', data=data)
plt.title('Distribution of Default Variable')
plt.show()

# Correlation heatmap
plt.figure(figsize=(12, 10))
sns.heatmap(data.corr(), cmap='coolwarm', annot=False)
plt.title('Correlation Heatmap')
plt.show()

---------------------------------------------------------------------------
ModuleNotFoundError                       Traceback (most recent call last)
Cell In[3], line 2
      1 import matplotlib.pyplot as plt
----> 2 import seaborn as sns
      4 # Check for missing values
      5 print("Missing values in each column:")

ModuleNotFoundError: No module named 'seaborn'

# Data Pre-processing
from sklearn.preprocessing import StandardScaler

# Separate features and target
X = data.drop('default', axis=1)
y = data['default']

# Feature Scaling
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split the dataset into training and testing sets.
from sklearn.model_selection import train_test_split

# Split the data (80% training, 20% testing)
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.2, random_state=42
)

# !pip install xgboost

# building the model 
import xgboost as xgb
from xgboost import XGBClassifier

# Initialize the model
model = XGBClassifier(
    n_estimators=100,
    learning_rate=0.1,
    max_depth=5,
    random_state=42,
    # use_label_encoder=False,
    eval_metric='logloss'
)

# Train the model
model.fit(X_train, y_train)

XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, device=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric='logloss',
              feature_types=None, gamma=None, grow_policy=None,
              importance_type=None, interaction_constraints=None,
              learning_rate=0.1, max_bin=None, max_cat_threshold=None,
              max_cat_to_onehot=None, max_delta_step=None, max_depth=5,
              max_leaves=None, min_child_weight=None, missing=nan,
              monotone_constraints=None, multi_strategy=None, n_estimators=100,
              n_jobs=None, num_parallel_tree=None, random_state=42, ...)

In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.

from sklearn.metrics import (
    accuracy_score, confusion_matrix, classification_report, roc_auc_score, roc_curve
)

# Predictions on the test set
y_pred = model.predict(X_test)
y_pred_proba = model.predict_proba(X_test)[:, 1]

# Accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}')

# Confusion Matrix
cm = confusion_matrix(y_test, y_pred)
sns.heatmap(cm, annot=True, fmt='d')
plt.title('Confusion Matrix')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()

# Classification Report
print('Classification Report:')
print(classification_report(y_test, y_pred))

# ROC AUC Score
roc_auc = roc_auc_score(y_test, y_pred_proba)
print(f'ROC AUC Score: {roc_auc:.2f}')

# ROC Curve
fpr, tpr, thresholds = roc_curve(y_test, y_pred_proba)
plt.plot(fpr, tpr, label=f'AUC = {roc_auc:.2f}')
plt.plot([0, 1], [0, 1], linestyle='--')
plt.title('ROC Curve')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.legend()
plt.show()

Accuracy: 1.00

_images/719d93ca89eb24b2fd8e0b90c8bd0448578823bc09cfe479fd164cc416e940f5.png

Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       153
           1       1.00      1.00      1.00        47

    accuracy                           1.00       200
   macro avg       1.00      1.00      1.00       200
weighted avg       1.00      1.00      1.00       200

ROC AUC Score: 1.00

_images/0f3ca1c1f5d0f491731f8545301ba63a2ddfa791264eaddd32880104c9584573.png

Model Comparisons#

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings('ignore')

# Generate synthetic data
np.random.seed(42)
n_samples = 1000
ovd_days = np.random.randint(0, 120, size=n_samples)
ovd30 = np.where(ovd_days >= 30, np.random.uniform(100, 1000, size=n_samples), 0)
ovd60 = np.where(ovd_days >= 60, np.random.uniform(100, 1000, size=n_samples), 0)
ovd90 = np.where(ovd_days >= 90, np.random.uniform(100, 1000, size=n_samples), 0)
additional_features = {f'feature_{i}': np.random.uniform(0, 1, size=n_samples) for i in range(1, 17)}
default = np.where(ovd_days >= 90, 1, 0)

# Create DataFrame
data = pd.DataFrame({'ovd_days': ovd_days, 'ovd30': ovd30, 'ovd60': ovd60, 'ovd90': ovd90, 'default': default})
for feature_name, values in additional_features.items():
    data[feature_name] = values

# Splitting data into train, validation, and holdout sets
X = data.drop('default', axis=1)
y = data['default']

# Train-Validation-Holdout split (60-20-20)
X_train_full, X_holdout, y_train_full, y_holdout = train_test_split(X, y, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train_full, y_train_full, test_size=0.25, random_state=42)  # 0.25 x 0.8 = 0.2

print(f"Training set size: {X_train.shape[0]}")
print(f"Validation set size: {X_val.shape[0]}")
print(f"Holdout set size: {X_holdout.shape[0]}")

Training set size: 600
Validation set size: 200
Holdout set size: 200

#!pip install catboost

# Indivudal Models
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
import lightgbm as lgb
from catboost import CatBoostClassifier

# Instantiate individual models
logistic_model = LogisticRegression(random_state=42)
decision_tree = DecisionTreeClassifier(random_state=42)
random_forest = RandomForestClassifier(n_estimators=100, random_state=42)
xgboost_model = xgb.XGBClassifier(n_estimators=100, learning_rate=0.1, max_depth=5, random_state=42, eval_metric='logloss')
lightgbm_model = lgb.LGBMClassifier(n_estimators=100, learning_rate=0.1, max_depth=5, random_state=42)
catboost_model = CatBoostClassifier(n_estimators=100, learning_rate=0.1, max_depth=5, random_state=42, verbose=0)

# Stacking
# For stacking, we will combine the predictions of individual models using a meta-model.
from sklearn.ensemble import StackingClassifier

# Define base models for stacking
base_models = [
    ('logistic', logistic_model),
    ('decision_tree', decision_tree),
    ('random_forest', random_forest),
    ('xgboost', xgboost_model),
    ('lightgbm', lightgbm_model),
    ('catboost', catboost_model)
]

# Meta-model: Logistic Regression
meta_model = LogisticRegression(random_state=42)

# Stacking Classifier
stacking_model = StackingClassifier(estimators=base_models, final_estimator=meta_model, cv=5)

# Training all models
models = {
    "Logistic Regression": logistic_model,
    "Decision Tree": decision_tree,
    "Random Forest": random_forest,
    "XGBoost": xgboost_model,
    "LightGBM": lightgbm_model.set_params(verbose=-1),  # Set verbose to -1 to silence output
    "CatBoost": catboost_model,
    "Stacking Model": stacking_model
}

# Train each model on the training set
for name, model in models.items():
    print(f"Training {name}...")
    model.fit(X_train, y_train)

Training Logistic Regression...
Training Decision Tree...
Training Random Forest...
Training XGBoost...
Training LightGBM...
Training CatBoost...
Training Stacking Model...

# !pip install ace_tools

from sklearn.metrics import accuracy_score, roc_auc_score, precision_score, recall_score

def evaluate_model(model, X, y):
    """
    Evaluate the model on the given dataset and return various performance metrics.
    
    Parameters:
    - model: Trained model to evaluate
    - X: Feature matrix
    - y: True labels
    
    Returns:
    - accuracy: Accuracy score of the model
    - roc_auc: ROC AUC score
    - precision: Precision score
    - recall: Recall score
    """
    predictions = model.predict(X)
    proba = model.predict_proba(X)[:, 1]  # Probability estimates for the positive class
    accuracy = accuracy_score(y, predictions)
    roc_auc = roc_auc_score(y, proba)
    precision = precision_score(y, predictions)
    recall = recall_score(y, predictions)
    return accuracy, roc_auc, precision, recall

# Evaluating models on the validation set
validation_results = pd.DataFrame(columns=["Model", "Accuracy", "ROC AUC", "Precision", "Recall"])

for name, model in models.items():
    accuracy, roc_auc, precision, recall = evaluate_model(model, X_val, y_val)
    result = pd.DataFrame({
        "Model": [name],
        "Accuracy": [accuracy],
        "ROC AUC": [roc_auc],
        "Precision": [precision],
        "Recall": [recall]
    })
    validation_results = pd.concat([validation_results, result], ignore_index=True)


# Evaluate each model on the holdout set
holdout_results = pd.DataFrame(columns=["Model", "Accuracy", "ROC AUC", "Precision", "Recall"])

for name, model in models.items():
    accuracy, roc_auc, precision, recall = evaluate_model(model, X_holdout, y_holdout)
    result = pd.DataFrame({
        "Model": [name],
        "Accuracy": [accuracy],
        "ROC AUC": [roc_auc],
        "Precision": [precision],
        "Recall": [recall]
    })
    holdout_results = pd.concat([holdout_results, result], ignore_index=True)

# Display holdout results
print(holdout_results)

                 Model  Accuracy  ROC AUC  Precision  Recall
Logistic Regression       1.0      1.0        1.0     1.0
      Decision Tree       1.0      1.0        1.0     1.0
      Random Forest       1.0      1.0        1.0     1.0
            XGBoost       1.0      1.0        1.0     1.0
           LightGBM       1.0      1.0        1.0     1.0
           CatBoost       1.0      1.0        1.0     1.0
     Stacking Model       1.0      1.0        1.0     1.0