# https://www.kaggle.com/competitions/microsoft-malware-prediction/discussion/76013
# 按预测概率排序,依次计算每个点,得到所有正样本打分大于负样本的个数 / 所有情况随机取一正一负总数m*n
# 类似蒙特卡洛的逆?
import numpy as np
def calculate_auc(y_true, y_prob):
y_true = np.asarray(y_true)
# Sort the indices based on predicted probabilities
sorted_indices = np.argsort(y_prob)
y_true_sorted = y_true[sorted_indices]
nfalse = 0 # 截至目前负样本0的累加数量
auc = 0
n = len(y_true_sorted)
for i in range(n):
y_i = y_true_sorted[i]
nfalse += (1 - y_i)
auc += y_i * nfalse # 每遇到一个正样本1,auc更新前面一共多少负样本。此时的数量就是每个正样本,其概率>负样本的概率的和
n_positive = np.sum(y_true_sorted)
n_negative = n - n_positive
auc /= (n_negative * n_positive) # auc / (负样本数量 * 正样本数量), 分子是每一个正样本概率大于负样本的总和
return auc
另一种思路,直接使用tpr和fpr计算
# https://stackoverflow.com/questions/39537443/how-to-calculate-a-partial-area-under-the-curve-auc
import numpy as np
def calculate_auc_tpr_fpr(y_true, y_prob):
# Sort by predicted probabilities in descending order
sorted_indices = np.argsort(y_prob)[::-1]
y_true_sorted = np.array(y_true)[sorted_indices]
tp = np.cumsum(y_true_sorted) # Cumulative sum of positive samples (True Positives)
fp = np.cumsum(1 - y_true_sorted) # Cumulative sum of negative samples (False Positives)
n_positive = np.sum(y_true)
n_negative = len(y_true) - n_positive
# TPR and FPR
tpr = tp / n_positive # True Positive Rate
fpr = fp / n_negative # False Positive Rate
# Calculate AUC using trapezoidal rule, the area under the curve is sum of trapezoids between consecutive points
auc = np.trapz(tpr, fpr) # Integral approximation (Area under the ROC curve)
return auc
4. KS
Kolmogorov-Smirnov,风控常用指标
KS曲线就是将阈值与TPR,FPR的变化趋势反应出来
5. average precision
AP是Precision-Recall Curve(PRC)下的面积
import numpy as np
def average_precision_score(y_true, y_scores):
"""Calculate the average precision score.
- y_true: 1D array-like, true binary labels (0 or 1).
- y_scores: 1D array-like, predicted scores or probabilities for positive class.
"""
# Combine true labels and predicted scores into a sorted list of (true label, score) pairs.
data = list(zip(y_true, y_scores))
data.sort(key=lambda x: x[1], reverse=True)
# Initialize variables for precision, recall, and total positive examples.
precision_values = []
recall_values = []
true_positives = 0
num_positive_examples = sum(y_true)
# Calculate precision and recall at each threshold.
for i, (true_label, score) in enumerate(data, start=1):
if true_label == 1:
true_positives += 1
precision = true_positives / i
recall = true_positives / num_positive_examples
precision_values.append(precision)
recall_values.append(recall)
# Calculate the average precision by integrating the precision-recall curve.
average_precision = np.trapz(precision_values, recall_values)
return average_precision