project:
```python
import sys
import numpy as np
import pandas as pd
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score, cross_val_predict
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, roc_curve, confusion_matrix, classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
import matplotlib.pyplot as plt
import seaborn as sns
def load_data(path=None):
if path:
df = pd.read_csv(path)
if 'target' in df.columns:
X = df.drop(columns=['target'])
y = df['target']
elif 'class' in df.columns:
X = df.drop(columns=['class'])
y = df['class']
else:
raise ValueError('CSV must contain a "target" or "class" column')
return X, y
data = load_breast_cancer()
X = pd.DataFrame(data.data, columns=data.feature_names)
y = pd.Series(data.target)
return X, y
def build_models():
models = {
'LogisticRegression': Pipeline([('s', StandardScaler()),('m', LogisticRegression(max_iter=10000))]),
'SVM': Pipeline([('s', StandardScaler()),('m', SVC(probability=True))]),
'RandomForest': Pipeline([('s', StandardScaler()),('m', RandomForestClassifier(n_estimators=200))]),
'GradientBoosting': Pipeline([('s', StandardScaler()),('m', GradientBoostingClassifier(n_estimators=200))]),
'KNN': Pipeline([('s', StandardScaler()),('m', KNeighborsClassifier(n_neighbors=5))]),
'GaussianNB': Pipeline([('s', StandardScaler()),('m', GaussianNB())])
}
return models
def evaluate_models(models, X_train, X_test, y_train, y_test):
results = {}
for name, model in models.items():
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
if hasattr(model, "predict_proba"):
y_proba = model.predict_proba(X_test)[:,1]
else:
y_proba = model.decision_function(X_test)
y_proba = (y_proba - y_proba.min())/(y_proba.max()-y_proba.min()+1e-12)
acc = accuracy_score(y_test, y_pred)
prec = precision_score(y_test, y_pred)
rec = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
auc = roc_auc_score(y_test, y_proba)
cm = confusion_matrix(y_test, y_pred)
results[name] = {'model': model, 'accuracy': acc, 'precision': prec, 'recall': rec, 'f1': f1, 'auc': auc, 'cm': cm, 'y_proba': y_proba, 'y_pred': y_pred}
return results
def cross_validate_models(models, X, y, folds=5):
skf = StratifiedKFold(n_splits=folds, shuffle=True, random_state=42)
cv_scores = {}
for name, model in models.items():
scores = cross_val_score(model, X, y, cv=skf, scoring='roc_auc')
cv_scores[name] = scores
return cv_scores
def plot_roc_curves(results, y_test):
plt.figure(figsize=(8,6))
for name, r in results.items():
fpr, tpr, _ = roc_curve(y_test, r['y_proba'])
plt.plot(fpr, tpr, label=f"{name} (AUC={r['auc']:.3f})")
plt.plot([0,1],[0,1],'k--',alpha=0.4)
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curves')
plt.legend()
plt.tight_layout()
plt.show()
def plot_confusion_matrices(results):
n = len(results)
cols = 3
rows = (n + cols - 1)//cols
plt.figure(figsize=(4*cols,3*rows))
i = 1
for name, r in results.items():
plt.subplot(rows,cols,i)
sns.heatmap(r['cm'], annot=True, fmt='d', cmap='Blues', cbar=False)
plt.title(name)
plt.xlabel('Predicted')
plt.ylabel('Actual')
i += 1
plt.tight_layout()
plt.show()
def print_summary(results, cv_scores=None):
df = []
for name, r in results.items():
df.append([name, r['accuracy'], r['precision'], r['recall'], r['f1'], r['auc']])
summary = pd.DataFrame(df, columns=['Model','Accuracy','Precision','Recall','F1','ROC_AUC']).sort_values('ROC_AUC', ascending=False)
print(summary.to_string(index=False))
if cv_scores:
print("\nCross-validated ROC AUC (mean ± std):")
for name, scores in cv_scores.items():
print(f"{name}: {scores.mean():.4f} ± {scores.std():.4f}")
def main():
path = sys.argv[1] if len(sys.argv) > 1 else None
X, y = load_data(path)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)
models = build_models()
cv_scores = cross_validate_models(models, X_train, y_train, folds=5)
results = evaluate_models(models, X_train, X_test, y_train, y_test)
print_summary(results, cv_scores)
plot_roc_curves(results, y_test)
plot_confusion_matrices(results)
best = max(results.items(), key=lambda kv: kv[1]['auc'])
name, info = best
print("\nBest model by ROC AUC:", name)
print(classification_report(y_test, info['y_pred']))
if __name__ == "__main__":
main()
```