Drawer

project:

```python
import sys
import numpy as np
import pandas as pd
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score, cross_val_predict
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, roc_curve, confusion_matrix, classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
import matplotlib.pyplot as plt
import seaborn as sns

def load_data(path=None):
    if path:
        df = pd.read_csv(path)
        if 'target' in df.columns:
            X = df.drop(columns=['target'])
            y = df['target']
        elif 'class' in df.columns:
            X = df.drop(columns=['class'])
            y = df['class']
        else:
            raise ValueError('CSV must contain a "target" or "class" column')
        return X, y
    data = load_breast_cancer()
    X = pd.DataFrame(data.data, columns=data.feature_names)
    y = pd.Series(data.target)
    return X, y

def build_models():
    models = {
        'LogisticRegression': Pipeline([('s', StandardScaler()),('m', LogisticRegression(max_iter=10000))]),
        'SVM': Pipeline([('s', StandardScaler()),('m', SVC(probability=True))]),
        'RandomForest': Pipeline([('s', StandardScaler()),('m', RandomForestClassifier(n_estimators=200))]),
        'GradientBoosting': Pipeline([('s', StandardScaler()),('m', GradientBoostingClassifier(n_estimators=200))]),
        'KNN': Pipeline([('s', StandardScaler()),('m', KNeighborsClassifier(n_neighbors=5))]),
        'GaussianNB': Pipeline([('s', StandardScaler()),('m', GaussianNB())])
    }
    return models

def evaluate_models(models, X_train, X_test, y_train, y_test):
    results = {}
    for name, model in models.items():
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        if hasattr(model, "predict_proba"):
            y_proba = model.predict_proba(X_test)[:,1]
        else:
            y_proba = model.decision_function(X_test)
            y_proba = (y_proba - y_proba.min())/(y_proba.max()-y_proba.min()+1e-12)
        acc = accuracy_score(y_test, y_pred)
        prec = precision_score(y_test, y_pred)
        rec = recall_score(y_test, y_pred)
        f1 = f1_score(y_test, y_pred)
        auc = roc_auc_score(y_test, y_proba)
        cm = confusion_matrix(y_test, y_pred)
        results[name] = {'model': model, 'accuracy': acc, 'precision': prec, 'recall': rec, 'f1': f1, 'auc': auc, 'cm': cm, 'y_proba': y_proba, 'y_pred': y_pred}
    return results

def cross_validate_models(models, X, y, folds=5):
    skf = StratifiedKFold(n_splits=folds, shuffle=True, random_state=42)
    cv_scores = {}
    for name, model in models.items():
        scores = cross_val_score(model, X, y, cv=skf, scoring='roc_auc')
        cv_scores[name] = scores
    return cv_scores

def plot_roc_curves(results, y_test):
    plt.figure(figsize=(8,6))
    for name, r in results.items():
        fpr, tpr, _ = roc_curve(y_test, r['y_proba'])
        plt.plot(fpr, tpr, label=f"{name} (AUC={r['auc']:.3f})")
    plt.plot([0,1],[0,1],'k--',alpha=0.4)
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('ROC Curves')
    plt.legend()
    plt.tight_layout()
    plt.show()

def plot_confusion_matrices(results):
    n = len(results)
    cols = 3
    rows = (n + cols - 1)//cols
    plt.figure(figsize=(4*cols,3*rows))
    i = 1
    for name, r in results.items():
        plt.subplot(rows,cols,i)
        sns.heatmap(r['cm'], annot=True, fmt='d', cmap='Blues', cbar=False)
        plt.title(name)
        plt.xlabel('Predicted')
        plt.ylabel('Actual')
        i += 1
    plt.tight_layout()
    plt.show()

def print_summary(results, cv_scores=None):
    df = []
    for name, r in results.items():
        df.append([name, r['accuracy'], r['precision'], r['recall'], r['f1'], r['auc']])
    summary = pd.DataFrame(df, columns=['Model','Accuracy','Precision','Recall','F1','ROC_AUC']).sort_values('ROC_AUC', ascending=False)
    print(summary.to_string(index=False))
    if cv_scores:
        print("\nCross-validated ROC AUC (mean ± std):")
        for name, scores in cv_scores.items():
            print(f"{name}: {scores.mean():.4f} ± {scores.std():.4f}")

def main():
    path = sys.argv[1] if len(sys.argv) > 1 else None
    X, y = load_data(path)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)
    models = build_models()
    cv_scores = cross_validate_models(models, X_train, y_train, folds=5)
    results = evaluate_models(models, X_train, X_test, y_train, y_test)
    print_summary(results, cv_scores)
    plot_roc_curves(results, y_test)
    plot_confusion_matrices(results)
    best = max(results.items(), key=lambda kv: kv[1]['auc'])
    name, info = best
    print("\nBest model by ROC AUC:", name)
    print(classification_report(y_test, info['y_pred']))

if __name__ == "__main__":
    main()
```