Source code for ML_tools.classifiers

import sys
from pathlib import Path
import os

sys.path.insert(0, str(Path(os.getcwd()).parent))

import numpy as np
import graphviz
from scipy import stats
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV, train_test_split
from sklearn.tree import export_graphviz
from sklearn.feature_selection import RFECV
from ML_tools.score_and_error import performance_scores


# PARAMETERS DISTRIBUTION FOR RANDOMSEARCHCV
param_dist = {'n_estimators': stats.randint(50, 500),
              'max_depth': stats.randint(1, 20)}


[docs]def RFPipeline_noPCA(df1, df2, n_iter, cv):
    """
    Creates pipeline that perform Random Forest classification on the data without Principal Component Analysis. The
    input data is split into training and test sets, then a Randomized Search (with cross-validation) is performed to
    find the best hyperparameters for the model.

    Parameters
    ----------
    df1 : pandas.DataFrame
        Dataframe containing the features.
    df2 : pandas.DataFrame
        Dataframe containing the labels.
    n_iter : int
        Number of parameter settings that are sampled.
    cv : int
        Number of cross-validation folds to use.

    Returns
    -------
    pipeline_simple : sklearn.pipeline.Pipeline
        A fitted pipeline (includes hyperparameter optimization using RandomizedSearchCV and a Random Forest Classifier
        model).

    See Also
    --------
    RandomizedSearchCV : https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.RandomizedSearchCV.html
    """

    X = df1.values
    y = df2.values
    region = list(df1.columns.values)

    X_tr, X_tst, y_tr, y_tst = train_test_split(X, y, test_size=.1, random_state=7)

    pipeline_simple = Pipeline(steps=[("hyper_opt", RandomizedSearchCV(RandomForestClassifier(),
                                                                       param_distributions=param_dist,
                                                                       n_iter=n_iter,
                                                                       cv=cv,
                                                                       random_state=10))
                                      ]
                               )

    pipeline_simple.fit(X_tr, y_tr)

    y_pred = pipeline_simple.predict(X_tst)
    y_prob = pipeline_simple.predict_proba(X_tst)

    # SCORES WITH 95% CONFIDENCE INTERVAL
    scores = performance_scores(y_tst, y_pred, y_prob)

    for i in range(3):
        tree = pipeline_simple["hyper_opt"].best_estimator_[i]
        dot_data = export_graphviz(tree,
                                   feature_names=region,
                                   filled=True,
                                   impurity=False,
                                   proportion=True,
                                   class_names=["CN", "AD"])
        graph = graphviz.Source(dot_data)
        graph.render(view=True)

    return pipeline_simple.fit


[docs]def RFPipeline_PCA(df1, df2, n_iter, cv):
    """
    Creates pipeline that perform Random Forest classification on the data with Principal Component Analysis. The
    input data is split into training and test sets, then a Randomized Search (with cross-validation) is performed to
    find the best hyperparameters for the model.

    Parameters
    ----------
    df1 : pandas.DataFrame
        Dataframe containing the features.
    df2 : pandas.DataFrame
        Dataframe containing the labels.
    n_iter : int
        Number of parameter settings that are sampled.
    cv : int
        Number of cross-validation folds to use.

    Returns
    -------
    pipeline_PCA : sklearn.pipeline.Pipeline
        A fitted pipeline (includes PCA, hyperparameter optimization using RandomizedSearchCV and a Random Forest
        Classifier model).

    See Also
    --------
    PCA : https://scikit-learn.org/stable/modules/generated/sklearn.decomposition.PCA.html
    RandomizedSearchCV : https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.RandomizedSearchCV.html
    """

    X = df1.values
    y = df2.values
    region = list(df1.columns.values)

    X_tr, X_tst, y_tr, y_tst = train_test_split(X, y, test_size=.1, random_state=6)

    pipeline_PCA = Pipeline(steps=[("dim_reduction", PCA()),
                                   ("hyper_opt", RandomizedSearchCV(RandomForestClassifier(),
                                                                    param_distributions=param_dist,
                                                                    n_iter=n_iter,
                                                                    cv=cv,
                                                                    random_state=9))
                                   ]
                            )

    pipeline_PCA.fit(X_tr, y_tr)

    y_pred = pipeline_PCA.predict(X_tst)
    y_prob = pipeline_PCA.predict_proba(X_tst)

    # SCORES WITH 95% CONFIDENCE INTERVAL
    scores = performance_scores(y_tst, y_pred, y_prob)

    print("Components shape is:", np.shape(pipeline_PCA["dim_reduction"].components_)[0])

    return pipeline_PCA.fit


[docs]def SVM_simple(df1, df2, ker: str):
    """
    Performs SVM classification on the data. The input data is split into training and test sets, then a Grid Search
    (with cross-validation) is performed to find the best hyperparameters for the model. Feature reduction is not
    implemented in this function.

    Parameters
    ----------
    df1 : pandas.DataFrame
        Dataframe containing the features.
    df2 : pandas.DataFrame
        Dataframe containing the labels.
    ker : str
        Kernel type.

    Returns
    -------
    grid : sklearn.model_selection.GridSearchCV
        A fitted grid search object with the best parameters for the SVM model.

    See Also
    --------
    GridSearchCV : https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.html
    """
    
    if ker == 'linear':
        param_grid = {'C': stats.expon.rvs(size=100),
                      'gamma': stats.expon(scale=.1).rvs(size=100),
                      'kernel': [ker],
                      'class_weight': ['balanced', None]}

    else:
        param_grid = {'C': np.logspace(-2, 10, 13),
                      'gamma': np.logspace(-9, 3, 13),
                      'kernel': [ker],
                      'class_weight': ['balanced', None]}

    X = df1.values
    y = df2.values
    
    X_tr, X_tst, y_tr, y_tst = train_test_split(X, y, test_size=.1, random_state=6)

    clf = svm.SVC(kernel=ker)

    grid = GridSearchCV(clf, param_grid, refit=True, n_jobs=-1)

    # fitting the model for grid search 
    grid.fit(X_tr, y_tr) 
     
    y_pred = grid.predict(X_tst)
    y_prob = grid.decision_function(X_tst)

    # SCORES WITH 95% CONFIDENCE INTERVAL
    scores = performance_scores(y_tst, y_pred, y_prob)
        
    return grid.fit


[docs]def SVM_feature_reduction(df1, df2):
    """
    Performs SVM classification on the data. The input data is split into training and test sets, then a Grid Search
    (with cross-validation) is performed to find the best hyperparameters for the model. Feature reduction is
    implemented in this function.

    Parameters
    ----------
    df1 : pandas.DataFrame
        Dataframe containing the features.
    df2 : pandas.DataFrame
        Dataframe containing the labels.
    Returns
    -------
    grid : sklearn.model_selection.GridSearchCV
        A fitted grid search object with the best parameters for the SVM model using the selected features.

    See Also
    --------
    RFECV: https://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.RFECV.html
    GridSearchCV : https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.html
    """

    X = df1.values
    y = df2.values
    
    X_tr, X_tst, y_tr, y_tst = train_test_split(X, y, test_size=.1, random_state=6)

    C_range = stats.expon.rvs(size=8)
    g = stats.expon(scale=.1)
    gamma_range = g.rvs(size=8)

    # defining parameter range 
    param_grid = {'estimator__C': C_range,
                  'estimator__gamma': gamma_range, 
                  'estimator__kernel': ['linear'], 
                  'estimator__class_weight': ['balanced', None]}
    
    clf = svm.SVC(kernel="linear")
    
    rfecv = RFECV(estimator=clf,
                  step=1,
                  cv=5,
                  scoring="accuracy",
                  min_features_to_select=len(y),
                  n_jobs=-1)
    
    grid = GridSearchCV(rfecv, param_grid, refit=True, n_jobs=-1)
    
    # fitting the model
    grid.fit(X_tr, y_tr)
     
    y_pred = grid.predict(X_tst)
    y_prob = grid.decision_function(X_tst)

    # SCORES WITH 95% CONFIDENCE INTERVAL
    scores = performance_scores(y_tst, y_pred, y_prob)

    return grid.fit