Source code for ML_tools.classifiers

import sys
from pathlib import Path
import os

sys.path.insert(0, str(Path(os.getcwd()).parent))

import numpy as np
import graphviz
from scipy import stats
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV, train_test_split
from sklearn.tree import export_graphviz
from sklearn.feature_selection import RFECV
from ML_tools.score_and_error import performance_scores


# PARAMETERS DISTRIBUTION FOR RANDOMSEARCHCV
param_dist = {'n_estimators': stats.randint(50, 500),
              'max_depth': stats.randint(1, 20)}


[docs]def RFPipeline_noPCA(df1, df2, n_iter, cv): """ Creates pipeline that perform Random Forest classification on the data without Principal Component Analysis. The input data is split into training and test sets, then a Randomized Search (with cross-validation) is performed to find the best hyperparameters for the model. Parameters ---------- df1 : pandas.DataFrame Dataframe containing the features. df2 : pandas.DataFrame Dataframe containing the labels. n_iter : int Number of parameter settings that are sampled. cv : int Number of cross-validation folds to use. Returns ------- pipeline_simple : sklearn.pipeline.Pipeline A fitted pipeline (includes hyperparameter optimization using RandomizedSearchCV and a Random Forest Classifier model). See Also -------- RandomizedSearchCV : https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.RandomizedSearchCV.html """ X = df1.values y = df2.values region = list(df1.columns.values) X_tr, X_tst, y_tr, y_tst = train_test_split(X, y, test_size=.1, random_state=7) pipeline_simple = Pipeline(steps=[("hyper_opt", RandomizedSearchCV(RandomForestClassifier(), param_distributions=param_dist, n_iter=n_iter, cv=cv, random_state=10)) ] ) pipeline_simple.fit(X_tr, y_tr) y_pred = pipeline_simple.predict(X_tst) y_prob = pipeline_simple.predict_proba(X_tst) # SCORES WITH 95% CONFIDENCE INTERVAL scores = performance_scores(y_tst, y_pred, y_prob) for i in range(3): tree = pipeline_simple["hyper_opt"].best_estimator_[i] dot_data = export_graphviz(tree, feature_names=region, filled=True, impurity=False, proportion=True, class_names=["CN", "AD"]) graph = graphviz.Source(dot_data) graph.render(view=True) return pipeline_simple.fit
[docs]def RFPipeline_PCA(df1, df2, n_iter, cv): """ Creates pipeline that perform Random Forest classification on the data with Principal Component Analysis. The input data is split into training and test sets, then a Randomized Search (with cross-validation) is performed to find the best hyperparameters for the model. Parameters ---------- df1 : pandas.DataFrame Dataframe containing the features. df2 : pandas.DataFrame Dataframe containing the labels. n_iter : int Number of parameter settings that are sampled. cv : int Number of cross-validation folds to use. Returns ------- pipeline_PCA : sklearn.pipeline.Pipeline A fitted pipeline (includes PCA, hyperparameter optimization using RandomizedSearchCV and a Random Forest Classifier model). See Also -------- PCA : https://scikit-learn.org/stable/modules/generated/sklearn.decomposition.PCA.html RandomizedSearchCV : https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.RandomizedSearchCV.html """ X = df1.values y = df2.values region = list(df1.columns.values) X_tr, X_tst, y_tr, y_tst = train_test_split(X, y, test_size=.1, random_state=6) pipeline_PCA = Pipeline(steps=[("dim_reduction", PCA()), ("hyper_opt", RandomizedSearchCV(RandomForestClassifier(), param_distributions=param_dist, n_iter=n_iter, cv=cv, random_state=9)) ] ) pipeline_PCA.fit(X_tr, y_tr) y_pred = pipeline_PCA.predict(X_tst) y_prob = pipeline_PCA.predict_proba(X_tst) # SCORES WITH 95% CONFIDENCE INTERVAL scores = performance_scores(y_tst, y_pred, y_prob) print("Components shape is:", np.shape(pipeline_PCA["dim_reduction"].components_)[0]) return pipeline_PCA.fit
[docs]def SVM_simple(df1, df2, ker: str): """ Performs SVM classification on the data. The input data is split into training and test sets, then a Grid Search (with cross-validation) is performed to find the best hyperparameters for the model. Feature reduction is not implemented in this function. Parameters ---------- df1 : pandas.DataFrame Dataframe containing the features. df2 : pandas.DataFrame Dataframe containing the labels. ker : str Kernel type. Returns ------- grid : sklearn.model_selection.GridSearchCV A fitted grid search object with the best parameters for the SVM model. See Also -------- GridSearchCV : https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.html """ if ker == 'linear': param_grid = {'C': stats.expon.rvs(size=100), 'gamma': stats.expon(scale=.1).rvs(size=100), 'kernel': [ker], 'class_weight': ['balanced', None]} else: param_grid = {'C': np.logspace(-2, 10, 13), 'gamma': np.logspace(-9, 3, 13), 'kernel': [ker], 'class_weight': ['balanced', None]} X = df1.values y = df2.values X_tr, X_tst, y_tr, y_tst = train_test_split(X, y, test_size=.1, random_state=6) clf = svm.SVC(kernel=ker) grid = GridSearchCV(clf, param_grid, refit=True, n_jobs=-1) # fitting the model for grid search grid.fit(X_tr, y_tr) y_pred = grid.predict(X_tst) y_prob = grid.decision_function(X_tst) # SCORES WITH 95% CONFIDENCE INTERVAL scores = performance_scores(y_tst, y_pred, y_prob) return grid.fit
[docs]def SVM_feature_reduction(df1, df2): """ Performs SVM classification on the data. The input data is split into training and test sets, then a Grid Search (with cross-validation) is performed to find the best hyperparameters for the model. Feature reduction is implemented in this function. Parameters ---------- df1 : pandas.DataFrame Dataframe containing the features. df2 : pandas.DataFrame Dataframe containing the labels. Returns ------- grid : sklearn.model_selection.GridSearchCV A fitted grid search object with the best parameters for the SVM model using the selected features. See Also -------- RFECV: https://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.RFECV.html GridSearchCV : https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.html """ X = df1.values y = df2.values X_tr, X_tst, y_tr, y_tst = train_test_split(X, y, test_size=.1, random_state=6) C_range = stats.expon.rvs(size=8) g = stats.expon(scale=.1) gamma_range = g.rvs(size=8) # defining parameter range param_grid = {'estimator__C': C_range, 'estimator__gamma': gamma_range, 'estimator__kernel': ['linear'], 'estimator__class_weight': ['balanced', None]} clf = svm.SVC(kernel="linear") rfecv = RFECV(estimator=clf, step=1, cv=5, scoring="accuracy", min_features_to_select=len(y), n_jobs=-1) grid = GridSearchCV(rfecv, param_grid, refit=True, n_jobs=-1) # fitting the model grid.fit(X_tr, y_tr) y_pred = grid.predict(X_tst) y_prob = grid.decision_function(X_tst) # SCORES WITH 95% CONFIDENCE INTERVAL scores = performance_scores(y_tst, y_pred, y_prob) return grid.fit