Matheus Schmitz
LinkedIn
Github Portfolio
Performance comparison for a multilabel class on using a single Random Forest Classifier versus an ensemble whose head is also a Random Forest Classifier.
The dataset is a multi-label classification problem with 6 different labels: {Beach, Sunset, FallFoliage, Field, Mountain, Urban.
Multilabel Scenes Dataset Source: https://sourceforge.net/projects/mulan/files/datasets/scene.rar
import numpy as np
import pandas as pd
from scipy.io.arff import loadarff
# Load data
aarf_train = loadarff('data/scene-train.arff')
df_train = pd.DataFrame(aarf_train[0])
aarf_test = loadarff('data/scene-test.arff')
df_test = pd.DataFrame(aarf_test[0])
# Convert labels read as bytes into integers
for label in ['Beach','Sunset','FallFoliage','Field','Mountain','Urban']:
df_train[label] = df_train[label].str.decode("utf-8")
df_train[label] = df_train[label].astype(int)
df_test[label] = df_test[label].str.decode("utf-8")
df_test[label] = df_test[label].astype(int)
# Split X and Y
X_train = df_train.drop(['Beach','Sunset','FallFoliage','Field','Mountain','Urban'], axis=1)
y_train = df_train[['Beach','Sunset','FallFoliage','Field','Mountain','Urban']]
X_test = df_test.drop(['Beach','Sunset','FallFoliage','Field','Mountain','Urban'], axis=1)
y_test = df_test[['Beach','Sunset','FallFoliage','Field','Mountain','Urban']]
# Scikit-Learn utility for conversing single label models into multilabel
from sklearn.multioutput import MultiOutputClassifier, ClassifierChain
# Support Vector Machine
from sklearn.svm import SVC
model1 = ClassifierChain(SVC())
model1.fit(X_train, y_train)
predictions1_train = pd.DataFrame(model1.predict(X_train))
predictions1_test = pd.DataFrame(model1.predict(X_test))
# KNN
from sklearn.neighbors import KNeighborsClassifier
model2 = KNeighborsClassifier()
model2.fit(X_train, y_train)
predictions2_train = pd.DataFrame(model2.predict(X_train))
predictions2_test = pd.DataFrame(model2.predict(X_test))
# Naive Bayes
from sklearn.naive_bayes import GaussianNB
model3 = ClassifierChain(GaussianNB())
model3.fit(X_train, y_train)
predictions3_train = pd.DataFrame(model3.predict(X_train))
predictions3_test = pd.DataFrame(model3.predict(X_test))
# Linear Discriminant Analysis
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
model4 = ClassifierChain(LinearDiscriminantAnalysis())
model4.fit(X_train, y_train)
predictions4_train = pd.DataFrame(model4.predict(X_train))
predictions4_test = pd.DataFrame(model4.predict(X_test))
# Logistic Regression
from sklearn.linear_model import LogisticRegression
model5 = ClassifierChain(LogisticRegression(max_iter=500))
model5.fit(X_train, y_train)
predictions5_train = pd.DataFrame(model5.predict(X_train))
predictions5_test = pd.DataFrame(model5.predict(X_test))
# Multi-Layer Perceptron
from sklearn.neural_network import MLPClassifier
model6 = MLPClassifier(max_iter=1000)
model6.fit(X_train, y_train)
predictions6_train = pd.DataFrame(model6.predict(X_train))
predictions6_test = pd.DataFrame(model6.predict(X_test))
# Gaussian Process
from sklearn.gaussian_process import GaussianProcessClassifier
model7 = ClassifierChain(GaussianProcessClassifier())
model7.fit(X_train, y_train)
predictions7_train = pd.DataFrame(model7.predict(X_train))
predictions7_test = pd.DataFrame(model7.predict(X_test))
# Gradient Boosting
from sklearn.ensemble import GradientBoostingClassifier
model8 = ClassifierChain(GradientBoostingClassifier())
model8.fit(X_train, y_train)
predictions8_train = pd.DataFrame(model8.predict(X_train))
predictions8_test = pd.DataFrame(model8.predict(X_test))
# Generate ensemble feature sets
X_train_ensemble = pd.concat([X_train, predictions1_train, predictions2_train,
predictions3_train, predictions4_train, predictions5_train,
predictions6_train, predictions7_train, predictions8_train], axis=1)
# X_train_ensemble = pd.concat([X_train, y_train, y_train,
# y_train, y_train, y_train,
# y_train, y_train, y_train], axis=1)
X_test_ensemble = pd.concat([X_test, predictions1_test, predictions2_test,
predictions3_test, predictions4_test, predictions5_test,
predictions6_test, predictions7_test, predictions8_test], axis=1)
# Random Forest
from sklearn.ensemble import RandomForestClassifier
model_ensemble = RandomForestClassifier()
model_ensemble.fit(X_train_ensemble, y_train)
predictions_ensemble_train = model_ensemble.predict(X_train_ensemble)
predictions_ensemble_test = model_ensemble.predict(X_test_ensemble)
# Random Forest
from sklearn.ensemble import RandomForestClassifier
model_simple = RandomForestClassifier()
model_simple.fit(X_train, y_train)
predictions_simple_train = model_simple.predict(X_train)
predictions_simple_test = model_simple.predict(X_test)
from sklearn.metrics import accuracy_score, hamming_loss
simple_accuracy_score = accuracy_score(y_test, predictions_simple_test)
ensemble_accuracy_score = accuracy_score(y_test, predictions_ensemble_test)
simple_hamming_score = 1 - hamming_loss(y_test, predictions_simple_test)
ensemble_hamming_score = 1 - hamming_loss(y_test, predictions_ensemble_test)
print(f"Simple Accuracy Score : {100*simple_exatch_match:.3f} %")
print(f"Ensemble Accuracy Score: {100*ensemble_exatch_match:.3f} %")
print()
print(f"Simple Hamming Score : {100*simple_hamming_score:.3f} %")
print(f"Ensemble Hamming Score: {100*ensemble_hamming_score:.3f} %")
Simple Accuracy Score : 50.669 % Ensemble Accuracy Score: 63.127 % Simple Hamming Score : 90.914 % Ensemble Hamming Score: 91.207 %
Matheus Schmitz
LinkedIn
Github Portfolio