DSCI 552 | Machine Learning for Data Science
Homework 6
Matheus Schmitz
USC ID: 5039286453
# Data Science
import numpy as np
import pandas as pd
# Scikit Learn
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score
# Visualization
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import seaborn as sns
# Progress Bar
from tqdm import tqdm
# Ignore Warnings
import warnings
warnings.filterwarnings("ignore")
Source Link: https://archive.ics.uci.edu/ml/datasets/banknote+authentication
Data Set Information:
Data were extracted from images that were taken from genuine and forged banknote-like specimens. For digitization, an industrial camera usually used for print inspection was used. The final images have 400x 400 pixels. Due to the object lens and distance to the investigated object gray-scale pictures with a resolution of about 660 dpi were gained. Wavelet Transform tool were used to extract features from images.
Attribute Information:
# Load Data
df = pd.read_csv('../data/data_banknote_authentication.txt', header=None,
names=['Variance', 'Skewness', 'Curtosis ', 'Entropy', 'Class'])
print(f'df.shape: {df.shape}')
df.head(3)
# Train-Test Split
df_train, df_test = train_test_split(df, test_size=472/len(df), stratify=df.Class)
print(f'df_train.shape: {df_train.shape}')
print(f'df_test.shape: {df_test.shape}')
# Specify number of Monte-Carlo Simulations
N_SIMULATIONS = 50
# Both active and passive take about 20 seconds per simulation, totaling 40 seconds.
# For 50 simulations the expected processing time is 50*40s = 2000s = 33.333~ minutes
# Grid Search to find best penalty parameter
svc = LinearSVC(penalty='l1', dual=False)
param_grid = {'C': np.logspace(-2, 2, 10)}
grid = GridSearchCV(svc, param_grid=param_grid, cv=5, n_jobs=-1)
# Dictionaries to store errors at each n_train_samples
train_error_passive, valid_error_passive, test_error_passive = {}, {}, {}
for train_samples in range(10, 901, 10):
train_error_passive[train_samples] = []
valid_error_passive[train_samples] = []
test_error_passive[train_samples] = []
# 50 Monte-Carlo Simulations
for M in tqdm(range(N_SIMULATIONS)):
# Dataframes to store samples during steps
train_samples = pd.DataFrame(columns=df.columns)
valid_samples = df_train.copy()
new_samples = pd.DataFrame(columns=df.columns)
# As long as there are validation datapoints to be drawn, keep iterating
while len(valid_samples)>0:
# Randomly sample 10 datapoints
# If samples are all from the same class, redraw another 10 samples, until both classes have been sampled
while len(new_samples.Class.unique()) < 2:
# Randomly sample 10 datapoints
new_samples = valid_samples.sample(10)
# If there are no more dual-class samples that can be drawn from valid_samples, break the loop
if len(valid_samples.Class.unique()) < 2:
break
# Move the samples from valid_samples to train_samples
train_samples = train_samples.append(new_samples)
valid_samples.drop(new_samples.index, inplace=True)
# Reset the new_samples object
new_samples = pd.DataFrame(columns=df.columns)
# Set new train, validation and test data
x_train = train_samples.iloc[:, :-1].copy()
y_train = train_samples.iloc[:, -1].astype('int').copy()
x_valid = valid_samples.iloc[:, :-1].copy()
y_valid = valid_samples.iloc[:, -1].astype('int').copy()
x_test = df_test.iloc[:, :-1].copy()
y_test = df_test.iloc[:, -1].astype('int').copy()
# Train the SVM with the current train_samples
grid.fit(x_train, y_train)
# Predict and calculate error for train data
pred_train = grid.predict(x_train)
train_error_passive[len(x_train)].append(1-accuracy_score(y_train, pred_train))
# Predict and calculate error for valid data
if len(x_valid)>0:
pred_valid = grid.predict(x_valid)
valid_error_passive[len(x_train)].append(1-accuracy_score(y_valid, pred_valid))
else:
valid_error_passive[len(x_train)].append(valid_error_passive[len(x_train)-10][-1]) # For the final cycle, there will be no valid_data, so copy the last valid_error_passive
# Predict and calculate error for test data
pred_test = grid.predict(x_test)
test_error_passive[len(x_train)].append(1-accuracy_score(y_test, pred_test))
decision_function(X)
Predict confidence scores for samples.
The confidence score for a sample is the signed distance of that sample to the hyperplane.
# Grid Search to find best penalty parameter
svc = LinearSVC(penalty='l1', dual=False)
param_grid = {'C': np.logspace(-2, 2, 10)}
grid = GridSearchCV(svc, param_grid=param_grid, cv=5, n_jobs=-1)
# Dictionaries to store errors at each n_train_samples
train_error_active, valid_error_active, test_error_active = {}, {}, {}
for train_samples in range(10, 901, 10):
train_error_active[train_samples] = []
valid_error_active[train_samples] = []
test_error_active[train_samples] = []
# 50 Monte-Carlo Simulations
for M in tqdm(range(N_SIMULATIONS)):
# Dataframes to store samples during steps
train_samples = pd.DataFrame(columns=df.columns)
valid_samples = df_train.copy()
new_samples = pd.DataFrame(columns=df.columns)
# As long as there are validation datapoints to be drawn, keep iterating
while len(valid_samples)>0:
# For the first round, pick 10 random samples
if len(train_samples)==0:
# If samples are all from the same class, redraw another 10 samples, until both classes have been sampled
while len(new_samples.Class.unique()) < 2:
# Randomly sample 10 datapoints
new_samples = valid_samples.sample(10)
# For all subsequent rounds, actively select the 10 most uncertain samples (10 closest to the hyperplane)
else:
sample_distances = np.absolute(grid.decision_function(valid_samples.iloc[:, :-1].copy()))
idx_closest = np.argsort(sample_distances)[:10]
new_samples = pd.DataFrame(valid_samples.iloc[idx_closest])
# Move the samples from valid_samples to train_samples
train_samples = train_samples.append(new_samples)
valid_samples.drop(new_samples.index, inplace=True)
# Reset the new_samples object
new_samples = pd.DataFrame(columns=df.columns)
# Set new train, validation and test data
x_train = train_samples.iloc[:, :-1].copy()
y_train = train_samples.iloc[:, -1].astype('int').copy()
x_valid = valid_samples.iloc[:, :-1].copy()
y_valid = valid_samples.iloc[:, -1].astype('int').copy()
x_test = df_test.iloc[:, :-1].copy()
y_test = df_test.iloc[:, -1].astype('int').copy()
# Train the SVM with the current train_samples
grid.fit(x_train, y_train)
# Predict and calculate error for train data
pred_train = grid.predict(x_train)
train_error_active[len(x_train)].append(1-accuracy_score(y_train, pred_train))
# Predict and calculate error for validation data
if len(x_valid)>0:
pred_valid = grid.predict(x_valid)
valid_error_active[len(x_train)].append(1-accuracy_score(y_valid, pred_valid))
else:
valid_error_active[len(x_train)].append(valid_error_active[len(x_train)-10][-1]) # For the final cycle, there will be no valid_data, so copy the last valid_error_active
# Predict and calculate error for test data
pred_test = grid.predict(x_test)
test_error_active[len(x_train)].append(1-accuracy_score(y_test, pred_test))
# Average the errors for passive learning
# For each of the 90 SVMS, calculate the average error over the 50 Simuluations
train_error_passive_avg, valid_error_passive_avg, test_error_passive_avg = {}, {}, {}
for train_samples in range(10, 901, 10):
train_error_passive_avg[train_samples] = np.mean(train_error_passive[train_samples])
valid_error_passive_avg[train_samples] = np.mean(valid_error_passive[train_samples])
test_error_passive_avg[train_samples] = np.mean(test_error_passive[train_samples])
# Average the errors for active learning
# For each of the 90 SVMS, calculate the average error over the 50 Simuluations
train_error_active_avg, valid_error_active_avg, test_error_active_avg = {}, {}, {}
for train_samples in range(10, 901, 10):
train_error_active_avg[train_samples] = np.mean(train_error_active[train_samples])
valid_error_active_avg[train_samples] = np.mean(valid_error_active[train_samples])
test_error_active_avg[train_samples] = np.mean(test_error_active[train_samples])
# Dataframe with all averaged errors
summary_dict = {'Train Error Passive': train_error_passive_avg,
'Valid Error Passive': valid_error_passive_avg,
'Test Error Passive': test_error_passive_avg,
'Train Error Active': train_error_active_avg,
'Valid Error Active': valid_error_active_avg,
'Test Error Active': test_error_active_avg}
summary = pd.DataFrame.from_dict(summary_dict, orient='columns')
summary
# Plotting test errors for passive and active learning
# Customize color palette
custom_palette = sns.color_palette("husl", 2)
sns.set_palette(custom_palette)
# Create figure and axis
fig, axs = plt.subplots(figsize=(16,8))
# Plot lineplots
l1 = sns.lineplot(y=summary['Test Error Passive'].values, x=summary.index)
l2 = sns.lineplot(y=summary['Test Error Active'].values, x=summary.index)
# Format figure and axis
axs.legend(labels=['Test Error Passive', 'Test Error Active'], fontsize=16)
fig.suptitle('Learning Curve', y=0.95, size=25, fontweight='bold')
axs.set_xlabel('Training Set Size', fontsize=16, labelpad=15)
axs.xaxis.set_major_locator(ticker.MultipleLocator(100))
axs.set_ylabel('Classification Error', fontsize=16, labelpad=15)
plt.show()