Active Learning Using Support Vector Machines¶

DSCI 552 | Machine Learning for Data Science

Homework 6

Matheus Schmitz

USC ID: 5039286453

Imports¶

# Data Science
import numpy as np
import pandas as pd

# Scikit Learn
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score

# Visualization
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import seaborn as sns

# Progress Bar
from tqdm import tqdm

# Ignore Warnings
import warnings
warnings.filterwarnings("ignore")

(a) Dataset¶

Source Link: https://archive.ics.uci.edu/ml/datasets/banknote+authentication

Data Set Information:

Data were extracted from images that were taken from genuine and forged banknote-like specimens. For digitization, an industrial camera usually used for print inspection was used. The final images have 400x 400 pixels. Due to the object lens and distance to the investigated object gray-scale pictures with a resolution of about 660 dpi were gained. Wavelet Transform tool were used to extract features from images.

Attribute Information:

variance of Wavelet Transformed image (continuous)
skewness of Wavelet Transformed image (continuous)
curtosis of Wavelet Transformed image (continuous)
entropy of image (continuous)
class (integer)

# Load Data
df = pd.read_csv('../data/data_banknote_authentication.txt', header=None, 
                 names=['Variance', 'Skewness', 'Curtosis ', 'Entropy', 'Class'])
print(f'df.shape: {df.shape}')
df.head(3)

df.shape: (1372, 5)

# Train-Test Split
df_train, df_test = train_test_split(df, test_size=472/len(df), stratify=df.Class)
print(f'df_train.shape: {df_train.shape}')
print(f'df_test.shape: {df_test.shape}')

df_train.shape: (900, 5)
df_test.shape: (472, 5)

(B) Monte-Carlo Simulations¶

# Specify number of Monte-Carlo Simulations
N_SIMULATIONS = 50

# Both active and passive take about 20 seconds per simulation, totaling 40 seconds.
# For 50 simulations the expected processing time is 50*40s = 2000s = 33.333~ minutes

(i) Passive Learning¶

# Grid Search to find best penalty parameter
svc = LinearSVC(penalty='l1', dual=False)
param_grid = {'C': np.logspace(-2, 2, 10)}
grid = GridSearchCV(svc, param_grid=param_grid, cv=5, n_jobs=-1)

# Dictionaries to store errors at each n_train_samples
train_error_passive, valid_error_passive, test_error_passive = {}, {}, {}
for train_samples in range(10, 901, 10):
    train_error_passive[train_samples] = []
    valid_error_passive[train_samples] = []
    test_error_passive[train_samples] = []

# 50 Monte-Carlo Simulations
for M in tqdm(range(N_SIMULATIONS)):

    # Dataframes to store samples during steps
    train_samples = pd.DataFrame(columns=df.columns)
    valid_samples = df_train.copy()
    new_samples = pd.DataFrame(columns=df.columns)

    # As long as there are validation datapoints to be drawn, keep iterating
    while len(valid_samples)>0:

        # Randomly sample 10 datapoints
        # If samples are all from the same class, redraw another 10 samples, until both classes have been sampled
        while len(new_samples.Class.unique()) < 2:

            # Randomly sample 10 datapoints
            new_samples = valid_samples.sample(10)

            # If there are no more dual-class samples that can be drawn from valid_samples, break the loop
            if len(valid_samples.Class.unique()) < 2:
                break

        # Move the samples from valid_samples to train_samples
        train_samples = train_samples.append(new_samples)
        valid_samples.drop(new_samples.index, inplace=True)   
        
        # Reset the new_samples object
        new_samples = pd.DataFrame(columns=df.columns)

        # Set new train, validation and test data
        x_train = train_samples.iloc[:, :-1].copy()
        y_train = train_samples.iloc[:, -1].astype('int').copy()   
        x_valid = valid_samples.iloc[:, :-1].copy()
        y_valid = valid_samples.iloc[:, -1].astype('int').copy()    
        x_test = df_test.iloc[:, :-1].copy()
        y_test = df_test.iloc[:, -1].astype('int').copy() 

        # Train the SVM with the current train_samples
        grid.fit(x_train, y_train)

        # Predict and calculate error for train data
        pred_train = grid.predict(x_train)
        train_error_passive[len(x_train)].append(1-accuracy_score(y_train, pred_train))

        # Predict and calculate error for valid data
        if len(x_valid)>0:
            pred_valid = grid.predict(x_valid)
            valid_error_passive[len(x_train)].append(1-accuracy_score(y_valid, pred_valid))
        else:
            valid_error_passive[len(x_train)].append(valid_error_passive[len(x_train)-10][-1]) # For the final cycle, there will be no valid_data, so copy the last valid_error_passive

        # Predict and calculate error for test data
        pred_test = grid.predict(x_test)
        test_error_passive[len(x_train)].append(1-accuracy_score(y_test, pred_test))

100%|██████████████████████████████████████████| 50/50 [16:57<00:00, 20.36s/it]

(ii) Active Learning¶

From: https://scikit-learn.org/stable/modules/generated/sklearn.svm.LinearSVC.html#sklearn.svm.LinearSVC.decision_function

decision_function(X)

Predict confidence scores for samples.

The confidence score for a sample is the signed distance of that sample to the hyperplane.

# Grid Search to find best penalty parameter
svc = LinearSVC(penalty='l1', dual=False)
param_grid = {'C': np.logspace(-2, 2, 10)}
grid = GridSearchCV(svc, param_grid=param_grid, cv=5, n_jobs=-1)

# Dictionaries to store errors at each n_train_samples
train_error_active, valid_error_active, test_error_active = {}, {}, {}
for train_samples in range(10, 901, 10):
    train_error_active[train_samples] = []
    valid_error_active[train_samples] = []
    test_error_active[train_samples] = []

# 50 Monte-Carlo Simulations
for M in tqdm(range(N_SIMULATIONS)):

    # Dataframes to store samples during steps
    train_samples = pd.DataFrame(columns=df.columns)
    valid_samples = df_train.copy()
    new_samples = pd.DataFrame(columns=df.columns)
    
    # As long as there are validation datapoints to be drawn, keep iterating
    while len(valid_samples)>0:
    
        # For the first round, pick 10 random samples
        if len(train_samples)==0:

            # If samples are all from the same class, redraw another 10 samples, until both classes have been sampled
            while len(new_samples.Class.unique()) < 2:

                # Randomly sample 10 datapoints
                new_samples = valid_samples.sample(10)

        # For all subsequent rounds, actively select the 10 most uncertain samples (10 closest to the hyperplane)
        else:
            sample_distances = np.absolute(grid.decision_function(valid_samples.iloc[:, :-1].copy()))
            idx_closest = np.argsort(sample_distances)[:10]
            new_samples = pd.DataFrame(valid_samples.iloc[idx_closest])

        # Move the samples from valid_samples to train_samples
        train_samples = train_samples.append(new_samples)
        valid_samples.drop(new_samples.index, inplace=True)  
        
        # Reset the new_samples object
        new_samples = pd.DataFrame(columns=df.columns)

        # Set new train, validation and test data
        x_train = train_samples.iloc[:, :-1].copy()
        y_train = train_samples.iloc[:, -1].astype('int').copy()   
        x_valid = valid_samples.iloc[:, :-1].copy()
        y_valid = valid_samples.iloc[:, -1].astype('int').copy()    
        x_test = df_test.iloc[:, :-1].copy()
        y_test = df_test.iloc[:, -1].astype('int').copy() 

        # Train the SVM with the current train_samples
        grid.fit(x_train, y_train)

        # Predict and calculate error for train data
        pred_train = grid.predict(x_train)
        train_error_active[len(x_train)].append(1-accuracy_score(y_train, pred_train))

        # Predict and calculate error for validation data
        if len(x_valid)>0:
            pred_valid = grid.predict(x_valid)
            valid_error_active[len(x_train)].append(1-accuracy_score(y_valid, pred_valid))
        else:
            valid_error_active[len(x_train)].append(valid_error_active[len(x_train)-10][-1]) # For the final cycle, there will be no valid_data, so copy the last valid_error_active

        # Predict and calculate error for test data
        pred_test = grid.predict(x_test)
        test_error_active[len(x_train)].append(1-accuracy_score(y_test, pred_test))

100%|██████████████████████████████████████████| 50/50 [16:14<00:00, 19.50s/it]

(c) Learning Curve¶

# Average the errors for passive learning
# For each of the 90 SVMS, calculate the average error over the 50 Simuluations
train_error_passive_avg, valid_error_passive_avg, test_error_passive_avg = {}, {}, {}
for train_samples in range(10, 901, 10):
    train_error_passive_avg[train_samples] = np.mean(train_error_passive[train_samples])
    valid_error_passive_avg[train_samples] = np.mean(valid_error_passive[train_samples])
    test_error_passive_avg[train_samples] = np.mean(test_error_passive[train_samples])
    
# Average the errors for active learning   
# For each of the 90 SVMS, calculate the average error over the 50 Simuluations
train_error_active_avg, valid_error_active_avg, test_error_active_avg = {}, {}, {}
for train_samples in range(10, 901, 10):
    train_error_active_avg[train_samples] = np.mean(train_error_active[train_samples])
    valid_error_active_avg[train_samples] = np.mean(valid_error_active[train_samples])
    test_error_active_avg[train_samples] = np.mean(test_error_active[train_samples])

# Dataframe with all averaged errors
summary_dict = {'Train Error Passive': train_error_passive_avg,
                'Valid Error Passive': valid_error_passive_avg,
                'Test Error Passive': test_error_passive_avg,
                'Train Error Active': train_error_active_avg,
                'Valid Error Active': valid_error_active_avg,
                'Test Error Active': test_error_active_avg}
summary = pd.DataFrame.from_dict(summary_dict, orient='columns')
summary

# Plotting test errors for passive and active learning

# Customize color palette
custom_palette = sns.color_palette("husl", 2)
sns.set_palette(custom_palette)

# Create figure and axis
fig, axs = plt.subplots(figsize=(16,8))

# Plot lineplots
l1 = sns.lineplot(y=summary['Test Error Passive'].values, x=summary.index)
l2 = sns.lineplot(y=summary['Test Error Active'].values, x=summary.index)

# Format figure and axis
axs.legend(labels=['Test Error Passive', 'Test Error Active'], fontsize=16)
fig.suptitle('Learning Curve', y=0.95, size=25, fontweight='bold')
axs.set_xlabel('Training Set Size', fontsize=16, labelpad=15)
axs.xaxis.set_major_locator(ticker.MultipleLocator(100))
axs.set_ylabel('Classification Error', fontsize=16, labelpad=15)

plt.show()

	Variance	Skewness	Curtosis	Entropy
0	3.6216	8.6661	-2.8073	-0.44699
1	4.5459	8.1674	-2.4586	-1.46210
2	3.8660	-2.6383	1.9242	0.10645

	Train Error Passive	Valid Error Passive	Test Error Passive	Train Error Active	Valid Error Active	Test Error Active
10	0.028000	0.145101	0.144619	0.064000	0.176809	0.171949
20	0.019000	0.081341	0.079873	0.012000	0.048932	0.044661
30	0.006667	0.050483	0.047119	0.015333	0.041011	0.038983
40	0.011500	0.041279	0.037500	0.018000	0.022698	0.022542
50	0.009600	0.033741	0.028008	0.034800	0.015859	0.013729
...	...	...	...	...	...	...
860	0.012349	0.010500	0.004492	0.013372	0.000000	0.004873
870	0.012115	0.011333	0.004364	0.013609	0.000000	0.005085
880	0.012273	0.009000	0.004449	0.013364	0.000000	0.004873
890	0.012180	0.008000	0.004407	0.013528	0.000000	0.005085
900	0.012178	0.008000	0.004364	0.013556	0.000000	0.005127