Business Problem¶

Detecting gender from an audio file.

Data Source¶

Using a dataset from Google available at:

https://research.google.com/audioset/

The traininig dataset will be from youtube audio files totaling 10 minutes, being 5 for each gender, each spoken by 5 different people, so 1 minute per person (5 male and 5 female)

The testing dataset was extracted from "AudioSet", a corpus containing 558 speech expressions spoken by women and 546 spoken by men. Each audio file has 10 seconds and is sampled at 16000 Hz.

Packages¶

# Imports

# Data manipulation and visualization
import os
import matplotlib
import numpy as np
import pickle as cPickle
import matplotlib.pyplot as plt

# Feature extraction
import scipy
import librosa
import python_speech_features as psf
from scipy.io import wavfile

# Model training
import sklearn
from sklearn import preprocessing
from sklearn.mixture import GaussianMixture as GMM

# Audio Recording
import sounddevice as sd
import soundfile as sf

# Package versions
%reload_ext watermark
%watermark --iversions

matplotlib  3.2.2
sklearn     0.23.1
sounddevice 0.3.15
librosa     0.7.2
numpy       1.18.5
scipy       1.5.0
soundfile   0.10.3

Loading Data¶

There will be two models, one for male data and another for female data. The audio files are already split accordingly.

Audio Files - Male Voices

# Path for the training data with male voices
train_voice_m = 'data/train/youtube/male'

# Create lite with all files
list_files_m = [os.path.join(train_voice_m, f) for f in os.listdir(train_voice_m) if f.endswith('.wav')]

# Check list content
list_files_m

['data/train/youtube/male\\male1.wav',
 'data/train/youtube/male\\male2.wav',
 'data/train/youtube/male\\male3.wav',
 'data/train/youtube/male\\male4.wav',
 'data/train/youtube/male\\male5.wav']

# Object type
type(list_files_m)

list

Audio Files - Female Voices

# Path for the training data with female voices
train_voice_f = 'data/train/youtube/female'

# Create lite with all files
list_files_f = [os.path.join(train_voice_f, f) for f in os.listdir(train_voice_f) if f.endswith('.wav')]

# Check list content
list_files_f

['data/train/youtube/female\\female1.wav',
 'data/train/youtube/female\\female2.wav',
 'data/train/youtube/female\\female3.wav',
 'data/train/youtube/female\\female4.wav',
 'data/train/youtube/female\\female5.wav']

# Object type
type(list_files_f)

list

Extracting MFCC Features¶

Using the python_speech_features package: https://python-speech-features.readthedocs.io/en/latest/

It has a function to extract the MFCC features from audio files.

# Function to extract MFCC features and scale them
def extract_mfcc(sr, audio):
    
    # Extract features from audio frames
    features = psf.mfcc(audio, sr, 
                        winlen = 0.025, winstep = 0.01,
                        numcep = 13, appendEnergy = True)
    
    # Standardizing (scaling) features
    features = preprocessing.scale(features)
    
    return features

# Creating arrays to receive the extracted features
features_m = np.asarray(())
features_f = np.asarray(())

# Extrating features for the male audio files
for file_m in list_files_m:
    
    # Read the audio file
    sr, audio = wavfile.read(file_m)
    
    # Extract the feature vector
    feature_vector = extract_mfcc(sr, audio)
    
    # Check the size of the array created on the previous cell
    # If it's zero, include the first feature vector
    # If it's above zero, append
    # Needed to to this because otherwise the np.vstack method would bug on the first feature vector
    if features_m.size == 0:
        features_m = feature_vector
    else:
        features_m = np.vstack((features_m, feature_vector))

# Extracted features
features_m

array([[-2.17442346, -0.55387187,  0.14534254, ..., -0.17595846,
        -0.69355552,  1.06524329],
       [-2.20149991, -0.47781889,  0.48374196, ..., -0.38345243,
        -0.76356038,  1.13956846],
       [-2.18603295, -0.62058102,  0.21628953, ...,  0.10395744,
        -0.56469634,  0.79154986],
       ...,
       [-2.06799885, -1.14589413,  0.65274852, ...,  1.13735423,
         0.23905922,  0.33625924],
       [-2.1203576 , -1.03779545,  0.83049988, ...,  0.95875965,
         0.65757472,  0.85409856],
       [-2.27073073, -0.88582047,  1.07441446, ...,  0.98584747,
         0.46769639,  0.68195901]])

# Extrating features for the female audio files
for file_f in list_files_f:
    
    # Read the audio file
    sr, audio = wavfile.read(file_f)
    
    # Extract the feature vector
    feature_vector = extract_mfcc(sr, audio)
    
    # Check the size of the array created on the previous cell
    # If it's zero, include the first feature vector
    # If it's above zero, append
    # Needed to to this because other the np.vstack method would bug on the first feature vector
    if features_f.size == 0:
        features_f = feature_vector
    else:
        features_f = np.vstack((features_f, feature_vector))

# Extracted features
features_f

array([[ 0.39577634,  0.5179585 ,  0.95218191, ...,  1.33169242,
         1.16834709, -1.3924513 ],
       [ 0.18475744,  1.2545105 ,  0.9253783 , ...,  0.33311528,
         0.8407119 , -1.86215752],
       [-0.21848578,  0.89438619,  1.35277608, ...,  0.53590941,
         0.25581686, -1.11834446],
       ...,
       [-1.74177759, -0.22278724,  0.62891626, ..., -0.16816487,
        -0.5033839 , -1.81041936],
       [-1.50532737, -0.37325447,  0.51987615, ..., -1.02577787,
        -0.96314989, -2.67721113],
       [-1.6352069 , -0.39439019,  0.65016017, ..., -1.2399293 ,
        -0.41338792, -1.87413114]])

MFCC Feature Visualization¶

len(features_m)

30169

len(features_f)

30065

# Plot to visualize the features as signals

# PLot area
plt.figure(figsize = (16, 8))

# Loop through the features
for i in range(1, 30000, 1000):
    plt.plot(features_m[i], c = 'blue', linewidth = 0.5, alpha = 0.5)
    plt.plot(features_f[i], c = 'magenta', linewidth = 0.5, alpha = 0.5)
    
# Plot
plt.plot(features_m[i+1], c = 'blue', label = 'Male Voice', linewidth = 0.5, alpha = 0.5)
plt.plot(features_f[i+1], c = 'magenta', label = 'Female Voice', linewidth = 0.5, alpha = 0.5)
plt.legend()
plt.title('\nMFCC Features')
plt.show()

Machine Learning¶

Using Gaussian Mixture Model. This is a unsupervised algorithm.

From Wikipedia: In statistics, a mixture model is a probabilistic model for representing the presence of subpopulations within an overall population, without requiring that an observed data set should identify the sub-population to which an individual observation belongs.

The idea behind using a GMM is aproximating the probabily distribution of a class by a linear combination of 'k' Gaussian groups, also known as the GMM components.

Essentially it groups together samples which when grouped form a normal distribuiton

https://scikit-learn.org/stable/modules/generated/sklearn.mixture.GaussianMixture.html

Model for Male Voices

# Create model
model_gmm_m = GMM(n_components = 8, max_iter = 200, covariance_type = 'diag', n_init = 3)

%%time

# Train model
model_gmm_m.fit(features_m)

Wall time: 1.83 s

GaussianMixture(covariance_type='diag', max_iter=200, n_components=8, n_init=3)

# Save model to disk
cPickle.dump(model_gmm_m, open('models/'+'model_gmm_m', 'wb'))

Model for Female Voices

# Create model
model_gmm_f = GMM(n_components = 8, max_iter = 200, covariance_type = 'diag', n_init = 3)

%%time

# Train model
model_gmm_f.fit(features_f)

Wall time: 1.51 s

GaussianMixture(covariance_type='diag', max_iter=200, n_components=8, n_init=3)

# Save model to disk
cPickle.dump(model_gmm_f, open('models/'+'model_gmm_f', 'wb'))

Model Evaluation¶

Loading Test Data¶

# Path for the test data
test_voice_m = 'data/test/AudioSet/male_clips'
test_voice_f = 'data/test/AudioSet/female_clips'

# Create lite with all files
list_test_m = [os.path.join(test_voice_m, f) for f in os.listdir(test_voice_m) if f.endswith('.wav')]
list_test_f = [os.path.join(test_voice_f, f) for f in os.listdir(test_voice_f) if f.endswith('.wav')]

# Sample test data
list_test_m[:3]

['data/test/AudioSet/male_clips\\-0CAdy06NRo.wav',
 'data/test/AudioSet/male_clips\\-0DO0ulATPY.wav',
 'data/test/AudioSet/male_clips\\-0kDcUEDfmY.wav']

# Sample test data
list_test_f[:3]

['data/test/AudioSet/female_clips\\--EQQVMYe50.wav',
 'data/test/AudioSet/female_clips\\--IHiTgsaIk.wav',
 'data/test/AudioSet/female_clips\\--K91QrLI4g.wav']

Extract MFCC Features¶

# Creating arrays to receive the extracted features
features_test_m = np.asarray(())
features_test_f = np.asarray(())

%%time
# Extrating features for the male audio files
for file_test_m in list_test_m:
    
    # Read the audio file
    sr, audio = wavfile.read(file_test_m)
    
    # Extract the feature vector
    feature_vector = extract_mfcc(sr, audio)
    
    # Check the size of the array created on the previous cell
    # If it's zero, include the first feature vector
    # If it's above zero, append
    # Needed to to this because otherwise the np.vstack method would bug on the first feature vector
    if features_test_m.size == 0:
        features_test_m = feature_vector
    else:
        features_test_m = np.vstack((features_test_m, feature_vector))

Wall time: 34 s

%%time
# Extrating features for the female audio files
for file_test_f in list_test_f:
    
    # Read the audio file
    sr, audio = wavfile.read(file_test_f)
    
    # Extract the feature vector
    feature_vector = extract_mfcc(sr, audio)
    
    # Check the size of the array created on the previous cell
    # If it's zero, include the first feature vector
    # If it's above zero, append
    # Needed to to this because other the np.vstack method would bug on the first feature vector
    if features_test_f.size == 0:
        features_test_f = feature_vector
    else:
        features_test_f = np.vstack((features_test_f, feature_vector))

Wall time: 34.6 s

# Shape
features_test_m.shape

(544372, 13)

# Shape
features_test_f.shape

(555154, 13)

Test Model¶

# List to receive output
output_m = []

%%time
# Loop through the MFCC features for the male voices
# Testing them on both (male and female) models
for f in features_test_m:
    
    # Apply the male voice model
    log_likelihood_male = np.array(model_gmm_m.score([f])).sum()
    
    # Apply the female voice model
    log_likelihood_female = np.array(model_gmm_f.score([f])).sum()   
    
    # Check which model has the grater probability and assign the gender
    # Using 0 (zero) as truth here helps later on with calculating accuracy
    if log_likelihood_male > log_likelihood_female:
        output_m.append(0)
    else:
        output_m.append(1)

Wall time: 3min 9s

# Accuracy
acc_m = (1 - sum(output_m) / len(output_m))
print('Model Accuracy Recognizing Male Voices: ', acc_m)

Model Accuracy Recognizing Male Voices:  0.5329407096617754

# List to receive output
output_f = []

%%time
# Loop through the MFCC features for the female voices
# Testing them on both (male and female) models
for f in features_test_f:
    
    # Apply the male voice model
    log_likelihood_male = np.array(model_gmm_m.score([f])).sum()
    
    # Apply the female voice model
    log_likelihood_female = np.array(model_gmm_f.score([f])).sum()   
    
    # Check which model has the grater probability and assign the gender
    # Using 0 (zero) as truth here helps later on with calculating accuracy
    if log_likelihood_female > log_likelihood_male:
        output_f.append(0)
    else:
        output_f.append(1)

Wall time: 3min 21s

# Accuracy
acc_f = (1 - sum(output_f) / len(output_f))
print('Model Accuracy Recognizing Female Voices: ', acc_f)

Model Accuracy Recognizing Female Voices:  0.6735392341584497

Training with the Test Dataset (aka heresy)¶

# Function to extract MFCC features from all available data
def extract_features_all(source):
    
    # Split files
    list_files = [os.path.join(source, f) for f in os.listdir(source) if f.endswith('.wav')]
    
    # Feature list
    features = []
    
    # Loop through the file list
    for file in list_files:
        
        # Read the audio file
        sr, audio = wavfile.read(file)
        
        # Extract the feature vector
        feature_vector = extract_mfcc(sr, audio)

        # Check the size of the array created on the previous cell
        # If it's zero, include the first feature vector
        # If it's above zero, append
        # Needed to to this because other the np.vstack method would bug on the first feature vector
        if len(features) == 0:
            features = feature_vector
        else:
            features = np.vstack((features, feature_vector)) 
            
    return features

Model with Male Voices

%%time
# Extract the male MFCC features
source_male = 'data/test/AudioSet/male_clips'
features_male = extract_features_all(source_male)

# Train model with male voices
model_gmm_m_2 = GMM(n_components = 8, max_iter = 200, covariance_type = 'diag', n_init = 3)
model_gmm_m_2.fit(features_male)

Wall time: 1min 13s

GaussianMixture(covariance_type='diag', max_iter=200, n_components=8, n_init=3)

# Save model to disk
cPickle.dump(model_gmm_m_2, open('models/' + 'model_gmm_m_2.gmm', 'wb'))

Model with Female Voices

%%time
# Extract the female MFCC features
source_female = 'data/test/AudioSet/female_clips'
features_female = extract_features_all(source_female)

# Train model with female voices
model_gmm_f_2 = GMM(n_components = 8, max_iter = 200, covariance_type = 'diag', n_init = 3)
model_gmm_f_2.fit(features_female)

Wall time: 1min 28s

GaussianMixture(covariance_type='diag', max_iter=200, n_components=8, n_init=3)

# Save model to disk
cPickle.dump(model_gmm_f_2, open('models/' + 'model_gmm_f_2.gmm', 'wb'))

Visualize MFCC Features¶

len(features_male)

544372

len(features_female)

555154

# Plot to visualize the features as signals

# PLot area
plt.figure(figsize = (16, 8))

# Loop through the features
for i in range(1, 540000, 1000):
    plt.plot(features_male[i], c = 'blue', linewidth = 0.5, alpha = 0.5)
    plt.plot(features_female[i], c = 'magenta', linewidth = 0.5, alpha = 0.5)
    
# Plot
plt.plot(features_male[i+1], c = 'blue', label = 'Male Voice', linewidth = 0.5, alpha = 0.5)
plt.plot(features_female[i+1], c = 'magenta', label = 'Female Voice', linewidth = 0.5, alpha = 0.5)
plt.legend()
plt.title('\nMFCC Features')
plt.show()

Testing Model with Training Dataset (reverse heresy)¶

# List to receive output
output_male = []

%%time
# Loop through the MFCC features for the male voices
# Testing them on both (male and female) models
for f in features_m:
    
    # Apply the male voice model
    log_likelihood_male = np.array(model_gmm_m_2.score([f])).sum()
    
    # Apply the female voice model
    log_likelihood_female = np.array(model_gmm_f_2.score([f])).sum()   
    
    # Check which model has the grater probability and assign the gender
    # Using 0 (zero) as truth here helps later on with calculating accuracy
    if log_likelihood_male > log_likelihood_female:
        output_male.append(0)
    else:
        output_male.append(1)

Wall time: 11.2 s

# Accuracy
acc_m_2 = (1 - sum(output_male) / len(output_male))
print('Model Accuracy Recognizing Male Voices: ', acc_m_2)

Model Accuracy Recognizing Male Voices:  0.6503695846730086

# List to receive output
output_female = []

%%time
# Loop through the MFCC features for the male voices
# Testing them on both (male and female) models
for f in features_f:
    
    # Apply the male voice model
    log_likelihood_male = np.array(model_gmm_m_2.score([f])).sum()
    
    # Apply the female voice model
    log_likelihood_female = np.array(model_gmm_f_2.score([f])).sum()   
    
    # Check which model has the grater probability and assign the gender
    # Using 0 (zero) as truth here helps later on with calculating accuracy
    if log_likelihood_female > log_likelihood_male:
        output_female.append(0)
    else:
        output_female.append(1)

Wall time: 11 s

# Accuracy
acc_f_2 = (1 - sum(output_female) / len(output_female))
print('Model Accuracy Recognizing Female Voices: ', acc_f_2)

Model Accuracy Recognizing Female Voices:  0.701879261599867

Both the male and female models improved with the biggest dataset. Not a big surprise considering that I'm using small datasets.

Real Time Voice Recognition¶

# Function to record audio and recognize gender in real time
def real_time_gender_recognition(sr = 16000, channels = 1, duration = 3, filename = 'temp/voice.wav'):
    
    # Record voice
    audio_record = sd.rec(int(duration*sr), samplerate = sr, channels = channels).reshape(-1)
    
    # Wait conclusion
    sd.wait()
    
    # Extract MFCC features
    mfcc_features = extract_mfcc(sr, audio_record)
    
    # Initialize the score variable
    scores = None
    
    # Apply the models
    log_likelihood_m = np.array(model_gmm_m_2.score(mfcc_features)).sum()
    log_likelihood_f = np.array(model_gmm_f_2.score(mfcc_features)).sum()

    # Check which model had the highest probability and assign gender
    if log_likelihood_m >= log_likelihood_f:
        return('Male Voice')
    else:
        return('Female Voice')

# Execute function
real_time_gender_recognition()

'Male Voice'