Detecting gender from an audio file.
Using a dataset from Google available at:
https://research.google.com/audioset/
The traininig dataset will be from youtube audio files totaling 10 minutes, being 5 for each gender, each spoken by 5 different people, so 1 minute per person (5 male and 5 female)
The testing dataset was extracted from "AudioSet", a corpus containing 558 speech expressions spoken by women and 546 spoken by men. Each audio file has 10 seconds and is sampled at 16000 Hz.
# Imports
# Data manipulation and visualization
import os
import matplotlib
import numpy as np
import pickle as cPickle
import matplotlib.pyplot as plt
# Feature extraction
import scipy
import librosa
import python_speech_features as psf
from scipy.io import wavfile
# Model training
import sklearn
from sklearn import preprocessing
from sklearn.mixture import GaussianMixture as GMM
# Audio Recording
import sounddevice as sd
import soundfile as sf
# Package versions
%reload_ext watermark
%watermark --iversions
There will be two models, one for male data and another for female data. The audio files are already split accordingly.
Audio Files - Male Voices
# Path for the training data with male voices
train_voice_m = 'data/train/youtube/male'
# Create lite with all files
list_files_m = [os.path.join(train_voice_m, f) for f in os.listdir(train_voice_m) if f.endswith('.wav')]
# Check list content
list_files_m
# Object type
type(list_files_m)
Audio Files - Female Voices
# Path for the training data with female voices
train_voice_f = 'data/train/youtube/female'
# Create lite with all files
list_files_f = [os.path.join(train_voice_f, f) for f in os.listdir(train_voice_f) if f.endswith('.wav')]
# Check list content
list_files_f
# Object type
type(list_files_f)
Using the python_speech_features package: https://python-speech-features.readthedocs.io/en/latest/
It has a function to extract the MFCC features from audio files.
# Function to extract MFCC features and scale them
def extract_mfcc(sr, audio):
# Extract features from audio frames
features = psf.mfcc(audio, sr,
winlen = 0.025, winstep = 0.01,
numcep = 13, appendEnergy = True)
# Standardizing (scaling) features
features = preprocessing.scale(features)
return features
# Creating arrays to receive the extracted features
features_m = np.asarray(())
features_f = np.asarray(())
# Extrating features for the male audio files
for file_m in list_files_m:
# Read the audio file
sr, audio = wavfile.read(file_m)
# Extract the feature vector
feature_vector = extract_mfcc(sr, audio)
# Check the size of the array created on the previous cell
# If it's zero, include the first feature vector
# If it's above zero, append
# Needed to to this because otherwise the np.vstack method would bug on the first feature vector
if features_m.size == 0:
features_m = feature_vector
else:
features_m = np.vstack((features_m, feature_vector))
# Extracted features
features_m
# Extrating features for the female audio files
for file_f in list_files_f:
# Read the audio file
sr, audio = wavfile.read(file_f)
# Extract the feature vector
feature_vector = extract_mfcc(sr, audio)
# Check the size of the array created on the previous cell
# If it's zero, include the first feature vector
# If it's above zero, append
# Needed to to this because other the np.vstack method would bug on the first feature vector
if features_f.size == 0:
features_f = feature_vector
else:
features_f = np.vstack((features_f, feature_vector))
# Extracted features
features_f
len(features_m)
len(features_f)
# Plot to visualize the features as signals
# PLot area
plt.figure(figsize = (16, 8))
# Loop through the features
for i in range(1, 30000, 1000):
plt.plot(features_m[i], c = 'blue', linewidth = 0.5, alpha = 0.5)
plt.plot(features_f[i], c = 'magenta', linewidth = 0.5, alpha = 0.5)
# Plot
plt.plot(features_m[i+1], c = 'blue', label = 'Male Voice', linewidth = 0.5, alpha = 0.5)
plt.plot(features_f[i+1], c = 'magenta', label = 'Female Voice', linewidth = 0.5, alpha = 0.5)
plt.legend()
plt.title('\nMFCC Features')
plt.show()
Using Gaussian Mixture Model. This is a unsupervised algorithm.
From Wikipedia: In statistics, a mixture model is a probabilistic model for representing the presence of subpopulations within an overall population, without requiring that an observed data set should identify the sub-population to which an individual observation belongs.
The idea behind using a GMM is aproximating the probabily distribution of a class by a linear combination of 'k' Gaussian groups, also known as the GMM components.
Essentially it groups together samples which when grouped form a normal distribuiton
Model for Male Voices
# Create model
model_gmm_m = GMM(n_components = 8, max_iter = 200, covariance_type = 'diag', n_init = 3)
%%time
# Train model
model_gmm_m.fit(features_m)
# Save model to disk
cPickle.dump(model_gmm_m, open('models/'+'model_gmm_m', 'wb'))
Model for Female Voices
# Create model
model_gmm_f = GMM(n_components = 8, max_iter = 200, covariance_type = 'diag', n_init = 3)
%%time
# Train model
model_gmm_f.fit(features_f)
# Save model to disk
cPickle.dump(model_gmm_f, open('models/'+'model_gmm_f', 'wb'))
# Path for the test data
test_voice_m = 'data/test/AudioSet/male_clips'
test_voice_f = 'data/test/AudioSet/female_clips'
# Create lite with all files
list_test_m = [os.path.join(test_voice_m, f) for f in os.listdir(test_voice_m) if f.endswith('.wav')]
list_test_f = [os.path.join(test_voice_f, f) for f in os.listdir(test_voice_f) if f.endswith('.wav')]
# Sample test data
list_test_m[:3]
# Sample test data
list_test_f[:3]
# Creating arrays to receive the extracted features
features_test_m = np.asarray(())
features_test_f = np.asarray(())
%%time
# Extrating features for the male audio files
for file_test_m in list_test_m:
# Read the audio file
sr, audio = wavfile.read(file_test_m)
# Extract the feature vector
feature_vector = extract_mfcc(sr, audio)
# Check the size of the array created on the previous cell
# If it's zero, include the first feature vector
# If it's above zero, append
# Needed to to this because otherwise the np.vstack method would bug on the first feature vector
if features_test_m.size == 0:
features_test_m = feature_vector
else:
features_test_m = np.vstack((features_test_m, feature_vector))
%%time
# Extrating features for the female audio files
for file_test_f in list_test_f:
# Read the audio file
sr, audio = wavfile.read(file_test_f)
# Extract the feature vector
feature_vector = extract_mfcc(sr, audio)
# Check the size of the array created on the previous cell
# If it's zero, include the first feature vector
# If it's above zero, append
# Needed to to this because other the np.vstack method would bug on the first feature vector
if features_test_f.size == 0:
features_test_f = feature_vector
else:
features_test_f = np.vstack((features_test_f, feature_vector))
# Shape
features_test_m.shape
# Shape
features_test_f.shape
# List to receive output
output_m = []
%%time
# Loop through the MFCC features for the male voices
# Testing them on both (male and female) models
for f in features_test_m:
# Apply the male voice model
log_likelihood_male = np.array(model_gmm_m.score([f])).sum()
# Apply the female voice model
log_likelihood_female = np.array(model_gmm_f.score([f])).sum()
# Check which model has the grater probability and assign the gender
# Using 0 (zero) as truth here helps later on with calculating accuracy
if log_likelihood_male > log_likelihood_female:
output_m.append(0)
else:
output_m.append(1)
# Accuracy
acc_m = (1 - sum(output_m) / len(output_m))
print('Model Accuracy Recognizing Male Voices: ', acc_m)
# List to receive output
output_f = []
%%time
# Loop through the MFCC features for the female voices
# Testing them on both (male and female) models
for f in features_test_f:
# Apply the male voice model
log_likelihood_male = np.array(model_gmm_m.score([f])).sum()
# Apply the female voice model
log_likelihood_female = np.array(model_gmm_f.score([f])).sum()
# Check which model has the grater probability and assign the gender
# Using 0 (zero) as truth here helps later on with calculating accuracy
if log_likelihood_female > log_likelihood_male:
output_f.append(0)
else:
output_f.append(1)
# Accuracy
acc_f = (1 - sum(output_f) / len(output_f))
print('Model Accuracy Recognizing Female Voices: ', acc_f)
# Function to extract MFCC features from all available data
def extract_features_all(source):
# Split files
list_files = [os.path.join(source, f) for f in os.listdir(source) if f.endswith('.wav')]
# Feature list
features = []
# Loop through the file list
for file in list_files:
# Read the audio file
sr, audio = wavfile.read(file)
# Extract the feature vector
feature_vector = extract_mfcc(sr, audio)
# Check the size of the array created on the previous cell
# If it's zero, include the first feature vector
# If it's above zero, append
# Needed to to this because other the np.vstack method would bug on the first feature vector
if len(features) == 0:
features = feature_vector
else:
features = np.vstack((features, feature_vector))
return features
Model with Male Voices
%%time
# Extract the male MFCC features
source_male = 'data/test/AudioSet/male_clips'
features_male = extract_features_all(source_male)
# Train model with male voices
model_gmm_m_2 = GMM(n_components = 8, max_iter = 200, covariance_type = 'diag', n_init = 3)
model_gmm_m_2.fit(features_male)
# Save model to disk
cPickle.dump(model_gmm_m_2, open('models/' + 'model_gmm_m_2.gmm', 'wb'))
Model with Female Voices
%%time
# Extract the female MFCC features
source_female = 'data/test/AudioSet/female_clips'
features_female = extract_features_all(source_female)
# Train model with female voices
model_gmm_f_2 = GMM(n_components = 8, max_iter = 200, covariance_type = 'diag', n_init = 3)
model_gmm_f_2.fit(features_female)
# Save model to disk
cPickle.dump(model_gmm_f_2, open('models/' + 'model_gmm_f_2.gmm', 'wb'))
len(features_male)
len(features_female)
# Plot to visualize the features as signals
# PLot area
plt.figure(figsize = (16, 8))
# Loop through the features
for i in range(1, 540000, 1000):
plt.plot(features_male[i], c = 'blue', linewidth = 0.5, alpha = 0.5)
plt.plot(features_female[i], c = 'magenta', linewidth = 0.5, alpha = 0.5)
# Plot
plt.plot(features_male[i+1], c = 'blue', label = 'Male Voice', linewidth = 0.5, alpha = 0.5)
plt.plot(features_female[i+1], c = 'magenta', label = 'Female Voice', linewidth = 0.5, alpha = 0.5)
plt.legend()
plt.title('\nMFCC Features')
plt.show()
# List to receive output
output_male = []
%%time
# Loop through the MFCC features for the male voices
# Testing them on both (male and female) models
for f in features_m:
# Apply the male voice model
log_likelihood_male = np.array(model_gmm_m_2.score([f])).sum()
# Apply the female voice model
log_likelihood_female = np.array(model_gmm_f_2.score([f])).sum()
# Check which model has the grater probability and assign the gender
# Using 0 (zero) as truth here helps later on with calculating accuracy
if log_likelihood_male > log_likelihood_female:
output_male.append(0)
else:
output_male.append(1)
# Accuracy
acc_m_2 = (1 - sum(output_male) / len(output_male))
print('Model Accuracy Recognizing Male Voices: ', acc_m_2)
# List to receive output
output_female = []
%%time
# Loop through the MFCC features for the male voices
# Testing them on both (male and female) models
for f in features_f:
# Apply the male voice model
log_likelihood_male = np.array(model_gmm_m_2.score([f])).sum()
# Apply the female voice model
log_likelihood_female = np.array(model_gmm_f_2.score([f])).sum()
# Check which model has the grater probability and assign the gender
# Using 0 (zero) as truth here helps later on with calculating accuracy
if log_likelihood_female > log_likelihood_male:
output_female.append(0)
else:
output_female.append(1)
# Accuracy
acc_f_2 = (1 - sum(output_female) / len(output_female))
print('Model Accuracy Recognizing Female Voices: ', acc_f_2)
Both the male and female models improved with the biggest dataset. Not a big surprise considering that I'm using small datasets.
# Function to record audio and recognize gender in real time
def real_time_gender_recognition(sr = 16000, channels = 1, duration = 3, filename = 'temp/voice.wav'):
# Record voice
audio_record = sd.rec(int(duration*sr), samplerate = sr, channels = channels).reshape(-1)
# Wait conclusion
sd.wait()
# Extract MFCC features
mfcc_features = extract_mfcc(sr, audio_record)
# Initialize the score variable
scores = None
# Apply the models
log_likelihood_m = np.array(model_gmm_m_2.score(mfcc_features)).sum()
log_likelihood_f = np.array(model_gmm_f_2.score(mfcc_features)).sum()
# Check which model had the highest probability and assign gender
if log_likelihood_m >= log_likelihood_f:
return('Male Voice')
else:
return('Female Voice')
# Execute function
real_time_gender_recognition()