Matheus Schmitz
LinkedIn
Github Portfolio
A CSV file containing empirical data for a fermentation lab experiment (e.g. yogurt).
The variables of each experiment are:
The post-fermentation measurements of each experiment are:
For this challenge, we want to leverage the fermentation lab experiment dataset to implement a function that can suggest optimal experimental parameters to yield desired output results. In other words, suggest what experiment a scientist would have to conduct in order for the experiment to yield a specific fermentation time and coagulation quality:
suggest_experimental_parameters(
desired_fermentation_time=83,
desired_coagulation_quality=1.22){
protein_quantity: 0.22,
starch_quantity: 0.41,
probiotic_quantity: 0.35
water_quantity: 0.16,
starting_ph: 6.3
} # just an example
Hint: How certain are you that the suggestions would yield the desired results? Is there more than one likely experiment that would yield the desired post-fermentation properties? Keep in mind the explanatory versus response variables of your model.
# Data Manipualtion
import numpy as np
import pandas as pd
pd.options.display.float_format = "{:,.3f}".format
from collections import defaultdict
# Auxilary
from tqdm import tqdm
# Plotting
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
# Warnings
import warnings
warnings.filterwarnings('ignore')
# Load Data
df = pd.read_csv('dataset_B.csv')
# Split
X = df.drop(columns=['fermentation_time', 'coagulation_quality'])
fermentation_time = df['fermentation_time']
coagulation_quality = df['coagulation_quality']
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 98 entries, 0 to 97 Data columns (total 7 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 protein_quantity 98 non-null float64 1 starch_quantity 98 non-null float64 2 probiotic_quantity 98 non-null float64 3 water_quantity 98 non-null float64 4 starting_ph 98 non-null float64 5 fermentation_time 98 non-null int64 6 coagulation_quality 98 non-null float64 dtypes: float64(6), int64(1) memory usage: 5.5 KB
df.head(3)
protein_quantity | starch_quantity | probiotic_quantity | water_quantity | starting_ph | fermentation_time | coagulation_quality | |
---|---|---|---|---|---|---|---|
0 | 0.315 | 0.062 | 0.590 | 0.033 | 7.500 | 90 | 0.500 |
1 | 0.435 | 0.232 | 0.248 | 0.085 | 7.800 | 351 | 6.900 |
2 | 0.231 | 0.298 | 0.328 | 0.143 | 7.500 | 294 | 4.700 |
# Are the inputs represented as percentages?
df[['protein_quantity', 'starch_quantity', 'probiotic_quantity', 'water_quantity']].sum(axis='columns').round(2).value_counts()
1.000 98 dtype: int64
# Correlations with fermentation_time
sns.pairplot(df, palette='coolwarm', hue='fermentation_time')
plt.show()
# Correlations with coagulation_quality
sns.pairplot(df, palette='coolwarm', hue='coagulation_quality')
plt.show()
fig, ax = plt.subplots(figsize=(8,6))
plt.title('Correlation Plot', size=20, pad=20)
mask = np.triu(df.corr())
mask[np.diag_indices_from(mask)] = False
sns.heatmap(df.corr(), mask=mask,
cmap='coolwarm', vmin=-1, vmax=1,
annot=True, fmt=".2f")
plt.xticks(rotation=45)
plt.axis('equal')
plt.show()
# Data Manipualtion
import numpy as np
import pandas as pd
pd.options.display.float_format = "{:,.3f}".format
from collections import defaultdict
# Auxilary
from tqdm import tqdm
# Machine Learning
import sklearn
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from sklearn.model_selection import GridSearchCV, KFold
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error
# Genetic Algorithm
import pygad
# Load Data
df = pd.read_csv('dataset_B.csv')
# Split
X = df.drop(columns=['fermentation_time', 'coagulation_quality'])
fermentation_time = df['fermentation_time']
coagulation_quality = df['coagulation_quality']
# Scale data
mms = defaultdict(lambda: MinMaxScaler())
mms['X'].fit(X.drop(columns='starting_ph'))
mms['ph'].fit(X['starting_ph'].to_numpy().reshape(-1, 1))
mms['ft'].fit(fermentation_time.to_numpy().reshape(-1, 1))
mms['cq'].fit(coagulation_quality.to_numpy().reshape(-1, 1))
MinMaxScaler()
Models which learn to predict the properties resulting from a given set of experimental parameters.
They are used later on as part of the descriminator system, so that the members of each generation can be chosen based on how well they approximate the desired properties.
Fermentation Time
%%time
# Pipeline to standardize then run the classifier
R_ft = Pipeline([("scaler", MinMaxScaler()),
("rf", GradientBoostingRegressor())])
# Grid with parameters to be tested via CV
R_ft_param_grid_ = {'rf__max_depth': [3, 4, 5, 6],
'rf__min_samples_leaf': [1, 2, 3, 4],
'rf__ccp_alpha': np.logspace(-3, 0, 4)}
# Instantiate GridSearchCV using accuracy as the scorer
R_ft_gridCV = GridSearchCV(R_ft, R_ft_param_grid_, cv=5, n_jobs=-1, scoring='neg_root_mean_squared_error')
# Run GridSearchCV
R_ft_gridCV = R_ft_gridCV.fit(X, fermentation_time.ravel())
Wall time: 16.7 s
R_ft_gridCV.best_estimator_
Pipeline(steps=[('scaler', MinMaxScaler()), ('rf', GradientBoostingRegressor(ccp_alpha=0.01, max_depth=6))])
R_ft_gridCV.best_score_
-87.37462382294984
Coagulation Quality
%%time
# Pipeline to standardize then run the classifier
R_cq = Pipeline([("scaler", MinMaxScaler()),
("rf", GradientBoostingRegressor())])
# Grid with parameters to be tested via CV
R_cq_param_grid_ = {'rf__max_depth': [3, 4, 5, 6],
'rf__min_samples_leaf': [1, 2, 3, 4],
'rf__ccp_alpha': np.logspace(-3, 0, 4)}
# Instantiate GridSearchCV using accuracy as the scorer
R_cq_gridCV = GridSearchCV(R_cq, R_cq_param_grid_, cv=5, n_jobs=-1, scoring='neg_root_mean_squared_error')
# Run GridSearchCV
R_cq_gridCV = R_cq_gridCV.fit(X, coagulation_quality.ravel())
Wall time: 4.01 s
Generator: Genetic Algorithm
Discriminator: Fitness Function
We note that features 1-4 are percentages, and hence I apply a modification to the genetic algorithm so that it's mutations are all represented as relative changes to the percentages, such that the sum of those initial four features is always equals to one. This can be thought of as applying a Softmax activation to those features.
Additionally, since the classifier models are only trained to predict features on the range seen in the training data, I bound the possible features mutations within the Genetic Algorithm to be within the range seen for each feature at training time.
def scale_genes(ga_obj):
offspring_pct = ga_obj.population[:, :4]
offspring_pct_sums = ga_obj.population[:, :4].sum(axis=1).reshape(-1, 1)
scaled = offspring_pct / offspring_pct_sums
ga_obj.population[:, :4] = scaled
# Specify the range limit for each gene based on historic data
gene_range = []
min_values = df.min(axis='rows')[:5].to_numpy()
max_values = df.max(axis='rows')[:5].to_numpy()
for low, high in zip(min_values, max_values):
gene_range.append({'low': low, 'high': high})
Use the training data to see which set of hyperparameters result in better approximations for the desired properties.
For each sample the loss (fitness function) takes the form of the difference between a sample's predicted properties and the target properties.
_parent_selection_type = ['sss', 'sus', 'tournament']
_crossover_type = ['single_point', 'scattered']
_crossover_probability = [0.33, 0.66]
_mutation_type = ['random', 'swap']
_mutation_probability = [0.2, 0.5, 0.8]
best_params = defaultdict(lambda: None)
min_error = float('inf')
for _ct in _crossover_type:
for _cp in _crossover_probability:
error = 0
# Test parameter combination on all test data
for idx, row in df.iterrows():
# Targets
target_ft = row['fermentation_time']
target_cq = row['coagulation_quality']
# Scale
scaled_target_ft = mms['ft'].transform([[target_ft]])[0][0]
scaled_target_cq = mms['cq'].transform([[target_cq]])[0][0]
def fitness_func(solution, solution_idx):
# Error = Solution's expected Fermentation Time vs target Fermentation Type
pred_ft = R_ft_gridCV.predict([solution])[0]
scaled_pred_ft = mms['ft'].transform([[pred_ft]])[0][0]
ft_error = -(scaled_target_ft - scaled_pred_ft)**2 # negative squared error for maximization problem
# Error = Solution's expected Coagulation Quality vs target Coagulation Quality
pred_cq = R_cq_gridCV.predict([solution])[0]
scaled_pred_cq = mms['cq'].transform([[pred_cq]])[0][0]
cq_error = -(scaled_target_cq - scaled_pred_cq)**2 # negative squared error for maximization problem
return ft_error + cq_error
# Genetic Algorithm
ga_instance = pygad.GA(num_generations = 200,
num_parents_mating = 5,
sol_per_pop = 10,
num_genes = X.shape[1],
fitness_func = fitness_func,
gene_space = gene_range,
parent_selection_type = 'sss',
keep_parents = 1,
crossover_type = _ct,
crossover_probability = _cp,
mutation_type = 'adaptive',
mutation_probability = (0.8, 0.2),
save_best_solutions = True,
on_generation = scale_genes,
suppress_warnings = True)
ga_instance.run()
# Get solution and predicted values
population = ga_instance.population
pred_fts = R_ft_gridCV.predict(population)
pred_cqs = R_cq_gridCV.predict(population)
# Error
scaled_pred_fts = mms['ft'].transform(pred_fts.reshape(-1, 1))
scaled_pred_cqs = mms['ft'].transform(pred_cqs.reshape(-1, 1))
error += sum((scaled_pred_fts - scaled_target_ft)**2 + (scaled_pred_cqs - scaled_target_cq)**2)
if error < min_error:
best_params['_ct'] = _ct
best_params['_cp'] = _cp
best_params['error'] = error
min_error = error
print(f'ct: {_ct} | cp: {_cp} | error: {error}')
ct: single_point | cp: 0.33 | error: [158.0246514] ct: single_point | cp: 0.66 | error: [158.16179847] ct: scattered | cp: 0.33 | error: [157.36472263] ct: scattered | cp: 0.66 | error: [154.32976128]
Using the trained ML model and the optimized Genetic Algorithm, we can then generate a set of suggested experimental parameters using the last generation's population.
best_params = {'_ct': 'single_point',
'_cp': 0.5}
def recommend(target_ft, target_cq):
# Transform inputs
scaled_target_ft = mms['ft'].transform([[target_ft]])[0][0]
scaled_target_cq = mms['cq'].transform([[target_cq]])[0][0]
def fitness_func(solution, solution_idx):
# Error = Solution's expected Fermentation Time vs target Fermentation Type
pred_ft = R_ft_gridCV.predict([solution])[0]
scaled_pred_ft = mms['ft'].transform([[pred_ft]])[0][0]
ft_error = -(scaled_target_ft - scaled_pred_ft)**2 # negative squared error for maximization problem
# Error = Solution's expected Coagulation Quality vs target Coagulation Quality
pred_cq = R_cq_gridCV.predict([solution])[0]
scaled_pred_cq = mms['cq'].transform([[pred_cq]])[0][0]
cq_error = -(scaled_target_cq - scaled_pred_cq)**2 # negative squared error for maximization problem
return ft_error + cq_error
# Instantiate and run Genetic Algorithm
ga_instance = pygad.GA(num_generations = 500,
num_parents_mating = 5,
sol_per_pop = 10,
num_genes = X.shape[1],
fitness_func = fitness_func,
gene_space = gene_range,
parent_selection_type = 'sss',
keep_parents = 1,
crossover_type = best_params['_ct'],
crossover_probability = best_params['_cp'],
mutation_type = 'adaptive',
mutation_probability = (0.8, 0.2),
save_best_solutions = True,
on_generation = scale_genes,
suppress_warnings = True)
ga_instance.run()
# Get solution and predicted values
population = ga_instance.population
pred_fts = R_ft_gridCV.predict(population)
pred_cqs = R_cq_gridCV.predict(population)
# Merge
df_X = pd.DataFrame(population)
df_ft = pd.DataFrame(pred_fts)
df_cq = pd.DataFrame(pred_cqs)
output = pd.concat([df_X, df_ft, df_cq], axis='columns')
output.columns = df.columns
return output
# Let's try to replicate the inputs for one of the samples on the dataset
df.iloc[1]
protein_quantity 0.435 starch_quantity 0.232 probiotic_quantity 0.248 water_quantity 0.085 starting_ph 7.800 fermentation_time 351.000 coagulation_quality 6.900 Name: 1, dtype: float64
target_ft = 351
target_cq = 6.9
r = recommend(target_ft, target_cq)
r
protein_quantity | starch_quantity | probiotic_quantity | water_quantity | starting_ph | fermentation_time | coagulation_quality | |
---|---|---|---|---|---|---|---|
0 | 0.073 | 0.291 | 0.291 | 0.346 | 7.630 | 195.942 | 6.561 |
1 | 0.194 | 0.195 | 0.194 | 0.417 | 8.087 | 409.926 | 5.201 |
2 | 0.075 | 0.299 | 0.246 | 0.380 | 8.487 | 283.581 | 7.425 |
3 | 0.117 | 0.451 | 0.098 | 0.334 | 7.630 | 326.153 | 8.135 |
4 | 0.351 | 0.401 | 0.076 | 0.173 | 7.630 | 328.283 | 6.995 |
5 | 0.185 | 0.276 | 0.312 | 0.228 | 8.487 | 434.621 | 5.945 |
6 | 0.438 | 0.279 | 0.226 | 0.057 | 7.630 | 307.934 | 5.408 |
7 | 0.106 | 0.417 | 0.218 | 0.259 | 7.630 | 323.831 | 8.277 |
8 | 0.095 | 0.300 | 0.247 | 0.357 | 7.630 | 314.156 | 6.887 |
9 | 0.359 | 0.453 | 0.102 | 0.086 | 8.248 | 484.315 | 7.826 |
# Check percentages
r[['protein_quantity', 'starch_quantity', 'probiotic_quantity', 'water_quantity']].sum(axis='columns').round(2).tolist()
[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0]
Matheus Schmitz
LinkedIn
Github Portfolio