evoFE.RmdevoFE (Evolutionary Feature Engineering) uses a genetic algorithm to automatically discover useful feature transformations for tabular data. Instead of manually crafting interaction terms, ratios, or binning strategies, you let evolution explore the space of possible transformations and keep the ones that improve predictive performance.
The result is an evo_recipe — a reusable transformation pipeline that can be applied to new data at prediction time.
# Install from local source
devtools::install_local("path/to/evoFE")
# Or install directly from a git repo
# devtools::install_github("your-org/evoFE")Let’s classify whether a car has an automatic or manual transmission
using the mtcars dataset.
library(evoFE)
data(mtcars)
df <- mtcars
df$am <- as.integer(df$am) # target: 0 = automatic, 1 = manual
res <- evolve_features(
data = df,
target_col = "am",
task = "classification",
evaluator = "xgboost",
generations = 5,
pop_size = 8,
cv_folds = 3,
early_stopping_rounds = 3,
seed = 42,
verbose = TRUE
)
#> Starting Evolutionary Feature Engineering...
#> Task: classification
#> Evaluator: xgboost
#> Generations: 5, Population Size: 8, CV Folds: 3
#> Original Numeric columns: mpg, cyl, disp, hp, drat, wt, qsec, vs, gear, carb
#> Original Categorical columns:
#>
#> [Gen 0] Initialized Population:
#> Individual 1: [Original features only]
#> Individual 2: [mst_score(mpg, gear, cyl, hp, vs, wt), mst_score(gear, drat, wt, hp, cyl)]
#> Individual 3: [genie(gear, carb, disp, hp), subtract(hp, cyl)]
#> Individual 4: [lumbermark(carb, mpg, cyl, wt), log_binning9(carb)]
#> Individual 5: [add(wt, cyl, drat, hp, cyl), log_binning_cat4(cyl)]
#> Individual 6: [log_binning_cat2(qsec), quantile_binning4(carb)]
#> Individual 7: [multiply(cyl, hp, disp), log_binning3(drat)]
#> Individual 8: [quantile_binning_cat4(wt), truncated_svd1(mpg, carb, qsec, hp, drat, cyl), truncated_svd2(mpg, carb, qsec, hp, drat, cyl), truncated_svd3(mpg, carb, qsec, hp, drat, cyl)]
#>
#> --- Generation 1 / 5 ---
#> Tested Individual 1 (New Best!) -> Fitness: 0.6862
#> Tested Individual 2 -> Fitness: 0.6836
#> Tested Individual 3 -> Fitness: 0.6862
#> Tested Individual 4 -> Fitness: 0.6857
#> Tested Individual 5 -> Fitness: 0.6862
#> Tested Individual 6 -> Fitness: 0.6862
#> Tested Individual 7 -> Fitness: 0.6862
#> Tested Individual 8 -> Fitness: 0.6862
#> Gen 1 Best Fitness: 0.6862
#> Gen 1 Best Recipe: [Original features only]
#>
#> --- Generation 2 / 5 (Current Best Fitness: 0.6862) ---
#> Tested Individual 2 -> Fitness: 0.6862
#> Tested Individual 3 -> Fitness: 0.6862
#> Tested Individual 4 -> Fitness: 0.6862
#> Tested Individual 5 -> Fitness: 0.6862
#> Tested Individual 6 -> Fitness: 0.6787
#> Tested Individual 7 -> Fitness: 0.6862
#> Tested Individual 8 -> Fitness: 0.6862
#> Gen 2 Best Fitness: 0.6862
#> Gen 2 Best Recipe: [Original features only]
#>
#> --- Generation 3 / 5 (Current Best Fitness: 0.6862) ---
#> Tested Individual 2 -> Fitness: 0.6862
#> Tested Individual 3 -> Fitness: 0.6862
#> Tested Individual 4 (New Best!) -> Fitness: 0.6896
#> Tested Individual 5 -> Fitness: 0.6862
#> Tested Individual 6 (New Best!) -> Fitness: 0.7051
#> Tested Individual 7 -> Fitness: 0.6862 (cached)
#> Tested Individual 8 -> Fitness: 0.6862
#> Tested Individual 9 -> Fitness: 0.6875
#> Tested Individual 10 -> Fitness: 0.6862
#> Tested Individual 11 -> Fitness: 0.6862
#> Tested Individual 12 -> Fitness: 0.6862
#> Gen 3 Best Fitness: 0.7051
#> Gen 3 Best Recipe: [umap1(disp, wt), umap2(disp, wt)]
#>
#> --- Generation 4 / 5 (Current Best Fitness: 0.7051) ---
#> Tested Individual 2 -> Fitness: 0.6908
#> Tested Individual 3 -> Fitness: 0.6896
#> Tested Individual 4 -> Fitness: 0.6862
#> Tested Individual 5 -> Fitness: 0.6862
#> Tested Individual 6 -> Fitness: 0.6831
#> Tested Individual 7 (New Best!) -> Fitness: 0.7073
#> Tested Individual 8 -> Fitness: 0.6816
#> Gen 4 Best Fitness: 0.7073
#> Gen 4 Best Recipe: [multiply(disp, gear, wt, wt, wt), pca1(carb, vs, hp), pca2(carb, vs, hp), pca3(carb, vs, hp), umap1(disp, wt), umap2(disp, wt)]
#>
#> --- Generation 5 / 5 (Current Best Fitness: 0.7073) ---
#> Tested Individual 2 -> Fitness: 0.6910
#> Tested Individual 3 -> Fitness: 0.6607
#> Tested Individual 4 -> Fitness: 0.6819
#> Tested Individual 5 (New Best!) -> Fitness: 0.7179
#> Tested Individual 6 -> Fitness: 0.7147
#> Tested Individual 7 -> Fitness: 0.6814
#> Tested Individual 8 -> Fitness: 0.7073
#> Gen 5 Best Fitness: 0.7179
#> Gen 5 Best Recipe: [multiply(disp, gear, wt, wt, wt), pca1(carb, vs, hp), pca2(carb, vs, hp), umap2(disp, wt)]
#>
#> Evolution Complete. Best Fitness: 0.7179
#> Best recipe: [multiply(disp, gear, wt, wt, wt), pca1(carb, vs, hp), pca2(carb, vs, hp), umap2(disp, wt)]
#> Generated columns: ((disp*gear*wt*wt*wt)), PCA1(car_vs_hp), PCA2(car_vs_hp), UMAP2(dis_wt)
#> Training final model on full dataset...The returned evo_recipe object contains the best
individual (feature recipe), the fitted model, and the evolution
history.
# View the winning recipe
cat("Best recipe:", individual_to_recipe_string(res$best_individual), "\n")
#> Best recipe: [multiply(disp, gear, wt, wt, wt), pca1(carb, vs, hp), pca2(carb, vs, hp), umap2(disp, wt)]
cat("Fitness: ", res$best_individual$fitness, "\n")
#> Fitness: 0.7178567predict() applies the evolved transformations to new
data and returns the engineered feature matrix:
engineered <- predict(res, df[1:5, ])
head(engineered)
#> mpg cyl disp hp drat wt qsec vs gear carb
#> <num> <num> <num> <num> <num> <num> <num> <num> <num> <num>
#> 1: 21.0 6 160 110 3.90 2.620 16.46 0 4 4
#> 2: 21.0 6 160 110 3.90 2.875 17.02 0 4 4
#> 3: 22.8 4 108 93 3.85 2.320 18.61 1 4 1
#> 4: 21.4 6 258 110 3.08 3.215 19.44 1 3 1
#> 5: 18.7 8 360 175 3.15 3.440 17.02 0 3 2
#> ((disp*gear*wt*wt*wt)) PCA1(car_vs_hp) PCA2(car_vs_hp) UMAP2(dis_wt)
#> <num> <num> <num> <num>
#> 1: 11510.226 -0.5758895 0.17083974 4.737757
#> 2: 15208.750 -0.5758895 0.17083974 4.526803
#> 3: 5394.457 1.7331290 -0.02830549 7.232712
#> 4: 25720.766 1.5826875 -0.04046764 -6.888471
#> 5: 43964.191 -0.4499549 0.95866969 -5.179038predict_model() goes one step further — it applies the
transformations and runs the trained model to produce
predictions:
preds <- predict_model(res, df[1:5, ])
preds
#> [1] 0.96213979 0.96213979 0.91886902 0.03086278 0.05843740Predict petal length from the iris dataset:
data(iris)
res_reg <- evolve_features(
data = iris[, 1:5],
target_col = "Petal.Length",
task = "regression",
evaluator = "xgboost",
generations = 5,
pop_size = 8,
cv_folds = 3,
early_stopping_rounds = 3,
seed = 123,
verbose = TRUE
)
#> Starting Evolutionary Feature Engineering...
#> Task: regression
#> Evaluator: xgboost
#> Generations: 5, Population Size: 8, CV Folds: 3
#> Original Numeric columns: Sepal.Length, Sepal.Width, Petal.Width
#> Original Categorical columns: Species
#>
#> [Gen 0] Initialized Population:
#> Individual 1: [Original features only]
#> Individual 2: [groupby_min(Species, Petal.Width), groupby_max(Species, Petal.Width)]
#> Individual 3: [truncated_svd1(Sepal.Width, Petal.Width), truncated_svd2(Sepal.Width, Petal.Width), truncated_svd3(Sepal.Width, Petal.Width)]
#> Individual 4: [log_binning6(Sepal.Length), groupby_ratio(Species, Petal.Width)]
#> Individual 5: [normalized_difference(Sepal.Length, Sepal.Length), deadwood(Petal.Width, Sepal.Width)]
#> Individual 6: [log_ratio(Petal.Width, Sepal.Width), pca1(Sepal.Width, Sepal.Length), pca2(Sepal.Width, Sepal.Length), pca3(Sepal.Width, Sepal.Length)]
#> Individual 7: [divide(Sepal.Length, Sepal.Width), groupby_min(Species, Sepal.Width)]
#> Individual 8: [groupby_sd(Species, Petal.Width), pca1(Sepal.Length, Sepal.Width), pca2(Sepal.Length, Sepal.Width), pca3(Sepal.Length, Sepal.Width)]
#>
#> --- Generation 1 / 5 ---
#> Tested Individual 1 (New Best!) -> Fitness: -0.2819
#> Tested Individual 2 -> Fitness: -0.2819
#> Tested Individual 3 -> Fitness: -0.2833
#> Tested Individual 4 (New Best!) -> Fitness: -0.2793
#> Tested Individual 5 -> Fitness: -0.2810
#> Tested Individual 6 -> Fitness: -0.2881
#> Tested Individual 7 -> Fitness: -0.2831
#> Tested Individual 8 -> Fitness: -0.2878
#> Gen 1 Best Fitness: -0.2793
#> Gen 1 Best Recipe: [log_binning6(Sepal.Length), groupby_ratio(Species, Petal.Width)]
#>
#> --- Generation 2 / 5 (Current Best Fitness: -0.2793) ---
#> Tested Individual 2 -> Fitness: -0.2793
#> Tested Individual 3 -> Fitness: -0.2810
#> Tested Individual 4 -> Fitness: -0.2793
#> Tested Individual 5 -> Fitness: -0.2819
#> Tested Individual 6 -> Fitness: -0.2812
#> Tested Individual 7 -> Fitness: -0.2819
#> Tested Individual 8 -> Fitness: -0.2819
#> Gen 2 Best Fitness: -0.2793
#> Gen 2 Best Recipe: [log_binning6(Sepal.Length), groupby_ratio(Species, Petal.Width)]
#>
#> --- Generation 3 / 5 (Current Best Fitness: -0.2793) ---
#> Tested Individual 2 -> Fitness: -0.2819
#> Tested Individual 3 -> Fitness: -0.2793 (cached)
#> Tested Individual 4 -> Fitness: -0.2793
#> Tested Individual 5 -> Fitness: -0.2812
#> Tested Individual 6 -> Fitness: -0.2819
#> Tested Individual 7 (New Best!) -> Fitness: -0.2768
#> Tested Individual 8 -> Fitness: -0.2819
#> Tested Individual 9 -> Fitness: -0.2798
#> Tested Individual 10 -> Fitness: -0.2810
#> Tested Individual 11 (New Best!) -> Fitness: -0.2739
#> Tested Individual 12 -> Fitness: -0.2793
#> Gen 3 Best Fitness: -0.2739
#> Gen 3 Best Recipe: [log_binning6(Sepal.Length), groupby_ratio(Species, Petal.Width), genie(ratio_Petal.Width_by_Species, Sepal.Length, logbin6(Sepal.Length))]
#>
#> --- Generation 4 / 5 (Current Best Fitness: -0.2739) ---
#> Tested Individual 2 -> Fitness: -0.2768
#> Tested Individual 3 -> Fitness: -0.2793
#> Tested Individual 4 -> Fitness: -0.2793
#> Tested Individual 5 -> Fitness: -0.2793
#> Tested Individual 6 -> Fitness: -0.2793
#> Tested Individual 7 -> Fitness: -0.2793
#> Tested Individual 8 -> Fitness: -0.2817
#> Gen 4 Best Fitness: -0.2739
#> Gen 4 Best Recipe: [log_binning6(Sepal.Length), groupby_ratio(Species, Petal.Width), genie(ratio_Petal.Width_by_Species, Sepal.Length, logbin6(Sepal.Length))]
#>
#> --- Generation 5 / 5 (Current Best Fitness: -0.2739) ---
#> Tested Individual 2 -> Fitness: -0.2809
#> Tested Individual 3 -> Fitness: -0.2816
#> Tested Individual 4 -> Fitness: -0.2768
#> Tested Individual 5 -> Fitness: -0.2739
#> Tested Individual 6 -> Fitness: -0.2793
#> Tested Individual 7 -> Fitness: -0.2739
#> Tested Individual 8 -> Fitness: -0.2819
#> Tested Individual 9 -> Fitness: -0.2793
#> Tested Individual 10 -> Fitness: -0.2792
#> Tested Individual 11 -> Fitness: -0.2768
#> Tested Individual 12 -> Fitness: -0.2793
#> Gen 5 Best Fitness: -0.2739
#> Gen 5 Best Recipe: [log_binning6(Sepal.Length), groupby_ratio(Species, Petal.Width), genie(ratio_Petal.Width_by_Species, Sepal.Length, logbin6(Sepal.Length))]
#>
#> Evolution Complete. Best Fitness: -0.2739
#> Best recipe: [log_binning6(Sepal.Length), groupby_ratio(Species, Petal.Width), genie(ratio_Petal.Width_by_Species, Sepal.Length, logbin6(Sepal.Length))]
#> Generated columns: logbin6(Sepal.Length), ratio_Petal.Width_by_Species, Genie4(rat_Sep_log)
#> Training final model on full dataset...
cat("Best recipe:", individual_to_recipe_string(res_reg$best_individual), "\n")
#> Best recipe: [log_binning6(Sepal.Length), groupby_ratio(Species, Petal.Width), genie(ratio_Petal.Width_by_Species, Sepal.Length, logbin6(Sepal.Length))]
cat("Fitness (neg RMSE):", res_reg$best_individual$fitness, "\n")
#> Fitness (neg RMSE): -0.2739487
preds_reg <- predict_model(res_reg, iris[1:10, ])
# Compare predictions to actuals
data.frame(
actual = iris$Petal.Length[1:10],
predicted = round(preds_reg, 2)
)
#> actual predicted
#> 1 1.4 1.45
#> 2 1.4 1.47
#> 3 1.3 1.49
#> 4 1.5 1.39
#> 5 1.4 1.44
#> 6 1.7 1.56
#> 7 1.4 1.40
#> 8 1.5 1.49
#> 9 1.4 1.36
#> 10 1.5 1.48Classify iris species (3 classes). Note
task = "multiclass":
iris_mc <- iris
iris_mc$Species <- as.character(iris_mc$Species)
res_mc <- evolve_features(
data = iris_mc,
target_col = "Species",
task = "multiclass",
evaluator = "xgboost",
generations = 5,
pop_size = 8,
cv_folds = 3,
early_stopping_rounds = 3,
seed = 99,
verbose = TRUE
)
#> Starting Evolutionary Feature Engineering...
#> Task: multiclass
#> Evaluator: xgboost
#> Generations: 5, Population Size: 8, CV Folds: 3
#> Original Numeric columns: Sepal.Length, Sepal.Width, Petal.Length, Petal.Width
#> Original Categorical columns:
#>
#> [Gen 0] Initialized Population:
#> Individual 1: [Original features only]
#> Individual 2: [mst_score(Sepal.Length, Sepal.Width), quantile_binning_cat5(Sepal.Length)]
#> Individual 3: [sqrt(Petal.Width), multiply(Petal.Width, Petal.Width)]
#> Individual 4: [add(Sepal.Width, Petal.Width), quantile_binning_cat7(Petal.Width)]
#> Individual 5: [add(Sepal.Length, Petal.Length, Sepal.Width), divide(Sepal.Width, Petal.Width)]
#> Individual 6: [subtract(Petal.Width, Petal.Length), random_projection(Sepal.Width, Sepal.Length)]
#> Individual 7: [mst_score(Sepal.Width, Petal.Length), log(Sepal.Width)]
#> Individual 8: [deadwood(Petal.Length, Sepal.Width), multiply(Petal.Length, Petal.Width, Petal.Width, Sepal.Length)]
#>
#> --- Generation 1 / 5 ---
#> Tested Individual 1 (New Best!) -> Fitness: 0.8425
#> Tested Individual 2 -> Fitness: 0.8391
#> Tested Individual 3 -> Fitness: 0.8425
#> Tested Individual 4 -> Fitness: 0.8382
#> Tested Individual 5 (New Best!) -> Fitness: 0.8499
#> Tested Individual 6 -> Fitness: 0.8460
#> Tested Individual 7 -> Fitness: 0.8456
#> Tested Individual 8 -> Fitness: 0.8467
#> Gen 1 Best Fitness: 0.8499
#> Gen 1 Best Recipe: [add(Sepal.Length, Petal.Length, Sepal.Width), divide(Sepal.Width, Petal.Width)]
#>
#> --- Generation 2 / 5 (Current Best Fitness: 0.8499) ---
#> Tested Individual 2 -> Fitness: 0.8467
#> Tested Individual 3 -> Fitness: 0.8456
#> Tested Individual 4 -> Fitness: 0.8431
#> Tested Individual 5 -> Fitness: 0.8499
#> Tested Individual 6 -> Fitness: 0.8448
#> Tested Individual 7 (New Best!) -> Fitness: 0.8536
#> Tested Individual 8 -> Fitness: 0.8456
#> Gen 2 Best Fitness: 0.8536
#> Gen 2 Best Recipe: [add(Sepal.Length, Petal.Length, Sepal.Width), divide(Sepal.Width, Petal.Width), deadwood(Petal.Length, Sepal.Width), multiply(Petal.Length, Petal.Width, Petal.Width, Sepal.Length)]
#>
#> --- Generation 3 / 5 (Current Best Fitness: 0.8536) ---
#> Tested Individual 2 -> Fitness: 0.8536
#> Tested Individual 3 -> Fitness: 0.8499
#> Tested Individual 4 -> Fitness: 0.8499
#> Tested Individual 5 -> Fitness: 0.8443
#> Tested Individual 6 (New Best!) -> Fitness: 0.8589
#> Tested Individual 7 -> Fitness: 0.8536
#> Tested Individual 8 -> Fitness: 0.8536
#> Gen 3 Best Fitness: 0.8589
#> Gen 3 Best Recipe: [add(Sepal.Length, Petal.Length, Sepal.Width), divide(Sepal.Width, Petal.Width), deadwood(Petal.Length, Sepal.Width), multiply(Petal.Length, Petal.Width, Petal.Width, Sepal.Length), add(Petal.Length, Petal.Length, Petal.Width, Petal.Length)]
#>
#> --- Generation 4 / 5 (Current Best Fitness: 0.8589) ---
#> Tested Individual 2 -> Fitness: 0.8536
#> Tested Individual 3 -> Fitness: 0.8499
#> Tested Individual 4 -> Fitness: 0.8536
#> Tested Individual 5 -> Fitness: 0.8440
#> Tested Individual 6 -> Fitness: 0.8565
#> Tested Individual 7 -> Fitness: 0.8536
#> Tested Individual 8 -> Fitness: 0.8589
#> Gen 4 Best Fitness: 0.8589
#> Gen 4 Best Recipe: [add(Sepal.Length, Petal.Length, Sepal.Width), divide(Sepal.Width, Petal.Width), deadwood(Petal.Length, Sepal.Width), multiply(Petal.Length, Petal.Width, Petal.Width, Sepal.Length), add(Petal.Length, Petal.Length, Petal.Width, Petal.Length)]
#>
#> --- Generation 5 / 5 (Current Best Fitness: 0.8589) ---
#> Tested Individual 2 -> Fitness: 0.8589
#> Tested Individual 3 -> Fitness: 0.8536
#> Tested Individual 4 -> Fitness: 0.8477
#> Tested Individual 5 -> Fitness: 0.8565
#> Tested Individual 6 -> Fitness: 0.8589
#> Tested Individual 7 -> Fitness: 0.8589
#> Tested Individual 8 -> Fitness: 0.8550
#> Tested Individual 9 (New Best!) -> Fitness: 0.8591
#> Tested Individual 10 -> Fitness: 0.8575
#> Tested Individual 11 -> Fitness: 0.8589
#> Tested Individual 12 -> Fitness: 0.8589
#> Gen 5 Best Fitness: 0.8591
#> Gen 5 Best Recipe: [add(Sepal.Length, Petal.Length, Sepal.Width), divide(Sepal.Width, Petal.Width), deadwood(Petal.Length, Sepal.Width), multiply(Petal.Length, Petal.Width, Petal.Width, Sepal.Length), add(Petal.Length, Petal.Length, Petal.Width, Petal.Length), mst_score(((Sepal.Length+Petal.Length+Sepal.Width)), Sepal.Length)]
#>
#> Evolution Complete. Best Fitness: 0.8591
#> Best recipe: [add(Sepal.Length, Petal.Length, Sepal.Width), divide(Sepal.Width, Petal.Width), deadwood(Petal.Length, Sepal.Width), multiply(Petal.Length, Petal.Width, Petal.Width, Sepal.Length), add(Petal.Length, Petal.Length, Petal.Width, Petal.Length), mst_score(((Sepal.Length+Petal.Length+Sepal.Width)), Sepal.Length)]
#> Generated columns: ((Sepal.Length+Petal.Length+Sepal.Width)), ((Sepal.Width/Petal.Width)), Deadwood(Pet_Sep), ((Petal.Length*Petal.Width*Petal.Width*Sepal.Length)), ((Petal.Length+Petal.Length+Petal.Width+Petal.Length)), MSTScore(((S_Sep)
#> Training final model on full dataset...
cat("Best recipe:", individual_to_recipe_string(res_mc$best_individual), "\n")
#> Best recipe: [add(Sepal.Length, Petal.Length, Sepal.Width), divide(Sepal.Width, Petal.Width), deadwood(Petal.Length, Sepal.Width), multiply(Petal.Length, Petal.Width, Petal.Width, Sepal.Length), add(Petal.Length, Petal.Length, Petal.Width, Petal.Length), mst_score(((Sepal.Length+Petal.Length+Sepal.Width)), Sepal.Length)]For multiclass, predict_model() returns a probability
matrix — one column per class:
probs <- predict_model(res_mc, iris_mc[c(1, 51, 101), ])
round(probs, 3)
#> setosa versicolor virginica
#> [1,] 0.986 0.008 0.006
#> [2,] 0.009 0.973 0.018
#> [3,] 0.006 0.007 0.987evoFE ships with 30 built-in transformers that the genetic algorithm can select from during evolution. The table below groups them by category.
| Transformer | Arity | Description |
|---|---|---|
log |
unary | Natural logarithm (safe:
log(abs(x) + 1)) |
sqrt |
unary | Square root (safe: sqrt(abs(x))) |
reciprocal |
unary | 1 / (x + ε) |
add |
multi | Element-wise sum of 2+ columns |
subtract |
binary | x₁ − x₂ |
multiply |
multi | Element-wise product of 2+ columns |
divide |
binary | x₁ / (x₂ + ε) |
normalized_difference |
binary | (x₁ − x₂) / (x₁ + x₂ + ε) |
log_ratio |
binary | log((x₁ + ε) / (x₂ + ε)) |
These combine a categorical grouping column with a numeric value column.
| Transformer | Description |
|---|---|
groupby_mean |
Mean of value within each group |
groupby_sd |
Standard deviation within each group |
groupby_max /
groupby_min
|
Max / min within each group |
groupby_ratio |
value / group_mean |
groupby_zscore |
(value − group_mean) / group_sd |
| Transformer | Input → Output | Description |
|---|---|---|
target_encode |
cat → num | Supervised mean-target encoding with leave-one-out |
frequency_encode |
cat → num | Proportion of each category in the data |
one_hot_encode |
cat → num | Binary one-hot encoding indicator for a specific category (or “other” for rare categories) |
quantile_binning |
num → num | Assign quantile rank (1–5) |
log_binning |
num → num | Assign log-scale bin index |
quantile_binning_cat |
num → cat | Same as quantile_binning, output as
factor |
log_binning_cat |
num → cat | Same as log_binning, output as factor |
| Transformer | Description |
|---|---|
pca |
First principal component of 2+ columns |
truncated_svd |
First component from truncated SVD |
random_projection |
Random linear combination of 2+ columns |
umap |
Low-dimensional UMAP projection |
| Transformer | Output | Description |
|---|---|---|
genie |
categorical | Genie robust hierarchical clustering |
lumbermark |
categorical | Lumbermark hierarchical clustering |
mst_score |
numeric | Minimum Spanning Tree-based anomaly score |
deadwood |
categorical | Deadwood anomaly detection (outlier indicators) |
One of evoFE’s powerful capabilities is hierarchical feature construction. After a gene has been evaluated and proven useful, subsequent generations can build on top of its output.
For example:
Gen 1: log_ratio(Sepal.Length, Petal.Width) → tested ✓
Gen 2: divide(Petal.Width, logratio(…)) → chains from tested gene ✓
Important safety rule: a gene can only chain from outputs that have been evaluated in a previous generation. A brand-new untested gene is never used as input for another gene in the same individual. This prevents fragile dependency chains built on unproven transformations.
evolve_features() returns an evo_recipe S3
object with:
| Field | Description |
|---|---|
best_individual |
The winning recipe (list of genes, column sets, fitness) |
best_model |
The final LightGBM/XGBoost model trained on all data |
history |
Full final-generation population (for inspection) |
task |
The task type used |
evaluator |
The evaluator used |
classes |
Class labels (multiclass only) |
ind <- res$best_individual
# Human-readable recipe string
cat(individual_to_recipe_string(ind), "\n")
#> [multiply(disp, gear, wt, wt, wt), pca1(carb, vs, hp), pca2(carb, vs, hp), umap2(disp, wt)]
# Number of evolved genes
cat("Evolved genes:", length(ind$genes), "\n")
#> Evolved genes: 4
# Original columns retained
cat("Numeric cols: ", paste(ind$numeric_cols, collapse = ", "), "\n")
#> Numeric cols: mpg, cyl, disp, hp, drat, wt, qsec, vs, gear, carb
cat("Categorical cols:", paste(ind$categorical_cols, collapse = ", "), "\n")
#> Categorical cols:
# Individual gene details
for (g in ind$genes) {
cat(sprintf(" %s(%s) → %s\n",
g$transformer_name,
paste(g$input_cols, collapse = ", "),
g$output_col))
}
#> multiply(disp, gear, wt, wt, wt) -> ((disp*gear*wt*wt*wt))
#> pca(carb, vs, hp) -> PCA1(car_vs_hp)
#> pca(carb, vs, hp) -> PCA2(car_vs_hp)
#> umap(disp, wt) -> UMAP2(dis_wt)evoFE supports two evaluation strategies for scoring
individuals:
cv): The default
strategy. Evaluates the fitness of individuals using
-fold
cross-validation (cv_folds parameter).split): Useful for faster evaluation on larger
datasets. You configure it with
evaluation_strategy = "split" and split_ratio
(e.g., c(0.6, 0.2, 0.2)).
split_ratio are used as the
Train and Validation sets to score the
candidate recipes during the evolutionary search.evolve_features()
| Parameter | Default | Description |
|---|---|---|
generations |
10 | Maximum number of evolutionary generations |
pop_size |
10 | Number of individuals per generation |
evaluation_strategy |
"cv" |
Evaluation method: "cv" (cross-validation)
or "split" (train/val/holdout split) |
cv_folds |
3 | Cross-validation folds for fitness evaluation (only
used if strategy is "cv") |
split_ratio |
c(0.6, 0.2, 0.2) |
Proportions for Train/Val/Holdout split (only used if
strategy is "split") |
split_ids |
NULL |
Optional user-defined vector of split assignments
("train", "val", "holdout") |
early_stopping_rounds |
3 | Stop if no improvement for n generations |
evaluator |
"lightgbm" |
Model backend: "lightgbm" or
"xgboost"
|
dynamic_population |
TRUE |
Expand population during stagnation |
crossover_type |
"both" |
"random", "union", or
"both"
|
threads |
8 | Parallelism for model training |
seed |
NULL |
RNG seed for reproducibility |
generations = 5, pop_size = 8 is enough to validate the
pipeline. Scale up once you confirm the setup works.pop_size for wider
exploration. Useful when you have many columns (> 20) and diverse
transformer options.generations for deeper
search. Works best when combined with
dynamic_population = TRUE so stagnation triggers population
expansion.seed for reproducible experiments
and benchmarking.crossover_type = "union" tends to
produce larger recipes (more features). "random" keeps
recipes leaner.Setting a seed guarantees identical results across runs:
r1 <- evolve_features(iris[,1:5], "Petal.Length", task = "regression",
generations = 3, pop_size = 5, evaluator = "xgboost",
seed = 42, verbose = FALSE)
r2 <- evolve_features(iris[,1:5], "Petal.Length", task = "regression",
generations = 3, pop_size = 5, evaluator = "xgboost",
seed = 42, verbose = FALSE)
identical(r1$best_individual$fitness, r2$best_individual$fitness)
#> [1] TRUE
identical(
individual_to_recipe_string(r1$best_individual),
individual_to_recipe_string(r2$best_individual)
)
#> [1] TRUEA realistic workflow with hold-out evaluation:
data(iris)
set.seed(1)
idx <- sample(nrow(iris), 0.7 * nrow(iris))
train <- iris[idx, ]
test <- iris[-idx, ]
# Evolve on training data only
recipe <- evolve_features(
data = train[, 1:4], # exclude Species
target_col = "Petal.Length",
task = "regression",
evaluator = "xgboost",
generations = 5,
pop_size = 8,
seed = 7,
verbose = FALSE
)
# Predict on held-out test data
test_preds <- predict_model(recipe, test[, 1:4])
# Evaluate
rmse <- sqrt(mean((test$Petal.Length - test_preds)^2))
cat(sprintf("Test RMSE: %.4f\n", rmse))
#> Test RMSE: 0.2764
cat(sprintf("Recipe: %s\n", individual_to_recipe_string(recipe$best_individual)))
#> Recipe: [lumbermark(Sepal.Length, Petal.Width), add(Petal.Width, Sepal.Length), log_ratio(((Petal.Width+Sepal.Length)), Petal.Width)]
sessionInfo()
#> R version 4.6.0 (2026-04-24)
#> Platform: aarch64-apple-darwin25.4.0
#> Running under: macOS Tahoe 26.5
#>
#> Matrix products: default
#> BLAS: /opt/homebrew/Cellar/openblas/0.3.33/lib/libopenblasp-r0.3.33.dylib
#> LAPACK: /opt/homebrew/Cellar/r/4.6.0/lib/R/lib/libRlapack.dylib; LAPACK version 3.12.1
#>
#> locale:
#> [1] C
#>
#> time zone: Europe/Stockholm
#> tzcode source: internal
#>
#> attached base packages:
#> [1] stats graphics grDevices utils datasets methods base
#>
#> other attached packages:
#> [1] evoFE_0.1.0
#>
#> loaded via a namespace (and not attached):
#> [1] lumbermark_0.9.0 cli_3.6.6 knitr_1.51
#> [4] rlang_1.2.0 xfun_0.57 otel_0.2.0
#> [7] quitefastmst_0.9.1 textshaping_1.0.5 jsonlite_2.0.0
#> [10] data.table_1.18.4 htmltools_0.5.9 ragg_1.5.2
#> [13] sass_0.4.10 uwot_0.2.4 rmarkdown_2.31
#> [16] grid_4.6.0 deadwood_0.9.0-3 evaluate_1.0.5
#> [19] jquerylib_0.1.4 fastmap_1.2.0 yaml_2.3.12
#> [22] lifecycle_1.0.5 RhpcBLASctl_0.23-42 compiler_4.6.0
#> [25] codetools_0.2-20 fs_2.1.0 htmlwidgets_1.6.4
#> [28] Rcpp_1.1.1-1.1 lattice_0.22-9 systemfonts_1.3.2
#> [31] digest_0.6.39 xgboost_3.2.1.1 R6_2.6.1
#> [34] Matrix_1.7-5 bslib_0.10.0 tools_4.6.0
#> [37] genieclust_1.3.0 RcppAnnoy_0.0.23 pkgdown_2.2.0
#> [40] cachem_1.1.0 desc_1.4.3