Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

automl benchmark script #74

Draft
wants to merge 15 commits into
base: master
Choose a base branch
from
41 changes: 41 additions & 0 deletions process_ablation.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
import pandas as pd

# Load the processed data
df = pd.read_csv('classification_regression.csv')

# Define the desired dataset (Task) order
task_order = [
"fasion_mnist", "food101", "stanfordcars", "magnetictiledefects",
"europeanflooddepth", "oxfordflowers", "OxfordIIITPet", "cd18", "ham10000",
"hateful_meme", "petfinder", "memotion", "financial_news", "MLDoc-11000",
"MultiATIS-5000", "fb_dialog", "SNIPS", "ag_news", "airbnb", "kick_start",
"cloth_review", "news_popularity", "cal_house"
]

# Pivot the DataFrame
pivoted_df = df.pivot(index='task', columns='framework', values='result')

# Ensure the DataFrame rows follow the specified task order
# Reindex the DataFrame according to the task_order list, this will automatically sort the rows
pivoted_df = pivoted_df.reindex(task_order)

# Specify the desired column (Framework) order
column_order = [
'autokeras_master',
"ablation_base",
"ablation_greedy_soup",
"ablation_gradient_clip",
"ablation_warmup_steps",
"ablation_cosine_decay",
"ablation_weight_decay",
"ablation_lr_decay"
]

# Reorder the columns according to the specified order
pivoted_df = pivoted_df[column_order]

# Save the reformatted DataFrame to a new CSV file
pivoted_df.to_csv('reformatted_results.csv')

print("Reformatted results saved to 'reformatted_results.csv'.")

34 changes: 34 additions & 0 deletions process_results.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
import pandas as pd
import numpy as np
from scipy.stats import sem # Import the sem function for standard error of mean calculation

input_file = 'classification_regression.csv'
output_file = 'result_file.csv'

df = pd.read_csv(input_file)
grouped = df.groupby(['framework', 'task'])

results = []

# Iterate over each group
for (framework, task), group in grouped:
results_data = group['result'].dropna()

mean = results_data.mean()
se = sem(results_data)
se_196 = se * 1.96

results.append({
'Framework': framework,
'Task': task,
'Result': f"{mean:.3f}({se_196:.3f})"
})

results_df = pd.DataFrame(results)

results_df.sort_values(by=['Framework', 'Task'], inplace=True)

results_df.to_csv(output_file, index=False)

print(f"Results have been saved to {output_file}")

65 changes: 65 additions & 0 deletions sample_configs/bench_all.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
import random
n_experiments = 5
seeds = []
for i in range(n_experiments):
seeds.append(random.randint(0, 100))

seeds = [22, 92, 54, 86, 41]
seeds = [22]
config_paths = [
"sample_configs/paper_image_cloud_configs.yaml",
"sample_configs/paper_text_tabular_cloud_configs.yaml",
"sample_configs/paper_text_cloud_configs.yaml",

frameworks = ['AutoGluon_best_master', 'ablation_base', 'ablation_add_greedy', 'ablation_add_grad_clip', 'ablation_add_warmup_steps', 'ablation_add_cosine_decay', 'ablation_add_weight_decay', 'ablation_add_lr_decay', 'AutoGluon_del_greedy', 'AutoGluon_del_grad_clip', 'AutoGluon_del_warmup_steps', 'AutoGluon_del_cosine_decay', 'AutoGluon_del_weight_decay', 'AutoGluon_del_lr_decay']

#frameworks = [
# "ablation_base",
# "ablation_greedy_soup",
# "ablation_gradient_clip",
# "ablation_warmup_steps",
# "ablation_cosine_decay",
# "ablation_weight_decay",
# "ablation_lr_decay",
# "autokeras_master",
#]
constraints = [
"g4_12x"
]
# module = "autokeras"
module = "multimodal"

import yaml
import os
import subprocess

config_root = "./temp_configs"
os.makedirs(config_root, exist_ok=True)

for seed in seeds:
print("Seed: ", seed)
for constraint in constraints:
os.makedirs(f"{config_root}/{constraint}", exist_ok=True)
for framework in frameworks:
# for shot in fs:
config_dir = f"{config_root}/{constraint}/{framework}"
os.makedirs(config_dir, exist_ok=True)

for config_path in config_paths:
with open(config_path, "r") as f:
configs = yaml.safe_load(f)
if constraint == "g4_12x":
configs["cdk_context"]["PREFIX"] = f"{configs['cdk_context']['PREFIX']}-multi"
configs["constraint"] = constraint
configs["framework"] = framework
configs["module"] = module
configs["seed"] = seed
# configs["custom_dataloader"]["shot"] = shot
configs["benchmark_name"] = f"{configs['benchmark_name']}-{seed}"
new_config_path = os.path.join(config_dir, os.path.basename(config_path))
with open(new_config_path, "w") as new_f:
yaml.dump(configs, new_f)
print("Running config: ", new_config_path)
command = ["agbench", "run", new_config_path]
subprocess.run(command)

Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ def path_expander(path, base_folder):
logger = logging.getLogger(__name__)


class VisionDataLoaer:
class ImageDataLoader:
def __init__(self, dataset_name: str, dataset_config_file: str, split: str = "train"):
with open(dataset_config_file, "r") as f:
config = yaml.safe_load(f)
Expand All @@ -31,7 +31,9 @@ def __init__(self, dataset_name: str, dataset_config_file: str, split: str = "tr

self.name = dataset_name
self.split = split
self.feature_columns = self.dataset_config["feature_columns"]
self.image_columns = self.dataset_config["image_columns"] or []
self.text_columns = self.dataset_config["text_columns"] or []
self.columns_to_drop = self.dataset_config["columns_to_drop"] or []
self.label_columns = self.dataset_config["label_columns"]

url = self.dataset_config["url"].format(name=self.name)
Expand All @@ -43,10 +45,15 @@ def __init__(self, dataset_name: str, dataset_config_file: str, split: str = "tr
image_path_pattern = self.dataset_config["image_path"]

self.data = pd.read_csv(os.path.join(self.dataset_dir, annotation_filename))
_columns_to_drop = self.data.columns.difference(self.feature_columns + self.label_columns)
self.data.drop(columns=_columns_to_drop, inplace=True)
self.tabular_columns = self.data.columns.difference(self.image_columns + self.text_columns + self.label_columns + self.columns_to_drop)
print("Image columns: ", self.image_columns)
print("Text columns: ", self.text_columns)
print("Tabular columns: ", self.tabular_columns)
self.data.drop(columns=self.columns_to_drop, inplace=True)
self.data.dropna(inplace=True)

image_base_path = image_path_pattern.format(name=self.name, split=self.split, value="")
for col in self.feature_columns:
for col in self.image_columns:
self.data[col] = self.data[col].apply(
lambda ele: path_expander(ele, base_folder=os.path.join(self.dataset_dir, image_base_path))
)
Expand Down
119 changes: 119 additions & 0 deletions sample_configs/dataloaders/paper_image_datasets.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,119 @@
# data = custom_class(dataset_name=dataset_name, split=split, **kwargs)
# custom_class: custom dataloader class, e.g. sample_configs/vision_dataset.py


base: &base
url: s3://zs-models/datasets/{name}.zip
splits:
- train
- test
image_columns:
- ImageID
text_columns:
label_columns:
- LabelName
columns_to_drop:
annotation: "{name}_{split}_annotations.csv"
image_path: "{name}/{value}"
metric: acc
problem_type: multiclass

fashion_mnist:
<<: *base
image_path: "{split}/{value}"

casting:
<<: *base
image_path: "{value}"
problem_type: binary
metric: roc_auc

food101:
<<: *base

oxfordflowers:
<<: *base
image_path: "{name}/{split}/{value}"

OxfordIIITPet:
<<: *base
splits:
- train
- validation
- test
annotation: "{name}_{split}_anno.csv"
image_path: "{split}/{value}"

europeanflooddepth:
<<: *base
problem_type: binary
metric: roc_auc

magnetictiledefects:
<<: *base

stanfordcars:
<<: *base

cub200:
<<: *base


petfinder:
<<: *base
splits:
- train
text_columns:
- Description
label_columns:
- AdoptionSpeed
annotation: "{name}_{split}_annotations.csv"
image_path: "{value}"
metric: acc
problem_type: multiclass


ham10000:
<<: *base
splits:
- train
label_columns:
- dx
image_path: "{split}/{value}"

cd18:
<<: *base
splits:
- train
label_columns:
- Price
image_path: "{split}/{value}"
metric: rmse
problem_type: regression


hateful_meme:
<<: *base
splits:
- train
text_columns:
- text
image_path: "{value}"
metric: roc_auc
problem_type: binary


memotion:
<<: *base
splits:
- train
text_columns:
- text_corrected
columns_to_drop:
- text_ocr
label_columns:
- overall_sentiment
annotation: "{name}_{split}_annotations.csv"
image_path: "{split}/{value}"
metric: acc
problem_type: multiclass
Loading
Loading