autogluon · tonyhoo · Feb 28, 2024 · Feb 29, 2024 · Feb 29, 2024 · Feb 29, 2024
diff --git a/process_ablation.py b/process_ablation.py
@@ -0,0 +1,41 @@
+import pandas as pd
+
+# Load the processed data
+df = pd.read_csv('classification_regression.csv')
+
+# Define the desired dataset (Task) order
+task_order = [
+    "fasion_mnist", "food101", "stanfordcars", "magnetictiledefects",
+    "europeanflooddepth", "oxfordflowers", "OxfordIIITPet", "cd18", "ham10000",
+    "hateful_meme", "petfinder", "memotion", "financial_news", "MLDoc-11000",
+    "MultiATIS-5000", "fb_dialog", "SNIPS", "ag_news", "airbnb", "kick_start",
+    "cloth_review", "news_popularity", "cal_house"
+]
+
+# Pivot the DataFrame
+pivoted_df = df.pivot(index='task', columns='framework', values='result')
+
+# Ensure the DataFrame rows follow the specified task order
+# Reindex the DataFrame according to the task_order list, this will automatically sort the rows
+pivoted_df = pivoted_df.reindex(task_order)
+
+# Specify the desired column (Framework) order
+column_order = [
+    'autokeras_master',
+    "ablation_base", 
+    "ablation_greedy_soup", 
+    "ablation_gradient_clip", 
+    "ablation_warmup_steps", 
+    "ablation_cosine_decay", 
+    "ablation_weight_decay", 
+    "ablation_lr_decay"
+]
+
+# Reorder the columns according to the specified order
+pivoted_df = pivoted_df[column_order]
+
+# Save the reformatted DataFrame to a new CSV file
+pivoted_df.to_csv('reformatted_results.csv')
+
+print("Reformatted results saved to 'reformatted_results.csv'.")
+
diff --git a/process_results.py b/process_results.py
@@ -0,0 +1,34 @@
+import pandas as pd
+import numpy as np
+from scipy.stats import sem  # Import the sem function for standard error of mean calculation
+
+input_file = 'classification_regression.csv'
+output_file = 'result_file.csv'
+
+df = pd.read_csv(input_file)
+grouped = df.groupby(['framework', 'task'])
+
+results = []
+
+# Iterate over each group
+for (framework, task), group in grouped:
+    results_data = group['result'].dropna()
+
+    mean = results_data.mean()
+    se = sem(results_data)
+    se_196 = se * 1.96
+
+    results.append({
+        'Framework': framework,
+        'Task': task,
+        'Result': f"{mean:.3f}({se_196:.3f})"
+    })
+
+results_df = pd.DataFrame(results)
+
+results_df.sort_values(by=['Framework', 'Task'], inplace=True)
+
+results_df.to_csv(output_file, index=False)
+
+print(f"Results have been saved to {output_file}")
+
diff --git a/sample_configs/bench_all.py b/sample_configs/bench_all.py
@@ -0,0 +1,65 @@
+import random
+n_experiments = 5
+seeds = []
+for i in range(n_experiments):
+    seeds.append(random.randint(0, 100))
+
+seeds = [22, 92, 54, 86, 41]
+seeds = [22]
+config_paths = [
+    "sample_configs/paper_image_cloud_configs.yaml",
+    "sample_configs/paper_text_tabular_cloud_configs.yaml",
+    "sample_configs/paper_text_cloud_configs.yaml",
+
+frameworks = ['AutoGluon_best_master', 'ablation_base', 'ablation_add_greedy', 'ablation_add_grad_clip', 'ablation_add_warmup_steps', 'ablation_add_cosine_decay', 'ablation_add_weight_decay', 'ablation_add_lr_decay', 'AutoGluon_del_greedy', 'AutoGluon_del_grad_clip', 'AutoGluon_del_warmup_steps', 'AutoGluon_del_cosine_decay', 'AutoGluon_del_weight_decay', 'AutoGluon_del_lr_decay']
+
+#frameworks = [
+   # "ablation_base",
+   # "ablation_greedy_soup",
+   # "ablation_gradient_clip",
+   # "ablation_warmup_steps",
+   # "ablation_cosine_decay",
+   # "ablation_weight_decay",
+   # "ablation_lr_decay",
+#    "autokeras_master",
+#]
+constraints = [
+    "g4_12x"
+]
+# module = "autokeras"
+module = "multimodal"
+
+import yaml
+import os
+import subprocess
+
+config_root = "./temp_configs"
+os.makedirs(config_root, exist_ok=True)
+
+for seed in seeds:
+    print("Seed: ", seed)
+    for constraint in constraints:
+        os.makedirs(f"{config_root}/{constraint}", exist_ok=True)
+        for framework in frameworks:
+            # for shot in fs:
+                config_dir = f"{config_root}/{constraint}/{framework}"
+                os.makedirs(config_dir, exist_ok=True)
+
+                for config_path in config_paths:
+                    with open(config_path, "r") as f:
+                        configs = yaml.safe_load(f)
+                        if constraint == "g4_12x":
+                            configs["cdk_context"]["PREFIX"] = f"{configs['cdk_context']['PREFIX']}-multi"
+                        configs["constraint"] = constraint
+                        configs["framework"] = framework
+                        configs["module"] = module
+                        configs["seed"] = seed 
+                        # configs["custom_dataloader"]["shot"] = shot
+                        configs["benchmark_name"] = f"{configs['benchmark_name']}-{seed}"
+                        new_config_path = os.path.join(config_dir, os.path.basename(config_path))
+                        with open(new_config_path, "w") as new_f:
+                            yaml.dump(configs, new_f)
+                        print("Running config: ", new_config_path)
+                        command = ["agbench", "run", new_config_path]
+                        subprocess.run(command)
+
diff --git a/..._configs/dataloaders/vision_dataloader.py → ...e_configs/dataloaders/image_dataloader.py b/..._configs/dataloaders/vision_dataloader.py → ...e_configs/dataloaders/image_dataloader.py
@@ -16,7 +16,7 @@ def path_expander(path, base_folder):
 logger = logging.getLogger(__name__)
 
 
-class VisionDataLoaer:
+class ImageDataLoader:
     def __init__(self, dataset_name: str, dataset_config_file: str, split: str = "train"):
         with open(dataset_config_file, "r") as f:
             config = yaml.safe_load(f)
@@ -31,7 +31,9 @@ def __init__(self, dataset_name: str, dataset_config_file: str, split: str = "tr
 
         self.name = dataset_name
         self.split = split
-        self.feature_columns = self.dataset_config["feature_columns"]
+        self.image_columns = self.dataset_config["image_columns"] or []
+        self.text_columns = self.dataset_config["text_columns"] or []
+        self.columns_to_drop = self.dataset_config["columns_to_drop"] or []
         self.label_columns = self.dataset_config["label_columns"]
 
         url = self.dataset_config["url"].format(name=self.name)
@@ -43,10 +45,15 @@ def __init__(self, dataset_name: str, dataset_config_file: str, split: str = "tr
         image_path_pattern = self.dataset_config["image_path"]
 
         self.data = pd.read_csv(os.path.join(self.dataset_dir, annotation_filename))
-        _columns_to_drop = self.data.columns.difference(self.feature_columns + self.label_columns)
-        self.data.drop(columns=_columns_to_drop, inplace=True)
+        self.tabular_columns = self.data.columns.difference(self.image_columns + self.text_columns + self.label_columns + self.columns_to_drop)
+        print("Image columns: ", self.image_columns)
+        print("Text columns: ", self.text_columns)
+        print("Tabular columns: ", self.tabular_columns)
+        self.data.drop(columns=self.columns_to_drop, inplace=True)
+        self.data.dropna(inplace=True)
+
         image_base_path = image_path_pattern.format(name=self.name, split=self.split, value="")
-        for col in self.feature_columns:
+        for col in self.image_columns:
             self.data[col] = self.data[col].apply(
                 lambda ele: path_expander(ele, base_folder=os.path.join(self.dataset_dir, image_base_path))
             )

diff --git a/..._configs/dataloaders/vision_datasets.yaml → ...e_configs/dataloaders/image_datasets.yaml b/..._configs/dataloaders/vision_datasets.yaml → ...e_configs/dataloaders/image_datasets.yaml
diff --git a/sample_configs/dataloaders/paper_image_datasets.yaml b/sample_configs/dataloaders/paper_image_datasets.yaml
@@ -0,0 +1,119 @@
+# data = custom_class(dataset_name=dataset_name, split=split, **kwargs)
+# custom_class: custom dataloader class, e.g. sample_configs/vision_dataset.py
+
+
+base: &base
+  url: s3://zs-models/datasets/{name}.zip
+  splits:
+    - train
+    - test
+  image_columns:
+    - ImageID
+  text_columns:
+  label_columns:
+    - LabelName
+  columns_to_drop:
+  annotation: "{name}_{split}_annotations.csv"
+  image_path: "{name}/{value}"
+  metric: acc
+  problem_type: multiclass
+
+fashion_mnist:
+  <<: *base
+  image_path: "{split}/{value}"
+
+casting:
+  <<: *base
+  image_path: "{value}"
+  problem_type: binary
+  metric: roc_auc
+
+food101:
+  <<: *base
+
+oxfordflowers:
+  <<: *base
+  image_path: "{name}/{split}/{value}"
+
+OxfordIIITPet:
+  <<: *base
+  splits:
+    - train
+    - validation
+    - test
+  annotation: "{name}_{split}_anno.csv"
+  image_path: "{split}/{value}"
+
+europeanflooddepth:
+  <<: *base
+  problem_type: binary
+  metric: roc_auc
+
+magnetictiledefects:
+  <<: *base
+
+stanfordcars:
+  <<: *base
+
+cub200:
+  <<: *base
+
+
+petfinder:
+  <<: *base
+  splits:
+    - train
+  text_columns:
+    - Description 
+  label_columns:
+    - AdoptionSpeed
+  annotation: "{name}_{split}_annotations.csv"
+  image_path: "{value}"
+  metric: acc
+  problem_type: multiclass
+
+
+ham10000:
+  <<: *base
+  splits:
+    - train
+  label_columns:
+    - dx
+  image_path: "{split}/{value}"
+
+cd18:
+  <<: *base
+  splits:
+    - train
+  label_columns:
+    - Price
+  image_path: "{split}/{value}"
+  metric: rmse
+  problem_type: regression
+
+
+hateful_meme:
+  <<: *base
+  splits:
+    - train
+  text_columns:
+    - text    
+  image_path: "{value}"
+  metric: roc_auc
+  problem_type: binary
+
+
+memotion:
+  <<: *base
+  splits:
+    - train
+  text_columns:
+    - text_corrected  
+  columns_to_drop:
+    - text_ocr
+  label_columns:
+    - overall_sentiment
+  annotation: "{name}_{split}_annotations.csv"
+  image_path: "{split}/{value}"
+  metric: acc
+  problem_type: multiclass