erikbern · jiangyinzuo · May 17, 2024 · May 17, 2024 · jiangyinzuo · May 21, 2024
diff --git a/ann_benchmarks/datasets.py b/ann_benchmarks/datasets.py
@@ -213,7 +213,7 @@ def glove(out_fn: str, d: int) -> None:
         for line in z.open(z_fn):
             v = [float(x) for x in line.strip().split()[1:]]
             X.append(numpy.array(v))
-        X_train, X_test = train_test_split(X)
+        X_train, X_test = train_test_split(numpy.array(X))
         write_output(numpy.array(X_train), numpy.array(X_test), out_fn, "angular")
 
 

diff --git a/ann_benchmarks/results.py b/ann_benchmarks/results.py
@@ -84,7 +84,7 @@ def load_all_results(dataset: Optional[str] = None,
     Yields:
         tuple: A tuple containing properties as a dictionary and an h5py file object.
     """
-    for root, _, files in os.walk(build_result_filepath(dataset, count)):
+    for root, _, files in os.walk(build_result_filepath(dataset, count, batch_mode=batch_mode)):
 def build_result_filepath(dataset_name: Optional[str] = None,  
                           count: Optional[int] = None,  
                           definition: Optional[Definition] = None,  
                           query_arguments: Optional[Any] = None,  
                           batch_mode: bool = False) -> str: 
     """ 
     Constructs the filepath for storing the results. 
     Args: 
         dataset_name (str, optional): The name of the dataset. 
         count (int, optional): The count of records. 
         definition (Definition, optional): The definition of the algorithm. 
         query_arguments (Any, optional): Additional arguments for the query. 
         batch_mode (bool, optional): If True, the batch mode is activated. 
     Returns: 
         str: The constructed filepath. 
     """ 
     d = ["results"] 
     if dataset_name: 
         d.append(dataset_name) 
     if count: 
         d.append(str(count)) 
     if definition: 
         d.append(definition.algorithm + ("-batch" if batch_mode else "")) 
         data = definition.arguments + query_arguments 
         d.append(re.sub(r"\W+", "_", json.dumps(data, sort_keys=True)).strip("_") + ".hdf5") 
     return os.path.join(*d) 
     continue 
 try: 
     with h5py.File(os.path.join(root, filename), "r+") as f: 
         properties = dict(f.attrs) 
         if batch_mode != properties["batch_mode"]: 
             continue 
         yield properties, f 
 except Exception: 
 def build_result_filepath(dataset_name: Optional[str] = None,  
                           count: Optional[int] = None,  
                           definition: Optional[Definition] = None,  
                           query_arguments: Optional[Any] = None,  
                           batch_mode: bool = False) -> str: 
     """ 
     Constructs the filepath for storing the results. 
  
     Args: 
         dataset_name (str, optional): The name of the dataset. 
         count (int, optional): The count of records. 
         definition (Definition, optional): The definition of the algorithm. 
         query_arguments (Any, optional): Additional arguments for the query. 
         batch_mode (bool, optional): If True, the batch mode is activated. 
  
     Returns: 
         str: The constructed filepath. 
     """ 
     d = ["results"] 
     if dataset_name: 
         d.append(dataset_name) 
     if count: 
         d.append(str(count)) 
     if definition: 
         d.append(definition.algorithm + ("-batch" if batch_mode else "")) 
         data = definition.arguments + query_arguments 
         d.append(re.sub(r"\W+", "_", json.dumps(data, sort_keys=True)).strip("_") + ".hdf5") 
     return os.path.join(*d) 
     continue 
 try: 
     with h5py.File(os.path.join(root, filename), "r+") as f: 
         properties = dict(f.attrs) 
         if batch_mode != properties["batch_mode"]: 
             continue 
         yield properties, f 
 except Exception: 
         for filename in files:
             if os.path.splitext(filename)[-1] != ".hdf5":
                 continue
@@ -110,4 +110,4 @@ def get_unique_algorithms() -> Set[str]:
     for batch_mode in [False, True]:
         for properties, _ in load_all_results(batch_mode=batch_mode):
             algorithms.add(properties["algo"])
-    return algorithms
+    return algorithms
diff --git a/data_export.py b/data_export.py
@@ -9,14 +9,21 @@
     parser = argparse.ArgumentParser()
     parser.add_argument("--output", help="Path to the output file", required=True)
     parser.add_argument("--recompute", action="store_true", help="Recompute metrics")
+    parser.add_argument(
+        "-k", "--count", default=10, type=int, help="The number of near neighbours to search for"
+    )
+    parser.add_argument("--batch", action="store_true", help="Batch mode")
     args = parser.parse_args()
 
     datasets = DATASETS.keys()
     dfs = []
     for dataset_name in datasets:
         print("Looking at dataset", dataset_name)
-        if len(list(load_all_results(dataset_name))) > 0:
-            results = load_all_results(dataset_name)
+        if len(list(load_all_results(dataset_name,
+                                        count=args.count,
+                                        batch_mode=args.batch
+                                     ))) > 0:
+            results = load_all_results(dataset_name, count=args.count, batch_mode=args.batch)
             dataset, _ = get_dataset(dataset_name)
             results = compute_metrics_all_runs(dataset, results, args.recompute)
             for res in results: