Merge pull request #2 from divyegala/python-ann-bench-use-gbench

achirkin · Aug 26, 2023 · 9b82f85 · 9b82f85
2 parents 617c60f + 902f9f4
commit 9b82f85
Show file tree

Hide file tree

Showing 5 changed files with 116 additions and 186 deletions.
diff --git a/bench/ann/data_export.py b/bench/ann/data_export.py
diff --git a/bench/ann/plot.py b/bench/ann/plot.py
@@ -192,25 +192,38 @@ def inv_fun(x):
     plt.close()
 
 
-def load_all_results(result_filepath):
+def load_all_results(dataset_path):
     results = dict()
-    with open(result_filepath, 'r') as f:
-        for line in f.readlines()[1:]:
-            split_lines = line.split(',')
-            algo_name = split_lines[0].split('.')[0]
-            if algo_name not in results:
-                results[algo_name] = []
-            results[algo_name].append([algo_name, float(split_lines[1]), 
-                                  float(split_lines[2])])
+    results_path = os.path.join(dataset_path, "result", "search")
+    for result_filepath in os.listdir(results_path):
+        with open(os.path.join(results_path, result_filepath), 'r') as f:
+            lines = f.readlines()
+            idx = 0
+            for pos, line in enumerate(lines):
+                if "QPS" in line:
+                    idx = pos
+                    break
+
+            for line in lines[idx+1:]:
+                split_lines = line.split(',')
+                algo_name = split_lines[0].split('.')[0].strip("\"")
+                if algo_name not in results:
+                    results[algo_name] = []
+                results[algo_name].append([algo_name, float(split_lines[12]), 
+                                    float(split_lines[10])])
     return results
 
 
 def main():
     parser = argparse.ArgumentParser(
         formatter_class=argparse.ArgumentDefaultsHelpFormatter)
-    parser.add_argument("--result-csv", help="Path to CSV Results", required=True)
-    parser.add_argument("--output", help="Path to the PNG output file",
-                        default=f"{os.getcwd()}/out.png")
+    parser.add_argument("--dataset", help="dataset to download",
+                        default="glove-100-inner")
+    parser.add_argument("--dataset-path", help="path to dataset folder",
+                        default=os.path.join(os.getenv("RAFT_HOME"), 
+                                             "bench", "ann", "data"))
+    parser.add_argument("--output-filename",
+                        default="plot.png")
     parser.add_argument(
         "--x-scale",
         help="Scale to use when drawing the X-axis. \
@@ -228,12 +241,13 @@ def main():
     )
     args = parser.parse_args()
 
-    print(f"writing output to {args.output}")
+    output_filepath = os.path.join(args.dataset_path, args.dataset, args.output_filename)
+    print(f"writing output to {output_filepath}")
 
-    results = load_all_results(args.result_csv)
+    results = load_all_results(os.path.join(args.dataset_path, args.dataset))
     linestyles = create_linestyles(sorted(results.keys()))
 
-    create_plot(results, args.raw, args.x_scale, args.y_scale, args.output, linestyles)
+    create_plot(results, args.raw, args.x_scale, args.y_scale, output_filepath, linestyles)
 
 
 if __name__ == "__main__":

diff --git a/bench/ann/run.py b/bench/ann/run.py
@@ -41,49 +41,62 @@ def find_executable(algos_conf, algo):
                               executable)
     build_path = os.path.join(os.getenv("RAFT_HOME"), "cpp", "build", executable)
     if os.path.exists(conda_path):
-        return (executable, conda_path)
+        return (executable, conda_path, algo)
     elif os.path.exists(build_path):
-        return (executable, build_path)
+        return (executable, build_path, algo)
     else:
         raise FileNotFoundError(executable)
 
 
-def run_build_and_search(conf_filename, conf_file, executables_to_run,
-                         force, conf_filedir, build, search, k, batch_size):
-    for executable, ann_executable_path in executables_to_run.keys():
+def run_build_and_search(conf_file, conf_filename, conf_filedir,
+                         executables_to_run, dataset_path, force,
+                         build, search, k, batch_size):
+    for executable, ann_executable_path, algo in executables_to_run.keys():
         # Need to write temporary configuration
-        temp_conf_filename = f"temporary_executable_{conf_filename}"
+        temp_conf_filename = f"temporary_{conf_filename}"
         temp_conf_filepath = os.path.join(conf_filedir, temp_conf_filename)
         with open(temp_conf_filepath, "w") as f:
             temp_conf = dict()
             temp_conf["dataset"] = conf_file["dataset"]
             temp_conf["search_basic_param"] = conf_file["search_basic_param"]
-            temp_conf["index"] = executables_to_run[(executable,
-                                                     ann_executable_path)]["index"]
+            temp_conf["index"] = executables_to_run[(executable, 
+                                                     ann_executable_path,
+                                                     algo)]["index"]
             json.dump(temp_conf, f)
 
+        legacy_result_folder = os.path.join(dataset_path, conf_file['dataset']['name'], 'result')
+        os.makedirs(legacy_result_folder, exist_ok=True)
         if build:
+            build_folder = os.path.join(legacy_result_folder, "build")
+            os.makedirs(build_folder, exist_ok=True)
+            cmd = [ann_executable_path,
+                   "--build",
+                   "--data_prefix="+dataset_path,
+                   "--benchmark_out_format=csv",
+                   f"--benchmark_out={os.path.join(build_folder, f'{algo}.csv')}"]
             if force:
-                p = subprocess.Popen([ann_executable_path, "--build", "--overwrite",
-                                    temp_conf_filepath])
-                p.wait()
-            else:
-                p = subprocess.Popen([ann_executable_path, "--build",
-                                    temp_conf_filepath])
-                p.wait()
+                cmd = cmd + ["--overwrite"]
+            cmd = cmd + [temp_conf_filepath]
+            print(cmd)
+            p = subprocess.Popen(cmd)
+            p.wait()
 
         if search:
-            legacy_result_folder = "result/" + temp_conf["dataset"]["name"]
-            os.makedirs(legacy_result_folder, exist_ok=True)
-            p = subprocess.Popen([
-                ann_executable_path,
-                "--search",
-                "--benchmark_counters_tabular",
-                "--benchmark_out_format=json",
-                "--override_kv=k:%s" % k,
-                "--override_kv=n_queries:%s" % batch_size,
-                f"--benchmark_out={legacy_result_folder}/{executable}.json",
-                temp_conf_filepath])
+            search_folder = os.path.join(legacy_result_folder, "search")
+            os.makedirs(search_folder, exist_ok=True)
+            cmd = [ann_executable_path,
+                   "--search",
+                   "--data_prefix="+dataset_path,
+                   "--benchmark_counters_tabular",
+                   "--override_kv=k:%s" % k,
+                   "--override_kv=n_queries:%s" % batch_size,
+                   "--benchmark_out_format=csv",
+                   f"--benchmark_out={os.path.join(search_folder, f'{algo}.csv')}"]
+            if force:
+                cmd = cmd + ["--overwrite"]
+            cmd = cmd + [temp_conf_filepath]
+            print(cmd)
+            p = subprocess.Popen(cmd)
             p.wait()
 
         os.remove(temp_conf_filepath)
@@ -157,28 +170,24 @@ def main():
     # Read configuration file associated to dataset
     if args.configuration:
         conf_filepath = args.configuration
+    elif args.dataset:
+        conf_filepath = \
+            os.path.join(scripts_path, "conf", f"{args.dataset}.json")
     else:
-        conf_filepath = os.path.join(scripts_path, "conf", f"{args.dataset}.json")
+        raise ValueError("One of parameters `configuration` or \
+                         `dataset` need to be provided")
     conf_filename = conf_filepath.split("/")[-1]
     conf_filedir = "/".join(conf_filepath.split("/")[:-1])
     dataset_name = conf_filename.replace(".json", "")
-    dataset_path = os.path.realpath(os.path.join(args.dataset_path, dataset_name))
+    dataset_path = args.dataset_path
     if not os.path.exists(conf_filepath):
         raise FileNotFoundError(conf_filename)
+    if not os.path.exists(os.path.join(args.dataset_path, dataset_name)):
+        raise FileNotFoundError(os.path.join(args.dataset_path, dataset_name))
 
     with open(conf_filepath, "r") as f:
         conf_file = json.load(f)
 
-    # Replace base, query to dataset-path
-    conf_file["dataset"]["base_file"] = os.path.join(dataset_path, "base.fbin")
-    conf_file["dataset"]["query_file"] = os.path.join(dataset_path, "query.fbin")
-    conf_file["dataset"]["groundtruth_neighbors_file"] = os.path.join(dataset_path, "groundtruth.neighbors.ibin")
-    # Ensure base and query files exist for dataset
-    if not os.path.exists(conf_file["dataset"]["base_file"]):
-        raise FileNotFoundError(conf_file["dataset"]["base_file"])
-    if not os.path.exists(conf_file["dataset"]["query_file"]):
-        raise FileNotFoundError(conf_file["dataset"]["query_file"])
-
     executables_to_run = dict()
     # At least one named index should exist in config file
     if args.indices:
@@ -218,16 +227,16 @@ def main():
                     executables_to_run[executable_path] = {"index": []}
                 executables_to_run[executable_path]["index"].append(index)
 
-    # Replace build, search to dataset path
+    # Replace index to dataset path
     for executable_path in executables_to_run:
         for pos, index in enumerate(executables_to_run[executable_path]["index"]):
-            index["file"] = os.path.join(dataset_path, "index", index["name"])
-            index["search_result_file"] = \
-                os.path.join(dataset_path, "result", index["name"])
+            index["file"] = os.path.join(dataset_path, dataset_name, "index", index["name"])
             executables_to_run[executable_path]["index"][pos] = index
 
-    run_build_and_search(conf_filename, conf_file, executables_to_run,
-                         args.force, conf_filedir, build, search, k, batch_size)
+    run_build_and_search(conf_file, conf_filename, conf_filedir, 
+                         executables_to_run, dataset_path,
+                         args.force, build, search,
+                         k, batch_size)
 
 
 if __name__ == "__main__":

diff --git a/cpp/bench/ann/src/common/benchmark.cpp b/cpp/bench/ann/src/common/benchmark.cpp
@@ -52,8 +52,17 @@ auto load_lib(const std::string& algo) -> void*
   auto found = libs.find(algo);
 
   if (found != libs.end()) { return found->second.handle; }
-  auto lib_name = "lib" + algo + "_ann_bench.so";
-  return libs.emplace(algo, lib_name).first->second.handle;
+  auto lib_name        = "lib" + algo + "_ann_bench.so";
+  std::string lib_path = "";
+  if (std::getenv("CONDA_PREFIX") != nullptr) {
+    auto conda_path = std::string(std::getenv("CONDA_PREFIX")) + "/bin" + "/ann/";
+    if (std::filesystem::exists(conda_path + "ANN_BENCH")) { lib_path = conda_path; }
+  }
+  if (std::getenv("RAFT_HOME") != nullptr) {
+    auto build_path = std::string(std::getenv("RAFT_HOME")) + "/cpp" + "/build/";
+    if (std::filesystem::exists(build_path + "ANN_BENCH")) { lib_path = build_path; }
+  }
+  return libs.emplace(algo, lib_path + lib_name).first->second.handle;
 }
 
 auto get_fun_name(void* addr) -> std::string