From f06ea8454482f1393e54298681ee51db2fada073 Mon Sep 17 00:00:00 2001 From: Felipe Maia Polo Date: Sun, 13 Oct 2024 04:59:37 +0000 Subject: [PATCH 1/6] added option --examples --- lm_eval/__main__.py | 22 +++++++++++++++++++--- lm_eval/api/task.py | 32 +++++++++++++++++++++++--------- lm_eval/evaluator.py | 14 ++++++++++++-- 3 files changed, 54 insertions(+), 14 deletions(-) diff --git a/lm_eval/__main__.py b/lm_eval/__main__.py index ab68781939..5b134b57a5 100644 --- a/lm_eval/__main__.py +++ b/lm_eval/__main__.py @@ -128,6 +128,15 @@ def setup_parser() -> argparse.ArgumentParser: help="Limit the number of examples per task. " "If <1, limit is a percentage of the total number of examples.", ) + parser.add_argument( + "--examples", + "-E", + nargs='+', + type=int, + default=None, + help="Examples to test. " + "Should be in the format x1 x2 x3 ... xn, where xi is an integer number.", + ) parser.add_argument( "--use_cache", "-c", @@ -309,11 +318,17 @@ def cli_evaluate(args: Union[argparse.Namespace, None] = None) -> None: ) if args.limit: + limit = args.limit eval_logger.warning( " --limit SHOULD ONLY BE USED FOR TESTING." "REAL METRICS SHOULD NOT BE COMPUTED USING LIMIT." ) - + if args.examples: + assert args.limit is None, "If --examples is not None, then --limit must be None." + assert all(isinstance(x, int) and x >= 0 for x in args.examples), "Elements of the list given in --examples should be non-negative integers." + examples = args.examples + limit = len(examples) + if args.tasks is None: eval_logger.error("Need to specify task to evaluate.") sys.exit() @@ -388,7 +403,8 @@ def cli_evaluate(args: Union[argparse.Namespace, None] = None) -> None: max_batch_size=args.max_batch_size, device=args.device, use_cache=args.use_cache, - limit=args.limit, + limit=limit, + examples=examples, check_integrity=args.check_integrity, write_out=args.write_out, log_samples=args.log_samples, @@ -445,7 +461,7 @@ def cli_evaluate(args: Union[argparse.Namespace, None] = None) -> None: evaluation_tracker.recreate_metadata_card() print( - f"{args.model} ({args.model_args}), gen_kwargs: ({args.gen_kwargs}), limit: {args.limit}, num_fewshot: {args.num_fewshot}, " + f"{args.model} ({args.model_args}), gen_kwargs: ({args.gen_kwargs}), limit: {limit}, num_fewshot: {args.num_fewshot}, " f"batch_size: {args.batch_size}{f' ({batch_sizes})' if batch_sizes else ''}" ) print(make_table(results)) diff --git a/lm_eval/api/task.py b/lm_eval/api/task.py index 532e9e7ae6..f97c377830 100644 --- a/lm_eval/api/task.py +++ b/lm_eval/api/task.py @@ -373,6 +373,7 @@ def build_all_requests( self, *, limit: Union[int, None] = None, + examples: Optional[List[int]] = None, rank: int = 0, world_size: int = 1, cache_requests: bool = False, @@ -425,7 +426,7 @@ def build_all_requests( limit = None doc_id_docs = list( - self.doc_iterator(rank=rank, limit=limit, world_size=world_size) + self.doc_iterator(rank=rank, limit=limit, examples=examples, world_size=world_size) ) num_docs = len(doc_id_docs) @@ -676,15 +677,28 @@ def eval_docs(self) -> Union[datasets.Dataset, List[dict]]: ) def doc_iterator( - self, *, rank: int = 0, limit: Union[int, None] = None, world_size: int = 1 + self, *, rank: int = 0, + limit: Union[int, None] = None, + examples: Optional[List[int]] = None, + world_size: int = 1 ) -> Iterator[Tuple[int, Any]]: - limit = int(limit) if limit else None - doc_iterator = utils.create_iterator( - enumerate(self.eval_docs), - rank=int(rank), - limit=limit, - world_size=int(world_size), - ) + if examples: + n = self.eval_docs.to_pandas().shape[0] + assert all([e Date: Mon, 21 Oct 2024 02:15:13 +0000 Subject: [PATCH 2/6] specifying examples in dictionary --- lm_eval/__main__.py | 12 ++++++------ lm_eval/evaluator.py | 18 +++++++++--------- 2 files changed, 15 insertions(+), 15 deletions(-) diff --git a/lm_eval/__main__.py b/lm_eval/__main__.py index 5b134b57a5..457b5449fc 100644 --- a/lm_eval/__main__.py +++ b/lm_eval/__main__.py @@ -131,11 +131,11 @@ def setup_parser() -> argparse.ArgumentParser: parser.add_argument( "--examples", "-E", - nargs='+', - type=int, default=None, + type=str, + metavar="/path/to/json", help="Examples to test. " - "Should be in the format x1 x2 x3 ... xn, where xi is an integer number.", + "Should be a json file which loads into a Python dictionary. E.g., {'mmlu_anatomy':[0,1],'mmlu_astronomy':[1,2,3]}.", ) parser.add_argument( "--use_cache", @@ -325,9 +325,9 @@ def cli_evaluate(args: Union[argparse.Namespace, None] = None) -> None: ) if args.examples: assert args.limit is None, "If --examples is not None, then --limit must be None." - assert all(isinstance(x, int) and x >= 0 for x in args.examples), "Elements of the list given in --examples should be non-negative integers." - examples = args.examples - limit = len(examples) + limit = None + with open(args.examples, 'r') as json_file: + examples = json.load(json_file) if args.tasks is None: eval_logger.error("Need to specify task to evaluate.") diff --git a/lm_eval/evaluator.py b/lm_eval/evaluator.py index 5b21f5a7c7..c260de1cbf 100644 --- a/lm_eval/evaluator.py +++ b/lm_eval/evaluator.py @@ -58,7 +58,7 @@ def simple_evaluate( rewrite_requests_cache: bool = False, delete_requests_cache: bool = False, limit: Optional[Union[int, float]] = None, - examples: Optional[List[int]] = None, + examples: Optional[Dict] = None, bootstrap_iters: int = 100000, check_integrity: bool = False, write_out: bool = False, @@ -103,8 +103,8 @@ def simple_evaluate( Deletes all of the request cache if set to `True`. `None` if not desired. :param limit: int or float, optional Limit the number of examples per task (only use this for testing), If <1, limit is a percentage of the total number of examples. - :param examples: list of ints, optional - List indicating which examples should be tested. + :param examples: dictionary, optional + Dictionary indicating which examples should be tested in each task, e.g., {'mmlu_astronomy':[0,3,6],'mmlu_anatomy':[1,4,7,10]}. :param bootstrap_iters: Number of iterations for bootstrap statistics, used when calculating stderrs. set to 0 for no stderr calculations to be performed. :param check_integrity: bool @@ -365,7 +365,7 @@ def evaluate( lm: "LM", task_dict, limit: Optional[int] = None, - examples: Optional[List[int]] = None, + examples: Optional[Dict] = None, cache_requests: bool = False, rewrite_requests_cache: bool = False, bootstrap_iters: Optional[int] = 100000, @@ -384,8 +384,8 @@ def evaluate( Dictionary of tasks. Tasks will be taken to have name type(task).config.task . :param limit: int, optional Limit the number of examples per task (only use this for testing) - :param examples: list of ints, optional - List indicating which examples should be tested. + :param examples: dictionary, optional + Dictionary indicating which examples should be tested in each task, e.g., {'mmlu_astronomy':[0,3,6],'mmlu_anatomy':[1,4,7,10]}. :param bootstrap_iters: Number of iterations for bootstrap statistics, used when calculating stderr. Set to 0 for skipping all stderr calculations. :param write_out: bool @@ -450,7 +450,7 @@ def evaluate( limits.append(limit) task.build_all_requests( limit=limit, - examples=examples, + examples=examples[task_output.task_name], rank=lm.rank, world_size=lm.world_size, cache_requests=cache_requests, @@ -535,10 +535,10 @@ def evaluate( # iterate over different filters used for filter_key in task.instances[0].filtered_resps.keys(): doc_iterator = task.doc_iterator( - rank=RANK, limit=limit, examples=examples, world_size=WORLD_SIZE + rank=RANK, limit=limit, examples=examples[task_output.task_name], world_size=WORLD_SIZE ) for doc_id, doc in doc_iterator: - if examples: doc_id_true = examples[doc_id] + if examples: doc_id_true = examples[task_output.task_name][doc_id] else: doc_id_true = doc_id requests = instances_by_doc_id[doc_id] metrics = task.process_results( From 4863977a36e0e5f1b8f16edf84bb9246db916a75 Mon Sep 17 00:00:00 2001 From: mirianfrsilva Date: Tue, 26 Nov 2024 19:49:15 +0000 Subject: [PATCH 3/6] run pre-commit - fix arg type MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Mírian Silva None: "REAL METRICS SHOULD NOT BE COMPUTED USING LIMIT." ) if args.examples: - assert args.limit is None, "If --examples is not None, then --limit must be None." + assert ( + args.limit is None + ), "If --examples is not None, then --limit must be None." limit = None - with open(args.examples, 'r') as json_file: - examples = json.load(json_file) - + with open(args.examples, "r") as json_file: + examples = json.load(json_file) + if args.tasks is None: eval_logger.error("Need to specify task to evaluate.") sys.exit() diff --git a/lm_eval/api/task.py b/lm_eval/api/task.py index f97c377830..58d78464fc 100644 --- a/lm_eval/api/task.py +++ b/lm_eval/api/task.py @@ -426,7 +426,9 @@ def build_all_requests( limit = None doc_id_docs = list( - self.doc_iterator(rank=rank, limit=limit, examples=examples, world_size=world_size) + self.doc_iterator( + rank=rank, limit=limit, examples=examples, world_size=world_size + ) ) num_docs = len(doc_id_docs) @@ -677,18 +679,28 @@ def eval_docs(self) -> Union[datasets.Dataset, List[dict]]: ) def doc_iterator( - self, *, rank: int = 0, + self, + *, + rank: int = 0, limit: Union[int, None] = None, examples: Optional[List[int]] = None, - world_size: int = 1 + world_size: int = 1, ) -> Iterator[Tuple[int, Any]]: if examples: n = self.eval_docs.to_pandas().shape[0] - assert all([e Date: Wed, 4 Dec 2024 20:41:18 +0000 Subject: [PATCH 4/6] fixing bug when examples==None --- lm_eval/evaluator.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lm_eval/evaluator.py b/lm_eval/evaluator.py index e460c0c3e7..79800d5a39 100644 --- a/lm_eval/evaluator.py +++ b/lm_eval/evaluator.py @@ -450,7 +450,7 @@ def evaluate( limits.append(limit) task.build_all_requests( limit=limit, - examples=examples[task_output.task_name], + examples=examples[task_output.task_name] if examples is None else examples, rank=lm.rank, world_size=lm.world_size, cache_requests=cache_requests, @@ -537,7 +537,7 @@ def evaluate( doc_iterator = task.doc_iterator( rank=RANK, limit=limit, - examples=examples[task_output.task_name], + examples=examples[task_output.task_name] if examples is None else examples, world_size=WORLD_SIZE, ) for doc_id, doc in doc_iterator: From 724612d4205d13b69ba236c95d22fa5b3e44fb96 Mon Sep 17 00:00:00 2001 From: Felipe Maia Polo Date: Wed, 4 Dec 2024 15:53:41 -0500 Subject: [PATCH 5/6] fixing bug when examples==None --- lm_eval/evaluator.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lm_eval/evaluator.py b/lm_eval/evaluator.py index 79800d5a39..7fcdf77ced 100644 --- a/lm_eval/evaluator.py +++ b/lm_eval/evaluator.py @@ -450,7 +450,7 @@ def evaluate( limits.append(limit) task.build_all_requests( limit=limit, - examples=examples[task_output.task_name] if examples is None else examples, + examples=examples[task_output.task_name] if examples is not None else examples, rank=lm.rank, world_size=lm.world_size, cache_requests=cache_requests, @@ -537,7 +537,7 @@ def evaluate( doc_iterator = task.doc_iterator( rank=RANK, limit=limit, - examples=examples[task_output.task_name] if examples is None else examples, + examples=examples[task_output.task_name] if examples is not None else examples, world_size=WORLD_SIZE, ) for doc_id, doc in doc_iterator: From 76139904f1bc0f1b4945c4b80c98b56977546144 Mon Sep 17 00:00:00 2001 From: Felipe Maia Polo Date: Wed, 4 Dec 2024 16:14:31 -0500 Subject: [PATCH 6/6] limit or examples must be None in simple_evaluate.py and in evaluator.py --- lm_eval/evaluator.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/lm_eval/evaluator.py b/lm_eval/evaluator.py index 7fcdf77ced..fdbce7c5fc 100644 --- a/lm_eval/evaluator.py +++ b/lm_eval/evaluator.py @@ -142,6 +142,9 @@ def simple_evaluate( eval_logger.setLevel(getattr(logging, f"{verbosity}")) start_date = time.time() + if limit is not None and examples is not None: + raise ValueError("Either 'limit' or 'examples' must be None, but both are not None.") + if delete_requests_cache: eval_logger.info("Deleting requests cache...") delete_cache() @@ -407,6 +410,9 @@ def evaluate( eval_logger.setLevel(getattr(logging, f"{verbosity}")) + if limit is not None and examples is not None: + raise ValueError("Either 'limit' or 'examples' must be None, but both are not None.") + # tracks all Instances/requests a model must generate output on. requests = defaultdict(list) # stores the amount to pad out reqs per req. type so that