Merge branch 'EleutherAI:main' into models/gigachat_llms

EleutherAI · Nov 22, 2024 · 9df7a6b · 9df7a6b
2 parents 96442f6 + 867413f
commit 9df7a6b
Show file tree

Hide file tree

Showing 111 changed files with 1,419 additions and 61 deletions.
diff --git a/.github/workflows/new_tasks.yml b/.github/workflows/new_tasks.yml
@@ -16,7 +16,7 @@ jobs:
     name: Scan for changed tasks
     steps:
       - name: checkout
-        uses: actions/checkout@v3
+        uses: actions/checkout@v4
         with:
           fetch-depth: 2  # OR "2" -> To retrieve the preceding commit.
 
@@ -47,7 +47,7 @@ jobs:
 
       - name: Set up Python 3.9
         if: steps.changed-tasks.outputs.tasks_any_modified == 'true' || steps.changed-tasks.outputs.api_any_modified == 'true'
-        uses: actions/setup-python@v4
+        uses: actions/setup-python@v5
         with:
           python-version: 3.9
           cache: 'pip'

diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml
@@ -13,7 +13,7 @@ jobs:
     steps:
     - uses: actions/checkout@v4
     - name: Set up Python
-      uses: actions/setup-python@v4
+      uses: actions/setup-python@v5
       with:
         python-version: "3.x"
 
@@ -26,7 +26,7 @@ jobs:
     - name: Build a binary wheel and a source tarball
       run: python3 -m build
     - name: Store the distribution packages
-      uses: actions/upload-artifact@v3
+      uses: actions/upload-artifact@v4
       with:
         name: python-package-distributions
         path: dist/
@@ -46,7 +46,7 @@ jobs:
 
     steps:
     - name: Download all the dists
-      uses: actions/download-artifact@v3
+      uses: actions/download-artifact@v4
       with:
         name: python-package-distributions
         path: dist/
@@ -68,7 +68,7 @@ jobs:
 
     steps:
     - name: Download all the dists
-      uses: actions/download-artifact@v3
+      uses: actions/download-artifact@v4
       with:
         name: python-package-distributions
         path: dist/

diff --git a/.github/workflows/unit_tests.yml b/.github/workflows/unit_tests.yml
@@ -63,9 +63,9 @@ jobs:
     - name: Test with pytest
       run: python -m pytest --showlocals -s -vv -n=auto --ignore=tests/models/test_neuralmagic.py --ignore=tests/models/test_openvino.py
     - name: Archive artifacts
-      uses: actions/upload-artifact@v3
+      uses: actions/upload-artifact@v4
       with:
-        name: output_results
+        name: output_testcpu${{ matrix.python-version }}
         path: |
           test_logs/*
   testmodels:
@@ -87,9 +87,3 @@ jobs:
         pip install -e '.[dev,optimum,deepsparse,sparseml,api]' --extra-index-url https://download.pytorch.org/whl/cpu
     - name: Test with pytest
       run: python -m pytest tests/models --showlocals -s -vv
-    - name: Archive artifacts
-      uses: actions/upload-artifact@v3
-      with:
-        name: output_results
-        path: |
-          test_logs/*
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -2,7 +2,7 @@
 exclude: ^tests/testdata/
 repos:
   - repo: https://github.com/pre-commit/pre-commit-hooks
-    rev: v4.6.0
+    rev: v5.0.0
     hooks:
       - id: check-added-large-files
       - id: check-ast
@@ -29,7 +29,7 @@ repos:
       - id: mixed-line-ending
         args: [--fix=lf]
   - repo: https://github.com/astral-sh/ruff-pre-commit
-    rev: v0.6.8
+    rev: v0.7.4
     hooks:
       # Run the linter.
       - id: ruff

diff --git a/lm_eval/evaluator.py b/lm_eval/evaluator.py
@@ -294,7 +294,9 @@ def _adjust_config(task_dict):
             model_source=model,
             model_args=model_args,
             system_instruction=system_instruction,
-            chat_template=lm.chat_template(apply_chat_template),
+            chat_template=lm.chat_template(apply_chat_template)
+            if apply_chat_template
+            else None,
             fewshot_as_multiturn=fewshot_as_multiturn,
         )
 

diff --git a/lm_eval/models/huggingface.py b/lm_eval/models/huggingface.py
@@ -464,7 +464,7 @@ def _get_backend(
             elif backend == "seq2seq":
                 self.backend = backend
             eval_logger.info(
-                f"Overrode HF model backend type, and using type '{backend}'"
+                f"Overrode HF model backend type, and using type '{self.backend}'"
             )
         else:
             # determine and use the default HF backend for this model, based on its config + metadata.
@@ -476,12 +476,12 @@ def _get_backend(
                 # models like MBart are listed in both seq2seq and causal mistakenly in HF transformers.
                 # these special cases should be treated as seq2seq models.
                 self.backend = "seq2seq"
-                eval_logger.info(f"Using model type '{backend}'")
+                eval_logger.debug(f"Using model type '{self.backend}'")
             elif (
                 getattr(self.config, "model_type") in MODEL_FOR_CAUSAL_LM_MAPPING_NAMES
             ):
                 self.backend = "causal"
-                eval_logger.info(f"Using model type '{backend}'")
+                eval_logger.debug(f"Using model type '{self.backend}'")
             else:
                 if not trust_remote_code:
                     eval_logger.warning(
@@ -493,7 +493,7 @@ def _get_backend(
                 # then we default to assuming AutoModelForCausalLM
                 self.backend = "causal"
                 eval_logger.info(
-                    f"Model type cannot be determined. Using default model type '{backend}'"
+                    f"Model type cannot be determined. Using default model type '{self.backend}'"
                 )
 
         if self.AUTO_MODEL_CLASS is None:

diff --git a/lm_eval/models/mamba_lm.py b/lm_eval/models/mamba_lm.py
@@ -12,6 +12,8 @@ class MambaLMWrapper(HFLM):
     def __init__(
         self,
         pretrained="state-spaces/mamba-130m",
+        # To use the HF compatible variant
+        is_hf: bool = False,
         **kwargs,
     ) -> None:
         """
@@ -52,7 +54,7 @@ def __init__(
         if "backend" in kwargs:
             # mamba currently only supports causal models
             assert kwargs["backend"] == "causal"
-
+        self.is_hf = is_hf or (True if pretrained.endswith("hf") else False)
         super().__init__(
             pretrained=pretrained,
             # set appropriate defaults for tokenizer, max length, etc
@@ -67,15 +69,18 @@ def _get_config(
         pretrained: str,
         **kwargs,
     ) -> None:
-        try:
-            from mamba_ssm.utils.hf import load_config_hf  # noqa: F811
-        except ModuleNotFoundError as exception:
-            raise type(exception)(
-                "attempted to use 'mamba_ssm' LM type, but package `mamba_ssm` is not installed. \
-please install mamba via `pip install lm-eval[mamba]` or `pip install -e .[mamba]`",
-            )
-
-        self._config = load_config_hf(pretrained)
+        if self.is_hf:
+            super()._get_config(pretrained, **kwargs)
+        else:
+            try:
+                from mamba_ssm.utils.hf import load_config_hf  # noqa: F811
+            except ModuleNotFoundError as exception:
+                raise type(exception)(
+                    "attempted to use 'mamba_ssm' LM type, but package `mamba_ssm` is not installed. \
+    please install mamba via `pip install lm-eval[mamba]` or `pip install -e .[mamba]`",
+                )
+
+            self._config = load_config_hf(pretrained)
 
     def _create_model(
         self,
@@ -86,24 +91,32 @@ def _create_model(
         # Mamba does not support arbitrary HF from_pretrained() args
         **kwargs,
     ) -> None:
-        try:
-            from mamba_ssm.models.mixer_seq_simple import MambaLMHeadModel  # noqa: F811
-        except ModuleNotFoundError as exception:
-            raise type(exception)(
-                "attempted to use 'mamba_ssm' LM type, but package `mamba_ssm` is not installed. \
-please install mamba via `pip install lm-eval[mamba]` or `pip install -e .[mamba]`",
+        if self.is_hf:
+            super()._create_model(pretrained, dtype=dtype, **kwargs)
+        else:
+            try:
+                from mamba_ssm.models.mixer_seq_simple import (
+                    MambaLMHeadModel,  # noqa: F811
+                )
+            except ModuleNotFoundError as exception:
+                raise type(exception)(
+                    "attempted to use 'mamba_ssm' LM type, but package `mamba_ssm` is not installed. \
+    please install mamba via `pip install lm-eval[mamba]` or `pip install -e .[mamba]`",
+                )
+
+            self._model = MambaLMHeadModel.from_pretrained(
+                pretrained,
+                device=self._device,
+                dtype=torch.float16
+                if dtype == "auto"
+                else lm_eval.models.utils.get_dtype(dtype),
             )
 
-        self._model = MambaLMHeadModel.from_pretrained(
-            pretrained,
-            device=self._device,
-            dtype=torch.float16
-            if dtype == "auto"
-            else lm_eval.models.utils.get_dtype(dtype),
-        )
-
     def _model_generate(self, context, max_length, stop, **generation_kwargs):
-        for key in ("do_sample", "attention_mask"):
+        remove_arg = (
+            ["attention_mask"] if self.is_hf else ["do_sample", "attention_mask"]
+        )
+        for key in remove_arg:
             if key in generation_kwargs:
                 generation_kwargs.pop(key)
 
@@ -116,11 +129,37 @@ def _model_generate(self, context, max_length, stop, **generation_kwargs):
         #     self.tokenizer, stop, 1, context.shape[0]
         # )
 
-        return self.model.generate(
-            input_ids=context,
-            max_length=max_length,
-            # stopping_criteria=stopping_criteria,
-            # pad_token_id=self.tokenizer.pad_token_id,
-            # use_cache=True,
-            **generation_kwargs,
-        )
+        if not self.is_hf:
+            return self.model.generate(
+                input_ids=context,
+                max_length=max_length,
+                # stopping_criteria=stopping_criteria,
+                # pad_token_id=self.tokenizer.pad_token_id,
+                # use_cache=True,
+                **generation_kwargs,
+            )
+        else:
+            stopping_criteria = lm_eval.models.utils.stop_sequences_criteria(
+                self.tokenizer,
+                stop,
+                context.shape[1],
+                context.shape[0],
+            )
+
+            generation_kwargs["temperature"] = generation_kwargs.get("temperature", 0.0)
+            do_sample = generation_kwargs.get("do_sample", None)
+
+            # The temperature has to be a strictly positive float -- if it is 0.0, use greedy decoding strategies
+            if generation_kwargs.get("temperature") == 0.0 and do_sample is None:
+                generation_kwargs["do_sample"] = do_sample = False
+            if do_sample is False and generation_kwargs.get("temperature") == 0.0:
+                generation_kwargs.pop("temperature")
+
+            return self.model.generate(
+                input_ids=context,
+                max_length=max_length,
+                stopping_criteria=stopping_criteria,
+                pad_token_id=self.tokenizer.pad_token_id,
+                use_cache=True,
+                **generation_kwargs,
+            )
diff --git a/lm_eval/models/vllm_causallms.py b/lm_eval/models/vllm_causallms.py
@@ -118,7 +118,7 @@ def __init__(
             tokenizer if tokenizer else pretrained,
             tokenizer_mode=tokenizer_mode,
             trust_remote_code=trust_remote_code,
-            tokenizer_revision=tokenizer_revision,
+            revision=tokenizer_revision,
         )
         self.tokenizer = configure_pad_token(self.tokenizer)
         self.add_bos_token = add_bos_token

diff --git a/lm_eval/tasks/README.md b/lm_eval/tasks/README.md
@@ -56,6 +56,7 @@
 | [ifeval](ifeval/README.md) | Interactive fiction evaluation tasks for narrative understanding and reasoning. | English |
 | [inverse_scaling](inverse_scaling/README.md) | Multiple-choice tasks from the Inverse Scaling Prize, designed to find settings where larger language models perform worse. | English |
 | [japanese_leaderboard](japanese_leaderboard/README.md) | Japanese language understanding tasks to benchmark model performance on various linguistic aspects. | Japanese |
+| [kbl](kbl/README.md) | Korean Benchmark for Legal Language Understanding. | Korean |
 | [kmmlu](kmmlu/README.md) | Knowledge-based multi-subject multiple choice questions for academic evaluation. | Korean |
 | [kobest](kobest/README.md) | A collection of tasks designed to evaluate understanding in Korean language. | Korean |
 | [kormedmcqa](kormedmcqa/README.md) | Medical question answering tasks in Korean to test specialized domain knowledge. | Korean |
@@ -70,6 +71,7 @@
 | [mathqa](mathqa/README.md) | Question answering tasks involving mathematical reasoning and problem-solving. | English |
 | [mc_taco](mc_taco/README.md) | Question-answer pairs that require temporal commonsense comprehension. | English |
 | [med_concepts_qa](med_concepts_qa/README.md) | Benchmark for evaluating LLMs on their abilities to interpret medical codes and distinguish between medical concept. | English |
+| [metabench](metabench/README.md) | Distilled versions of six popular benchmarks which are highly predictive of overall benchmark performance and of a single general ability latent trait. | English |
 | medmcqa | Medical multiple choice questions assessing detailed medical knowledge. | English |
 | medqa | Multiple choice question answering based on the United States Medical License Exams. | |
 | [mgsm](mgsm/README.md) | Benchmark of multilingual grade-school math problems. | Spanish, French, German, Russian, Chinese, Japanese, Thai, Swahili, Bengali, Telugu |

diff --git a/lm_eval/tasks/catalan_bench/_arc_ca_common_yaml b/lm_eval/tasks/catalan_bench/_arc_ca_common_yaml
@@ -1,4 +1,3 @@
-tag: arc_ca
 dataset_path: projecte-aina/arc_ca
 output_type: multiple_choice
 training_split: null