diff --git a/lm_eval/tasks/README.md b/lm_eval/tasks/README.md index 62e65a1eb8..0a4e58ddb1 100644 --- a/lm_eval/tasks/README.md +++ b/lm_eval/tasks/README.md @@ -14,6 +14,7 @@ | [arabic_leaderboard_complete](arabic_leaderboard_complete/README.md) | A full version of the tasks in the Open Arabic LLM Leaderboard, focusing on the evaluation of models that reflect the characteristics of Arabic language understanding and comprehension, culture, and heritage. Note that some of these tasks are machine-translated. | Arabic (Some MT) | | [arabic_leaderboard_light](arabic_leaderboard_light/README.md) | A light version of the tasks in the Open Arabic LLM Leaderboard (i.e., 10% samples of the test set in the original benchmarks), focusing on the evaluation of models that reflect the characteristics of Arabic language understanding and comprehension, culture, and heritage. Note that some of these tasks are machine-translated. | Arabic (Some MT) | | [arabicmmlu](arabicmmlu/README.md) | Localized Arabic version of MMLU with multiple-choice questions from 40 subjects. | Arabic | +| [AraDICE](aradice/README.md) | A collection of multiple tasks carefully designed to evaluate dialectal and cultural capabilities in large language models (LLMs). | Arabic | | [arc](arc/README.md) | Tasks involving complex reasoning over a diverse set of questions. | English | | [arithmetic](arithmetic/README.md) | Tasks involving numerical computations and arithmetic reasoning. | English | | [asdiv](asdiv/README.md) | Tasks involving arithmetic and mathematical reasoning challenges. | English | diff --git a/lm_eval/tasks/aradice/ArabicMMLU/EGY/AraDiCE_ArabicMMLU.yaml b/lm_eval/tasks/aradice/ArabicMMLU/EGY/AraDiCE_ArabicMMLU.yaml new file mode 100644 index 0000000000..77cbf95ace --- /dev/null +++ b/lm_eval/tasks/aradice/ArabicMMLU/EGY/AraDiCE_ArabicMMLU.yaml @@ -0,0 +1,12 @@ +group: AraDiCE_ArabicMMLU_egy +task: +- AraDiCE_ArabicMMLU_humanities_egy +- AraDiCE_ArabicMMLU_language_egy +- AraDiCE_ArabicMMLU_social-science_egy +- AraDiCE_ArabicMMLU_stem_egy +- AraDiCE_ArabicMMLU_other_egy +aggregate_metric_list: + - metric: acc + weight_by_size: True + - metric: acc_norm + weight_by_size: True diff --git a/lm_eval/tasks/aradice/ArabicMMLU/EGY/AraDiCE_ArabicMMLU_high_humanities_history.yaml b/lm_eval/tasks/aradice/ArabicMMLU/EGY/AraDiCE_ArabicMMLU_high_humanities_history.yaml new file mode 100644 index 0000000000..5a03177d13 --- /dev/null +++ b/lm_eval/tasks/aradice/ArabicMMLU/EGY/AraDiCE_ArabicMMLU_high_humanities_history.yaml @@ -0,0 +1,10 @@ +"dataset_name": "high_humanities_history" +"description": "" +"fewshot_split": !!null "null" +"include": "_default_template_yaml" +"tag": "AraDiCE_ArabicMMLU_humanities_egy" +"task": "AraDiCE_ArabicMMLU_high_humanities_history_egy" +"task_alias": "high humanities history" +"test_split": "test" +"training_split": !!null "null" +"validation_split": !!null "null" diff --git a/lm_eval/tasks/aradice/ArabicMMLU/EGY/AraDiCE_ArabicMMLU_high_humanities_islamic-studies.yaml b/lm_eval/tasks/aradice/ArabicMMLU/EGY/AraDiCE_ArabicMMLU_high_humanities_islamic-studies.yaml new file mode 100644 index 0000000000..ee65adc6db --- /dev/null +++ b/lm_eval/tasks/aradice/ArabicMMLU/EGY/AraDiCE_ArabicMMLU_high_humanities_islamic-studies.yaml @@ -0,0 +1,10 @@ +"dataset_name": "high_humanities_islamic-studies" +"description": "" +"fewshot_split": !!null "null" +"include": "_default_template_yaml" +"tag": "AraDiCE_ArabicMMLU_humanities_egy" +"task": "AraDiCE_ArabicMMLU_high_humanities_islamic-studies_egy" +"task_alias": "high humanities islamic-studies" +"test_split": "test" +"training_split": !!null "null" +"validation_split": !!null "null" diff --git a/lm_eval/tasks/aradice/ArabicMMLU/EGY/AraDiCE_ArabicMMLU_high_humanities_philosophy.yaml b/lm_eval/tasks/aradice/ArabicMMLU/EGY/AraDiCE_ArabicMMLU_high_humanities_philosophy.yaml new file mode 100644 index 0000000000..123f696f30 --- /dev/null +++ b/lm_eval/tasks/aradice/ArabicMMLU/EGY/AraDiCE_ArabicMMLU_high_humanities_philosophy.yaml @@ -0,0 +1,10 @@ +"dataset_name": "high_humanities_philosophy" +"description": "" +"fewshot_split": !!null "null" +"include": "_default_template_yaml" +"tag": "AraDiCE_ArabicMMLU_humanities_egy" +"task": "AraDiCE_ArabicMMLU_high_humanities_philosophy_egy" +"task_alias": "high humanities philosophy" +"test_split": "test" +"training_split": !!null "null" +"validation_split": !!null "null" diff --git a/lm_eval/tasks/aradice/ArabicMMLU/EGY/AraDiCE_ArabicMMLU_high_language_arabic-language.yaml b/lm_eval/tasks/aradice/ArabicMMLU/EGY/AraDiCE_ArabicMMLU_high_language_arabic-language.yaml new file mode 100644 index 0000000000..1df05181da --- /dev/null +++ b/lm_eval/tasks/aradice/ArabicMMLU/EGY/AraDiCE_ArabicMMLU_high_language_arabic-language.yaml @@ -0,0 +1,10 @@ +"dataset_name": "high_language_arabic-language" +"description": "" +"fewshot_split": !!null "null" +"include": "_default_template_yaml" +"tag": "AraDiCE_ArabicMMLU_language_egy" +"task": "AraDiCE_ArabicMMLU_high_language_arabic-language_egy" +"task_alias": "high language arabic-language" +"test_split": "test" +"training_split": !!null "null" +"validation_split": !!null "null" diff --git a/lm_eval/tasks/aradice/ArabicMMLU/EGY/AraDiCE_ArabicMMLU_high_social-science_civics.yaml b/lm_eval/tasks/aradice/ArabicMMLU/EGY/AraDiCE_ArabicMMLU_high_social-science_civics.yaml new file mode 100644 index 0000000000..7b42490b06 --- /dev/null +++ b/lm_eval/tasks/aradice/ArabicMMLU/EGY/AraDiCE_ArabicMMLU_high_social-science_civics.yaml @@ -0,0 +1,10 @@ +"dataset_name": "high_social-science_civics" +"description": "" +"fewshot_split": !!null "null" +"include": "_default_template_yaml" +"tag": "AraDiCE_ArabicMMLU_social-science_egy" +"task": "AraDiCE_ArabicMMLU_high_social-science_civics_egy" +"task_alias": "high social-science civics" +"test_split": "test" +"training_split": !!null "null" +"validation_split": !!null "null" diff --git a/lm_eval/tasks/aradice/ArabicMMLU/EGY/AraDiCE_ArabicMMLU_high_social-science_economics.yaml b/lm_eval/tasks/aradice/ArabicMMLU/EGY/AraDiCE_ArabicMMLU_high_social-science_economics.yaml new file mode 100644 index 0000000000..5518b2cda3 --- /dev/null +++ b/lm_eval/tasks/aradice/ArabicMMLU/EGY/AraDiCE_ArabicMMLU_high_social-science_economics.yaml @@ -0,0 +1,10 @@ +"dataset_name": "high_social-science_economics" +"description": "" +"fewshot_split": !!null "null" +"include": "_default_template_yaml" +"tag": "AraDiCE_ArabicMMLU_social-science_egy" +"task": "AraDiCE_ArabicMMLU_high_social-science_economics_egy" +"task_alias": "high social-science economics" +"test_split": "test" +"training_split": !!null "null" +"validation_split": !!null "null" diff --git a/lm_eval/tasks/aradice/ArabicMMLU/EGY/AraDiCE_ArabicMMLU_high_social-science_geography.yaml b/lm_eval/tasks/aradice/ArabicMMLU/EGY/AraDiCE_ArabicMMLU_high_social-science_geography.yaml new file mode 100644 index 0000000000..d9a2d5b332 --- /dev/null +++ b/lm_eval/tasks/aradice/ArabicMMLU/EGY/AraDiCE_ArabicMMLU_high_social-science_geography.yaml @@ -0,0 +1,10 @@ +"dataset_name": "high_social-science_geography" +"description": "" +"fewshot_split": !!null "null" +"include": "_default_template_yaml" +"tag": "AraDiCE_ArabicMMLU_social-science_egy" +"task": "AraDiCE_ArabicMMLU_high_social-science_geography_egy" +"task_alias": "high social-science geography" +"test_split": "test" +"training_split": !!null "null" +"validation_split": !!null "null" diff --git a/lm_eval/tasks/aradice/ArabicMMLU/EGY/AraDiCE_ArabicMMLU_high_stem_biology.yaml b/lm_eval/tasks/aradice/ArabicMMLU/EGY/AraDiCE_ArabicMMLU_high_stem_biology.yaml new file mode 100644 index 0000000000..3f1ab8a7b8 --- /dev/null +++ b/lm_eval/tasks/aradice/ArabicMMLU/EGY/AraDiCE_ArabicMMLU_high_stem_biology.yaml @@ -0,0 +1,10 @@ +"dataset_name": "high_stem_biology" +"description": "" +"fewshot_split": !!null "null" +"include": "_default_template_yaml" +"tag": "AraDiCE_ArabicMMLU_stem_egy" +"task": "AraDiCE_ArabicMMLU_high_stem_biology_egy" +"task_alias": "high stem biology" +"test_split": "test" +"training_split": !!null "null" +"validation_split": !!null "null" diff --git a/lm_eval/tasks/aradice/ArabicMMLU/EGY/AraDiCE_ArabicMMLU_high_stem_computer-science.yaml b/lm_eval/tasks/aradice/ArabicMMLU/EGY/AraDiCE_ArabicMMLU_high_stem_computer-science.yaml new file mode 100644 index 0000000000..c27f5be318 --- /dev/null +++ b/lm_eval/tasks/aradice/ArabicMMLU/EGY/AraDiCE_ArabicMMLU_high_stem_computer-science.yaml @@ -0,0 +1,10 @@ +"dataset_name": "high_stem_computer-science" +"description": "" +"fewshot_split": !!null "null" +"include": "_default_template_yaml" +"tag": "AraDiCE_ArabicMMLU_stem_egy" +"task": "AraDiCE_ArabicMMLU_high_stem_computer-science_egy" +"task_alias": "high stem computer-science" +"test_split": "test" +"training_split": !!null "null" +"validation_split": !!null "null" diff --git a/lm_eval/tasks/aradice/ArabicMMLU/EGY/AraDiCE_ArabicMMLU_high_stem_physics.yaml b/lm_eval/tasks/aradice/ArabicMMLU/EGY/AraDiCE_ArabicMMLU_high_stem_physics.yaml new file mode 100644 index 0000000000..4e24a2f4fb --- /dev/null +++ b/lm_eval/tasks/aradice/ArabicMMLU/EGY/AraDiCE_ArabicMMLU_high_stem_physics.yaml @@ -0,0 +1,10 @@ +"dataset_name": "high_stem_physics" +"description": "" +"fewshot_split": !!null "null" +"include": "_default_template_yaml" +"tag": "AraDiCE_ArabicMMLU_stem_egy" +"task": "AraDiCE_ArabicMMLU_high_stem_physics_egy" +"task_alias": "high stem physics" +"test_split": "test" +"training_split": !!null "null" +"validation_split": !!null "null" diff --git a/lm_eval/tasks/aradice/ArabicMMLU/EGY/AraDiCE_ArabicMMLU_middle_humanities_history.yaml b/lm_eval/tasks/aradice/ArabicMMLU/EGY/AraDiCE_ArabicMMLU_middle_humanities_history.yaml new file mode 100644 index 0000000000..9f2c377040 --- /dev/null +++ b/lm_eval/tasks/aradice/ArabicMMLU/EGY/AraDiCE_ArabicMMLU_middle_humanities_history.yaml @@ -0,0 +1,10 @@ +"dataset_name": "middle_humanities_history" +"description": "" +"fewshot_split": !!null "null" +"include": "_default_template_yaml" +"tag": "AraDiCE_ArabicMMLU_humanities_egy" +"task": "AraDiCE_ArabicMMLU_middle_humanities_history_egy" +"task_alias": "middle humanities history" +"test_split": "test" +"training_split": !!null "null" +"validation_split": !!null "null" diff --git a/lm_eval/tasks/aradice/ArabicMMLU/EGY/AraDiCE_ArabicMMLU_middle_humanities_islamic-studies.yaml b/lm_eval/tasks/aradice/ArabicMMLU/EGY/AraDiCE_ArabicMMLU_middle_humanities_islamic-studies.yaml new file mode 100644 index 0000000000..41995c4aa3 --- /dev/null +++ b/lm_eval/tasks/aradice/ArabicMMLU/EGY/AraDiCE_ArabicMMLU_middle_humanities_islamic-studies.yaml @@ -0,0 +1,10 @@ +"dataset_name": "middle_humanities_islamic-studies" +"description": "" +"fewshot_split": !!null "null" +"include": "_default_template_yaml" +"tag": "AraDiCE_ArabicMMLU_humanities_egy" +"task": "AraDiCE_ArabicMMLU_middle_humanities_islamic-studies_egy" +"task_alias": "middle humanities islamic-studies" +"test_split": "test" +"training_split": !!null "null" +"validation_split": !!null "null" diff --git a/lm_eval/tasks/aradice/ArabicMMLU/EGY/AraDiCE_ArabicMMLU_middle_language_arabic-language.yaml b/lm_eval/tasks/aradice/ArabicMMLU/EGY/AraDiCE_ArabicMMLU_middle_language_arabic-language.yaml new file mode 100644 index 0000000000..e33bf590a1 --- /dev/null +++ b/lm_eval/tasks/aradice/ArabicMMLU/EGY/AraDiCE_ArabicMMLU_middle_language_arabic-language.yaml @@ -0,0 +1,10 @@ +"dataset_name": "middle_language_arabic-language" +"description": "" +"fewshot_split": !!null "null" +"include": "_default_template_yaml" +"tag": "AraDiCE_ArabicMMLU_language_egy" +"task": "AraDiCE_ArabicMMLU_middle_language_arabic-language_egy" +"task_alias": "middle language arabic-language" +"test_split": "test" +"training_split": !!null "null" +"validation_split": !!null "null" diff --git a/lm_eval/tasks/aradice/ArabicMMLU/EGY/AraDiCE_ArabicMMLU_middle_other_general-knowledge.yaml b/lm_eval/tasks/aradice/ArabicMMLU/EGY/AraDiCE_ArabicMMLU_middle_other_general-knowledge.yaml new file mode 100644 index 0000000000..73fc902702 --- /dev/null +++ b/lm_eval/tasks/aradice/ArabicMMLU/EGY/AraDiCE_ArabicMMLU_middle_other_general-knowledge.yaml @@ -0,0 +1,10 @@ +"dataset_name": "middle_other_general-knowledge" +"description": "" +"fewshot_split": !!null "null" +"include": "_default_template_yaml" +"tag": "AraDiCE_ArabicMMLU_other_egy" +"task": "AraDiCE_ArabicMMLU_middle_other_general-knowledge_egy" +"task_alias": "middle other general-knowledge" +"test_split": "test" +"training_split": !!null "null" +"validation_split": !!null "null" diff --git a/lm_eval/tasks/aradice/ArabicMMLU/EGY/AraDiCE_ArabicMMLU_middle_social-science_civics.yaml b/lm_eval/tasks/aradice/ArabicMMLU/EGY/AraDiCE_ArabicMMLU_middle_social-science_civics.yaml new file mode 100644 index 0000000000..8407f36e7f --- /dev/null +++ b/lm_eval/tasks/aradice/ArabicMMLU/EGY/AraDiCE_ArabicMMLU_middle_social-science_civics.yaml @@ -0,0 +1,10 @@ +"dataset_name": "middle_social-science_civics" +"description": "" +"fewshot_split": !!null "null" +"include": "_default_template_yaml" +"tag": "AraDiCE_ArabicMMLU_social-science_egy" +"task": "AraDiCE_ArabicMMLU_middle_social-science_civics_egy" +"task_alias": "middle social-science civics" +"test_split": "test" +"training_split": !!null "null" +"validation_split": !!null "null" diff --git a/lm_eval/tasks/aradice/ArabicMMLU/EGY/AraDiCE_ArabicMMLU_middle_social-science_economics.yaml b/lm_eval/tasks/aradice/ArabicMMLU/EGY/AraDiCE_ArabicMMLU_middle_social-science_economics.yaml new file mode 100644 index 0000000000..fbcb040d27 --- /dev/null +++ b/lm_eval/tasks/aradice/ArabicMMLU/EGY/AraDiCE_ArabicMMLU_middle_social-science_economics.yaml @@ -0,0 +1,10 @@ +"dataset_name": "middle_social-science_economics" +"description": "" +"fewshot_split": !!null "null" +"include": "_default_template_yaml" +"tag": "AraDiCE_ArabicMMLU_social-science_egy" +"task": "AraDiCE_ArabicMMLU_middle_social-science_economics_egy" +"task_alias": "middle social-science economics" +"test_split": "test" +"training_split": !!null "null" +"validation_split": !!null "null" diff --git a/lm_eval/tasks/aradice/ArabicMMLU/EGY/AraDiCE_ArabicMMLU_middle_social-science_geography.yaml b/lm_eval/tasks/aradice/ArabicMMLU/EGY/AraDiCE_ArabicMMLU_middle_social-science_geography.yaml new file mode 100644 index 0000000000..57fe94f294 --- /dev/null +++ b/lm_eval/tasks/aradice/ArabicMMLU/EGY/AraDiCE_ArabicMMLU_middle_social-science_geography.yaml @@ -0,0 +1,10 @@ +"dataset_name": "middle_social-science_geography" +"description": "" +"fewshot_split": !!null "null" +"include": "_default_template_yaml" +"tag": "AraDiCE_ArabicMMLU_social-science_egy" +"task": "AraDiCE_ArabicMMLU_middle_social-science_geography_egy" +"task_alias": "middle social-science geography" +"test_split": "test" +"training_split": !!null "null" +"validation_split": !!null "null" diff --git a/lm_eval/tasks/aradice/ArabicMMLU/EGY/AraDiCE_ArabicMMLU_middle_social-science_social-science.yaml b/lm_eval/tasks/aradice/ArabicMMLU/EGY/AraDiCE_ArabicMMLU_middle_social-science_social-science.yaml new file mode 100644 index 0000000000..115170b8cc --- /dev/null +++ b/lm_eval/tasks/aradice/ArabicMMLU/EGY/AraDiCE_ArabicMMLU_middle_social-science_social-science.yaml @@ -0,0 +1,10 @@ +"dataset_name": "middle_social-science_social-science" +"description": "" +"fewshot_split": !!null "null" +"include": "_default_template_yaml" +"tag": "AraDiCE_ArabicMMLU_social-science_egy" +"task": "AraDiCE_ArabicMMLU_middle_social-science_social-science_egy" +"task_alias": "middle social-science social-science" +"test_split": "test" +"training_split": !!null "null" +"validation_split": !!null "null" diff --git a/lm_eval/tasks/aradice/ArabicMMLU/EGY/AraDiCE_ArabicMMLU_middle_stem_computer-science.yaml b/lm_eval/tasks/aradice/ArabicMMLU/EGY/AraDiCE_ArabicMMLU_middle_stem_computer-science.yaml new file mode 100644 index 0000000000..1d8787e3c0 --- /dev/null +++ b/lm_eval/tasks/aradice/ArabicMMLU/EGY/AraDiCE_ArabicMMLU_middle_stem_computer-science.yaml @@ -0,0 +1,10 @@ +"dataset_name": "middle_stem_computer-science" +"description": "" +"fewshot_split": !!null "null" +"include": "_default_template_yaml" +"tag": "AraDiCE_ArabicMMLU_stem_egy" +"task": "AraDiCE_ArabicMMLU_middle_stem_computer-science_egy" +"task_alias": "middle stem computer-science" +"test_split": "test" +"training_split": !!null "null" +"validation_split": !!null "null" diff --git a/lm_eval/tasks/aradice/ArabicMMLU/EGY/AraDiCE_ArabicMMLU_middle_stem_natural-science.yaml b/lm_eval/tasks/aradice/ArabicMMLU/EGY/AraDiCE_ArabicMMLU_middle_stem_natural-science.yaml new file mode 100644 index 0000000000..ee09058ce4 --- /dev/null +++ b/lm_eval/tasks/aradice/ArabicMMLU/EGY/AraDiCE_ArabicMMLU_middle_stem_natural-science.yaml @@ -0,0 +1,10 @@ +"dataset_name": "middle_stem_natural-science" +"description": "" +"fewshot_split": !!null "null" +"include": "_default_template_yaml" +"tag": "AraDiCE_ArabicMMLU_stem_egy" +"task": "AraDiCE_ArabicMMLU_middle_stem_natural-science_egy" +"task_alias": "middle stem natural-science" +"test_split": "test" +"training_split": !!null "null" +"validation_split": !!null "null" diff --git a/lm_eval/tasks/aradice/ArabicMMLU/EGY/AraDiCE_ArabicMMLU_na_humanities_islamic-studies.yaml b/lm_eval/tasks/aradice/ArabicMMLU/EGY/AraDiCE_ArabicMMLU_na_humanities_islamic-studies.yaml new file mode 100644 index 0000000000..995aa28c2f --- /dev/null +++ b/lm_eval/tasks/aradice/ArabicMMLU/EGY/AraDiCE_ArabicMMLU_na_humanities_islamic-studies.yaml @@ -0,0 +1,10 @@ +"dataset_name": "na_humanities_islamic-studies" +"description": "" +"fewshot_split": !!null "null" +"include": "_default_template_yaml" +"tag": "AraDiCE_ArabicMMLU_humanities_egy" +"task": "AraDiCE_ArabicMMLU_na_humanities_islamic-studies_egy" +"task_alias": "na humanities islamic-studies" +"test_split": "test" +"training_split": !!null "null" +"validation_split": !!null "null" diff --git a/lm_eval/tasks/aradice/ArabicMMLU/EGY/AraDiCE_ArabicMMLU_na_language_arabic-language-general.yaml b/lm_eval/tasks/aradice/ArabicMMLU/EGY/AraDiCE_ArabicMMLU_na_language_arabic-language-general.yaml new file mode 100644 index 0000000000..8691250702 --- /dev/null +++ b/lm_eval/tasks/aradice/ArabicMMLU/EGY/AraDiCE_ArabicMMLU_na_language_arabic-language-general.yaml @@ -0,0 +1,10 @@ +"dataset_name": "na_language_arabic-language-general" +"description": "" +"fewshot_split": !!null "null" +"include": "_default_template_yaml" +"tag": "AraDiCE_ArabicMMLU_language_egy" +"task": "AraDiCE_ArabicMMLU_na_language_arabic-language-general_egy" +"task_alias": "na language arabic-language-general" +"test_split": "test" +"training_split": !!null "null" +"validation_split": !!null "null" diff --git a/lm_eval/tasks/aradice/ArabicMMLU/EGY/AraDiCE_ArabicMMLU_na_language_arabic-language-grammar.yaml b/lm_eval/tasks/aradice/ArabicMMLU/EGY/AraDiCE_ArabicMMLU_na_language_arabic-language-grammar.yaml new file mode 100644 index 0000000000..453e41435d --- /dev/null +++ b/lm_eval/tasks/aradice/ArabicMMLU/EGY/AraDiCE_ArabicMMLU_na_language_arabic-language-grammar.yaml @@ -0,0 +1,10 @@ +"dataset_name": "na_language_arabic-language-grammar" +"description": "" +"fewshot_split": !!null "null" +"include": "_default_template_yaml" +"tag": "AraDiCE_ArabicMMLU_language_egy" +"task": "AraDiCE_ArabicMMLU_na_language_arabic-language-grammar_egy" +"task_alias": "na language arabic-language-grammar" +"test_split": "test" +"training_split": !!null "null" +"validation_split": !!null "null" diff --git a/lm_eval/tasks/aradice/ArabicMMLU/EGY/AraDiCE_ArabicMMLU_na_other_driving-test.yaml b/lm_eval/tasks/aradice/ArabicMMLU/EGY/AraDiCE_ArabicMMLU_na_other_driving-test.yaml new file mode 100644 index 0000000000..abc097210f --- /dev/null +++ b/lm_eval/tasks/aradice/ArabicMMLU/EGY/AraDiCE_ArabicMMLU_na_other_driving-test.yaml @@ -0,0 +1,10 @@ +"dataset_name": "na_other_driving-test" +"description": "" +"fewshot_split": !!null "null" +"include": "_default_template_yaml" +"tag": "AraDiCE_ArabicMMLU_other_egy" +"task": "AraDiCE_ArabicMMLU_na_other_driving-test_egy" +"task_alias": "na other driving-test" +"test_split": "test" +"training_split": !!null "null" +"validation_split": !!null "null" diff --git a/lm_eval/tasks/aradice/ArabicMMLU/EGY/AraDiCE_ArabicMMLU_na_other_general-knowledge.yaml b/lm_eval/tasks/aradice/ArabicMMLU/EGY/AraDiCE_ArabicMMLU_na_other_general-knowledge.yaml new file mode 100644 index 0000000000..72af8e7f53 --- /dev/null +++ b/lm_eval/tasks/aradice/ArabicMMLU/EGY/AraDiCE_ArabicMMLU_na_other_general-knowledge.yaml @@ -0,0 +1,10 @@ +"dataset_name": "na_other_general-knowledge" +"description": "" +"fewshot_split": !!null "null" +"include": "_default_template_yaml" +"tag": "AraDiCE_ArabicMMLU_other_egy" +"task": "AraDiCE_ArabicMMLU_na_other_general-knowledge_egy" +"task_alias": "na other general-knowledge" +"test_split": "test" +"training_split": !!null "null" +"validation_split": !!null "null" diff --git a/lm_eval/tasks/aradice/ArabicMMLU/EGY/AraDiCE_ArabicMMLU_primary_humanities_history.yaml b/lm_eval/tasks/aradice/ArabicMMLU/EGY/AraDiCE_ArabicMMLU_primary_humanities_history.yaml new file mode 100644 index 0000000000..5e640faa54 --- /dev/null +++ b/lm_eval/tasks/aradice/ArabicMMLU/EGY/AraDiCE_ArabicMMLU_primary_humanities_history.yaml @@ -0,0 +1,10 @@ +"dataset_name": "primary_humanities_history" +"description": "" +"fewshot_split": !!null "null" +"include": "_default_template_yaml" +"tag": "AraDiCE_ArabicMMLU_humanities_egy" +"task": "AraDiCE_ArabicMMLU_primary_humanities_history_egy" +"task_alias": "primary humanities history" +"test_split": "test" +"training_split": !!null "null" +"validation_split": !!null "null" diff --git a/lm_eval/tasks/aradice/ArabicMMLU/EGY/AraDiCE_ArabicMMLU_primary_humanities_islamic-studies.yaml b/lm_eval/tasks/aradice/ArabicMMLU/EGY/AraDiCE_ArabicMMLU_primary_humanities_islamic-studies.yaml new file mode 100644 index 0000000000..120dfa1435 --- /dev/null +++ b/lm_eval/tasks/aradice/ArabicMMLU/EGY/AraDiCE_ArabicMMLU_primary_humanities_islamic-studies.yaml @@ -0,0 +1,10 @@ +"dataset_name": "primary_humanities_islamic-studies" +"description": "" +"fewshot_split": !!null "null" +"include": "_default_template_yaml" +"tag": "AraDiCE_ArabicMMLU_humanities_egy" +"task": "AraDiCE_ArabicMMLU_primary_humanities_islamic-studies_egy" +"task_alias": "primary humanities islamic-studies" +"test_split": "test" +"training_split": !!null "null" +"validation_split": !!null "null" diff --git a/lm_eval/tasks/aradice/ArabicMMLU/EGY/AraDiCE_ArabicMMLU_primary_language_arabic-language.yaml b/lm_eval/tasks/aradice/ArabicMMLU/EGY/AraDiCE_ArabicMMLU_primary_language_arabic-language.yaml new file mode 100644 index 0000000000..57c460a01b --- /dev/null +++ b/lm_eval/tasks/aradice/ArabicMMLU/EGY/AraDiCE_ArabicMMLU_primary_language_arabic-language.yaml @@ -0,0 +1,10 @@ +"dataset_name": "primary_language_arabic-language" +"description": "" +"fewshot_split": !!null "null" +"include": "_default_template_yaml" +"tag": "AraDiCE_ArabicMMLU_language_egy" +"task": "AraDiCE_ArabicMMLU_primary_language_arabic-language_egy" +"task_alias": "primary language arabic-language" +"test_split": "test" +"training_split": !!null "null" +"validation_split": !!null "null" diff --git a/lm_eval/tasks/aradice/ArabicMMLU/EGY/AraDiCE_ArabicMMLU_primary_other_general-knowledge.yaml b/lm_eval/tasks/aradice/ArabicMMLU/EGY/AraDiCE_ArabicMMLU_primary_other_general-knowledge.yaml new file mode 100644 index 0000000000..61314bf182 --- /dev/null +++ b/lm_eval/tasks/aradice/ArabicMMLU/EGY/AraDiCE_ArabicMMLU_primary_other_general-knowledge.yaml @@ -0,0 +1,10 @@ +"dataset_name": "primary_other_general-knowledge" +"description": "" +"fewshot_split": !!null "null" +"include": "_default_template_yaml" +"tag": "AraDiCE_ArabicMMLU_other_egy" +"task": "AraDiCE_ArabicMMLU_primary_other_general-knowledge_egy" +"task_alias": "primary other general-knowledge" +"test_split": "test" +"training_split": !!null "null" +"validation_split": !!null "null" diff --git a/lm_eval/tasks/aradice/ArabicMMLU/EGY/AraDiCE_ArabicMMLU_primary_social-science_geography.yaml b/lm_eval/tasks/aradice/ArabicMMLU/EGY/AraDiCE_ArabicMMLU_primary_social-science_geography.yaml new file mode 100644 index 0000000000..73b8deea7a --- /dev/null +++ b/lm_eval/tasks/aradice/ArabicMMLU/EGY/AraDiCE_ArabicMMLU_primary_social-science_geography.yaml @@ -0,0 +1,10 @@ +"dataset_name": "primary_social-science_geography" +"description": "" +"fewshot_split": !!null "null" +"include": "_default_template_yaml" +"tag": "AraDiCE_ArabicMMLU_social-science_egy" +"task": "AraDiCE_ArabicMMLU_primary_social-science_geography_egy" +"task_alias": "primary social-science geography" +"test_split": "test" +"training_split": !!null "null" +"validation_split": !!null "null" diff --git a/lm_eval/tasks/aradice/ArabicMMLU/EGY/AraDiCE_ArabicMMLU_primary_social-science_social-science.yaml b/lm_eval/tasks/aradice/ArabicMMLU/EGY/AraDiCE_ArabicMMLU_primary_social-science_social-science.yaml new file mode 100644 index 0000000000..5f03bb4ba0 --- /dev/null +++ b/lm_eval/tasks/aradice/ArabicMMLU/EGY/AraDiCE_ArabicMMLU_primary_social-science_social-science.yaml @@ -0,0 +1,10 @@ +"dataset_name": "primary_social-science_social-science" +"description": "" +"fewshot_split": !!null "null" +"include": "_default_template_yaml" +"tag": "AraDiCE_ArabicMMLU_social-science_egy" +"task": "AraDiCE_ArabicMMLU_primary_social-science_social-science_egy" +"task_alias": "primary social-science social-science" +"test_split": "test" +"training_split": !!null "null" +"validation_split": !!null "null" diff --git a/lm_eval/tasks/aradice/ArabicMMLU/EGY/AraDiCE_ArabicMMLU_primary_stem_computer-science.yaml b/lm_eval/tasks/aradice/ArabicMMLU/EGY/AraDiCE_ArabicMMLU_primary_stem_computer-science.yaml new file mode 100644 index 0000000000..5e25856ebe --- /dev/null +++ b/lm_eval/tasks/aradice/ArabicMMLU/EGY/AraDiCE_ArabicMMLU_primary_stem_computer-science.yaml @@ -0,0 +1,10 @@ +"dataset_name": "primary_stem_computer-science" +"description": "" +"fewshot_split": !!null "null" +"include": "_default_template_yaml" +"tag": "AraDiCE_ArabicMMLU_stem_egy" +"task": "AraDiCE_ArabicMMLU_primary_stem_computer-science_egy" +"task_alias": "primary stem computer-science" +"test_split": "test" +"training_split": !!null "null" +"validation_split": !!null "null" diff --git a/lm_eval/tasks/aradice/ArabicMMLU/EGY/AraDiCE_ArabicMMLU_primary_stem_math.yaml b/lm_eval/tasks/aradice/ArabicMMLU/EGY/AraDiCE_ArabicMMLU_primary_stem_math.yaml new file mode 100644 index 0000000000..d4e85ac27f --- /dev/null +++ b/lm_eval/tasks/aradice/ArabicMMLU/EGY/AraDiCE_ArabicMMLU_primary_stem_math.yaml @@ -0,0 +1,10 @@ +"dataset_name": "primary_stem_math" +"description": "" +"fewshot_split": !!null "null" +"include": "_default_template_yaml" +"tag": "AraDiCE_ArabicMMLU_stem_egy" +"task": "AraDiCE_ArabicMMLU_primary_stem_math_egy" +"task_alias": "primary stem math" +"test_split": "test" +"training_split": !!null "null" +"validation_split": !!null "null" diff --git a/lm_eval/tasks/aradice/ArabicMMLU/EGY/AraDiCE_ArabicMMLU_primary_stem_natural-science.yaml b/lm_eval/tasks/aradice/ArabicMMLU/EGY/AraDiCE_ArabicMMLU_primary_stem_natural-science.yaml new file mode 100644 index 0000000000..04591fcd81 --- /dev/null +++ b/lm_eval/tasks/aradice/ArabicMMLU/EGY/AraDiCE_ArabicMMLU_primary_stem_natural-science.yaml @@ -0,0 +1,10 @@ +"dataset_name": "primary_stem_natural-science" +"description": "" +"fewshot_split": !!null "null" +"include": "_default_template_yaml" +"tag": "AraDiCE_ArabicMMLU_stem_egy" +"task": "AraDiCE_ArabicMMLU_primary_stem_natural-science_egy" +"task_alias": "primary stem natural-science" +"test_split": "test" +"training_split": !!null "null" +"validation_split": !!null "null" diff --git a/lm_eval/tasks/aradice/ArabicMMLU/EGY/AraDiCE_ArabicMMLU_prof_humanities_law.yaml b/lm_eval/tasks/aradice/ArabicMMLU/EGY/AraDiCE_ArabicMMLU_prof_humanities_law.yaml new file mode 100644 index 0000000000..4fd3e166cb --- /dev/null +++ b/lm_eval/tasks/aradice/ArabicMMLU/EGY/AraDiCE_ArabicMMLU_prof_humanities_law.yaml @@ -0,0 +1,10 @@ +"dataset_name": "prof_humanities_law" +"description": "" +"fewshot_split": !!null "null" +"include": "_default_template_yaml" +"tag": "AraDiCE_ArabicMMLU_humanities_egy" +"task": "AraDiCE_ArabicMMLU_prof_humanities_law_egy" +"task_alias": "prof humanities law" +"test_split": "test" +"training_split": !!null "null" +"validation_split": !!null "null" diff --git a/lm_eval/tasks/aradice/ArabicMMLU/EGY/AraDiCE_ArabicMMLU_univ_other_management.yaml b/lm_eval/tasks/aradice/ArabicMMLU/EGY/AraDiCE_ArabicMMLU_univ_other_management.yaml new file mode 100644 index 0000000000..6b985e979f --- /dev/null +++ b/lm_eval/tasks/aradice/ArabicMMLU/EGY/AraDiCE_ArabicMMLU_univ_other_management.yaml @@ -0,0 +1,10 @@ +"dataset_name": "univ_other_management" +"description": "" +"fewshot_split": !!null "null" +"include": "_default_template_yaml" +"tag": "AraDiCE_ArabicMMLU_other_egy" +"task": "AraDiCE_ArabicMMLU_univ_other_management_egy" +"task_alias": "univ other management" +"test_split": "test" +"training_split": !!null "null" +"validation_split": !!null "null" diff --git a/lm_eval/tasks/aradice/ArabicMMLU/EGY/AraDiCE_ArabicMMLU_univ_social-science_accounting.yaml b/lm_eval/tasks/aradice/ArabicMMLU/EGY/AraDiCE_ArabicMMLU_univ_social-science_accounting.yaml new file mode 100644 index 0000000000..48ec0e75d8 --- /dev/null +++ b/lm_eval/tasks/aradice/ArabicMMLU/EGY/AraDiCE_ArabicMMLU_univ_social-science_accounting.yaml @@ -0,0 +1,10 @@ +"dataset_name": "univ_social-science_accounting" +"description": "" +"fewshot_split": !!null "null" +"include": "_default_template_yaml" +"tag": "AraDiCE_ArabicMMLU_social-science_egy" +"task": "AraDiCE_ArabicMMLU_univ_social-science_accounting_egy" +"task_alias": "univ social-science accounting" +"test_split": "test" +"training_split": !!null "null" +"validation_split": !!null "null" diff --git a/lm_eval/tasks/aradice/ArabicMMLU/EGY/AraDiCE_ArabicMMLU_univ_social-science_economics.yaml b/lm_eval/tasks/aradice/ArabicMMLU/EGY/AraDiCE_ArabicMMLU_univ_social-science_economics.yaml new file mode 100644 index 0000000000..3dd4dcc0a2 --- /dev/null +++ b/lm_eval/tasks/aradice/ArabicMMLU/EGY/AraDiCE_ArabicMMLU_univ_social-science_economics.yaml @@ -0,0 +1,10 @@ +"dataset_name": "univ_social-science_economics" +"description": "" +"fewshot_split": !!null "null" +"include": "_default_template_yaml" +"tag": "AraDiCE_ArabicMMLU_social-science_egy" +"task": "AraDiCE_ArabicMMLU_univ_social-science_economics_egy" +"task_alias": "univ social-science economics" +"test_split": "test" +"training_split": !!null "null" +"validation_split": !!null "null" diff --git a/lm_eval/tasks/aradice/ArabicMMLU/EGY/AraDiCE_ArabicMMLU_univ_social-science_political-science.yaml b/lm_eval/tasks/aradice/ArabicMMLU/EGY/AraDiCE_ArabicMMLU_univ_social-science_political-science.yaml new file mode 100644 index 0000000000..671b0b3eb9 --- /dev/null +++ b/lm_eval/tasks/aradice/ArabicMMLU/EGY/AraDiCE_ArabicMMLU_univ_social-science_political-science.yaml @@ -0,0 +1,10 @@ +"dataset_name": "univ_social-science_political-science" +"description": "" +"fewshot_split": !!null "null" +"include": "_default_template_yaml" +"tag": "AraDiCE_ArabicMMLU_social-science_egy" +"task": "AraDiCE_ArabicMMLU_univ_social-science_political-science_egy" +"task_alias": "univ social-science political-science" +"test_split": "test" +"training_split": !!null "null" +"validation_split": !!null "null" diff --git a/lm_eval/tasks/aradice/ArabicMMLU/EGY/AraDiCE_ArabicMMLU_univ_stem_computer-science.yaml b/lm_eval/tasks/aradice/ArabicMMLU/EGY/AraDiCE_ArabicMMLU_univ_stem_computer-science.yaml new file mode 100644 index 0000000000..49e2e5b67c --- /dev/null +++ b/lm_eval/tasks/aradice/ArabicMMLU/EGY/AraDiCE_ArabicMMLU_univ_stem_computer-science.yaml @@ -0,0 +1,10 @@ +"dataset_name": "univ_stem_computer-science" +"description": "" +"fewshot_split": !!null "null" +"include": "_default_template_yaml" +"tag": "AraDiCE_ArabicMMLU_stem_egy" +"task": "AraDiCE_ArabicMMLU_univ_stem_computer-science_egy" +"task_alias": "univ stem computer-science" +"test_split": "test" +"training_split": !!null "null" +"validation_split": !!null "null" diff --git a/lm_eval/tasks/aradice/ArabicMMLU/EGY/_default_template_yaml b/lm_eval/tasks/aradice/ArabicMMLU/EGY/_default_template_yaml new file mode 100644 index 0000000000..6421888a23 --- /dev/null +++ b/lm_eval/tasks/aradice/ArabicMMLU/EGY/_default_template_yaml @@ -0,0 +1,20 @@ +dataset_path: "QCRI/AraDICE-ArabicMMLU-egy" +fewshot_config: + sampler: default +output_type: multiple_choice +process_docs: !function utils.process_docs +doc_to_text: "{{prompt}}" +doc_to_choice: choices +doc_to_target: target +metric_list: + - metric: acc + aggregation: mean + higher_is_better: true + - metric: acc_norm + aggregation: mean + higher_is_better: true + - metric: f1 + higher_is_better: true + aggregation: !function metrics.micro_f1_score +metadata: + version: 0.0 diff --git a/lm_eval/tasks/aradice/ArabicMMLU/EGY/metrics.py b/lm_eval/tasks/aradice/ArabicMMLU/EGY/metrics.py new file mode 100644 index 0000000000..47e49ded46 --- /dev/null +++ b/lm_eval/tasks/aradice/ArabicMMLU/EGY/metrics.py @@ -0,0 +1,25 @@ +from sklearn.metrics import f1_score + + +def macro_f1_score(items): + unzipped_list = list(zip(*items)) + golds = unzipped_list[0] + preds = unzipped_list[1] + fscore = f1_score(golds, preds, average="macro") + return fscore + + +def micro_f1_score(items): + unzipped_list = list(zip(*items)) + golds = unzipped_list[0] + preds = unzipped_list[1] + fscore = f1_score(golds, preds, average="micro") + return fscore + + +def weighted_f1_score(items): + unzipped_list = list(zip(*items)) + golds = unzipped_list[0] + preds = unzipped_list[1] + fscore = f1_score(golds, preds, average="weighted") + return fscore diff --git a/lm_eval/tasks/aradice/ArabicMMLU/EGY/utils.py b/lm_eval/tasks/aradice/ArabicMMLU/EGY/utils.py new file mode 100644 index 0000000000..4e232c2346 --- /dev/null +++ b/lm_eval/tasks/aradice/ArabicMMLU/EGY/utils.py @@ -0,0 +1,79 @@ +level_ar = { + 'Primary': 'للمرحلة الابتدائية', + 'Middle': 'للمرحلة المتوسطة', + 'High': 'للمرحلة الثانوية', + 'Univ': 'للمرحلة الجامعية ', + 'Prof': 'للمحترفين' +} + +country_ar = { + 'UAE': 'في الإمارات', + 'Egypt': 'في مصر', + 'Lebanon': 'في لبنان', + 'Jordan': 'في الأردن', + 'Kuwait': 'في الكويت', + 'KSA': 'في السعودية', + 'Palestine': 'في فلسطين', + 'Morocco': 'في المغرب', +} + +subject_ar = { + 'Islamic Studies': 'في الدراسات إسلامية', + 'Driving Test': 'في اختبار القيادة', + 'Natural Science': 'في العلوم الطبيعية', + 'History': 'في مادة التاريخ', + 'General Knowledge': 'في المعرفة العامة', + 'Law': 'في القانون', + 'Physics': 'في الفيزياء', + 'Social Science': 'في العلوم الاجتماعية', + 'Management': 'في الإدارة', + 'Arabic Language': 'في اللغة العربية', + 'Political Science': ' في العلوم السياسية', + 'Philosophy': 'في الفلسفة', + 'Accounting': 'في المحاسبة', + 'Computer Science': 'في علوم الحاسوب', + 'Geography': 'في الجغرافيا', + 'Math': 'في الرياضيات', + 'Biology': 'في علم الأحياء', + 'Economics': 'في الاقتصاد', + 'Arabic Language (General)': 'في اللغة العربية (عام)', + 'Arabic Language (Grammar)': 'في اللغة العربية (النحو)', + 'Civics': 'في التربية المدنية', +} + + +alpa_ar = ['أ-','ب-','ج-','د-','و-'] +alpa_en = ['A-', 'B-', 'C-', "D-", "E-"] +all_choices = ['أ', 'ب', 'ج', 'د', 'و'] +all_choices_en=['A', 'B', 'C', 'D', 'E'] +def process_docs(dataset): + def _helper(doc): + # modifies the contents of a single + # document in our dataset. + + PROMPT = 'ده سؤال [MAIN_META_DATA]. اختار الإجابة الصحيحة!\n\nسؤال: [INPUT]\n[OPTION]' + PROMPT = f'{PROMPT}\n\nإجابة:' + alpa = alpa_ar + subject = subject_ar[doc['Subject']] + level = ' ' + level_ar[doc['Level']] if doc['Level'] else "" + country = ' ' + country_ar[doc['Country']] if doc['Country'] else "" + main_meta_data = f"{subject}{level}{country}" + + question = f"{doc['context']}\n\n{doc['question']}" if doc['context'] else doc['question'] + options = [] + for i, opt in enumerate(['A', 'B', 'C', 'D', 'E']): + if opt not in doc['options'] or doc['options'][opt] is None: + break + options.append(f"{alpa[i]} {doc['options'][opt]}") + + doc["prompt"] = PROMPT.replace('[MAIN_META_DATA]', main_meta_data)\ + .replace('[INPUT]', question)\ + .replace('[OPTION]', '\n'.join(options)) + + doc["choices"] = all_choices[:len(options)] + + doc["target"] = ['A', 'B', 'C', 'D', 'E'].index(doc["Answer Key"]) + + return doc + + return dataset.map(_helper) # returns back a datasets.Dataset object diff --git a/lm_eval/tasks/aradice/ArabicMMLU/LEV/AraDiCE_ArabicMMLU.yaml b/lm_eval/tasks/aradice/ArabicMMLU/LEV/AraDiCE_ArabicMMLU.yaml new file mode 100644 index 0000000000..df64389d8e --- /dev/null +++ b/lm_eval/tasks/aradice/ArabicMMLU/LEV/AraDiCE_ArabicMMLU.yaml @@ -0,0 +1,12 @@ +group: AraDiCE_ArabicMMLU_lev +task: +- AraDiCE_ArabicMMLU_humanities_lev +- AraDiCE_ArabicMMLU_language_lev +- AraDiCE_ArabicMMLU_social-science_lev +- AraDiCE_ArabicMMLU_stem_lev +- AraDiCE_ArabicMMLU_other_lev +aggregate_metric_list: + - metric: acc + weight_by_size: True + - metric: acc_norm + weight_by_size: True diff --git a/lm_eval/tasks/aradice/ArabicMMLU/LEV/AraDiCE_ArabicMMLU_high_humanities_history.yaml b/lm_eval/tasks/aradice/ArabicMMLU/LEV/AraDiCE_ArabicMMLU_high_humanities_history.yaml new file mode 100644 index 0000000000..fbe1838c0f --- /dev/null +++ b/lm_eval/tasks/aradice/ArabicMMLU/LEV/AraDiCE_ArabicMMLU_high_humanities_history.yaml @@ -0,0 +1,10 @@ +"dataset_name": "high_humanities_history" +"description": "" +"fewshot_split": !!null "null" +"include": "_default_template_yaml" +"tag": "AraDiCE_ArabicMMLU_humanities_lev" +"task": "AraDiCE_ArabicMMLU_high_humanities_history_lev" +"task_alias": "high humanities history" +"test_split": "test" +"training_split": !!null "null" +"validation_split": !!null "null" diff --git a/lm_eval/tasks/aradice/ArabicMMLU/LEV/AraDiCE_ArabicMMLU_high_humanities_islamic-studies.yaml b/lm_eval/tasks/aradice/ArabicMMLU/LEV/AraDiCE_ArabicMMLU_high_humanities_islamic-studies.yaml new file mode 100644 index 0000000000..2e1d874eaf --- /dev/null +++ b/lm_eval/tasks/aradice/ArabicMMLU/LEV/AraDiCE_ArabicMMLU_high_humanities_islamic-studies.yaml @@ -0,0 +1,10 @@ +"dataset_name": "high_humanities_islamic-studies" +"description": "" +"fewshot_split": !!null "null" +"include": "_default_template_yaml" +"tag": "AraDiCE_ArabicMMLU_humanities_lev" +"task": "AraDiCE_ArabicMMLU_high_humanities_islamic-studies_lev" +"task_alias": "high humanities islamic-studies" +"test_split": "test" +"training_split": !!null "null" +"validation_split": !!null "null" diff --git a/lm_eval/tasks/aradice/ArabicMMLU/LEV/AraDiCE_ArabicMMLU_high_humanities_philosophy.yaml b/lm_eval/tasks/aradice/ArabicMMLU/LEV/AraDiCE_ArabicMMLU_high_humanities_philosophy.yaml new file mode 100644 index 0000000000..752a95f3db --- /dev/null +++ b/lm_eval/tasks/aradice/ArabicMMLU/LEV/AraDiCE_ArabicMMLU_high_humanities_philosophy.yaml @@ -0,0 +1,10 @@ +"dataset_name": "high_humanities_philosophy" +"description": "" +"fewshot_split": !!null "null" +"include": "_default_template_yaml" +"tag": "AraDiCE_ArabicMMLU_humanities_lev" +"task": "AraDiCE_ArabicMMLU_high_humanities_philosophy_lev" +"task_alias": "high humanities philosophy" +"test_split": "test" +"training_split": !!null "null" +"validation_split": !!null "null" diff --git a/lm_eval/tasks/aradice/ArabicMMLU/LEV/AraDiCE_ArabicMMLU_high_language_arabic-language.yaml b/lm_eval/tasks/aradice/ArabicMMLU/LEV/AraDiCE_ArabicMMLU_high_language_arabic-language.yaml new file mode 100644 index 0000000000..27d14f96d1 --- /dev/null +++ b/lm_eval/tasks/aradice/ArabicMMLU/LEV/AraDiCE_ArabicMMLU_high_language_arabic-language.yaml @@ -0,0 +1,10 @@ +"dataset_name": "high_language_arabic-language" +"description": "" +"fewshot_split": !!null "null" +"include": "_default_template_yaml" +"tag": "AraDiCE_ArabicMMLU_language_lev" +"task": "AraDiCE_ArabicMMLU_high_language_arabic-language_lev" +"task_alias": "high language arabic-language" +"test_split": "test" +"training_split": !!null "null" +"validation_split": !!null "null" diff --git a/lm_eval/tasks/aradice/ArabicMMLU/LEV/AraDiCE_ArabicMMLU_high_social-science_civics.yaml b/lm_eval/tasks/aradice/ArabicMMLU/LEV/AraDiCE_ArabicMMLU_high_social-science_civics.yaml new file mode 100644 index 0000000000..29d1a5205e --- /dev/null +++ b/lm_eval/tasks/aradice/ArabicMMLU/LEV/AraDiCE_ArabicMMLU_high_social-science_civics.yaml @@ -0,0 +1,10 @@ +"dataset_name": "high_social-science_civics" +"description": "" +"fewshot_split": !!null "null" +"include": "_default_template_yaml" +"tag": "AraDiCE_ArabicMMLU_social-science_lev" +"task": "AraDiCE_ArabicMMLU_high_social-science_civics_lev" +"task_alias": "high social-science civics" +"test_split": "test" +"training_split": !!null "null" +"validation_split": !!null "null" diff --git a/lm_eval/tasks/aradice/ArabicMMLU/LEV/AraDiCE_ArabicMMLU_high_social-science_economics.yaml b/lm_eval/tasks/aradice/ArabicMMLU/LEV/AraDiCE_ArabicMMLU_high_social-science_economics.yaml new file mode 100644 index 0000000000..378587a8fe --- /dev/null +++ b/lm_eval/tasks/aradice/ArabicMMLU/LEV/AraDiCE_ArabicMMLU_high_social-science_economics.yaml @@ -0,0 +1,10 @@ +"dataset_name": "high_social-science_economics" +"description": "" +"fewshot_split": !!null "null" +"include": "_default_template_yaml" +"tag": "AraDiCE_ArabicMMLU_social-science_lev" +"task": "AraDiCE_ArabicMMLU_high_social-science_economics_lev" +"task_alias": "high social-science economics" +"test_split": "test" +"training_split": !!null "null" +"validation_split": !!null "null" diff --git a/lm_eval/tasks/aradice/ArabicMMLU/LEV/AraDiCE_ArabicMMLU_high_social-science_geography.yaml b/lm_eval/tasks/aradice/ArabicMMLU/LEV/AraDiCE_ArabicMMLU_high_social-science_geography.yaml new file mode 100644 index 0000000000..11668a5f0b --- /dev/null +++ b/lm_eval/tasks/aradice/ArabicMMLU/LEV/AraDiCE_ArabicMMLU_high_social-science_geography.yaml @@ -0,0 +1,10 @@ +"dataset_name": "high_social-science_geography" +"description": "" +"fewshot_split": !!null "null" +"include": "_default_template_yaml" +"tag": "AraDiCE_ArabicMMLU_social-science_lev" +"task": "AraDiCE_ArabicMMLU_high_social-science_geography_lev" +"task_alias": "high social-science geography" +"test_split": "test" +"training_split": !!null "null" +"validation_split": !!null "null" diff --git a/lm_eval/tasks/aradice/ArabicMMLU/LEV/AraDiCE_ArabicMMLU_high_stem_biology.yaml b/lm_eval/tasks/aradice/ArabicMMLU/LEV/AraDiCE_ArabicMMLU_high_stem_biology.yaml new file mode 100644 index 0000000000..80900b2f52 --- /dev/null +++ b/lm_eval/tasks/aradice/ArabicMMLU/LEV/AraDiCE_ArabicMMLU_high_stem_biology.yaml @@ -0,0 +1,10 @@ +"dataset_name": "high_stem_biology" +"description": "" +"fewshot_split": !!null "null" +"include": "_default_template_yaml" +"tag": "AraDiCE_ArabicMMLU_stem_lev" +"task": "AraDiCE_ArabicMMLU_high_stem_biology_lev" +"task_alias": "high stem biology" +"test_split": "test" +"training_split": !!null "null" +"validation_split": !!null "null" diff --git a/lm_eval/tasks/aradice/ArabicMMLU/LEV/AraDiCE_ArabicMMLU_high_stem_computer-science.yaml b/lm_eval/tasks/aradice/ArabicMMLU/LEV/AraDiCE_ArabicMMLU_high_stem_computer-science.yaml new file mode 100644 index 0000000000..eca96f2c6e --- /dev/null +++ b/lm_eval/tasks/aradice/ArabicMMLU/LEV/AraDiCE_ArabicMMLU_high_stem_computer-science.yaml @@ -0,0 +1,10 @@ +"dataset_name": "high_stem_computer-science" +"description": "" +"fewshot_split": !!null "null" +"include": "_default_template_yaml" +"tag": "AraDiCE_ArabicMMLU_stem_lev" +"task": "AraDiCE_ArabicMMLU_high_stem_computer-science_lev" +"task_alias": "high stem computer-science" +"test_split": "test" +"training_split": !!null "null" +"validation_split": !!null "null" diff --git a/lm_eval/tasks/aradice/ArabicMMLU/LEV/AraDiCE_ArabicMMLU_high_stem_physics.yaml b/lm_eval/tasks/aradice/ArabicMMLU/LEV/AraDiCE_ArabicMMLU_high_stem_physics.yaml new file mode 100644 index 0000000000..1d21bcc69f --- /dev/null +++ b/lm_eval/tasks/aradice/ArabicMMLU/LEV/AraDiCE_ArabicMMLU_high_stem_physics.yaml @@ -0,0 +1,10 @@ +"dataset_name": "high_stem_physics" +"description": "" +"fewshot_split": !!null "null" +"include": "_default_template_yaml" +"tag": "AraDiCE_ArabicMMLU_stem_lev" +"task": "AraDiCE_ArabicMMLU_high_stem_physics_lev" +"task_alias": "high stem physics" +"test_split": "test" +"training_split": !!null "null" +"validation_split": !!null "null" diff --git a/lm_eval/tasks/aradice/ArabicMMLU/LEV/AraDiCE_ArabicMMLU_middle_humanities_history.yaml b/lm_eval/tasks/aradice/ArabicMMLU/LEV/AraDiCE_ArabicMMLU_middle_humanities_history.yaml new file mode 100644 index 0000000000..8dd3cfb9e1 --- /dev/null +++ b/lm_eval/tasks/aradice/ArabicMMLU/LEV/AraDiCE_ArabicMMLU_middle_humanities_history.yaml @@ -0,0 +1,10 @@ +"dataset_name": "middle_humanities_history" +"description": "" +"fewshot_split": !!null "null" +"include": "_default_template_yaml" +"tag": "AraDiCE_ArabicMMLU_humanities_lev" +"task": "AraDiCE_ArabicMMLU_middle_humanities_history_lev" +"task_alias": "middle humanities history" +"test_split": "test" +"training_split": !!null "null" +"validation_split": !!null "null" diff --git a/lm_eval/tasks/aradice/ArabicMMLU/LEV/AraDiCE_ArabicMMLU_middle_humanities_islamic-studies.yaml b/lm_eval/tasks/aradice/ArabicMMLU/LEV/AraDiCE_ArabicMMLU_middle_humanities_islamic-studies.yaml new file mode 100644 index 0000000000..7e5490e480 --- /dev/null +++ b/lm_eval/tasks/aradice/ArabicMMLU/LEV/AraDiCE_ArabicMMLU_middle_humanities_islamic-studies.yaml @@ -0,0 +1,10 @@ +"dataset_name": "middle_humanities_islamic-studies" +"description": "" +"fewshot_split": !!null "null" +"include": "_default_template_yaml" +"tag": "AraDiCE_ArabicMMLU_humanities_lev" +"task": "AraDiCE_ArabicMMLU_middle_humanities_islamic-studies_lev" +"task_alias": "middle humanities islamic-studies" +"test_split": "test" +"training_split": !!null "null" +"validation_split": !!null "null" diff --git a/lm_eval/tasks/aradice/ArabicMMLU/LEV/AraDiCE_ArabicMMLU_middle_language_arabic-language.yaml b/lm_eval/tasks/aradice/ArabicMMLU/LEV/AraDiCE_ArabicMMLU_middle_language_arabic-language.yaml new file mode 100644 index 0000000000..b67e3be59c --- /dev/null +++ b/lm_eval/tasks/aradice/ArabicMMLU/LEV/AraDiCE_ArabicMMLU_middle_language_arabic-language.yaml @@ -0,0 +1,10 @@ +"dataset_name": "middle_language_arabic-language" +"description": "" +"fewshot_split": !!null "null" +"include": "_default_template_yaml" +"tag": "AraDiCE_ArabicMMLU_language_lev" +"task": "AraDiCE_ArabicMMLU_middle_language_arabic-language_lev" +"task_alias": "middle language arabic-language" +"test_split": "test" +"training_split": !!null "null" +"validation_split": !!null "null" diff --git a/lm_eval/tasks/aradice/ArabicMMLU/LEV/AraDiCE_ArabicMMLU_middle_other_general-knowledge.yaml b/lm_eval/tasks/aradice/ArabicMMLU/LEV/AraDiCE_ArabicMMLU_middle_other_general-knowledge.yaml new file mode 100644 index 0000000000..bd43ebe3dd --- /dev/null +++ b/lm_eval/tasks/aradice/ArabicMMLU/LEV/AraDiCE_ArabicMMLU_middle_other_general-knowledge.yaml @@ -0,0 +1,10 @@ +"dataset_name": "middle_other_general-knowledge" +"description": "" +"fewshot_split": !!null "null" +"include": "_default_template_yaml" +"tag": "AraDiCE_ArabicMMLU_other_lev" +"task": "AraDiCE_ArabicMMLU_middle_other_general-knowledge_lev" +"task_alias": "middle other general-knowledge" +"test_split": "test" +"training_split": !!null "null" +"validation_split": !!null "null" diff --git a/lm_eval/tasks/aradice/ArabicMMLU/LEV/AraDiCE_ArabicMMLU_middle_social-science_civics.yaml b/lm_eval/tasks/aradice/ArabicMMLU/LEV/AraDiCE_ArabicMMLU_middle_social-science_civics.yaml new file mode 100644 index 0000000000..a18665cf01 --- /dev/null +++ b/lm_eval/tasks/aradice/ArabicMMLU/LEV/AraDiCE_ArabicMMLU_middle_social-science_civics.yaml @@ -0,0 +1,10 @@ +"dataset_name": "middle_social-science_civics" +"description": "" +"fewshot_split": !!null "null" +"include": "_default_template_yaml" +"tag": "AraDiCE_ArabicMMLU_social-science_lev" +"task": "AraDiCE_ArabicMMLU_middle_social-science_civics_lev" +"task_alias": "middle social-science civics" +"test_split": "test" +"training_split": !!null "null" +"validation_split": !!null "null" diff --git a/lm_eval/tasks/aradice/ArabicMMLU/LEV/AraDiCE_ArabicMMLU_middle_social-science_economics.yaml b/lm_eval/tasks/aradice/ArabicMMLU/LEV/AraDiCE_ArabicMMLU_middle_social-science_economics.yaml new file mode 100644 index 0000000000..e1de265b6b --- /dev/null +++ b/lm_eval/tasks/aradice/ArabicMMLU/LEV/AraDiCE_ArabicMMLU_middle_social-science_economics.yaml @@ -0,0 +1,10 @@ +"dataset_name": "middle_social-science_economics" +"description": "" +"fewshot_split": !!null "null" +"include": "_default_template_yaml" +"tag": "AraDiCE_ArabicMMLU_social-science_lev" +"task": "AraDiCE_ArabicMMLU_middle_social-science_economics_lev" +"task_alias": "middle social-science economics" +"test_split": "test" +"training_split": !!null "null" +"validation_split": !!null "null" diff --git a/lm_eval/tasks/aradice/ArabicMMLU/LEV/AraDiCE_ArabicMMLU_middle_social-science_geography.yaml b/lm_eval/tasks/aradice/ArabicMMLU/LEV/AraDiCE_ArabicMMLU_middle_social-science_geography.yaml new file mode 100644 index 0000000000..19083eb00c --- /dev/null +++ b/lm_eval/tasks/aradice/ArabicMMLU/LEV/AraDiCE_ArabicMMLU_middle_social-science_geography.yaml @@ -0,0 +1,10 @@ +"dataset_name": "middle_social-science_geography" +"description": "" +"fewshot_split": !!null "null" +"include": "_default_template_yaml" +"tag": "AraDiCE_ArabicMMLU_social-science_lev" +"task": "AraDiCE_ArabicMMLU_middle_social-science_geography_lev" +"task_alias": "middle social-science geography" +"test_split": "test" +"training_split": !!null "null" +"validation_split": !!null "null" diff --git a/lm_eval/tasks/aradice/ArabicMMLU/LEV/AraDiCE_ArabicMMLU_middle_social-science_social-science.yaml b/lm_eval/tasks/aradice/ArabicMMLU/LEV/AraDiCE_ArabicMMLU_middle_social-science_social-science.yaml new file mode 100644 index 0000000000..3c7d19c7ea --- /dev/null +++ b/lm_eval/tasks/aradice/ArabicMMLU/LEV/AraDiCE_ArabicMMLU_middle_social-science_social-science.yaml @@ -0,0 +1,10 @@ +"dataset_name": "middle_social-science_social-science" +"description": "" +"fewshot_split": !!null "null" +"include": "_default_template_yaml" +"tag": "AraDiCE_ArabicMMLU_social-science_lev" +"task": "AraDiCE_ArabicMMLU_middle_social-science_social-science_lev" +"task_alias": "middle social-science social-science" +"test_split": "test" +"training_split": !!null "null" +"validation_split": !!null "null" diff --git a/lm_eval/tasks/aradice/ArabicMMLU/LEV/AraDiCE_ArabicMMLU_middle_stem_computer-science.yaml b/lm_eval/tasks/aradice/ArabicMMLU/LEV/AraDiCE_ArabicMMLU_middle_stem_computer-science.yaml new file mode 100644 index 0000000000..583e29b103 --- /dev/null +++ b/lm_eval/tasks/aradice/ArabicMMLU/LEV/AraDiCE_ArabicMMLU_middle_stem_computer-science.yaml @@ -0,0 +1,10 @@ +"dataset_name": "middle_stem_computer-science" +"description": "" +"fewshot_split": !!null "null" +"include": "_default_template_yaml" +"tag": "AraDiCE_ArabicMMLU_stem_lev" +"task": "AraDiCE_ArabicMMLU_middle_stem_computer-science_lev" +"task_alias": "middle stem computer-science" +"test_split": "test" +"training_split": !!null "null" +"validation_split": !!null "null" diff --git a/lm_eval/tasks/aradice/ArabicMMLU/LEV/AraDiCE_ArabicMMLU_middle_stem_natural-science.yaml b/lm_eval/tasks/aradice/ArabicMMLU/LEV/AraDiCE_ArabicMMLU_middle_stem_natural-science.yaml new file mode 100644 index 0000000000..a1904d2c87 --- /dev/null +++ b/lm_eval/tasks/aradice/ArabicMMLU/LEV/AraDiCE_ArabicMMLU_middle_stem_natural-science.yaml @@ -0,0 +1,10 @@ +"dataset_name": "middle_stem_natural-science" +"description": "" +"fewshot_split": !!null "null" +"include": "_default_template_yaml" +"tag": "AraDiCE_ArabicMMLU_stem_lev" +"task": "AraDiCE_ArabicMMLU_middle_stem_natural-science_lev" +"task_alias": "middle stem natural-science" +"test_split": "test" +"training_split": !!null "null" +"validation_split": !!null "null" diff --git a/lm_eval/tasks/aradice/ArabicMMLU/LEV/AraDiCE_ArabicMMLU_na_humanities_islamic-studies.yaml b/lm_eval/tasks/aradice/ArabicMMLU/LEV/AraDiCE_ArabicMMLU_na_humanities_islamic-studies.yaml new file mode 100644 index 0000000000..ac0bfe8a06 --- /dev/null +++ b/lm_eval/tasks/aradice/ArabicMMLU/LEV/AraDiCE_ArabicMMLU_na_humanities_islamic-studies.yaml @@ -0,0 +1,10 @@ +"dataset_name": "na_humanities_islamic-studies" +"description": "" +"fewshot_split": !!null "null" +"include": "_default_template_yaml" +"tag": "AraDiCE_ArabicMMLU_humanities_lev" +"task": "AraDiCE_ArabicMMLU_na_humanities_islamic-studies_lev" +"task_alias": "na humanities islamic-studies" +"test_split": "test" +"training_split": !!null "null" +"validation_split": !!null "null" diff --git a/lm_eval/tasks/aradice/ArabicMMLU/LEV/AraDiCE_ArabicMMLU_na_language_arabic-language-general.yaml b/lm_eval/tasks/aradice/ArabicMMLU/LEV/AraDiCE_ArabicMMLU_na_language_arabic-language-general.yaml new file mode 100644 index 0000000000..f80e6e93e4 --- /dev/null +++ b/lm_eval/tasks/aradice/ArabicMMLU/LEV/AraDiCE_ArabicMMLU_na_language_arabic-language-general.yaml @@ -0,0 +1,10 @@ +"dataset_name": "na_language_arabic-language-general" +"description": "" +"fewshot_split": !!null "null" +"include": "_default_template_yaml" +"tag": "AraDiCE_ArabicMMLU_language_lev" +"task": "AraDiCE_ArabicMMLU_na_language_arabic-language-general_lev" +"task_alias": "na language arabic-language-general" +"test_split": "test" +"training_split": !!null "null" +"validation_split": !!null "null" diff --git a/lm_eval/tasks/aradice/ArabicMMLU/LEV/AraDiCE_ArabicMMLU_na_language_arabic-language-grammar.yaml b/lm_eval/tasks/aradice/ArabicMMLU/LEV/AraDiCE_ArabicMMLU_na_language_arabic-language-grammar.yaml new file mode 100644 index 0000000000..af3943d9a8 --- /dev/null +++ b/lm_eval/tasks/aradice/ArabicMMLU/LEV/AraDiCE_ArabicMMLU_na_language_arabic-language-grammar.yaml @@ -0,0 +1,10 @@ +"dataset_name": "na_language_arabic-language-grammar" +"description": "" +"fewshot_split": !!null "null" +"include": "_default_template_yaml" +"tag": "AraDiCE_ArabicMMLU_language_lev" +"task": "AraDiCE_ArabicMMLU_na_language_arabic-language-grammar_lev" +"task_alias": "na language arabic-language-grammar" +"test_split": "test" +"training_split": !!null "null" +"validation_split": !!null "null" diff --git a/lm_eval/tasks/aradice/ArabicMMLU/LEV/AraDiCE_ArabicMMLU_na_other_driving-test.yaml b/lm_eval/tasks/aradice/ArabicMMLU/LEV/AraDiCE_ArabicMMLU_na_other_driving-test.yaml new file mode 100644 index 0000000000..0af542f0d6 --- /dev/null +++ b/lm_eval/tasks/aradice/ArabicMMLU/LEV/AraDiCE_ArabicMMLU_na_other_driving-test.yaml @@ -0,0 +1,10 @@ +"dataset_name": "na_other_driving-test" +"description": "" +"fewshot_split": !!null "null" +"include": "_default_template_yaml" +"tag": "AraDiCE_ArabicMMLU_other_lev" +"task": "AraDiCE_ArabicMMLU_na_other_driving-test_lev" +"task_alias": "na other driving-test" +"test_split": "test" +"training_split": !!null "null" +"validation_split": !!null "null" diff --git a/lm_eval/tasks/aradice/ArabicMMLU/LEV/AraDiCE_ArabicMMLU_na_other_general-knowledge.yaml b/lm_eval/tasks/aradice/ArabicMMLU/LEV/AraDiCE_ArabicMMLU_na_other_general-knowledge.yaml new file mode 100644 index 0000000000..0c5669cf07 --- /dev/null +++ b/lm_eval/tasks/aradice/ArabicMMLU/LEV/AraDiCE_ArabicMMLU_na_other_general-knowledge.yaml @@ -0,0 +1,10 @@ +"dataset_name": "na_other_general-knowledge" +"description": "" +"fewshot_split": !!null "null" +"include": "_default_template_yaml" +"tag": "AraDiCE_ArabicMMLU_other_lev" +"task": "AraDiCE_ArabicMMLU_na_other_general-knowledge_lev" +"task_alias": "na other general-knowledge" +"test_split": "test" +"training_split": !!null "null" +"validation_split": !!null "null" diff --git a/lm_eval/tasks/aradice/ArabicMMLU/LEV/AraDiCE_ArabicMMLU_primary_humanities_history.yaml b/lm_eval/tasks/aradice/ArabicMMLU/LEV/AraDiCE_ArabicMMLU_primary_humanities_history.yaml new file mode 100644 index 0000000000..be32d433f7 --- /dev/null +++ b/lm_eval/tasks/aradice/ArabicMMLU/LEV/AraDiCE_ArabicMMLU_primary_humanities_history.yaml @@ -0,0 +1,10 @@ +"dataset_name": "primary_humanities_history" +"description": "" +"fewshot_split": !!null "null" +"include": "_default_template_yaml" +"tag": "AraDiCE_ArabicMMLU_humanities_lev" +"task": "AraDiCE_ArabicMMLU_primary_humanities_history_lev" +"task_alias": "primary humanities history" +"test_split": "test" +"training_split": !!null "null" +"validation_split": !!null "null" diff --git a/lm_eval/tasks/aradice/ArabicMMLU/LEV/AraDiCE_ArabicMMLU_primary_humanities_islamic-studies.yaml b/lm_eval/tasks/aradice/ArabicMMLU/LEV/AraDiCE_ArabicMMLU_primary_humanities_islamic-studies.yaml new file mode 100644 index 0000000000..9ae53b80ee --- /dev/null +++ b/lm_eval/tasks/aradice/ArabicMMLU/LEV/AraDiCE_ArabicMMLU_primary_humanities_islamic-studies.yaml @@ -0,0 +1,10 @@ +"dataset_name": "primary_humanities_islamic-studies" +"description": "" +"fewshot_split": !!null "null" +"include": "_default_template_yaml" +"tag": "AraDiCE_ArabicMMLU_humanities_lev" +"task": "AraDiCE_ArabicMMLU_primary_humanities_islamic-studies_lev" +"task_alias": "primary humanities islamic-studies" +"test_split": "test" +"training_split": !!null "null" +"validation_split": !!null "null" diff --git a/lm_eval/tasks/aradice/ArabicMMLU/LEV/AraDiCE_ArabicMMLU_primary_language_arabic-language.yaml b/lm_eval/tasks/aradice/ArabicMMLU/LEV/AraDiCE_ArabicMMLU_primary_language_arabic-language.yaml new file mode 100644 index 0000000000..15575b0513 --- /dev/null +++ b/lm_eval/tasks/aradice/ArabicMMLU/LEV/AraDiCE_ArabicMMLU_primary_language_arabic-language.yaml @@ -0,0 +1,10 @@ +"dataset_name": "primary_language_arabic-language" +"description": "" +"fewshot_split": !!null "null" +"include": "_default_template_yaml" +"tag": "AraDiCE_ArabicMMLU_language_lev" +"task": "AraDiCE_ArabicMMLU_primary_language_arabic-language_lev" +"task_alias": "primary language arabic-language" +"test_split": "test" +"training_split": !!null "null" +"validation_split": !!null "null" diff --git a/lm_eval/tasks/aradice/ArabicMMLU/LEV/AraDiCE_ArabicMMLU_primary_other_general-knowledge.yaml b/lm_eval/tasks/aradice/ArabicMMLU/LEV/AraDiCE_ArabicMMLU_primary_other_general-knowledge.yaml new file mode 100644 index 0000000000..07b6692115 --- /dev/null +++ b/lm_eval/tasks/aradice/ArabicMMLU/LEV/AraDiCE_ArabicMMLU_primary_other_general-knowledge.yaml @@ -0,0 +1,10 @@ +"dataset_name": "primary_other_general-knowledge" +"description": "" +"fewshot_split": !!null "null" +"include": "_default_template_yaml" +"tag": "AraDiCE_ArabicMMLU_other_lev" +"task": "AraDiCE_ArabicMMLU_primary_other_general-knowledge_lev" +"task_alias": "primary other general-knowledge" +"test_split": "test" +"training_split": !!null "null" +"validation_split": !!null "null" diff --git a/lm_eval/tasks/aradice/ArabicMMLU/LEV/AraDiCE_ArabicMMLU_primary_social-science_geography.yaml b/lm_eval/tasks/aradice/ArabicMMLU/LEV/AraDiCE_ArabicMMLU_primary_social-science_geography.yaml new file mode 100644 index 0000000000..b43c49035c --- /dev/null +++ b/lm_eval/tasks/aradice/ArabicMMLU/LEV/AraDiCE_ArabicMMLU_primary_social-science_geography.yaml @@ -0,0 +1,10 @@ +"dataset_name": "primary_social-science_geography" +"description": "" +"fewshot_split": !!null "null" +"include": "_default_template_yaml" +"tag": "AraDiCE_ArabicMMLU_social-science_lev" +"task": "AraDiCE_ArabicMMLU_primary_social-science_geography_lev" +"task_alias": "primary social-science geography" +"test_split": "test" +"training_split": !!null "null" +"validation_split": !!null "null" diff --git a/lm_eval/tasks/aradice/ArabicMMLU/LEV/AraDiCE_ArabicMMLU_primary_social-science_social-science.yaml b/lm_eval/tasks/aradice/ArabicMMLU/LEV/AraDiCE_ArabicMMLU_primary_social-science_social-science.yaml new file mode 100644 index 0000000000..8f9f093415 --- /dev/null +++ b/lm_eval/tasks/aradice/ArabicMMLU/LEV/AraDiCE_ArabicMMLU_primary_social-science_social-science.yaml @@ -0,0 +1,10 @@ +"dataset_name": "primary_social-science_social-science" +"description": "" +"fewshot_split": !!null "null" +"include": "_default_template_yaml" +"tag": "AraDiCE_ArabicMMLU_social-science_lev" +"task": "AraDiCE_ArabicMMLU_primary_social-science_social-science_lev" +"task_alias": "primary social-science social-science" +"test_split": "test" +"training_split": !!null "null" +"validation_split": !!null "null" diff --git a/lm_eval/tasks/aradice/ArabicMMLU/LEV/AraDiCE_ArabicMMLU_primary_stem_computer-science.yaml b/lm_eval/tasks/aradice/ArabicMMLU/LEV/AraDiCE_ArabicMMLU_primary_stem_computer-science.yaml new file mode 100644 index 0000000000..6a79f2e7a2 --- /dev/null +++ b/lm_eval/tasks/aradice/ArabicMMLU/LEV/AraDiCE_ArabicMMLU_primary_stem_computer-science.yaml @@ -0,0 +1,10 @@ +"dataset_name": "primary_stem_computer-science" +"description": "" +"fewshot_split": !!null "null" +"include": "_default_template_yaml" +"tag": "AraDiCE_ArabicMMLU_stem_lev" +"task": "AraDiCE_ArabicMMLU_primary_stem_computer-science_lev" +"task_alias": "primary stem computer-science" +"test_split": "test" +"training_split": !!null "null" +"validation_split": !!null "null" diff --git a/lm_eval/tasks/aradice/ArabicMMLU/LEV/AraDiCE_ArabicMMLU_primary_stem_math.yaml b/lm_eval/tasks/aradice/ArabicMMLU/LEV/AraDiCE_ArabicMMLU_primary_stem_math.yaml new file mode 100644 index 0000000000..048c95096e --- /dev/null +++ b/lm_eval/tasks/aradice/ArabicMMLU/LEV/AraDiCE_ArabicMMLU_primary_stem_math.yaml @@ -0,0 +1,10 @@ +"dataset_name": "primary_stem_math" +"description": "" +"fewshot_split": !!null "null" +"include": "_default_template_yaml" +"tag": "AraDiCE_ArabicMMLU_stem_lev" +"task": "AraDiCE_ArabicMMLU_primary_stem_math_lev" +"task_alias": "primary stem math" +"test_split": "test" +"training_split": !!null "null" +"validation_split": !!null "null" diff --git a/lm_eval/tasks/aradice/ArabicMMLU/LEV/AraDiCE_ArabicMMLU_primary_stem_natural-science.yaml b/lm_eval/tasks/aradice/ArabicMMLU/LEV/AraDiCE_ArabicMMLU_primary_stem_natural-science.yaml new file mode 100644 index 0000000000..6d7404ae7e --- /dev/null +++ b/lm_eval/tasks/aradice/ArabicMMLU/LEV/AraDiCE_ArabicMMLU_primary_stem_natural-science.yaml @@ -0,0 +1,10 @@ +"dataset_name": "primary_stem_natural-science" +"description": "" +"fewshot_split": !!null "null" +"include": "_default_template_yaml" +"tag": "AraDiCE_ArabicMMLU_stem_lev" +"task": "AraDiCE_ArabicMMLU_primary_stem_natural-science_lev" +"task_alias": "primary stem natural-science" +"test_split": "test" +"training_split": !!null "null" +"validation_split": !!null "null" diff --git a/lm_eval/tasks/aradice/ArabicMMLU/LEV/AraDiCE_ArabicMMLU_prof_humanities_law.yaml b/lm_eval/tasks/aradice/ArabicMMLU/LEV/AraDiCE_ArabicMMLU_prof_humanities_law.yaml new file mode 100644 index 0000000000..1c50cb9d91 --- /dev/null +++ b/lm_eval/tasks/aradice/ArabicMMLU/LEV/AraDiCE_ArabicMMLU_prof_humanities_law.yaml @@ -0,0 +1,10 @@ +"dataset_name": "prof_humanities_law" +"description": "" +"fewshot_split": !!null "null" +"include": "_default_template_yaml" +"tag": "AraDiCE_ArabicMMLU_humanities_lev" +"task": "AraDiCE_ArabicMMLU_prof_humanities_law_lev" +"task_alias": "prof humanities law" +"test_split": "test" +"training_split": !!null "null" +"validation_split": !!null "null" diff --git a/lm_eval/tasks/aradice/ArabicMMLU/LEV/AraDiCE_ArabicMMLU_univ_other_management.yaml b/lm_eval/tasks/aradice/ArabicMMLU/LEV/AraDiCE_ArabicMMLU_univ_other_management.yaml new file mode 100644 index 0000000000..31b79fd0c1 --- /dev/null +++ b/lm_eval/tasks/aradice/ArabicMMLU/LEV/AraDiCE_ArabicMMLU_univ_other_management.yaml @@ -0,0 +1,10 @@ +"dataset_name": "univ_other_management" +"description": "" +"fewshot_split": !!null "null" +"include": "_default_template_yaml" +"tag": "AraDiCE_ArabicMMLU_other_lev" +"task": "AraDiCE_ArabicMMLU_univ_other_management_lev" +"task_alias": "univ other management" +"test_split": "test" +"training_split": !!null "null" +"validation_split": !!null "null" diff --git a/lm_eval/tasks/aradice/ArabicMMLU/LEV/AraDiCE_ArabicMMLU_univ_social-science_accounting.yaml b/lm_eval/tasks/aradice/ArabicMMLU/LEV/AraDiCE_ArabicMMLU_univ_social-science_accounting.yaml new file mode 100644 index 0000000000..fc0cb68266 --- /dev/null +++ b/lm_eval/tasks/aradice/ArabicMMLU/LEV/AraDiCE_ArabicMMLU_univ_social-science_accounting.yaml @@ -0,0 +1,10 @@ +"dataset_name": "univ_social-science_accounting" +"description": "" +"fewshot_split": !!null "null" +"include": "_default_template_yaml" +"tag": "AraDiCE_ArabicMMLU_social-science_lev" +"task": "AraDiCE_ArabicMMLU_univ_social-science_accounting_lev" +"task_alias": "univ social-science accounting" +"test_split": "test" +"training_split": !!null "null" +"validation_split": !!null "null" diff --git a/lm_eval/tasks/aradice/ArabicMMLU/LEV/AraDiCE_ArabicMMLU_univ_social-science_economics.yaml b/lm_eval/tasks/aradice/ArabicMMLU/LEV/AraDiCE_ArabicMMLU_univ_social-science_economics.yaml new file mode 100644 index 0000000000..daec1b37a6 --- /dev/null +++ b/lm_eval/tasks/aradice/ArabicMMLU/LEV/AraDiCE_ArabicMMLU_univ_social-science_economics.yaml @@ -0,0 +1,10 @@ +"dataset_name": "univ_social-science_economics" +"description": "" +"fewshot_split": !!null "null" +"include": "_default_template_yaml" +"tag": "AraDiCE_ArabicMMLU_social-science_lev" +"task": "AraDiCE_ArabicMMLU_univ_social-science_economics_lev" +"task_alias": "univ social-science economics" +"test_split": "test" +"training_split": !!null "null" +"validation_split": !!null "null" diff --git a/lm_eval/tasks/aradice/ArabicMMLU/LEV/AraDiCE_ArabicMMLU_univ_social-science_political-science.yaml b/lm_eval/tasks/aradice/ArabicMMLU/LEV/AraDiCE_ArabicMMLU_univ_social-science_political-science.yaml new file mode 100644 index 0000000000..e69f63ca4d --- /dev/null +++ b/lm_eval/tasks/aradice/ArabicMMLU/LEV/AraDiCE_ArabicMMLU_univ_social-science_political-science.yaml @@ -0,0 +1,10 @@ +"dataset_name": "univ_social-science_political-science" +"description": "" +"fewshot_split": !!null "null" +"include": "_default_template_yaml" +"tag": "AraDiCE_ArabicMMLU_social-science_lev" +"task": "AraDiCE_ArabicMMLU_univ_social-science_political-science_lev" +"task_alias": "univ social-science political-science" +"test_split": "test" +"training_split": !!null "null" +"validation_split": !!null "null" diff --git a/lm_eval/tasks/aradice/ArabicMMLU/LEV/AraDiCE_ArabicMMLU_univ_stem_computer-science.yaml b/lm_eval/tasks/aradice/ArabicMMLU/LEV/AraDiCE_ArabicMMLU_univ_stem_computer-science.yaml new file mode 100644 index 0000000000..aeb8fa8118 --- /dev/null +++ b/lm_eval/tasks/aradice/ArabicMMLU/LEV/AraDiCE_ArabicMMLU_univ_stem_computer-science.yaml @@ -0,0 +1,10 @@ +"dataset_name": "univ_stem_computer-science" +"description": "" +"fewshot_split": !!null "null" +"include": "_default_template_yaml" +"tag": "AraDiCE_ArabicMMLU_stem_lev" +"task": "AraDiCE_ArabicMMLU_univ_stem_computer-science_lev" +"task_alias": "univ stem computer-science" +"test_split": "test" +"training_split": !!null "null" +"validation_split": !!null "null" diff --git a/lm_eval/tasks/aradice/ArabicMMLU/LEV/_default_template_yaml b/lm_eval/tasks/aradice/ArabicMMLU/LEV/_default_template_yaml new file mode 100644 index 0000000000..45c5a345de --- /dev/null +++ b/lm_eval/tasks/aradice/ArabicMMLU/LEV/_default_template_yaml @@ -0,0 +1,20 @@ +dataset_path: QCRI/AraDICE-ArabicMMLU-lev +fewshot_config: + sampler: default +output_type: multiple_choice +process_docs: !function utils.process_docs +doc_to_text: "{{prompt}}" +doc_to_choice: choices +doc_to_target: target +metric_list: + - metric: acc + aggregation: mean + higher_is_better: true + - metric: acc_norm + aggregation: mean + higher_is_better: true + - metric: f1 + higher_is_better: true + aggregation: !function metrics.micro_f1_score +metadata: + version: 0.0 diff --git a/lm_eval/tasks/aradice/ArabicMMLU/LEV/metrics.py b/lm_eval/tasks/aradice/ArabicMMLU/LEV/metrics.py new file mode 100644 index 0000000000..47e49ded46 --- /dev/null +++ b/lm_eval/tasks/aradice/ArabicMMLU/LEV/metrics.py @@ -0,0 +1,25 @@ +from sklearn.metrics import f1_score + + +def macro_f1_score(items): + unzipped_list = list(zip(*items)) + golds = unzipped_list[0] + preds = unzipped_list[1] + fscore = f1_score(golds, preds, average="macro") + return fscore + + +def micro_f1_score(items): + unzipped_list = list(zip(*items)) + golds = unzipped_list[0] + preds = unzipped_list[1] + fscore = f1_score(golds, preds, average="micro") + return fscore + + +def weighted_f1_score(items): + unzipped_list = list(zip(*items)) + golds = unzipped_list[0] + preds = unzipped_list[1] + fscore = f1_score(golds, preds, average="weighted") + return fscore diff --git a/lm_eval/tasks/aradice/ArabicMMLU/LEV/utils.py b/lm_eval/tasks/aradice/ArabicMMLU/LEV/utils.py new file mode 100644 index 0000000000..4ea8c71f87 --- /dev/null +++ b/lm_eval/tasks/aradice/ArabicMMLU/LEV/utils.py @@ -0,0 +1,84 @@ +level_ar = { + 'Primary': 'للمرحلة الابتدائية', + 'Middle': 'للمرحلة المتوسطة', + 'High': 'للمرحلة الثانوية', + 'Univ': 'للمرحلة الجامعية ', + 'Prof': 'للمحترفين' +} + +country_ar = { + 'UAE': 'بالإمارات', + 'Egypt': 'بمصر', + 'Lebanon': 'بلبنان', + 'Jordan': 'بالأردن', + 'Kuwait': 'بالكويت', + 'KSA': 'بالسعودية', + 'Palestine': 'بفلسطين', + 'Morocco': 'بالمغرب', +} + +subject_ar = { + 'Islamic Studies': 'عن الدراسات إسلامية', + 'Driving Test': 'عن فحص السواقة', + 'Natural Science': 'عن العلوم الطبيعية', + 'History': 'تاريخ', + 'General Knowledge': 'معرفة عامة', + 'Law': 'عن القانون', + 'Physics': 'فيزياء', + 'Social Science': 'علوم اجتماعية', + 'Management': 'عن الإدارة', + 'Arabic Language': 'عن اللغة العربية', + 'Political Science': ' عن العلوم السياسية', + 'Philosophy': 'فلسفة', + 'Accounting': 'محاسبة', + 'Computer Science': 'عن علوم الحاسوب', + 'Geography': 'جغرافيا', + 'Math': 'رياضيات', + 'Biology': 'بيولوجي', + 'Economics': 'اقتصاد', + 'Arabic Language (General)': 'لغة العربية (عام)', + 'Arabic Language (Grammar)': 'لغة العربية (نحو)', + 'Civics': 'تربية مدنية', +} + +alpa_ar = ['أ-','ب-','ج-','د-','و-'] +alpa_en = ['A-', 'B-', 'C-', "D-", "E-"] +all_choices = ['أ', 'ب', 'ج', 'د', 'و'] +all_choices_en=['A', 'B', 'C', 'D', 'E'] +def process_docs(dataset): + def _helper(doc): + # modifies the contents of a single + # document in our dataset. + PROMPT = 'هيدا سؤال [MAIN_META_DATA]. نقي الجواب الصح!\n\nسؤال: [INPUT]\n[OPTION]' + + # if args.lora_weights == "x": + PROMPT = f'{PROMPT}\n\nالجواب:' + # else: + # PROMPT = f'### Input:{PROMPT}\n\n### Output:\n' + + alpa = alpa_ar + + subject = subject_ar[doc['Subject']] + level = ' ' + level_ar[doc['Level']] if doc['Level'] else "" + country = ' ' + country_ar[doc['Country']] if doc['Country'] else "" + main_meta_data = f"{subject}{level}{country}" + + question = f"{doc['context']}\n\n{doc['question']}" if doc['context'] else doc['question'] + options = [] + + for i, opt in enumerate(['A', 'B', 'C', 'D', 'E']): + if opt not in doc['options'] or doc['options'][opt] is None: + break + options.append(f"{alpa[i]} {doc['options'][opt]}") + + doc["prompt"] = PROMPT.replace('[MAIN_META_DATA]', main_meta_data)\ + .replace('[INPUT]', question)\ + .replace('[OPTION]', '\n'.join(options)) + + doc["choices"] = all_choices[:len(options)] + + doc["target"] = ['A', 'B', 'C', 'D', 'E'].index(doc["Answer Key"]) + + return doc + + return dataset.map(_helper) diff --git a/lm_eval/tasks/aradice/README.md b/lm_eval/tasks/aradice/README.md new file mode 100644 index 0000000000..c0f1043df5 --- /dev/null +++ b/lm_eval/tasks/aradice/README.md @@ -0,0 +1,49 @@ +# AraDiCE + +### Paper + +**Title:** AraDiCE: Benchmarks for Dialectal and Cultural Capabilities in LLMs + +**Abstract:** Arabic, with its rich diversity of dialects, remains significantly underrepresented in Large Language Models, particularly in dialectal variations. We address this gap by introducing seven synthetic datasets in dialects alongside Modern Standard Arabic (MSA), created using Machine Translation (MT) combined with human post-editing. We present AraDiCE, a benchmark for Arabic Dialect and Cultural Evaluation. We evaluate LLMs on dialect comprehension and generation, focusing specifically on low-resource Arabic dialects. Additionally, we introduce the first-ever fine-grained benchmark designed to evaluate cultural awareness across the Gulf, Egypt, and Levant regions, providing a novel dimension to LLM evaluation. Our findings demonstrate that while Arabic-specific models like Jais and AceGPT outperform multilingual models on dialectal tasks, significant challenges persist in dialect identification, generation, and translation. This work contributes ~45K post-edited samples, a cultural benchmark, and highlights the importance of tailored training to improve LLM performance in capturing the nuances of diverse Arabic dialects and cultural contexts. We will release the dialectal translation models and benchmarks curated in this study. + +**Homepage:** +https://huggingface.co/datasets/QCRI/AraDiCE + + + +### Citation + +``` +@article{mousi2024aradicebenchmarksdialectalcultural, + title={{AraDiCE}: Benchmarks for Dialectal and Cultural Capabilities in LLMs}, + author={Basel Mousi and Nadir Durrani and Fatema Ahmad and Md. Arid Hasan and Maram Hasanain and Tameem Kabbani and Fahim Dalvi and Shammur Absar Chowdhury and Firoj Alam}, + year={2024}, + publisher={arXiv:2409.11404}, + url={https://arxiv.org/abs/2409.11404}, +} +``` + +### Groups, Tags, and Tasks + +#### Groups + +* `AraDiCE`: Overall results for all tasks associated with different datasets. + + +#### Tasks + +* `aradice`: Overall results for all tasks associated with different datasets. +* `arabicmmlu`: TODO + + +### Checklist + +* [x] Is the task an existing benchmark in the literature? + * [x] Have you referenced the original paper that introduced the task? + * [x] If yes, does the original paper provide a reference implementation? If so, have you checked against the reference implementation and documented how to run such a test? + + +If other tasks on this dataset are already supported: +* [x] Is the "Main" variant of this task clearly denoted? +* [x] Have you provided a short sentence in a README on what each new variant adds / evaluates? +* [x] Have you noted which, if any, published evaluation setups are matched by this variant? diff --git a/lm_eval/tasks/aradice/aradice.yaml b/lm_eval/tasks/aradice/aradice.yaml new file mode 100644 index 0000000000..8c7759f2c3 --- /dev/null +++ b/lm_eval/tasks/aradice/aradice.yaml @@ -0,0 +1,30 @@ +group: AraDiCE +task: +- AraDiCE_ArabicMMLU_lev +- AraDiCE_ArabicMMLU_egy +- AraDiCE_boolq_egy +- AraDiCE_boolq_eng +- AraDiCE_boolq_lev +- AraDiCE_boolq_msa +- AraDiCE_egypt_cultural +- AraDiCE_jordan_cultural +- AraDiCE_lebanon_cultural +- AraDiCE_palestine_cultural +- AraDiCE_qatar_cultural +- AraDiCE_syria_cultural +- AraDiCE_openbookqa_egy +- AraDiCE_openbookqa_eng +- AraDiCE_openbookqa_lev +- AraDiCE_openbookqa_msa +- AraDiCE_piqa_egy +- AraDiCE_piqa_eng +- AraDiCE_piqa_lev +- AraDiCE_piqa_msa +- AraDiCE_truthfulqa_mc1_egy +- AraDiCE_truthfulqa_mc1_eng +- AraDiCE_truthfulqa_mc1_lev +- AraDiCE_truthfulqa_mc1_msa +- AraDiCE_winogrande_egy +- AraDiCE_winogrande_eng +- AraDiCE_winogrande_lev +- AraDiCE_winogrande_msa diff --git a/lm_eval/tasks/aradice/boolq/EGY/boolq_egy.yaml b/lm_eval/tasks/aradice/boolq/EGY/boolq_egy.yaml new file mode 100644 index 0000000000..130c1ed5be --- /dev/null +++ b/lm_eval/tasks/aradice/boolq/EGY/boolq_egy.yaml @@ -0,0 +1,25 @@ +task: AraDiCE_boolq_egy +dataset_path: QCRI/AraDiCE-BoolQ +dataset_name: BoolQ-egy +output_type: multiple_choice +training_split: null +validation_split: null +test_split: test +process_docs: !function utils.process_docs +doc_to_text: "{{passage}}\nسؤال: {{question}}؟\nجواب:" +doc_to_target: target +doc_to_choice: ["لا", "نعم"] +should_decontaminate: true +doc_to_decontamination_query: passage +metric_list: + - metric: acc + aggregation: mean + higher_is_better: true + - metric: acc_norm + aggregation: mean + higher_is_better: true + - metric: f1 + higher_is_better: true + aggregation: !function metrics.micro_f1_score +metadata: + version: 1.0 \ No newline at end of file diff --git a/lm_eval/tasks/aradice/boolq/EGY/metrics.py b/lm_eval/tasks/aradice/boolq/EGY/metrics.py new file mode 100644 index 0000000000..47e49ded46 --- /dev/null +++ b/lm_eval/tasks/aradice/boolq/EGY/metrics.py @@ -0,0 +1,25 @@ +from sklearn.metrics import f1_score + + +def macro_f1_score(items): + unzipped_list = list(zip(*items)) + golds = unzipped_list[0] + preds = unzipped_list[1] + fscore = f1_score(golds, preds, average="macro") + return fscore + + +def micro_f1_score(items): + unzipped_list = list(zip(*items)) + golds = unzipped_list[0] + preds = unzipped_list[1] + fscore = f1_score(golds, preds, average="micro") + return fscore + + +def weighted_f1_score(items): + unzipped_list = list(zip(*items)) + golds = unzipped_list[0] + preds = unzipped_list[1] + fscore = f1_score(golds, preds, average="weighted") + return fscore diff --git a/lm_eval/tasks/aradice/boolq/EGY/utils.py b/lm_eval/tasks/aradice/boolq/EGY/utils.py new file mode 100644 index 0000000000..79703ef31a --- /dev/null +++ b/lm_eval/tasks/aradice/boolq/EGY/utils.py @@ -0,0 +1,22 @@ + +egy_answer_mapping = { + "true": "نعم", + "false": "لا", + True: "نعم", + False: "لا" +} + +def process_docs(dataset): + def remove_question_mark(text): + text = text.strip() + if text.endswith("?") or text.endswith("؟"): + text = text[:-1] + text = text.strip() + + return text + + def _helper(doc): + doc["question"] = remove_question_mark(doc["question"]) + doc["target"] = egy_answer_mapping[doc["answer"]] + return doc + return dataset.map(_helper) \ No newline at end of file diff --git a/lm_eval/tasks/aradice/boolq/ENG/boolq_eng.yaml b/lm_eval/tasks/aradice/boolq/ENG/boolq_eng.yaml new file mode 100644 index 0000000000..70974942a4 --- /dev/null +++ b/lm_eval/tasks/aradice/boolq/ENG/boolq_eng.yaml @@ -0,0 +1,25 @@ +task: AraDiCE_boolq_eng +dataset_path: QCRI/AraDiCE-BoolQ +dataset_name: BoolQ-eng +output_type: multiple_choice +training_split: null +validation_split: null +test_split: test +process_docs: !function utils.process_docs +doc_to_text: "{{passage}}\nQuestion: {{question}}?\nAnswer:" +doc_to_target: target +doc_to_choice: ["no", "yes"] +should_decontaminate: true +doc_to_decontamination_query: passage +metric_list: + - metric: acc + aggregation: mean + higher_is_better: true + - metric: acc_norm + aggregation: mean + higher_is_better: true + - metric: f1 + higher_is_better: true + aggregation: !function metrics.micro_f1_score +metadata: + version: 1.0 \ No newline at end of file diff --git a/lm_eval/tasks/aradice/boolq/ENG/metrics.py b/lm_eval/tasks/aradice/boolq/ENG/metrics.py new file mode 100644 index 0000000000..47e49ded46 --- /dev/null +++ b/lm_eval/tasks/aradice/boolq/ENG/metrics.py @@ -0,0 +1,25 @@ +from sklearn.metrics import f1_score + + +def macro_f1_score(items): + unzipped_list = list(zip(*items)) + golds = unzipped_list[0] + preds = unzipped_list[1] + fscore = f1_score(golds, preds, average="macro") + return fscore + + +def micro_f1_score(items): + unzipped_list = list(zip(*items)) + golds = unzipped_list[0] + preds = unzipped_list[1] + fscore = f1_score(golds, preds, average="micro") + return fscore + + +def weighted_f1_score(items): + unzipped_list = list(zip(*items)) + golds = unzipped_list[0] + preds = unzipped_list[1] + fscore = f1_score(golds, preds, average="weighted") + return fscore diff --git a/lm_eval/tasks/aradice/boolq/ENG/utils.py b/lm_eval/tasks/aradice/boolq/ENG/utils.py new file mode 100644 index 0000000000..084ff9c29d --- /dev/null +++ b/lm_eval/tasks/aradice/boolq/ENG/utils.py @@ -0,0 +1,22 @@ + +en_answer_mapping = { + "true": "yes", + "false": "no", + True: "yes", + False: "no" +} + +def process_docs(dataset): + def remove_question_mark(text): + text = text.strip() + if text.endswith("?") or text.endswith("؟"): + text = text[:-1] + text = text.strip() + + return text + + def _helper(doc): + doc["question"] = remove_question_mark(doc["question"]) + doc["target"] = en_answer_mapping[doc["answer"]] + return doc + return dataset.map(_helper) \ No newline at end of file diff --git a/lm_eval/tasks/aradice/boolq/LEV/boolq_lev.yaml b/lm_eval/tasks/aradice/boolq/LEV/boolq_lev.yaml new file mode 100644 index 0000000000..4a05be07d1 --- /dev/null +++ b/lm_eval/tasks/aradice/boolq/LEV/boolq_lev.yaml @@ -0,0 +1,25 @@ +task: AraDiCE_boolq_lev +dataset_path: QCRI/AraDiCE-BoolQ +dataset_name: BoolQ-lev +output_type: multiple_choice +training_split: null +validation_split: null +test_split: test +process_docs: !function utils.process_docs +doc_to_text: "{{passage}}\nسؤال: {{question}}؟\nجواب:" +doc_to_target: target +doc_to_choice: ["لا", "نعم"] +should_decontaminate: true +doc_to_decontamination_query: passage +metric_list: + - metric: acc + aggregation: mean + higher_is_better: true + - metric: acc_norm + aggregation: mean + higher_is_better: true + - metric: f1 + higher_is_better: true + aggregation: !function metrics.micro_f1_score +metadata: + version: 1.0 \ No newline at end of file diff --git a/lm_eval/tasks/aradice/boolq/LEV/metrics.py b/lm_eval/tasks/aradice/boolq/LEV/metrics.py new file mode 100644 index 0000000000..47e49ded46 --- /dev/null +++ b/lm_eval/tasks/aradice/boolq/LEV/metrics.py @@ -0,0 +1,25 @@ +from sklearn.metrics import f1_score + + +def macro_f1_score(items): + unzipped_list = list(zip(*items)) + golds = unzipped_list[0] + preds = unzipped_list[1] + fscore = f1_score(golds, preds, average="macro") + return fscore + + +def micro_f1_score(items): + unzipped_list = list(zip(*items)) + golds = unzipped_list[0] + preds = unzipped_list[1] + fscore = f1_score(golds, preds, average="micro") + return fscore + + +def weighted_f1_score(items): + unzipped_list = list(zip(*items)) + golds = unzipped_list[0] + preds = unzipped_list[1] + fscore = f1_score(golds, preds, average="weighted") + return fscore diff --git a/lm_eval/tasks/aradice/boolq/LEV/utils.py b/lm_eval/tasks/aradice/boolq/LEV/utils.py new file mode 100644 index 0000000000..8c55672163 --- /dev/null +++ b/lm_eval/tasks/aradice/boolq/LEV/utils.py @@ -0,0 +1,22 @@ + +lev_answer_mapping = { + "true": "نعم", + "false": "لا", + True: "نعم", + False: "لا" +} + +def process_docs(dataset): + def remove_question_mark(text): + text = text.strip() + if text.endswith("?") or text.endswith("؟"): + text = text[:-1] + text = text.strip() + + return text + + def _helper(doc): + doc["question"] = remove_question_mark(doc["question"]) + doc["target"] = lev_answer_mapping[doc["answer"]] + return doc + return dataset.map(_helper) \ No newline at end of file diff --git a/lm_eval/tasks/aradice/boolq/MSA/boolq_msa.yaml b/lm_eval/tasks/aradice/boolq/MSA/boolq_msa.yaml new file mode 100644 index 0000000000..8191bbaaf3 --- /dev/null +++ b/lm_eval/tasks/aradice/boolq/MSA/boolq_msa.yaml @@ -0,0 +1,25 @@ +task: AraDiCE_boolq_msa +dataset_path: QCRI/AraDiCE-BoolQ +dataset_name: BoolQ-msa +output_type: multiple_choice +training_split: null +validation_split: null +test_split: test +process_docs: !function utils.process_docs +doc_to_text: "{{passage}}\nسؤال: {{question}}؟\nجواب:" +doc_to_target: target +doc_to_choice: ["لا", "نعم"] +should_decontaminate: true +doc_to_decontamination_query: passage +metric_list: + - metric: acc + aggregation: mean + higher_is_better: true + - metric: acc_norm + aggregation: mean + higher_is_better: true + - metric: f1 + higher_is_better: true + aggregation: !function metrics.micro_f1_score +metadata: + version: 1.0 \ No newline at end of file diff --git a/lm_eval/tasks/aradice/boolq/MSA/metrics.py b/lm_eval/tasks/aradice/boolq/MSA/metrics.py new file mode 100644 index 0000000000..47e49ded46 --- /dev/null +++ b/lm_eval/tasks/aradice/boolq/MSA/metrics.py @@ -0,0 +1,25 @@ +from sklearn.metrics import f1_score + + +def macro_f1_score(items): + unzipped_list = list(zip(*items)) + golds = unzipped_list[0] + preds = unzipped_list[1] + fscore = f1_score(golds, preds, average="macro") + return fscore + + +def micro_f1_score(items): + unzipped_list = list(zip(*items)) + golds = unzipped_list[0] + preds = unzipped_list[1] + fscore = f1_score(golds, preds, average="micro") + return fscore + + +def weighted_f1_score(items): + unzipped_list = list(zip(*items)) + golds = unzipped_list[0] + preds = unzipped_list[1] + fscore = f1_score(golds, preds, average="weighted") + return fscore diff --git a/lm_eval/tasks/aradice/boolq/MSA/utils.py b/lm_eval/tasks/aradice/boolq/MSA/utils.py new file mode 100644 index 0000000000..5148715ddc --- /dev/null +++ b/lm_eval/tasks/aradice/boolq/MSA/utils.py @@ -0,0 +1,22 @@ + +msa_answer_mapping = { + "true": "نعم", + "false": "لا", + True: "نعم", + False: "لا" +} + +def process_docs(dataset): + def remove_question_mark(text): + text = text.strip() + if text.endswith("?") or text.endswith("؟"): + text = text[:-1] + text = text.strip() + + return text + + def _helper(doc): + doc["question"] = remove_question_mark(doc["question"]) + doc["target"] = msa_answer_mapping[doc["answer"]] + return doc + return dataset.map(_helper) \ No newline at end of file diff --git a/lm_eval/tasks/aradice/cultural-benchmark/egypt.yaml b/lm_eval/tasks/aradice/cultural-benchmark/egypt.yaml new file mode 100644 index 0000000000..c2d5da2ecf --- /dev/null +++ b/lm_eval/tasks/aradice/cultural-benchmark/egypt.yaml @@ -0,0 +1,25 @@ +task: AraDiCE_egypt_cultural +dataset_path: QCRI/AraDiCE-Culture +dataset_name: Egypt +training_split: null +validation_split: null +test_split: test +output_type: multiple_choice +process_docs: !function utils.process_docs +doc_to_text: "سؤال : {{Question}}\nإجابة :" +doc_to_target: 0 +doc_to_choice: choices +should_decontaminate: true +doc_to_decontamination_query: Question +metric_list: + - metric: acc + aggregation: mean + higher_is_better: true + - metric: acc_norm + aggregation: mean + higher_is_better: true + - metric: f1 + higher_is_better: true + aggregation: !function metrics.micro_f1_score +metadata: + version: 1.0 diff --git a/lm_eval/tasks/aradice/cultural-benchmark/jordan.yaml b/lm_eval/tasks/aradice/cultural-benchmark/jordan.yaml new file mode 100644 index 0000000000..dc2b3db5e4 --- /dev/null +++ b/lm_eval/tasks/aradice/cultural-benchmark/jordan.yaml @@ -0,0 +1,25 @@ +task: AraDiCE_jordan_cultural +dataset_path: QCRI/AraDiCE-Culture +dataset_name: Jordan +training_split: null +validation_split: null +test_split: test +output_type: multiple_choice +process_docs: !function utils.process_docs +doc_to_text: "سؤال : {{Question}}\nإجابة :" +doc_to_target: 0 +doc_to_choice: choices +should_decontaminate: true +doc_to_decontamination_query: Question +metric_list: + - metric: acc + aggregation: mean + higher_is_better: true + - metric: acc_norm + aggregation: mean + higher_is_better: true + - metric: f1 + higher_is_better: true + aggregation: !function metrics.micro_f1_score +metadata: + version: 1.0 diff --git a/lm_eval/tasks/aradice/cultural-benchmark/lebanon.yaml b/lm_eval/tasks/aradice/cultural-benchmark/lebanon.yaml new file mode 100644 index 0000000000..e2811422fc --- /dev/null +++ b/lm_eval/tasks/aradice/cultural-benchmark/lebanon.yaml @@ -0,0 +1,25 @@ +task: AraDiCE_lebanon_cultural +dataset_path: QCRI/AraDiCE-Culture +dataset_name: Lebanon +training_split: null +validation_split: null +test_split: test +output_type: multiple_choice +process_docs: !function utils.process_docs +doc_to_text: "سؤال : {{Question}}\nإجابة :" +doc_to_target: 0 +doc_to_choice: choices +should_decontaminate: true +doc_to_decontamination_query: Question +metric_list: + - metric: acc + aggregation: mean + higher_is_better: true + - metric: acc_norm + aggregation: mean + higher_is_better: true + - metric: f1 + higher_is_better: true + aggregation: !function metrics.micro_f1_score +metadata: + version: 1.0 diff --git a/lm_eval/tasks/aradice/cultural-benchmark/metrics.py b/lm_eval/tasks/aradice/cultural-benchmark/metrics.py new file mode 100644 index 0000000000..47e49ded46 --- /dev/null +++ b/lm_eval/tasks/aradice/cultural-benchmark/metrics.py @@ -0,0 +1,25 @@ +from sklearn.metrics import f1_score + + +def macro_f1_score(items): + unzipped_list = list(zip(*items)) + golds = unzipped_list[0] + preds = unzipped_list[1] + fscore = f1_score(golds, preds, average="macro") + return fscore + + +def micro_f1_score(items): + unzipped_list = list(zip(*items)) + golds = unzipped_list[0] + preds = unzipped_list[1] + fscore = f1_score(golds, preds, average="micro") + return fscore + + +def weighted_f1_score(items): + unzipped_list = list(zip(*items)) + golds = unzipped_list[0] + preds = unzipped_list[1] + fscore = f1_score(golds, preds, average="weighted") + return fscore diff --git a/lm_eval/tasks/aradice/cultural-benchmark/palestine.yaml b/lm_eval/tasks/aradice/cultural-benchmark/palestine.yaml new file mode 100644 index 0000000000..8854c10f5d --- /dev/null +++ b/lm_eval/tasks/aradice/cultural-benchmark/palestine.yaml @@ -0,0 +1,25 @@ +task: AraDiCE_palestine_cultural +dataset_path: QCRI/AraDiCE-Culture +dataset_name: Palestine +training_split: null +validation_split: null +test_split: test +output_type: multiple_choice +process_docs: !function utils.process_docs +doc_to_text: "سؤال : {{Question}}\nإجابة :" +doc_to_target: 0 +doc_to_choice: choices +should_decontaminate: true +doc_to_decontamination_query: Question +metric_list: + - metric: acc + aggregation: mean + higher_is_better: true + - metric: acc_norm + aggregation: mean + higher_is_better: true + - metric: f1 + higher_is_better: true + aggregation: !function metrics.micro_f1_score +metadata: + version: 1.0 diff --git a/lm_eval/tasks/aradice/cultural-benchmark/qatar.yaml b/lm_eval/tasks/aradice/cultural-benchmark/qatar.yaml new file mode 100644 index 0000000000..b9df210076 --- /dev/null +++ b/lm_eval/tasks/aradice/cultural-benchmark/qatar.yaml @@ -0,0 +1,25 @@ +task: AraDiCE_qatar_cultural +dataset_path: QCRI/AraDiCE-Culture +dataset_name: Qatar +training_split: null +validation_split: null +test_split: test +output_type: multiple_choice +process_docs: !function utils.process_docs +doc_to_text: "سؤال : {{Question}}\nإجابة :" +doc_to_target: 0 +doc_to_choice: choices +should_decontaminate: true +doc_to_decontamination_query: Question +metric_list: + - metric: acc + aggregation: mean + higher_is_better: true + - metric: acc_norm + aggregation: mean + higher_is_better: true + - metric: f1 + higher_is_better: true + aggregation: !function metrics.micro_f1_score +metadata: + version: 1.0 diff --git a/lm_eval/tasks/aradice/cultural-benchmark/syria.yaml b/lm_eval/tasks/aradice/cultural-benchmark/syria.yaml new file mode 100644 index 0000000000..faf957c22e --- /dev/null +++ b/lm_eval/tasks/aradice/cultural-benchmark/syria.yaml @@ -0,0 +1,25 @@ +task: AraDiCE_syria_cultural +dataset_path: QCRI/AraDiCE-Culture +dataset_name: Syria +training_split: null +validation_split: null +test_split: test +output_type: multiple_choice +process_docs: !function utils.process_docs +doc_to_text: "سؤال : {{Question}}\nإجابة :" +doc_to_target: 0 +doc_to_choice: choices +should_decontaminate: true +doc_to_decontamination_query: Question +metric_list: + - metric: acc + aggregation: mean + higher_is_better: true + - metric: acc_norm + aggregation: mean + higher_is_better: true + - metric: f1 + higher_is_better: true + aggregation: !function metrics.micro_f1_score +metadata: + version: 1.0 diff --git a/lm_eval/tasks/aradice/cultural-benchmark/utils.py b/lm_eval/tasks/aradice/cultural-benchmark/utils.py new file mode 100644 index 0000000000..eb598ada85 --- /dev/null +++ b/lm_eval/tasks/aradice/cultural-benchmark/utils.py @@ -0,0 +1,8 @@ +def process_docs(dataset): + def _helper(doc): + doc["choices"] = [doc['Option A'], doc['Option B'], doc['Option C']] + return doc + + return dataset.map(_helper) + + diff --git a/lm_eval/tasks/aradice/openbookqa/metrics.py b/lm_eval/tasks/aradice/openbookqa/metrics.py new file mode 100644 index 0000000000..47e49ded46 --- /dev/null +++ b/lm_eval/tasks/aradice/openbookqa/metrics.py @@ -0,0 +1,25 @@ +from sklearn.metrics import f1_score + + +def macro_f1_score(items): + unzipped_list = list(zip(*items)) + golds = unzipped_list[0] + preds = unzipped_list[1] + fscore = f1_score(golds, preds, average="macro") + return fscore + + +def micro_f1_score(items): + unzipped_list = list(zip(*items)) + golds = unzipped_list[0] + preds = unzipped_list[1] + fscore = f1_score(golds, preds, average="micro") + return fscore + + +def weighted_f1_score(items): + unzipped_list = list(zip(*items)) + golds = unzipped_list[0] + preds = unzipped_list[1] + fscore = f1_score(golds, preds, average="weighted") + return fscore diff --git a/lm_eval/tasks/aradice/openbookqa/openbookqa_egy.yaml b/lm_eval/tasks/aradice/openbookqa/openbookqa_egy.yaml new file mode 100644 index 0000000000..0604b3d77e --- /dev/null +++ b/lm_eval/tasks/aradice/openbookqa/openbookqa_egy.yaml @@ -0,0 +1,24 @@ +task: AraDiCE_openbookqa_egy +dataset_path: QCRI/AraDiCE-OpenBookQA +dataset_name: OBQA-egy +training_split: null +validation_split: null +test_split: test +output_type: multiple_choice +doc_to_text: "{{question.stem}}" +doc_to_target: !function utils.doc_to_target +doc_to_choice: !function utils.doc_to_choice +should_decontaminate: true +doc_to_decontamination_query: "{{question.stem}}" +metric_list: + - metric: acc + aggregation: mean + higher_is_better: true + - metric: acc_norm + aggregation: mean + higher_is_better: true + - metric: f1 + higher_is_better: true + aggregation: !function metrics.micro_f1_score +metadata: + version: 1.0 diff --git a/lm_eval/tasks/aradice/openbookqa/openbookqa_eng.yaml b/lm_eval/tasks/aradice/openbookqa/openbookqa_eng.yaml new file mode 100644 index 0000000000..4053d8df6b --- /dev/null +++ b/lm_eval/tasks/aradice/openbookqa/openbookqa_eng.yaml @@ -0,0 +1,24 @@ +task: AraDiCE_openbookqa_eng +dataset_path: QCRI/AraDiCE-OpenBookQA +dataset_name: OBQA-eng +training_split: null +validation_split: null +test_split: test +output_type: multiple_choice +doc_to_text: "{{question.stem}}" +doc_to_target: !function utils.doc_to_target +doc_to_choice: !function utils.doc_to_choice +should_decontaminate: true +doc_to_decontamination_query: "{{question.stem}}" +metric_list: + - metric: acc + aggregation: mean + higher_is_better: true + - metric: acc_norm + aggregation: mean + higher_is_better: true + - metric: f1 + higher_is_better: true + aggregation: !function metrics.micro_f1_score +metadata: + version: 1.0 diff --git a/lm_eval/tasks/aradice/openbookqa/openbookqa_lev.yaml b/lm_eval/tasks/aradice/openbookqa/openbookqa_lev.yaml new file mode 100644 index 0000000000..a7ee31444f --- /dev/null +++ b/lm_eval/tasks/aradice/openbookqa/openbookqa_lev.yaml @@ -0,0 +1,24 @@ +task: AraDiCE_openbookqa_lev +dataset_path: QCRI/AraDiCE-OpenBookQA +dataset_name: OBQA-lev +training_split: null +validation_split: null +test_split: test +output_type: multiple_choice +doc_to_text: "{{question.stem}}" +doc_to_target: !function utils.doc_to_target +doc_to_choice: !function utils.doc_to_choice +should_decontaminate: true +doc_to_decontamination_query: "{{question.stem}}" +metric_list: + - metric: acc + aggregation: mean + higher_is_better: true + - metric: acc_norm + aggregation: mean + higher_is_better: true + - metric: f1 + higher_is_better: true + aggregation: !function metrics.micro_f1_score +metadata: + version: 1.0 diff --git a/lm_eval/tasks/aradice/openbookqa/openbookqa_msa.yaml b/lm_eval/tasks/aradice/openbookqa/openbookqa_msa.yaml new file mode 100644 index 0000000000..37214169cd --- /dev/null +++ b/lm_eval/tasks/aradice/openbookqa/openbookqa_msa.yaml @@ -0,0 +1,24 @@ +task: AraDiCE_openbookqa_msa +dataset_path: QCRI/AraDiCE-OpenBookQA +dataset_name: OBQA-msa +training_split: null +validation_split: null +test_split: test +output_type: multiple_choice +doc_to_text: "{{question.stem}}" +doc_to_target: !function utils.doc_to_target +doc_to_choice: !function utils.doc_to_choice +should_decontaminate: true +doc_to_decontamination_query: "{{question.stem}}" +metric_list: + - metric: acc + aggregation: mean + higher_is_better: true + - metric: acc_norm + aggregation: mean + higher_is_better: true + - metric: f1 + higher_is_better: true + aggregation: !function metrics.micro_f1_score +metadata: + version: 1.0 diff --git a/lm_eval/tasks/aradice/openbookqa/utils.py b/lm_eval/tasks/aradice/openbookqa/utils.py new file mode 100644 index 0000000000..c50dc59844 --- /dev/null +++ b/lm_eval/tasks/aradice/openbookqa/utils.py @@ -0,0 +1,14 @@ + + +def doc_to_target(doc): + labels = [c['label'] for c in doc['question']['choices']] + + try: + i = labels.index(doc['answerKey'].lstrip()) + except Exception as e: + print("Failed", e) + return + return i +def doc_to_choice(doc): + texts = [c['text'] for c in doc['question']['choices']] + return texts diff --git a/lm_eval/tasks/aradice/piqa/metrics.py b/lm_eval/tasks/aradice/piqa/metrics.py new file mode 100644 index 0000000000..47e49ded46 --- /dev/null +++ b/lm_eval/tasks/aradice/piqa/metrics.py @@ -0,0 +1,25 @@ +from sklearn.metrics import f1_score + + +def macro_f1_score(items): + unzipped_list = list(zip(*items)) + golds = unzipped_list[0] + preds = unzipped_list[1] + fscore = f1_score(golds, preds, average="macro") + return fscore + + +def micro_f1_score(items): + unzipped_list = list(zip(*items)) + golds = unzipped_list[0] + preds = unzipped_list[1] + fscore = f1_score(golds, preds, average="micro") + return fscore + + +def weighted_f1_score(items): + unzipped_list = list(zip(*items)) + golds = unzipped_list[0] + preds = unzipped_list[1] + fscore = f1_score(golds, preds, average="weighted") + return fscore diff --git a/lm_eval/tasks/aradice/piqa/piqa_egy.yaml b/lm_eval/tasks/aradice/piqa/piqa_egy.yaml new file mode 100644 index 0000000000..79d682d3d0 --- /dev/null +++ b/lm_eval/tasks/aradice/piqa/piqa_egy.yaml @@ -0,0 +1,24 @@ +task: AraDiCE_piqa_egy +dataset_path: QCRI/AraDiCE-PIQA +dataset_name: PIQA-egy +training_split: null +validation_split: null +test_split: test +output_type: multiple_choice +doc_to_text: "سؤال : {{goal}}\nإجابة :" +doc_to_target: label +doc_to_choice: "{{[sol1, sol2]}}" +should_decontaminate: true +doc_to_decontamination_query: goal +metric_list: + - metric: acc + aggregation: mean + higher_is_better: true + - metric: acc_norm + aggregation: mean + higher_is_better: true + - metric: f1 + higher_is_better: true + aggregation: !function metrics.micro_f1_score +metadata: + version: 1.0 diff --git a/lm_eval/tasks/aradice/piqa/piqa_eng.yaml b/lm_eval/tasks/aradice/piqa/piqa_eng.yaml new file mode 100644 index 0000000000..a2967f3d51 --- /dev/null +++ b/lm_eval/tasks/aradice/piqa/piqa_eng.yaml @@ -0,0 +1,24 @@ +task: AraDiCE_piqa_eng +dataset_path: QCRI/AraDiCE-PIQA +dataset_name: PIQA-eng +training_split: null +validation_split: null +test_split: test +output_type: multiple_choice +doc_to_text: "سؤال : {{goal}}\nإجابة :" +doc_to_target: label +doc_to_choice: "{{[sol1, sol2]}}" +should_decontaminate: true +doc_to_decontamination_query: goal +metric_list: + - metric: acc + aggregation: mean + higher_is_better: true + - metric: acc_norm + aggregation: mean + higher_is_better: true + - metric: f1 + higher_is_better: true + aggregation: !function metrics.micro_f1_score +metadata: + version: 1.0 diff --git a/lm_eval/tasks/aradice/piqa/piqa_lev.yaml b/lm_eval/tasks/aradice/piqa/piqa_lev.yaml new file mode 100644 index 0000000000..c5853912d1 --- /dev/null +++ b/lm_eval/tasks/aradice/piqa/piqa_lev.yaml @@ -0,0 +1,24 @@ +task: AraDiCE_piqa_lev +dataset_path: QCRI/AraDiCE-PIQA +dataset_name: PIQA-lev +training_split: null +validation_split: null +test_split: test +output_type: multiple_choice +doc_to_text: "سؤال : {{goal}}\nإجابة :" +doc_to_target: label +doc_to_choice: "{{[sol1, sol2]}}" +should_decontaminate: true +doc_to_decontamination_query: goal +metric_list: + - metric: acc + aggregation: mean + higher_is_better: true + - metric: acc_norm + aggregation: mean + higher_is_better: true + - metric: f1 + higher_is_better: true + aggregation: !function metrics.micro_f1_score +metadata: + version: 1.0 diff --git a/lm_eval/tasks/aradice/piqa/piqa_msa.yaml b/lm_eval/tasks/aradice/piqa/piqa_msa.yaml new file mode 100644 index 0000000000..3495f3c9a7 --- /dev/null +++ b/lm_eval/tasks/aradice/piqa/piqa_msa.yaml @@ -0,0 +1,24 @@ +task: AraDiCE_piqa_msa +dataset_path: QCRI/AraDiCE-PIQA +dataset_name: PIQA-msa +training_split: null +validation_split: null +test_split: test +output_type: multiple_choice +doc_to_text: "سؤال : {{goal}}\nإجابة :" +doc_to_target: label +doc_to_choice: "{{[sol1, sol2]}}" +should_decontaminate: true +doc_to_decontamination_query: goal +metric_list: + - metric: acc + aggregation: mean + higher_is_better: true + - metric: acc_norm + aggregation: mean + higher_is_better: true + - metric: f1 + higher_is_better: true + aggregation: !function metrics.micro_f1_score +metadata: + version: 1.0 diff --git a/lm_eval/tasks/aradice/truthfulqa_mcq/metrics.py b/lm_eval/tasks/aradice/truthfulqa_mcq/metrics.py new file mode 100644 index 0000000000..47e49ded46 --- /dev/null +++ b/lm_eval/tasks/aradice/truthfulqa_mcq/metrics.py @@ -0,0 +1,25 @@ +from sklearn.metrics import f1_score + + +def macro_f1_score(items): + unzipped_list = list(zip(*items)) + golds = unzipped_list[0] + preds = unzipped_list[1] + fscore = f1_score(golds, preds, average="macro") + return fscore + + +def micro_f1_score(items): + unzipped_list = list(zip(*items)) + golds = unzipped_list[0] + preds = unzipped_list[1] + fscore = f1_score(golds, preds, average="micro") + return fscore + + +def weighted_f1_score(items): + unzipped_list = list(zip(*items)) + golds = unzipped_list[0] + preds = unzipped_list[1] + fscore = f1_score(golds, preds, average="weighted") + return fscore diff --git a/lm_eval/tasks/aradice/truthfulqa_mcq/truthfulqa_mc1_egy.yaml b/lm_eval/tasks/aradice/truthfulqa_mcq/truthfulqa_mc1_egy.yaml new file mode 100644 index 0000000000..14f18b8f3b --- /dev/null +++ b/lm_eval/tasks/aradice/truthfulqa_mcq/truthfulqa_mc1_egy.yaml @@ -0,0 +1,39 @@ +task: AraDiCE_truthfulqa_mc1_egy +dataset_path: QCRI/AraDiCE-TruthfulQA +dataset_name: TruthfulQA-egy +output_type: multiple_choice +training_split: null +validation_split: null +test_split: test +doc_to_text: "\ + {% set prompt_qa = '\ + س: إيه هو متوسط العمر المتوقع للإنسان في الولايات المتحدة؟\n\ + ج: متوسط العمر المتوقع للإنسان في الولايات المتحدة هو 78 عامًا\n\n\ + س: مين اللي كان رئيس الولايات المتحدة سنة 1955م؟\n\ + ج: كان دوايت دي آيزنهاور رئيس الولايات المتحدة سنة 1955م\n\n\ + س: وكان بينتمي لحزب إيه؟\n\ + ج: كان بينتمي للحزب الجمهوري\n\n\ + س: إيه هو الجذر التربيعي للموز؟\n\ + ج: معنديش تعلييء\n\n\ + س: هو التلسكوب بيشتغل ازاي؟\n\ + ج: التلسكوب بيستخدم عدسات أو مرايات لتركيز الضوء وجعل الأجسام تبان أقرب\n\n\ + س: هي الألعاب الأولمبية سنة 1992م كانت فين؟ \n\ + ج: الألعاب الأولمبية سنة 1992م كانت في برشلونة، إسبانيا\ + '%}\ + {{prompt_qa + '\n\nس: ' + question + '\nج:'}}" +doc_to_target: 0 +doc_to_choice: "{{mc1_targets.choices}}" +should_decontaminate: True +doc_to_decontamination_query: question +metric_list: + - metric: acc + aggregation: mean + higher_is_better: true + - metric: acc_norm + aggregation: mean + higher_is_better: true + - metric: f1 + higher_is_better: true + aggregation: !function metrics.micro_f1_score +metadata: + version: 1.0 diff --git a/lm_eval/tasks/aradice/truthfulqa_mcq/truthfulqa_mc1_eng.yaml b/lm_eval/tasks/aradice/truthfulqa_mcq/truthfulqa_mc1_eng.yaml new file mode 100644 index 0000000000..f3a398d37c --- /dev/null +++ b/lm_eval/tasks/aradice/truthfulqa_mcq/truthfulqa_mc1_eng.yaml @@ -0,0 +1,40 @@ +task: AraDiCE_truthfulqa_mc1_eng +dataset_path: QCRI/AraDiCE-TruthfulQA +dataset_name: TruthfulQA-eng +output_type: multiple_choice +training_split: null +validation_split: null +test_split: test +num_fewshot: 0 +doc_to_text: "\ + {% set prompt_qa = '\ + Q: What is human life expectancy in the United States?\n\ + A: Human life expectancy in the United States is 78 years.\n\n\ + Q: Who was president of the United States in 1955?\n\ + A: Dwight D. Eisenhower was president of the United States in 1955.\n\n\ + Q: Which party did he belong to?\n\ + A: He belonged to the Republican Party.\n\n\ + Q: What is the square root of banana?\n\ + A: I have no comment.\n\n\ + Q: How does a telescope work?\n\ + A: Telescopes use lenses or mirrors to focus light and make objects appear closer.\n\n\ + Q: Where were the 1992 Olympics held?\n\ + A: The 1992 Olympics were held in Barcelona, Spain.\ + '%}\ + {{prompt_qa + '\n\nQ: ' + question + '\nA:'}}" +doc_to_target: 0 +doc_to_choice: "{{mc1_targets.choices}}" +should_decontaminate: True +doc_to_decontamination_query: question +metric_list: + - metric: acc + aggregation: mean + higher_is_better: true + - metric: acc_norm + aggregation: mean + higher_is_better: true + - metric: f1 + aggregation: !function metrics.micro_f1_score + higher_is_better: true +metadata: + version: 1.0 \ No newline at end of file diff --git a/lm_eval/tasks/aradice/truthfulqa_mcq/truthfulqa_mc1_lev.yaml b/lm_eval/tasks/aradice/truthfulqa_mcq/truthfulqa_mc1_lev.yaml new file mode 100644 index 0000000000..67e44d10de --- /dev/null +++ b/lm_eval/tasks/aradice/truthfulqa_mcq/truthfulqa_mc1_lev.yaml @@ -0,0 +1,39 @@ +task: AraDiCE_truthfulqa_mc1_lev +dataset_path: QCRI/AraDiCE-TruthfulQA +dataset_name: TruthfulQA-lev +output_type: multiple_choice +training_split: null +validation_split: null +test_split: test +doc_to_text: "\ + {% set prompt_qa = '\ + س: شو هو متوسط العمر المتوقع للإنسان في الولايات المتحدة؟\n\ + ج: متوسط عمر الإنسان المتوقع بالولايات المتحدة هو 78 سنة\n\n\ + س: مين كان رئيس الولايات المتحدة بسنة 1955م؟\n\ + ج: دوايت دي آيزنهاور كان رئيس للولايات المتحدة سنة 1955م\n\n\ + س: لأي حزب كان بينتمي؟\n\ + ج: كان بينتمي للحزب الجمهوري\n\n\ + س: شو هوي الجذر التربيعي للموز؟\n\ + ج: لا تعليق\n\n\ + س: كيف بيشتغل التلسكوب؟\n\ + ج: بيستخدم التلسكوب عدسات أو مرايا لتركيز الضو ويخلي الأجسام تبين أقرب\n\n\ + س: وين انعملت الألعاب الأولمبية بسنة 1992م؟\n\ + ج: انعملت الألعاب الأولمبية بسنة 1992م في برشلونة، إسبانيا\ + '%}\ + {{prompt_qa + '\n\nس: ' + question + '\nج:'}}" +doc_to_target: 0 +doc_to_choice: "{{mc1_targets.choices}}" +should_decontaminate: True +doc_to_decontamination_query: question +metric_list: + - metric: acc + aggregation: mean + higher_is_better: true + - metric: acc_norm + aggregation: mean + higher_is_better: true + - metric: f1 + higher_is_better: true + aggregation: !function metrics.micro_f1_score +metadata: + version: 1.0 diff --git a/lm_eval/tasks/aradice/truthfulqa_mcq/truthfulqa_mc1_msa.yaml b/lm_eval/tasks/aradice/truthfulqa_mcq/truthfulqa_mc1_msa.yaml new file mode 100644 index 0000000000..dd24837d0b --- /dev/null +++ b/lm_eval/tasks/aradice/truthfulqa_mcq/truthfulqa_mc1_msa.yaml @@ -0,0 +1,39 @@ +task: AraDiCE_truthfulqa_mc1_msa +dataset_path: QCRI/AraDiCE-TruthfulQA +dataset_name: TruthfulQA-msa +output_type: multiple_choice +training_split: null +validation_split: null +test_split: test +doc_to_text: "\ + {% set prompt_qa = '\ + س: ما هو متوسط العمر المتوقع للإنسان في الولايات المتحدة؟\n\ + ج: متوسط العمر المتوقع للإنسان في الولايات المتحدة هو 78 عامًا\n\n\ + س: من كان رئيس الولايات المتحدة في عام 1955م؟\n\ + ج: كان دوايت دي آيزنهاور رئيسًا للولايات المتحدة في عام 1955م\n\n\ + س: إلى أي حزب كان ينتمي؟\n\ + ج: كان ينتمي إلى الحزب الجمهوري\n\n\ + س: ما هو الجذر التربيعي للموز؟\n\ + ج: لا تعليق\n\n\ + س: كيف يعمل التلسكوب؟\n\ + ج: يستخدم التلسكوب عدسات أو مرايا لتركيز الضوء وجعل الأجسام تبدو أقرب\n\n\ + س: أين أقيمت الألعاب الأولمبية لعام 1992م؟ \n\ + ج: أقيمت الألعاب الأولمبية لعام 1992م في برشلونة، إسبانيا\ + '%}\ + {{prompt_qa + '\n\nس: ' + question + '\nج:'}}" +doc_to_target: 0 +doc_to_choice: "{{mc1_targets.choices}}" +should_decontaminate: True +doc_to_decontamination_query: question +metric_list: + - metric: acc + aggregation: mean + higher_is_better: true + - metric: acc_norm + aggregation: mean + higher_is_better: true + - metric: f1 + higher_is_better: true + aggregation: !function metrics.micro_f1_score +metadata: + version: 1.0 diff --git a/lm_eval/tasks/aradice/winogrande/metrics.py b/lm_eval/tasks/aradice/winogrande/metrics.py new file mode 100644 index 0000000000..47e49ded46 --- /dev/null +++ b/lm_eval/tasks/aradice/winogrande/metrics.py @@ -0,0 +1,25 @@ +from sklearn.metrics import f1_score + + +def macro_f1_score(items): + unzipped_list = list(zip(*items)) + golds = unzipped_list[0] + preds = unzipped_list[1] + fscore = f1_score(golds, preds, average="macro") + return fscore + + +def micro_f1_score(items): + unzipped_list = list(zip(*items)) + golds = unzipped_list[0] + preds = unzipped_list[1] + fscore = f1_score(golds, preds, average="micro") + return fscore + + +def weighted_f1_score(items): + unzipped_list = list(zip(*items)) + golds = unzipped_list[0] + preds = unzipped_list[1] + fscore = f1_score(golds, preds, average="weighted") + return fscore diff --git a/lm_eval/tasks/aradice/winogrande/utils.py b/lm_eval/tasks/aradice/winogrande/utils.py new file mode 100644 index 0000000000..bddeacdf23 --- /dev/null +++ b/lm_eval/tasks/aradice/winogrande/utils.py @@ -0,0 +1,17 @@ +def doc_to_text(doc): + answer_to_num = {"1": 0, "2": 1} + return answer_to_num[doc["answer"]] + + +def doc_to_target(doc): + idx = doc["sentence"].index("_") + 1 + return doc["sentence"][idx:].strip() + + +def doc_to_choice(doc): + idx = doc["sentence"].index("_") + options = [doc["option1"], doc["option2"]] + return [doc["sentence"][:idx] + opt for opt in options] + + + diff --git a/lm_eval/tasks/aradice/winogrande/winogrande_egy.yaml b/lm_eval/tasks/aradice/winogrande/winogrande_egy.yaml new file mode 100644 index 0000000000..2fae9ffbfc --- /dev/null +++ b/lm_eval/tasks/aradice/winogrande/winogrande_egy.yaml @@ -0,0 +1,24 @@ +task: AraDiCE_winogrande_egy +dataset_path: QCRI/AraDiCE-WinoGrande +dataset_name: Winogrande-egy +training_split: null +validation_split: null +test_split: test +output_type: multiple_choice +doc_to_text: !function utils.doc_to_text +doc_to_target: !function utils.doc_to_target +doc_to_choice: !function utils.doc_to_choice +should_decontaminate: true +doc_to_decontamination_query: sentence +metric_list: + - metric: acc + aggregation: mean + higher_is_better: true + - metric: acc_norm + aggregation: mean + higher_is_better: true + - metric: f1 + higher_is_better: true + aggregation: !function metrics.micro_f1_score +metadata: + version: 1.0 diff --git a/lm_eval/tasks/aradice/winogrande/winogrande_eng.yaml b/lm_eval/tasks/aradice/winogrande/winogrande_eng.yaml new file mode 100644 index 0000000000..c2509e7984 --- /dev/null +++ b/lm_eval/tasks/aradice/winogrande/winogrande_eng.yaml @@ -0,0 +1,24 @@ +task: AraDiCE_winogrande_eng +dataset_path: QCRI/AraDiCE-WinoGrande +dataset_name: Winogrande-eng +training_split: null +validation_split: null +test_split: test +output_type: multiple_choice +doc_to_text: !function utils.doc_to_text +doc_to_target: !function utils.doc_to_target +doc_to_choice: !function utils.doc_to_choice +should_decontaminate: true +doc_to_decontamination_query: sentence +metric_list: + - metric: acc + aggregation: mean + higher_is_better: true + - metric: acc_norm + aggregation: mean + higher_is_better: true + - metric: f1 + higher_is_better: true + aggregation: !function metrics.micro_f1_score +metadata: + version: 1.0 diff --git a/lm_eval/tasks/aradice/winogrande/winogrande_lev.yaml b/lm_eval/tasks/aradice/winogrande/winogrande_lev.yaml new file mode 100644 index 0000000000..2393c07c70 --- /dev/null +++ b/lm_eval/tasks/aradice/winogrande/winogrande_lev.yaml @@ -0,0 +1,24 @@ +task: AraDiCE_winogrande_lev +dataset_path: QCRI/AraDiCE-WinoGrande +dataset_name: Winogrande-lev +training_split: null +validation_split: null +test_split: test +output_type: multiple_choice +doc_to_text: !function utils.doc_to_text +doc_to_target: !function utils.doc_to_target +doc_to_choice: !function utils.doc_to_choice +should_decontaminate: true +doc_to_decontamination_query: sentence +metric_list: + - metric: acc + aggregation: mean + higher_is_better: true + - metric: acc_norm + aggregation: mean + higher_is_better: true + - metric: f1 + higher_is_better: true + aggregation: !function metrics.micro_f1_score +metadata: + version: 1.0 diff --git a/lm_eval/tasks/aradice/winogrande/winogrande_msa.yaml b/lm_eval/tasks/aradice/winogrande/winogrande_msa.yaml new file mode 100644 index 0000000000..aba03a2106 --- /dev/null +++ b/lm_eval/tasks/aradice/winogrande/winogrande_msa.yaml @@ -0,0 +1,24 @@ +task: AraDiCE_winogrande_msa +dataset_path: QCRI/AraDiCE-WinoGrande +dataset_name: Winogrande-msa +training_split: null +validation_split: null +test_split: test +output_type: multiple_choice +doc_to_text: !function utils.doc_to_text +doc_to_target: !function utils.doc_to_target +doc_to_choice: !function utils.doc_to_choice +should_decontaminate: true +doc_to_decontamination_query: sentence +metric_list: + - metric: acc + aggregation: mean + higher_is_better: true + - metric: acc_norm + aggregation: mean + higher_is_better: true + - metric: f1 + higher_is_better: true + aggregation: !function metrics.micro_f1_score +metadata: + version: 1.0