henrycastillo commited on Oct 6

Commit

9413fee

verified ·

1 Parent(s): 47d993b

more lm eval harness

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +2 -0
lm-evaluation-harness/docs/API_guide.md +211 -0
lm-evaluation-harness/docs/CONTRIBUTING.md +83 -0
lm-evaluation-harness/docs/README.md +11 -0
lm-evaluation-harness/docs/chat-template-readme.md +31 -0
lm-evaluation-harness/docs/decontamination.md +76 -0
lm-evaluation-harness/docs/footguns.md +58 -0
lm-evaluation-harness/docs/img/fewshot_example_gpt3.png +3 -0
lm-evaluation-harness/docs/interface.md +170 -0
lm-evaluation-harness/docs/model_guide.md +192 -0
lm-evaluation-harness/docs/new_task_guide.md +521 -0
lm-evaluation-harness/docs/task_guide.md +335 -0
lm-evaluation-harness/examples/lm-eval-overview.ipynb +1240 -0
lm-evaluation-harness/examples/transformer-lens.py +59 -0
lm-evaluation-harness/examples/visualize-wandb.ipynb +172 -0
lm-evaluation-harness/examples/visualize-zeno.ipynb +115 -0
lm-evaluation-harness/lm_eval/__init__.py +21 -0
lm-evaluation-harness/lm_eval/__main__.py +536 -0
lm-evaluation-harness/lm_eval/api/__init__.py +0 -0
lm-evaluation-harness/lm_eval/api/filter.py +56 -0
lm-evaluation-harness/lm_eval/api/group.py +115 -0
lm-evaluation-harness/lm_eval/api/instance.py +38 -0
lm-evaluation-harness/lm_eval/api/metrics.py +629 -0
lm-evaluation-harness/lm_eval/api/model.py +502 -0
lm-evaluation-harness/lm_eval/api/registry.py +196 -0
lm-evaluation-harness/lm_eval/api/samplers.py +232 -0
lm-evaluation-harness/lm_eval/api/task.py +1885 -0
lm-evaluation-harness/lm_eval/caching/__init__.py +0 -0
lm-evaluation-harness/lm_eval/caching/cache.py +59 -0
lm-evaluation-harness/lm_eval/decontamination/__init__.py +0 -0
lm-evaluation-harness/lm_eval/decontamination/archiver.py +174 -0
lm-evaluation-harness/lm_eval/decontamination/decontaminate.py +166 -0
lm-evaluation-harness/lm_eval/decontamination/janitor.py +329 -0
lm-evaluation-harness/lm_eval/evaluator.py +787 -0
lm-evaluation-harness/lm_eval/evaluator_utils.py +554 -0
lm-evaluation-harness/lm_eval/filters/__init__.py +25 -0
lm-evaluation-harness/lm_eval/filters/custom.py +17 -0
lm-evaluation-harness/lm_eval/filters/decontamination.py +25 -0
lm-evaluation-harness/lm_eval/filters/extraction.py +233 -0
lm-evaluation-harness/lm_eval/filters/selection.py +61 -0
lm-evaluation-harness/lm_eval/filters/transformation.py +122 -0
lm-evaluation-harness/lm_eval/loggers/__init__.py +2 -0
lm-evaluation-harness/lm_eval/loggers/evaluation_tracker.py +537 -0
lm-evaluation-harness/lm_eval/loggers/utils.py +149 -0
lm-evaluation-harness/lm_eval/loggers/wandb_logger.py +358 -0
lm-evaluation-harness/lm_eval/models/__init__.py +35 -0
lm-evaluation-harness/lm_eval/models/anthropic_llms.py +382 -0
lm-evaluation-harness/lm_eval/models/api_models.py +810 -0
lm-evaluation-harness/lm_eval/models/dummy.py +41 -0
lm-evaluation-harness/lm_eval/models/gguf.py +132 -0

.gitattributes CHANGED Viewed

@@ -45,3 +45,5 @@ records/012625_BatchSize/ablations.png filter=lfs diff=lfs merge=lfs -text
 records/102924_Optimizers/nanogpt_speedrun81w.png filter=lfs diff=lfs merge=lfs -text
 records/102924_Optimizers/nanogpt_speedrun82w.png filter=lfs diff=lfs merge=lfs -text
 records/110624_ShortcutsTweaks/nanogpt_speedrun111.png filter=lfs diff=lfs merge=lfs -text

 records/102924_Optimizers/nanogpt_speedrun81w.png filter=lfs diff=lfs merge=lfs -text
 records/102924_Optimizers/nanogpt_speedrun82w.png filter=lfs diff=lfs merge=lfs -text
 records/110624_ShortcutsTweaks/nanogpt_speedrun111.png filter=lfs diff=lfs merge=lfs -text
+lm-evaluation-harness/docs/img/fewshot_example_gpt3.png filter=lfs diff=lfs merge=lfs -text
+lm-evaluation-harness/lm_eval/tasks/noreval/noreval.jpg filter=lfs diff=lfs merge=lfs -text

lm-evaluation-harness/docs/API_guide.md ADDED Viewed

	@@ -0,0 +1,211 @@

+# TemplateAPI Usage Guide
+The `TemplateAPI` class is a versatile superclass designed to facilitate the integration of various API-based language models into the lm-evaluation-harness framework. This guide will explain how to use and extend the `TemplateAPI` class to implement your own API models. If your API implements the OpenAI API you can use the `local-completions` or the `local-chat-completions` (defined [here](https://github.com/EleutherAI/lm-evaluation-harness/blob/main/lm_eval/models/openai_completions.py)) model types, which can also serve as examples of how to effectively subclass this template.
+## Overview
+The `TemplateAPI` class provides a template for creating API-based model implementations. It handles common functionalities such as:
+- Tokenization (optional)
+- Batch processing
+- Caching
+- Retrying failed requests
+- Parsing API responses
+To use this class, you typically need to subclass it and implement specific methods for your API.
+## Key Methods to Implement
+When subclassing `TemplateAPI`, you need to implement the following methods:
+1. `_create_payload`: Creates the JSON payload for API requests.
+2. `parse_logprobs`: Parses log probabilities from API responses.
+3. `parse_generations`: Parses generated text from API responses.
+Optional Properties:
+4. `header`: Returns the headers for the API request.
+5. `api_key`: Returns the API key for authentication (if required).
+You may also need to override other methods or properties depending on your API's specific requirements.
+> [!NOTE]
+> Currently loglikelihood and MCQ based tasks (such as MMLU) are only supported for completion endpoints. Not for chat-completion — those that expect a list of dicts — endpoints! Completion APIs which support instruct tuned models can be evaluated with the `--apply_chat_template` option in order to simultaneously evaluate models using a chat template format while still being able to access the model logits needed for loglikelihood-based tasks.
+## TemplateAPI Arguments
+When initializing a `TemplateAPI` instance or a subclass, you can provide several arguments to customize its behavior. Here's a detailed explanation of some important arguments:
+- `model` or `pretrained` (str):
+  - The name or identifier of the model to use.
+  - `model` takes precedence over `pretrained` when both are provided.
+- `base_url` (str):
+  - The base URL for the API endpoint.
+- `tokenizer` (str, optional):
+  - The name or path of the tokenizer to use.
+  - If not provided, it defaults to using the same tokenizer name as the model.
+- `num_concurrent` (int):
+  - Number of concurrent requests to make to the API.
+  - Useful for APIs that support parallel processing.
+  - Default is 1 (sequential processing).
+- `timeout` (int, optional):
+  - Timeout for API requests in seconds.
+  - Default is 30.
+- `tokenized_requests` (bool):
+  - Determines whether the input is pre-tokenized. Defaults to `True`.
+  - Requests can be sent in either tokenized form (`list[list[int]]`) or as text (`list[str]`, or `str` for batch_size=1).
+  - For loglikelihood-based tasks, prompts require tokenization to calculate the context length. If `False` prompts are decoded back to text before being sent to the API.
+  - Not as important for `generate_until` tasks.
+  - Ignored for chat formatted inputs (list[dict...]) or if tokenizer_backend is None.
+- `tokenizer_backend` (str, optional):
+  - Required for loglikelihood-based or MCQ tasks.
+  - Specifies the tokenizer library to use. Options are "tiktoken", "huggingface", or None.
+  - Default is "huggingface".
+- `max_length` (int, optional):
+  - Maximum length of input + output.
+  - Default is 2048.
+- `max_retries` (int, optional):
+  - Maximum number of retries for failed API requests.
+  - Default is 3.
+- `max_gen_toks` (int, optional):
+  - Maximum number of tokens to generate in completion tasks.
+  - Default is 256 or set in task yaml.
+- `batch_size` (int or str, optional):
+  - Number of requests to batch together (if the API supports batching).
+  - Can be an integer or "auto" (which defaults to 1 for API models).
+  - Default is 1.
+- `seed` (int, optional):
+  - Random seed for reproducibility.
+  - Default is 1234.
+- `add_bos_token` (bool, optional):
+  - Whether to add the beginning-of-sequence token to inputs (when tokenizing).
+  - Default is False.
+- `custom_prefix_token_id` (int, optional):
+  - Custom token ID to use as a prefix for inputs.
+  - If not provided, uses the model's default BOS or EOS token (if `add_bos_token` is True).
+- `verify_certificate` (bool, optional):
+  - Whether to validate the certificate of the API endpoint (if HTTPS).
+  - Default is True.
+- `header` (dict, optional):
+  - Custom headers for API requests.
+  - If not provided, uses `{"Authorization": f"Bearer {self.api_key}"}` by default.
+Example usage:
+```python
+class MyAPIModel(TemplateAPI):
+    def __init__(self, **kwargs):
+        super().__init__(
+            model="my-model",
+            base_url="https://api.mymodel.com/v1/completions",
+            tokenizer_backend="huggingface",
+            num_concurrent=5,
+            max_retries=5,
+            batch_size=10,
+            **kwargs
+        )
+    # Implement other required methods...
+```
+When subclassing `TemplateAPI`, you can override these arguments in your `__init__` method to set default values specific to your API. You can also add additional (potentially user-specified) arguments as needed for your specific implementation.
+## Example Implementation: OpenAI API
+The `OpenAICompletionsAPI` and `OpenAIChatCompletion` ([here](https://github.com/EleutherAI/lm-evaluation-harness/blob/main/lm_eval/models/openai_completions.py) classes demonstrate how to implement API models using the `TemplateAPI` class. Here's a breakdown of the key components:
+### 1. Subclassing and Initialization
+```python
+@register_model("openai-completions")
+class OpenAICompletionsAPI(LocalCompletionsAPI):
+    def __init__(
+        self,
+        base_url="https://api.openai.com/v1/completions",
+        tokenizer_backend="tiktoken",
+        **kwargs,
+    ):
+        super().__init__(
+            base_url=base_url, tokenizer_backend=tokenizer_backend, **kwargs
+        )
+```
+### 2. Implementing API Key Retrieval
+```python
+@cached_property
+def api_key(self):
+    key = os.environ.get("OPENAI_API_KEY", None)
+    if key is None:
+        raise ValueError(
+            "API key not found. Please set the OPENAI_API_KEY environment variable."
+        )
+    return key
+```
+### 3. Creating the Payload
+```python
+def _create_payload(
+    self,
+    messages: Union[List[List[int]], List[dict], List[str], str],
+    generate=False,
+    gen_kwargs: Optional[dict] = None,
+    **kwargs,
+) -> dict:
+    if generate:
+        # ... (implementation for generation)
+    else:
+        # ... (implementation for log likelihood)
+```
+### 4. Parsing API Responses
+```python
+@staticmethod
+def parse_logprobs(
+    outputs: Union[Dict, List[Dict]],
+    tokens: List[List[int]] = None,
+    ctxlens: List[int] = None,
+    **kwargs,
+) -> List[Tuple[float, bool]]:
+    # ... (implementation)
+@staticmethod
+def parse_generations(outputs: Union[Dict, List[Dict]], **kwargs) -> List[str]:
+    # ... (implementation)
+```
+The requests are initiated in the `model_call` or the `amodel_call` methods.
+## Implementing Your Own API Model
+To implement your own API model:
+1. Subclass `TemplateAPI` or one of its subclasses (e.g., `LocalCompletionsAPI`).
+2. Override the `__init__` method if you need to set specific parameters.
+3. Implement the `_create_payload` and `header` methods to create the appropriate payload for your API.
+4. Implement the `parse_logprobs` and `parse_generations` methods to parse your API's responses.
+5. Override the `api_key` property if your API requires authentication.
+6. Override any other methods as necessary to match your API's behavior.
+## Best Practices
+1. Use the `@register_model` decorator to register your model with the framework (and import it in `lm_eval/models/__init__.py`!).
+2. Use environment variables for sensitive information like API keys.
+3. Properly handle batching and concurrent requests if supported by your API.

lm-evaluation-harness/docs/CONTRIBUTING.md ADDED Viewed

	@@ -0,0 +1,83 @@

+# Contributing to LM Evaluation Harness
+Welcome and thank you for your interest in the LM Evaluation Harness! We welcome contributions and feedback and appreciate your time spent with our library, and hope you find it useful!
+## Important Resources
+There are several places information about LM Evaluation Harness is located:
+- Our [documentation pages](https://github.com/EleutherAI/lm-evaluation-harness/tree/main/docs)
+- We occasionally use [GitHub Milestones](https://github.com/EleutherAI/lm-evaluation-harness/milestones) to track progress toward specific near-term version releases.
+- We maintain a [Project Board](https://github.com/orgs/EleutherAI/projects/25) for tracking current work items and PRs, and for future roadmap items or feature requests.
+- Further discussion and support conversations are located in the #lm-thunderdome channel of the [EleutherAI discord](https://discord.gg/eleutherai).
+## Code Style
+LM Evaluation Harness uses [ruff](https://github.com/astral-sh/ruff) for linting via [pre-commit](https://pre-commit.com/).
+You can install linters and dev tools via
+```pip install lm_eval[dev]``` or ```pip install -e ".[dev]"```
+Then, run
+```pre-commit install```
+in order to ensure linters and other checks will be run upon committing.
+## Testing
+We use [pytest](https://docs.pytest.org/en/latest/) for running unit tests. All library unit tests can be run via:
+```bash
+python -m pytest --showlocals -s -vv -n=auto --ignore=tests/models/test_openvino.py
+```
+## Contributor License Agreement
+We ask that new contributors agree to a Contributor License Agreement affirming that EleutherAI has the rights to use your contribution to our library.
+First-time pull requests will have a reply added by @CLAassistant containing instructions for how to confirm this, and we require it before merging your PR.
+## Contribution Best Practices
+We recommend a few best practices to make your contributions or reported errors easier to assist with.
+**For Pull Requests:**
+- PRs should be titled descriptively, and be opened with a brief description of the scope and intent of the new contribution.
+- New features should have appropriate documentation added alongside them.
+- Aim for code maintainability, and minimize code copying.
+- If opening a task, try to share test results on the task using a publicly-available model, and if any public results are available on the task, compare to them.
+**For Feature Requests:**
+- Provide a short paragraph's worth of description. What is the feature you are requesting? What is its motivation, and an example use case of it? How does this differ from what is currently supported?
+**For Bug Reports**:
+- Provide a short description of the bug.
+- Provide a *reproducible example*--what is the command you run with our library that results in this error? Have you tried any other steps to resolve it?
+- Provide a *full error traceback* of the error that occurs, if applicable. A one-line error message or small screenshot snippet is unhelpful without the surrounding context.
+- Note what version of the codebase you are using, and any specifics of your environment and setup that may be relevant.
+**For Requesting New Tasks**:
+- Provide a 1-2 sentence description of what the task is and what it evaluates.
+- Provide a link to the paper introducing the task.
+- Provide a link to where the dataset can be found.
+- Provide a link to a paper containing results on an open-source model on the task, for use in comparisons and implementation validation.
+- If applicable, link to any codebase that has implemented the task (especially the original publication's codebase, if existent).
+## How Can I Get Involved?
+To quickly get started, we maintain a list of good first issues, which can be found [on our project board](https://github.com/orgs/EleutherAI/projects/25/views/8) or by [filtering GH Issues](https://github.com/EleutherAI/lm-evaluation-harness/issues?q=is%3Aopen+label%3A%22good+first+issue%22+label%3A%22help+wanted%22). These are typically smaller code changes or self-contained features which can be added without extensive familiarity with library internals, and we recommend new contributors consider taking a stab at one of these first if they are feeling uncertain where to begin.
+There are a number of distinct ways to contribute to LM Evaluation Harness, and all are extremely helpful! A sampling of ways to contribute include:
+- **Implementing and verifying new evaluation tasks**: Is there a task you'd like to see LM Evaluation Harness support? Consider opening an issue requesting it, or helping add it! Verifying and cross-checking task implementations with their original versions is also a very valuable form of assistance in ensuring standardized evaluation.
+- **Improving documentation** - Improvements to the documentation, or noting pain points / gaps in documentation, are helpful in order for us to improve the user experience of the library and clarity + coverage of documentation.
+- **Testing and devops** - We are very grateful for any assistance in adding tests for the library that can be run for new PRs, and other devops workflows.
+- **Adding new modeling / inference library integrations** - We hope to support a broad range of commonly-used inference libraries popular among the community, and welcome PRs for new integrations, so long as they are documented properly and maintainable.
+- **Proposing or Contributing New Features** - We want LM Evaluation Harness to support a broad range of evaluation usecases. If you have a feature that is not currently supported but desired, feel free to open an issue describing the feature and, if applicable, how you intend to implement it. We would be happy to give feedback on the cleanest way to implement new functionalities and are happy to coordinate with interested contributors via GH discussions or via discord.
+We hope that this has been helpful, and appreciate your interest in contributing! Further questions can be directed to [our Discord](discord.gg/eleutherai).

lm-evaluation-harness/docs/README.md ADDED Viewed

	@@ -0,0 +1,11 @@

+# Eval Harness Documentation
+Welcome to the docs for the LM Evaluation Harness!
+## Table of Contents
+* To learn about the public interface of the library, as well as how to evaluate via the command line or as integrated into an external library, see the [Interface](./interface.md).
+* To learn how to add a new library, API, or model type to the library, as well as a quick explainer on the types of ways to evaluate an LM, see the [Model Guide](./model_guide.md).
+  * For an extended description of how to extend the library to new model classes served over an API, see the [API Guide](./API_guide.md).
+* For a crash course on adding new tasks to the library, see our [New Task Guide](./new_task_guide.md).
+* To learn more about pushing the limits of task configuration that the Eval Harness supports, see the [Task Configuration Guide](./task_guide.md).

lm-evaluation-harness/docs/chat-template-readme.md ADDED Viewed

	@@ -0,0 +1,31 @@

+# Chat Template Delimiter Handling Update
+## Overview
+This change modifies how delimiters are handled when applying chat templates in the request construction process for likelihood and multiple-choice based tasks. When `apply_chat_template` is set to `True`, the target delimiter is now set to an empty string instead of using the configured delimiter.
+## Background
+By default, the system uses a target delimiter (typically a whitespace " ") between the context and target text when constructing prompts. The full string is constructed as:
+```text
+doc_to_text(doc) + target_delimiter + doc_to_target(doc)
+```
+While this worked well for base models where we wanted the model to predict a single whitespace followed by the answer, chat models have their own formatting conventions that handle spacing differently.
+## The Change
+- When `apply_chat_template=True`, the target delimiter is now empty ("") instead of the default whitespace
+- This prevents interference between chat template formatting and the default delimiter system
+- Particularly important for multiple choice tasks where the template itself handles spacing
+## Example
+```text
+# Before (with default delimiter " ")
+<user>Question: What color is the sky?\nAnswer:<assistant> blue
+# After
+<user>Question: What color is the sky?\nAnswer:<assistant>blue
+```

lm-evaluation-harness/docs/decontamination.md ADDED Viewed

	@@ -0,0 +1,76 @@

+# Decontamination
+## Usage
+The provided directory should contain
+the ngram files and info.json produced in "Pile Ngram Generation" further down.
+```bash
+python -m lm_eval \
+    --model gpt2 \
+    --device 0 \
+    --tasks sciq
+```
+## Background
+Downstream evaluations test model generalization, and are less useful when test set data also exists in the training set, referred to as leakage or contamination.
+Filtering your training set against the test set is a good first step, however this isn't always possible, as in the case of a new benchmark or one that wasn't considered prior to model training. When training set filtering isn't possible, it is useful to measure the impact of test set leakage by detecting the contaminated test examples and producing a clean version of the benchmark.
+The basis for our decontamination procedure can be found in Appendix C of "Language Models are Few-Shot Learners". OpenAI defined a test document as contaminated if any N-gram overlap existed with any training document. They used a range of N values between 8 and 13 depending on dataset, while we just used 13 for simplicity.
+## Implementation
+Contamination detection can be found in `lm_eval/decontaminate.py` with supporting code in `lm_eval/decontamination/`.
+decontaminate.py does the following:
+1. Build dictionaries of all ngrams and their corresponding evaluation/document ids.
+2. Scan through sorted files containing training set n-grams.
+3. If a match is found, the corresponding evaluation/document combinations are marked as contaminated.
+`lm_eval/evaluator.py` can then produce a clean version of the benchmark by excluding the results of contaminated documents. For each metric, a clean version will be shown in the results with a "decontaminate" suffix.
+This is disabled by default for new tasks, to support decontamination on a task override the "should_decontaminate" and "doc_to_decontamination_query" methods. For more details see the [task guide](task_guide.md).
+## Pile Ngram Generation
+The relevant scripts can be found in `scripts/clean_training_data`, which also import from
+`lm_eval/decontamination/`
+1. git clone https://github.com/EleutherAI/lm-evaluation-harness.git
+2. pip install -r requirements.txt
+3. Download The Pile from [The Eye](https://the-eye.eu/public/AI/pile/train/)
+4. Place pile files in "pile" directory under "lm-evaluation-harness" (or create a symlink)
+5. Run generate_13_grams.
+```bash
+export PYTHONHASHSEED=0
+python -m scripts/clean_training_data/generate_13_grams \
+       -dir path/to/working/directory \
+       -n 13 \
+       -buckets 500
+```
+Took approximately 4 days for us. We had the time to wait, but this could be scaled out by doing partial pile scans on multiple instances of this script and merging the relevant buckets. We fixed PYTHONHASHSEED to ensure reproducibility of bucket hashing in case you need to stop and start.
+6. Sort the generated 13-grams.
+```bash
+python -m scripts/clean_training_data/sort_13_gram_buckets \
+       -dir path/to/working/directory/output
+```
+Took approximately 5 days for us. You could speed this up by spreading the files around to different machines and running the sort script before gathering them together.
+7. Compress the sorted 13 grams files and place them together with info.json.
+This step only takes a few hours.
+```bash
+python -m scripts/clean_training_data/compress_and_package \
+       -dir path/to/working/directory \
+       -output path/to/final/directory \
+       -procs 8
+```

lm-evaluation-harness/docs/footguns.md ADDED Viewed

	@@ -0,0 +1,58 @@

+# Common Pitfalls and Troubleshooting Guide
+This document highlights common pitfalls and troubleshooting tips when using this library. We'll continue to add more tips as we discover them.
+## YAML Configuration Issues
+### Newline Characters in YAML (`\n`)
+**Problem:** When specifying newline characters in YAML, they may be interpreted incorrectly depending on how you format them.
+```yaml
+# ❌ WRONG: Single quotes don't process escape sequences
+generation_kwargs:
+  until: ['\n']  # Gets parsed as the literal characters '\' and 'n' i.e "\\n"
+```
+```yaml
+# ✅ RIGHT: Use double quotes for escape sequences
+generation_kwargs:
+  until: ["\n"]  # Gets parsed as an actual newline character
+```
+**Solutions:**
+- Use double quotes for strings containing escape sequences
+- For multiline content, use YAML's block scalars (`|` or `>`)
+- When generating YAML programmatically, be careful with how template engines handle escape sequences
+### Quoting in YAML
+**When to use different types of quotes:**
+- **No quotes**: Simple values (numbers, booleans, alphanumeric strings without special characters)
+  ```yaml
+  simple_value: plain text
+  number: 42
+  ```
+- **Single quotes (')**:
+  - Preserves literal values
+  - Use when you need special characters to be treated literally
+  - Escape single quotes by doubling them: `'It''s working'`
+  ```yaml
+  literal_string: 'The newline character \n is not processed here'
+  path: 'C:\Users\name'  # Backslashes preserved
+  ```
+- **Double quotes (")**:
+  - Processes escape sequences like `\n`, `\t`, etc.
+  - Use for strings that need special characters interpreted
+  - Escape double quotes with backslash: `"He said \"Hello\""`
+  ```yaml
+  processed_string: "First line\nSecond line"  # Creates actual newline
+  unicode: "Copyright symbol: \u00A9"  # Unicode character
+  ```

lm-evaluation-harness/docs/img/fewshot_example_gpt3.png ADDED Viewed

Git LFS Details

SHA256: 6af5dc2196248b29260ba443e882725dd6cfc51ef17ad5a4dbab4f8ce6850c75
Pointer size: 131 Bytes
Size of remote file: 316 kB

lm-evaluation-harness/docs/interface.md ADDED Viewed

	@@ -0,0 +1,170 @@

+# User Guide
+This document details the interface exposed by `lm-eval` and provides details on what flags are available to users.
+## Command-line Interface
+A majority of users run the library by cloning it from Github, installing the package as editable, and running the `python -m lm_eval` script.
+Equivalently, running the library can be done via the `lm-eval` entrypoint at the command line.
+This mode supports a number of command-line arguments, the details of which can also be seen via running with `-h` or `--help`:
+- `--model` : Selects which model type or provider is evaluated. Must be a string corresponding to the name of the model type/provider being used. See [the main README](https://github.com/EleutherAI/lm-evaluation-harness/tree/main#model-apis-and-inference-servers) for a full list of enabled model names and supported libraries or APIs.
+- `--model_args` : Controls parameters passed to the model constructor. Accepts a string containing comma-separated keyword arguments to the model class of the format `"arg1=val1,arg2=val2,..."`, such as, for example `--model_args pretrained=EleutherAI/pythia-160m,dtype=float32`. For a full list of what keyword arguments, see the initialization of the `lm_eval.api.model.LM` subclass, e.g. [`HFLM`](https://github.com/EleutherAI/lm-evaluation-harness/blob/365fcda9b85bbb6e0572d91976b8daf409164500/lm_eval/models/huggingface.py#L66)
+- `--tasks` : Determines which tasks or task groups are evaluated. Accepts a comma-separated list of task names or task group names. Must be solely comprised of valid tasks/groups. A list of supported tasks can be viewed with `--tasks list`.
+- `--num_fewshot` : Sets the number of few-shot examples to place in context. Must be an integer.
+- `--gen_kwargs` : takes an arg string in same format as `--model_args` and creates a dictionary of keyword arguments. These will be passed to the models for all called `generate_until` (free-form or greedy generation task) tasks, to set options such as the sampling temperature or `top_p` / `top_k`. For a list of what args are supported for each model type, reference the respective library's documentation (for example, the documentation for `transformers.AutoModelForCausalLM.generate()`.) These kwargs will be applied to all `generate_until` tasks called--we do not currently support unique gen_kwargs or batch_size values per task in a single run of the library. To control these on a per-task level, set them in that task's YAML file.
+- `--batch_size` : Sets the batch size used for evaluation. Can be a positive integer or `"auto"` to automatically select the largest batch size that will fit in memory, speeding up evaluation. One can pass `--batch_size auto:N` to re-select the maximum batch size `N` times during evaluation. This can help accelerate evaluation further, since `lm-eval` sorts documents in descending order of context length.
+- `--max_batch_size` : Sets the maximum batch size to try to fit in memory, if `--batch_size auto` is passed.
+- `--device` : Sets which device to place the model onto. Must be a string, for example, `"cuda", "cuda:0", "cpu", "mps"`. Defaults to "cuda", and can be ignored if running multi-GPU or running a non-local model type.
+- `--output_path` : A string of the form `dir/file.jsonl` or `dir/`. Provides a path where high-level results will be saved, either into the file named or into the directory named. If `--log_samples` is passed as well, then per-document outputs and metrics will be saved into the directory as well.
+- `--log_samples` : If this flag is passed, then the model's outputs, and the text fed into the model, will be saved at per-document granularity. Must be used with `--output_path`.
+- `--limit` : Accepts an integer, or a float between 0.0 and 1.0 . If passed, will limit the number of documents to evaluate to the first X documents (if an integer) per task or first X% of documents per task. Useful for debugging, especially on costly API models.
+- `--use_cache` : Should be a path where a sqlite db file can be written to. Takes a string of format `/path/to/sqlite_cache_` in order to create a cache db at `/path/to/sqlite_cache_rank{i}.db` for each process (0-NUM_GPUS). This allows results of prior runs to be cached, so that there is no need to re-run results in order to re-score or re-run a given (model, task) pair again.
+- `--cache_requests` : Can be "true", "refresh", or "delete". "true" means that the cache should be used. "refresh" means that you wish to regenerate the cache, which you should run if you change your dataset configuration for a given task. "delete" will delete the cache. Cached files are stored under lm_eval/cache/.cache unless you specify a different path via the environment variable: `LM_HARNESS_CACHE_PATH`. e.g. `LM_HARNESS_CACHE_PATH=~/Documents/cache_for_lm_harness`.
+- `--check_integrity` : If this flag is used, the library tests for each task selected are run to confirm task integrity.
+- `--write_out` : Used for diagnostic purposes to observe the format of task documents passed to a model. If this flag is used, then prints the prompt and gold target string for the first document of each task.
+- `--show_config` : If used, prints the full `lm_eval.api.task.TaskConfig` contents (non-default settings the task YAML file) for each task which was run, at the completion of an evaluation. Useful for when one is modifying a task's configuration YAML locally to transmit the exact configurations used for debugging or for reproducibility purposes.
+- `--include_path` : Accepts a path to a folder. If passed, then all YAML files containing `lm-eval` compatible task configurations will be added to the task registry as available tasks. Used for when one is writing config files for their own task in a folder other than `lm_eval/tasks/`.
+- `--system_instruction`: Specifies a system instruction string to prepend to the prompt.
+- `--apply_chat_template` : This flag specifies whether to apply a chat template to the prompt. It can be used in the following ways:
+  - `--apply_chat_template` : When used without an argument, applies the only available chat template to the prompt. For Hugging Face models, if no dedicated chat template exists, the default chat template will be applied.
+  - `--apply_chat_template template_name` : If the model has multiple chat templates, apply the specified template to the prompt.
+    For Hugging Face models, the default chat template can be found in the [`default_chat_template`](https://github.com/huggingface/transformers/blob/fc35907f95459d7a6c5281dfadd680b6f7b620e3/src/transformers/tokenization_utils_base.py#L1912) property of the Transformers Tokenizer.
+- `--fewshot_as_multiturn` : If this flag is on, the Fewshot examples are treated as a multi-turn conversation. Questions are provided as user content and answers are provided as assistant responses. Requires `--num_fewshot` to be set to be greater than 0, and `--apply_chat_template` to be on.
+- `--predict_only`: Generates the model outputs without computing metrics. Use with `--log_samples` to retrieve decoded results.
+- `--seed`: Set seed for python's random, numpy and torch.  Accepts a comma-separated list of 3 values for python's random, numpy, and torch seeds, respectively, or a single integer to set the same seed for all three.  The values are either an integer or 'None' to not set the seed. Default is `0,1234,1234` (for backward compatibility).  E.g. `--seed 0,None,8` sets `random.seed(0)` and `torch.manual_seed(8)`. Here numpy's seed is not set since the second value is `None`.  E.g, `--seed 42` sets all three seeds to 42.
+- `--wandb_args`:  Tracks logging to Weights and Biases for evaluation runs and includes args passed to `wandb.init`, such as `project` and `job_type`. Full list [here](https://docs.wandb.ai/ref/python/init). e.g., ```--wandb_args project=test-project,name=test-run```. Also allows for the passing of the step to log things at (passed to `wandb.run.log`), e.g., `--wandb_args step=123`.
+- `--hf_hub_log_args` : Logs evaluation results to Hugging Face Hub. Accepts a string with the arguments separated by commas. Available arguments:
+  - `hub_results_org` - organization name on Hugging Face Hub, e.g., `EleutherAI`. If not provided, the results will be pushed to the owner of the Hugging Face token,
+  - `hub_repo_name` - repository name on Hugging Face Hub (deprecated, `details_repo_name` and `results_repo_name` should be used instead), e.g., `lm-eval-results`,
+  - `details_repo_name` - repository name on Hugging Face Hub to store details, e.g., `lm-eval-results`,
+  - `results_repo_name` - repository name on Hugging Face Hub to store results, e.g., `lm-eval-results`,
+  - `push_results_to_hub` - whether to push results to Hugging Face Hub, can be `True` or `False`,
+  - `push_samples_to_hub` - whether to push samples results to Hugging Face Hub, can be `True` or `False`. Requires `--log_samples` to be set,
+  - `public_repo` - whether the repository is public, can be `True` or `False`,
+  - `leaderboard_url` - URL to the leaderboard, e.g., `https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard`.
+  - `point_of_contact` - Point of contact for the results dataset, e.g., `[email protected]`.
+  - `gated` - whether to gate the details dataset, can be `True` or `False`.
+- `--metadata`: JSON string to pass to TaskConfig. Used for some tasks which require additional metadata to be passed for processing. E.g., `--metadata '{"key": "value"}'`.
+## External Library Usage
+We also support using the library's external API for use within model training loops or other scripts.
+`lm_eval` supplies two functions for external import and use: `lm_eval.evaluate()` and `lm_eval.simple_evaluate()`.
+`simple_evaluate()` can be used by simply creating an `lm_eval.api.model.LM` subclass that implements the methods described in the [Model Guide](https://github.com/EleutherAI/lm-evaluation-harness/tree/main/docs/model_guide.md), and wrapping your custom model in that class as follows:
+```python
+import lm_eval
+from lm_eval.utils import setup_logging
+...
+# initialize logging
+setup_logging("DEBUG") # optional, but recommended; or you can set up logging yourself
+my_model = initialize_my_model() # create your model (could be running finetuning with some custom modeling code)
+...
+# instantiate an LM subclass that takes your initialized model and can run
+# - `Your_LM.loglikelihood()`
+# - `Your_LM.loglikelihood_rolling()`
+# - `Your_LM.generate_until()`
+lm_obj = Your_LM(model=my_model, batch_size=16)
+# indexes all tasks from the `lm_eval/tasks` subdirectory.
+# Alternatively, you can set `TaskManager(include_path="path/to/my/custom/task/configs")`
+# to include a set of tasks in a separate directory.
+task_manager = lm_eval.tasks.TaskManager()
+# Setting `task_manager` to the one above is optional and should generally be done
+# if you want to include tasks from paths other than ones in `lm_eval/tasks`.
+# `simple_evaluate` will instantiate its own task_manager if it is set to None here.
+results = lm_eval.simple_evaluate( # call simple_evaluate
+    model=lm_obj,
+    tasks=["taskname1", "taskname2"],
+    num_fewshot=0,
+    task_manager=task_manager,
+    ...
+)
+```
+See the `simple_evaluate()` and `evaluate()` functions in [lm_eval/evaluator.py](../lm_eval/evaluator.py#:~:text=simple_evaluate) for a full description of all arguments available. All keyword arguments to simple_evaluate share the same role as the command-line flags described previously.
+Additionally, the `evaluate()` function offers the core evaluation functionality provided by the library, but without some of the special handling and simplification + abstraction provided by `simple_evaluate()`.
+As a brief example usage of `evaluate()`:
+```python
+import lm_eval
+# suppose you've defined a custom lm_eval.api.Task subclass in your own external codebase
+from my_tasks import MyTask1
+...
+# create your model (could be running finetuning with some custom modeling code)
+my_model = initialize_my_model()
+...
+# instantiate an LM subclass that takes your initialized model and can run
+# - `Your_LM.loglikelihood()`
+# - `Your_LM.loglikelihood_rolling()`
+# - `Your_LM.generate_until()`
+lm_obj = Your_LM(model=my_model, batch_size=16)
+# optional: the task_manager indexes tasks including ones
+# specified by the user through `include_path`.
+task_manager = lm_eval.tasks.TaskManager(
+    include_path="/path/to/custom/yaml"
+    )
+# To get a task dict for `evaluate`
+task_dict = lm_eval.tasks.get_task_dict(
+    [
+        "mmlu", # A stock task
+        "my_custom_task", # A custom task
+        {
+            "task": ..., # A dict that configures a task
+            "doc_to_text": ...,
+            },
+        MyTask1 # A task object from `lm_eval.task.Task`
+        ],
+    task_manager # A task manager that allows lm_eval to
+                 # load the task during evaluation.
+                 # If none is provided, `get_task_dict`
+                 # will instantiate one itself, but this
+                 # only includes the stock tasks so users
+                 # will need to set this if including
+                 # custom paths is required.
+    )
+results = evaluate(
+    lm=lm_obj,
+    task_dict=task_dict,
+    ...
+)
+```

lm-evaluation-harness/docs/model_guide.md ADDED Viewed

	@@ -0,0 +1,192 @@

+# New Model Guide
+This guide may be of special interest to users who are using the library outside of the repository, via installing the library via pypi and calling `lm_eval.evaluator.evaluate()` to evaluate an existing model.
+In order to properly evaluate a given LM, we require implementation of a wrapper class subclassing the `lm_eval.api.model.LM` class, that defines how the Evaluation Harness should interface with your model. This guide walks through how to write this `LM` subclass via adding it to the library!
+## Setup
+To get started contributing, go ahead and fork the main repo, clone it, create a branch with the name of your model, and install the project requirements in your environment:
+```sh
+# After forking...
+git clone https://github.com/<YOUR-USERNAME>/lm-evaluation-harness.git
+cd lm-evaluation-harness
+git checkout -b <model-type>
+pip install -e ".[dev]"
+```
+Now, we'll create a new file where we'll be adding our model:
+```sh
+touch lm_eval/models/<my_model_filename>.py
+```
+**Tip: this filename should not shadow package names! For example, naming your file `anthropic.py` is disallowed since the API's name on pypi is `anthropic`, but naming it `anthropic_llms.py` works with no problems.**
+## Interface
+All models must subclass the `lm_eval.api.model.LM` class.
+The LM class enforces a common interface via which we can extract responses from a model:
+```python
+class MyCustomLM(LM):
+    #...
+    def loglikelihood(self, requests: list[Instance]) -> list[tuple[float, bool]]:
+        #...
+    def loglikelihood_rolling(self, requests: list[Instance]) -> list[tuple[float, bool]]:
+        #...
+    def generate_until(self, requests: list[Instance]) -> list[str]:
+        #...
+    #...
+```
+Where `Instance` is a dataclass defined in [`lm_eval.api.instance`](https://github.com/EleutherAI/lm-evaluation-harness/blob/main/lm_eval/api/instance.py) with property `args` of request-dependent type signature described below.
+We support three types of requests, consisting of different interactions / measurements with an autoregressive LM.
+All three request types take as input `requests` of type `list[Instance]` that have a matching `Instance.request_type` to the method name.
+- `generate_until`
+  - Each request contains `Instance.args : Tuple[str, dict]` containing 1. an input string to the LM and 2. a dictionary of keyword arguments used to control generation parameters.
+  - Using this input and these generation parameters, text will be sampled from the language model (typically until a maximum output length or specific stopping string sequences--for example, `{"until": ["\n\n", "."], "max_gen_toks": 128}`).
+  - The generated output text from the model will then be returned.
+- `loglikelihood`
+  - Each request contains `Instance.args : Tuple[str, str]` containing 1. an input string to the LM and 2. a target string on which the loglikelihood of the LM producing this target, conditioned on the input, will be returned.
+  - Each request will have, as result, `(ll, is_greedy): Tuple[float, int]` returned, where `ll` is a floating point number representing the log probability of generating the target string conditioned on the input, and `is_greedy` being either the value `0` or `1`, with it being `1` if and only if the target string *would be generated by greedy sampling from the LM* (that is, if the  target string is the *most likely* N-token string to be output by the LM given the input. )
+- `loglikelihood_rolling`
+  - Each request contains `Instance.args : Tuple[str]`, which is an input string to the model whose *entire* loglikelihood, conditioned on purely the EOT token, will be calculated.
+  - This is used to evaluate *perplexity* on a data distribution.
+  - It should return `(ll,) : Tuple[float]` , a.k.a. solely the *loglikelihood* of producing each piece of text given no starting input.
+To allow a model to be evaluated on all types of tasks, you will need to implement these three types of measurements (note that `loglikelihood_rolling` is a special case of `loglikelihood`). For a reference implementation, check out `lm_eval/models/huggingface.py` ! Additionally, check out `lm_eval.api.model.TemplateLM` for a class that abstracts away some commonly used functions across LM subclasses, or see if your model would lend itself well to subclassing the `lm_eval.models.huggingface.HFLM` class and overriding just the initialization or a couple methods!
+**Tip: be careful of indexing in loglikelihood!**
+LMs take in tokens in position `[0 1 2 ... N]` and output a probability distribution for token position `N+1`. We provide a simplified graphic here, excerpted from `huggingface.py`:
+```text
+# how this all works (illustrated on a causal decoder-only setup):
+#          CTX      CONT
+# inp    0 1 2 3|4 5 6 7 8 9   <- last token is deleted by inp[:, :-1]
+# model  \               \
+# logits   1 2 3|4 5 6 7 8 9   <- the ctx half gets tossed out by the
+# cont_toks      4 5 6 7 8 9      [:, -len(continuation_enc):, :self.vocab_size] slice
+```
+The final token of the target is not passed into the LM, because we want the LM's predictions *up to but not past* that final target token. For more information, check out https://github.com/EleutherAI/lm-evaluation-harness/issues/942 .
+## Registration
+Congrats on implementing your model! Now it's time to test it out.
+To make your model usable via the command line interface to `lm-eval` using `python -m lm_eval`, you'll need to tell `lm-eval` what your model's name is.
+This is done via a *decorator*, `lm_eval.api.registry.register_model`. Using `register_model()`, one can both tell the package what the model's name(s) to be used are when invoking it with `python -m lm_eval --model <name>` and alert `lm-eval` to the model's existence.
+```python
+from lm_eval.api.registry import register_model
+@register_model("<name1>", "<name2>")
+class MyCustomLM(LM):
+```
+Using this decorator results in the class being added to an accounting of the usable LM types maintained internally to the library at `lm_eval.api.registry.MODEL_REGISTRY`. See `lm_eval.api.registry` for more detail on what sorts of registries and decorators exist in the library!
+**Tip: be sure to import your model in `lm_eval/models/__init__.py!`**
+## Testing
+We also recommend that new model contributions be accompanied by short tests of their 3 core functionalities, at minimum. To see an example of such tests, look at https://github.com/EleutherAI/lm-evaluation-harness/blob/35bdecd379c0cefad6897e67db892f4a6026a128/tests/test_ggml.py .
+## Chat Templating
+Many models are fine-tuned with a [Chat Template](https://huggingface.co/docs/transformers/main/en/chat_templating) in order to enable back-and-forth interaction between a "User"'s queries and the model (often called "Assistant")'s responses. It can be desirable to evaluate fine-tuned models on evaluation tasks while wrapped in the conversational format they expect.
+In order to make your model optionally compatible with a chat format, three additional methods must be implemented:
+```python
+class MyCustomLM(LM):
+    #...
+    @property
+    def tokenizer_name(self) -> str:
+        """
+        Return the name of the model's tokenizer and/or the accompanying chat template.
+        The returned string is used to cache requests.
+        Returns:
+            str: The name of the model's tokenizer and/or chat template.
+        """
+    def chat_template(self, chat_template: Union[bool, str] = False) -> str:
+        """
+        Get the appropriate chat template for the model based on the `chat_template` argument.
+        This method returns the chat template string to build the prompt from a chat history.
+        The chat template is saved in the evaluation results for reproducibility.
+        Boolean arguments should be used with models that have only one chat template,
+        while string arguments are used with models that have multiple chat templates.
+        For the reference implementation, see HFLM class in `lm_eval.models.huggingface`.
+        Args:
+            chat_template (Union[bool, str]): Specifies whether to apply a chat template:
+                - If False: Do not apply any chat template.
+                - If True: Apply the default chat template.
+                - If str: Apply the specified chat template by name.
+        Returns:
+            str: The selected chat template in Jinja format.
+        """
+    def apply_chat_template(self, chat_history: List[Dict[str, str]]) -> str:
+        """
+        Process a chat history to create a string that can be tokenized and input into the model.
+        Args:
+            chat_history (List[Dict[str, str]]): A list of dictionaries representing the chat history,
+                where each dictionary has "role" and "content" keys.
+        Returns:
+            str: A string representing the chat history that can be tokenized and fed into the model.
+        """
+```
+- `apply_chat_template`
+  - This method performs the bulk of the work required for chat-formatting.
+  - As input, a `chat_history: List[Dict[str, str]]` is passed in. This is a transcript of a conversation of a form similar to
+  ```text
+      [
+        {"system": <user-provided system message such as "You are a helpful math-focused chatbot">},
+        {"user": <task example - a few-shot example 'input'>}
+        {"assistant": <correct response to the above example>},
+        # ... more few-shot examples, potentially
+        {"user": <test set query--response on which we will evaluate>},
+      ]
+  ```
+  which can then be converted into a string input.
+  - The output is a string representing this conversation that can be fed into the model.
+  - For example, this consists of simply calling `tokenizer.apply_chat_template` for HFLM--see the implementation there for reference.
+- `tokenizer_name`
+  - LM Eval Harness supports [caching requests](https://github.com/EleutherAI/lm-evaluation-harness/blob/4902aaaf1f374682f95ac25fe2e13b23faddc91a/lm_eval/__main__.py#L140) that are sent to a model, for faster setup when repeating an already-performed evaluation.
+  - However, we don't want to use the cache of chat transcripts rendered using one chat template or system prompt to send to a model with a different template! So, we use this `lm.tokenizer_name` string to distinguish caches for a given model (and chat template) from one another.
+- `chat_template`
+  - Chat templates are typically provided as a Jinja template string or a string formatted with str.format to include user and assistant messages in a single prompt. This template string is saved in the evaluation results to ensure reproducibility.
+If not implemented for a given model type, the flags `--apply_chat_template` , `--fewshot_as_multiturn`, and `--system_instruction` cannot be used.
+## Other
+**Pro tip**: In order to make the Evaluation Harness overestimate total runtimes rather than underestimate it, HuggingFace models come in-built with the ability to provide responses on data points in *descending order by total input length* via `lm_eval.utils.Reorderer`. Take a look at `lm_eval.models.hf_causal.HFLM` to see how this is done, and see if you can implement it in your own model!
+## Conclusion
+After reading this guide, you should be able to add new model APIs or implementations to the Eval Harness library!

lm-evaluation-harness/docs/new_task_guide.md ADDED Viewed

	@@ -0,0 +1,521 @@

+# New Task Guide
+`lm-evaluation-harness` is a framework that strives to support a wide range of zero- and few-shot evaluation tasks on autoregressive language models (LMs).
+This documentation page provides a walkthrough to get started creating your own task, in `lm-eval` versions v0.4.0 and later.
+A more interactive tutorial is available as a Jupyter notebook [here](https://github.com/EleutherAI/lm-evaluation-harness/blob/main/examples/lm-eval-overview.ipynb).
+## Setup
+If you haven't already, go ahead and fork the main repo, clone it, create a branch with the name of your task, and install the project requirements in your environment:
+```sh
+# After forking...
+git clone https://github.com/<YOUR-USERNAME>/lm-evaluation-harness.git
+cd lm-evaluation-harness
+git checkout -b <task-name>
+pip install -e ".[dev]"
+```
+In this document, we'll walk through the basics of implementing a static benchmark evaluation in two formats: a *generative* task which requires sampling text from a model, such as [`gsm8k`](https://github.com/EleutherAI/lm-evaluation-harness/blob/main/lm_eval/tasks/gsm8k/gsm8k.yaml), and a *discriminative*, or *multiple choice*, task where the model picks the most likely of several fixed answer choices, such as [`sciq`](https://github.com/EleutherAI/lm-evaluation-harness/blob/main/lm_eval/tasks/sciq/sciq.yaml).
+## Creating a YAML file
+To implement a new standard task, we'll need to write a YAML file which configures our task logic. We start by making a new empty YAML file. This file can have any name, but we recommend placing it in a subfolder of `lm_eval/tasks` titled by the dataset or task's shorthand name: for example,
+```sh
+touch lm_eval/tasks/<dataset_name>/<my_new_task_name>.yaml
+```
+Or, copy the template subfolder we provide from `templates/new_yaml_task`:
+```sh
+cp -r templates/new_yaml_task lm_eval/tasks/
+```
+and rename the folders and YAML file(s) as desired.
+### Selecting and configuring a dataset
+All data downloading and management is handled through the HuggingFace (**HF**) [`datasets`](https://github.com/huggingface/datasets) API. So, the first thing you should do is check to see if your task's dataset is already provided in their catalog [here](https://huggingface.co/datasets). If it's not in there, please consider adding it to their Hub to make it accessible to a wider user base by following their [new dataset guide](https://github.com/huggingface/datasets/blob/main/ADD_NEW_DATASET.md)
+.
+> [!TIP]
+> To test your task, we recommend using verbose logging using `export LOGLEVEL = DEBUG` in your shell before running the evaluation script. This will help you debug any issues that may arise.
+Once you have a HuggingFace dataset prepared for your task, we want to assign our new YAML to use this dataset:
+```yaml
+dataset_path: ... # the name of the dataset on the HF Hub.
+dataset_name: ... # the dataset configuration to use. Leave `null` if your dataset does not require a config to be passed. See https://huggingface.co/docs/datasets/load_hub#configurations for more info.
+dataset_kwargs: null # any extra keyword arguments that should be passed to the dataset constructor, e.g. `data_dir`.
+```
+Next, we'd like to tell our task what the dataset's train, validation, and test splits are named, if they exist:
+```yaml
+training_split: <split name of training set, or `null`>
+validation_split: <split name of val. set, or `null`>
+test_split: <split name of test set, or `null`>
+```
+Tests will run on the `test_split` if it is available, and otherwise evaluate on the `validation_split`.
+We can also specify from which split the task should retrieve few-shot examples via:
+```yaml
+fewshot_split: <split name to draw fewshot examples from, or `null`>
+```
+or by hardcoding them, either using the following in the yaml file:
+```yaml
+fewshot_config:
+  sampler: first_n
+  samples: [
+    {<sample 1>},
+    {<sample 2>},
+  ]
+```
+or by adding the function `list_fewshot_samples` in the associated utils.py file:
+```python
+def list_fewshot_samples() -> list[dict]:
+  return [{<sample 1>}, {<sample 2>}]
+```
+See `lm_eval/tasks/minerva_math/minerva_math_algebra.yaml` for an example of the latter, and `lm_eval/tasks/gsm8k/gsm8k-cot.yaml` for an example of the former.
+In this case, each sample must contain the same fields as the samples in the above sets--for example, if `doc_to_text` expects an `input` field when rendering input prompts, these provided samples must include an `input` key.
+If neither above options are not set, we will default to train/validation/test sets, in that order.
+Finally, our dataset may not be already in the exact format we want. Maybe we have to strip whitespace and special characters via a regex from our dataset's "question" field! Or maybe we just want to rename its columns to match a convention we'll be using for our prompts.
+Let's create a python file in the directory where we're writing our YAML file:
+```bash
+touch lm_eval/tasks/<dataset_name>/utils.py
+```
+Now, in `utils.py` we'll write a function to process each split of our dataset (the following example is drawn from [the `hellaswag` task](../lm_eval/tasks/hellaswag/utils.py)):
+```python
+def process_docs(dataset: datasets.Dataset) -> datasets.Dataset:
+    def _process_doc(doc):
+        ctx = doc["ctx_a"] + " " + doc["ctx_b"].capitalize()
+        out_doc = {
+            "query": preprocess(doc["activity_label"] + ": " + ctx),
+            "choices": [preprocess(ending) for ending in doc["endings"]],
+            "gold": int(doc["label"]),
+        }
+        return out_doc
+    return dataset.map(_process_doc)
+```
+Now, in our YAML config file we'll use the `!function` constructor, and tell the config where our imported Python function will come from. At runtime, before doing anything else we will preprocess our dataset according to this function!
+```yaml
+process_docs: !function utils.process_docs
+```
+### Using Local Datasets
+To load a local dataset for evaluation, you can specify data files in the `dataset_kwargs` field, such as the following for JSON files:
+```yaml
+dataset_path: json
+dataset_name: null
+dataset_kwargs:
+  data_files: /path/to/my/json
+```
+Or with files already split into separate directories:
+```yaml
+dataset_path: arrow
+dataset_kwargs:
+  data_files:
+    train: /path/to/arrow/train/data-00000-of-00001.arrow
+    validation: /path/to/arrow/validation/data-00000-of-00001.arrow
+```
+Alternatively, if you have previously downloaded a dataset from huggingface hub (using `save_to_disk()`) and wish to use the local files, you will need to use `data_dir` under `dataset_kwargs` to point to where the directory is.
+```yaml
+dataset_path: hellaswag
+dataset_kwargs:
+  data_dir: hellaswag_local/
+```
+You can also set `dataset_path` as a directory path in your local system. This will assume that there is a loading script with the same name as the directory. [See datasets docs](https://huggingface.co/docs/datasets/loading#local-loading-script).
+## Writing a Prompt Template
+The next thing we need to do is decide what format to use when presenting the data to the LM. This is our **prompt**, where we'll define both an input and output format.
+To write a prompt, users will use `doc_to_text`, `doc_to_target`, and `doc_to_choice` (Optional when certain conditions are met).
+`doc_to_text` defines the input string a model will be given while `doc_to_target` and `doc_to_choice` will be used to generate the target text. `doc_to_target` can be either a text string that refers to the target string or an integer that refers to the index of the correct label. When it is set as an index, `doc_to_choice` must also be set with the appropriate list of possible choice strings.
+### Basic prompts
+If a dataset is straightforward enough, users can enter the feature name directly. This assumes that no preprocessing is required. For example in [Swag](https://github.com/EleutherAI/lm-evaluation-harness/blob/1710b42d52d0f327cb0eb3cb1bfbbeca992836ca/lm_eval/tasks/swag/swag.yaml#L10-L11), `doc_to_text` and `doc_to_target` given the name of one of the feature each.
+```yaml
+doc_to_text: startphrase
+doc_to_target: label
+```
+Hard-coding is also possible as is the case in [SciQ](https://github.com/EleutherAI/lm-evaluation-harness/blob/1710b42d52d0f327cb0eb3cb1bfbbeca992836ca/lm_eval/tasks/sciq/sciq.yaml#L11).
+```yaml
+doc_to_target: 3
+```
+`doc_to_choice` can be directly given a list of text as option (See [Toxigen](https://github.com/EleutherAI/lm-evaluation-harness/blob/1710b42d52d0f327cb0eb3cb1bfbbeca992836ca/lm_eval/tasks/toxigen/toxigen.yaml#L11))
+```yaml
+doc_to_choice: ['No', 'Yes']
+```
+if a dataset feature is already a list, you can set the name of the feature as `doc_to_choice` (See [Hellaswag](https://github.com/EleutherAI/lm-evaluation-harness/blob/e0eda4d3ffa10e5f65e0976161cd134bec61983a/lm_eval/tasks/hellaswag/hellaswag.yaml#L13))
+```yaml
+doc_to_choice: choices
+```
+### Writing a prompt with Jinja 2
+We support the [Jinja 2](https://jinja.palletsprojects.com/en/3.1.x/) templating language for writing prompts. In practice, this means you can take your dataset's columns and do many basic string manipulations to place each document into prompted format.
+Take for example the dataset `super_glue/boolq`. As input, we'd like to use the features `passage` and `question` and string them together so that for a sample line `doc`, the model sees something in the format of:
+```text
+doc["passage"]
+Question: doc["question"]?
+Answer:
+```
+We do this by [writing](https://github.com/EleutherAI/lm-evaluation-harness/blob/1710b42d52d0f327cb0eb3cb1bfbbeca992836ca/lm_eval/tasks/super_glue/boolq/default.yaml#L9C1-L9C61)
+```yaml
+doc_to_text: "{{passage}}\nQuestion: {{question}}?\nAnswer:"
+```
+Such that `{{passage}}` will be replaced by `doc["passage"]` and `{{question}}` with `doc["question"]` when rendering the prompt template.
+Our intended output is for the model to predict a single whitespace, and then the answer to the question. We do this via:
+```yaml
+doc_to_target: "{{answer}}"
+```
+#### Multiple choice format
+For tasks which are multiple choice (a fixed, finite set of label words per each document) and evaluated via comparing loglikelihoods of all label words (the `multiple_choice` task output type) we enforce a particular convention on prompt format.
+> [!WARNING]
+> We add `target_delimiter` between input and target which defaults to " ", such that the full input-output string is `doc_to_text(doc) + target_delimiter + doc_to_target(doc)`. `doc_to_text` and `doc_to_target` should not contain trailing right or left whitespace, respectively. For multiple choice the target will be each choice index concatenated with the delimiter.
+An annotated example in the case of SciQ is as follows:
+```yaml
+doc_to_text: "{{support.lstrip()}}\nQuestion: {{question}}\nAnswer:" # This is the input portion of the prompt for this doc. It will have " {{choice}}" appended to it as target for each choice in answer_choices.
+doc_to_target: 3 # this contains the index into the answer choice list of the correct answer.
+doc_to_choice: "{{[distractor1, distractor2, distractor3, correct_answer]}}"
+```
+Task implementers are thus able to decide what the answer choices should be for a document, and what prompt format to use.
+The label index can also be sourced from a feature directly. For example in `superglue/boolq`, the label index if defined in the feature `label`. We can set `doc_to_target` as simply `label`. The options or verbalizers can be written in the form of a list `["no", "yes"]` that will correspond to the label index.
+```yaml
+doc_to_text: "{{passage}}\nQuestion: {{question}}?\nAnswer:"
+doc_to_target: label
+doc_to_choice: ["no", "yes"]
+```
+### Using Python Functions for Prompts
+There may be cases where the prompt we want to implement is easier expressed in Python instead of Jinja 2. For this, we can use Python helper functions that are defined in the YAML config. It should be noted that the function script must be in the same directory as the yaml.
+A good example is WikiText that requires a lot of regex rules to clean the samples.
+```python
+def wikitext_detokenizer(doc):
+    string = doc["page"]
+    # contractions
+    string = string.replace("s '", "s'")
+    string = re.sub(r"/' [0-9]/", r"/'[0-9]/", string)
+    ...
+    string = string.replace(" 's", "'s")
+    return string
+```
+We can load this function in `doc_to_target` by using a `!function` operator after `doc_to_target` and followed by `<file name>.<function name>`. In the file [wikitext.yaml](https://github.com/EleutherAI/lm-evaluation-harness/blob/main/lm_eval/tasks/wikitext/wikitext.yaml) we write:
+```yaml
+doc_to_target: !function preprocess_wikitext.wikitext_detokenizer
+```
+### Importing a Prompt from Promptsource
+[Promptsource](https://github.com/bigscience-workshop/promptsource/tree/main/promptsource) is a great repository for crowdsourced prompts for many datasets. We can load these prompts easily by using the `use_prompt` argument and filling it with the format `"promptsource:<name of prompt template>"`. To use this, `doc_to_text` and `doc_to_target` should be left undefined. This will fetch the template of the dataset defined in the YAML file.
+For example, For Super Glue BoolQ, if we want to use the prompt template `GPT-3 Style` we can add this to the YAML file.
+```yaml
+use_prompt: "promptsource:GPT-3 Style"
+```
+If you would like to run evaluation on all prompt templates, you can simply call it this way.
+```yaml
+use_prompt: "promptsource:*"
+```
+### Setting metrics
+You're almost done! Now we need to choose how to score our task.
+- *If this is a multiple choice task:* do you just want to check your model's accuracy in choosing the correct answer choice?
+- *If this is a generation task:* do you just want to check how often your model outputs *exactly the ground-truth output string provided*?
+If the answer to the above is no: you'll need to record what scoring metrics to use! Metrics can be listed in the following format:
+```yaml
+metric_list:
+  - metric: <name of the metric here>
+    aggregation: <name of the aggregation fn here>
+    higher_is_better: <true or false>
+  - metric: !function script.function
+    aggregation: ...
+    higher_is_better: ...
+```
+`aggregation` and `higher_is_better` can optionally be left out to default to the manually-set defaults if using a natively supported metric, otherwise it must be defined explicitly (for example, when using a custom metric implemented as a function).
+For a full list of natively supported metrics and aggregation functions see [`docs/task_guide.md`](https://github.com/EleutherAI/lm-evaluation-harness/blob/main/docs/task_guide.md). All metrics supported in [HuggingFace Evaluate](https://github.com/huggingface/evaluate/tree/main/metrics) can also be used, and will be loaded if a given metric name is not one natively supported in `lm-eval` or `hf_evaluate` is set to `true`.
+### Optional, More Advanced Setup
+Some tasks may require more advanced processing logic than is described in this guide.
+As a heuristic check:
+- Does your task require generating multiple free-form outputs per input document?
+- Does your task require complex, multi-step post-processing of generated model outputs?
+- Does your task require subsetting documents on the fly based on their content?
+- Do you expect to compute metrics after applying multiple such processing steps on your model outputs?
+- Does your task rely on metrics that need a custom implementation?
+For more detail on the task system and advanced features, see [`docs/task_guide.md`](https://github.com/EleutherAI/lm-evaluation-harness/blob/main/docs/task_guide.md). If none of the above sounds like they apply to your task, it's time to continue onto checking your task performance!
+### Task name + tags (registering a task)
+To test a task conveniently, it helps to *register* the task--that is, to give it a name and make the `lm-eval` library aware it exists!
+If you're writing your YAML file inside the `lm_eval/tasks` folder, you just need to give your task a name! You can do this inside your YAML file:
+```yaml
+task: <name of the task>
+```
+Including a task name is mandatory.
+It is often also convenient to label your task with several `tag` values, though this field is optional:
+```yaml
+tag:
+  - tag1
+  - tag2
+```
+This will add your task to the `tag1` and `tag2` tags, enabling people to know how to categorize your task, and if desired run all tasks in one of these groups at once, your task along with them.
+If your task is not in the `lm_eval/tasks` folder, you'll need to tell the Eval Harness where to look for YAML files.
+You can do this via the `--include_path` argument in `__main__.py`. This command will be used to initialize the `TaskManager` object which you can also use for your custom scripts.
+```python
+task_manager = TaskManager(args.verbosity, include_path=args.include_path)
+```
+Passing `--tasks /path/to/yaml/file` is also accepted.
+### Advanced Group Configs
+While `tag` values are helpful when you want to be able to quickly and conveniently run a set of related tasks via `--tasks my_tag_name`, often, we wish to implement more complex logic. For example, the MMLU benchmark contains 57 *subtasks* that must all be *averaged* together in order to report a final 'MMLU score'.
+Groupings of tasks might also use particular variants of a task--for example, we might want to default to evaluating a task as 5-shot when called as part of a given grouping, but not have a preference for number of shots when evaluating it as a standalone.
+We implement this via **groups**, which are distinct from tags. Groups can be implemented via *group config* YAML files, which are laid out similarly but slightly differently to tasks' YAML configs.
+The most basic form of group can be defined via a YAML config similar to the following:
+```yaml
+group: nli_tasks
+task:
+  - cb
+  - anli_r1
+  - rte
+metadata:
+  version: 1.0
+```
+This will behave almost identically to a `tag` that includes these 3 tasks, but with one key distinction: we'll print the `nli_tasks` group as a row (with no associated metrics) in our table of outputs, and visually show that these 3 tasks appear under its subheader.
+Now, let's assume we actually want to report an aggregate score for `nli_tasks`. We would instead use a YAML config like the following:
+```yaml
+group: nli_tasks
+task:
+  - cb
+  - anli_r1
+  - rte
+aggregate_metric_list:
+  - metric: acc
+    aggregation: mean
+    weight_by_size: true # defaults to `true`. Set this to `false` to do a "macro" average (taking each subtask's average accuracy, and summing those accuracies and dividing by 3)--by default we do a "micro" average (retain all subtasks' per-document accuracies, and take the mean over all documents' accuracies to get our aggregate mean).
+metadata:
+  version: 1.0
+```
+Similar to our `metric_list` for listing out the metrics we want to calculate for a given task, we use an `aggregate_metric_list` field to specify which metric name to aggregate across subtasks, what aggregation function to use, and whether we should micro- or macro- average these metrics. See [./task_guide.md](./task_guide.md) for a full list of related sub-keys.
+**[!Tip]: currently, we predominantly only support the aggregation of group metrics that use `mean` (either micro- or macro- averaged) over their subtasks. If you require even more complex aggregation rules, you may want to perform aggregation offline.**
+Group configs can be fairly complex! We can do various operations, such as defining new subtask(s) inline in our group YAML, overriding an existing task's specific config value, or nesting existing groups within our
+For example, let's build a config for evaluating MMLU and a few natural language inference tasks. For MMLU, we can write the name for the benchmark as a subtask written under `task`. You can configure the parameters such as `num_fewshot`. If the task being configured is a group such as `mmlu` or `super_glue`, the parameter set will be applied to all of the subtasks.
+```yaml
+group: nli_and_mmlu
+task:
+  - group: nli_tasks
+    task:
+      - cb
+      - anli_r1
+      - rte
+    aggregate_metric_list:
+      - metric: acc
+        aggregation: mean
+        higher_is_better: true
+  - task: mmlu
+    num_fewshot: 2
+```
+### Configuring python classes
+There can be occasions when yaml-based tasks cannot accommodate how a task is handled. LM-Eval supports the manually implementing tasks as was previously done before `0.4.x`. To register the task, you can simply make a yaml with the name of the task in `task` and the class object in `class` using the `!function` prefix.
+```yaml
+task: squadv2
+class: !function task.SQuAD2
+```
+This also applies to building group configurations with subtasks that are python classes.
+```yaml
+group: scrolls
+task:
+  - task: scrolls_qasper
+    class: !function task.Qasper
+  - task: scrolls_quality
+    class: !function task.QuALITY
+  - task: scrolls_narrativeqa
+    class: !function task.NarrativeQA
+  ...
+```
+You can also pass a custom argument to your class by accepting `config` in the custom class constructor.
+Here's how to do it:
+```yaml
+task: 20_newsgroups
+class: !function task.Unitxt
+recipe: card=cards.20_newsgroups,template=templates.classification.multi_class.title
+```
+In this example, `recipe` is the custom argument for the `Unitxt` class.
+## Beautifying Table Display
+To avoid conflict, each task needs to be registered with a unique name. Because of this, slight variations of task are still counted as unique tasks and need to be named uniquely. This could be done by appending an additional naming that may refer to the variation such as in MMLU where the template used to evaluated for flan are differentiated from the default by the prefix `mmlu_flan_*`. Printing the full task names can easily clutter the results table at the end of the evaluation especially when you have a long list of tasks or are using a benchmark that comprises of many tasks. To make it more legible, you can use `task_alias` and `group_alias` to provide an alternative task name and group name that will be printed. For example in `mmlu_abstract_algebra.yaml` we set `task_alias` to `abstract_algebra`. In group configs, a `group_alias` for a group can also be set.
+```yaml
+"dataset_name": "abstract_algebra"
+"description": "The following are multiple choice questions (with answers) about abstract\
+  \ algebra.\n\n"
+"include": "_default_template_yaml"
+"task": "mmlu_abstract_algebra"
+"task_alias": "abstract_algebra"
+```
+## Checking validity
+After registering your task, you can now check on your data downloading and verify that the few-shot samples look as intended. Run the following command with your desired args:
+```bash
+python -m scripts.write_out \
+    --output_base_path <path> \
+    --tasks <your-task-name> \
+    --sets <train | val | test> \
+    --num_fewshot K \
+    --num_examples N \
+```
+Open the file specified at the `--output_base_path <path>` and ensure it passes
+a simple eye test.
+## Versioning
+One key feature in LM Evaluation Harness is the ability to version tasks and groups--that is, mark them with a specific version number that can be bumped whenever a breaking change is made.
+This version info can be provided by adding the following to your new task or group config file:
+```yaml
+metadata:
+  version: 0
+```
+Now, whenever a change needs to be made to your task in the future, please increase the version number by 1 so that users can differentiate the different task iterations and versions.
+If you are incrementing a task's version, please also consider adding a changelog to the task's README.md noting the date, PR number, what version you have updated to, and a one-liner describing the change.
+for example,
+- \[Dec 25, 2023\] (PR #999) Version 0.0 -> 1.0: Fixed a bug with answer extraction that led to underestimated performance.
+## Checking performance + equivalence
+It's now time to check models' performance on your task! In the evaluation harness, we intend to support a wide range of evaluation tasks and setups, but prioritize the inclusion of already-proven benchmarks following the precise evaluation setups in the literature where possible.
+To enable this, we provide a checklist that should be completed when contributing a new task, to enable accurate book-keeping and to ensure that tasks added to the library are well-tested and, where applicable, precedented.
+### Task Validity Checklist
+The checklist is the following:
+For adding novel benchmarks/datasets to the library:
+- [ ] Is the task an existing benchmark in the literature?
+  - [ ] Have you referenced the original paper that introduced the task?
+  - [ ] If yes, does the original paper provide a reference implementation? If so, have you checked against the reference implementation and documented how to run such a test?
+If other tasks on this dataset are already supported:
+- [ ] Is the "Main" variant of this task clearly denoted?
+- [ ] Have you provided a short sentence in a README on what each new variant adds / evaluates?
+- [ ] Have you noted which, if any, published evaluation setups are matched by this variant?
+It is recommended to include a filled-out copy of this checklist in the README.md for the subfolder you are creating, if you have created a new subfolder in `lm_eval/tasks`.
+**Finally, please add a short description of your task(s), along with a link to its subfolder in lm_eval/tasks, to [`lm_eval/tasks/README.md`](https://github.com/EleutherAI/lm-evaluation-harness/blob/main/lm_eval/tasks/README.md) so that users can discover your task in the library, and follow the link to your README for more information about the variants supported, their task names, and the original source of the dataset and/or evaluation setup.**
+## Submitting your task
+You're all set! Now push your work and make a pull request to the `main` branch! Thanks for the contribution :). If there are any questions, please leave a message in the `#lm-thunderdome` channel on the EAI discord!

lm-evaluation-harness/docs/task_guide.md ADDED Viewed

	@@ -0,0 +1,335 @@

+# Task Configuration
+The `lm-evaluation-harness` is meant to be an extensible and flexible framework within which many different evaluation tasks can be defined. All tasks in the new version of the harness are built around a YAML configuration file format.
+These YAML configuration files, along with the current codebase commit hash, are intended to be shareable such that providing the YAML config enables another researcher to precisely replicate the evaluation setup used by another, in the case that the prompt or setup differs from standard `lm-eval` task implementations.
+While adding a standard evaluation task on a new dataset can be occasionally as simple as swapping out a Hugging Face dataset path in an existing file, more specialized evaluation setups also exist. Here we'll provide a crash course on the more advanced logic implementable in YAML form available to users.
+If your intended task relies on features beyond what is described in this guide, we'd love to hear about it! Feel free to open an issue describing the scenario on Github, create a PR to the project with a proposed implementation, or ask in the `#lm-thunderdome` channel on the EleutherAI discord.
+## Configurations
+Tasks are configured via the `TaskConfig` object. Below, we describe all fields usable within the object, and their role in defining a task.
+### Parameters
+Task naming + registration:
+- **task** (`str`, defaults to None) — name of the task.
+- **task_alias** (`str`, defaults to None) - Alias of the task name that will be printed in the final table results.
+- **tag** (`str`, *optional*) — name of the task tags(s) a task belongs to. Enables one to run all tasks with a specified tag name at once.
+Dataset configuration options:
+- **dataset_path** (`str`) — The name of the dataset as listed by HF in the datasets Hub.
+- **dataset_name**  (`str`, *optional*, defaults to None) — The name of what HF calls a “data instance” or sub-task of the benchmark. If your task does not contain any data instances, just leave this to default to None. (If you're familiar with the HF `datasets.load_dataset` function, these are just the first 2 arguments to it.)
+- **dataset_kwargs** (`dict`, *optional*) — Auxiliary arguments that `datasets.load_dataset` accepts. This can be used to specify arguments such as `data_files` or `data_dir` if you want to use local datafiles such as json or csv.
+- **custom_dataset** (`Callable`, *optional) - A function that returns a `dict[str, datasets.Dataset]` (<split_name>, dataset) object. This can be used to load a dataset from a custom source or to preprocess the dataset in a way that is not supported by the `datasets` library. Will have access to `metadata` field if defined (from config and passed to TaskManager), and `model_args` from runtime (if using `evaluate`).
+- **training_split** (`str`, *optional*) — Split in the dataset to use as the training split.
+- **validation_split** (`str`, *optional*) — Split in the dataset to use as the validation split.
+- **test_split** (`str`, *optional*) — Split in the dataset to use as the test split.
+- **fewshot_split** (`str`, *optional*) — Split in the dataset to draw few-shot exemplars from. assert that this not None if num_fewshot > 0.
+- **process_docs** (`Callable`, *optional*) — Optionally define a function to apply to each HF dataset split, to preprocess all documents before being fed into prompt template rendering or other evaluation steps. Can be used to rename dataset columns, or to process documents into a format closer to the expected format expected by a prompt template.
+Prompting / in-context formatting options:
+- **use_prompt** (`str`, *optional*) — Name of prompt in promptsource to use. if defined, will overwrite doc_to_text, doc_to_target, and doc_to_choice.
+- **description** (`str`, *optional*) — An optional prepended Jinja2 template or string which will be prepended to the few-shot examples passed into the model, often describing the task or providing instructions to a model, such as `"The following are questions (with answers) about {{subject}}.\n\n"`. No delimiters or spacing are inserted between the description and the first few-shot example.
+- **doc_to_text** (`Union[Callable, str]`, *optional*) — Jinja2 template, string, or function to process a sample into the appropriate input for the model.
+- **doc_to_target** (`Union[Callable, str]`, *optional*) — Jinja2 template, string, or function to process a sample into the appropriate target output for the model. For multiple choice tasks, this should return an index into the answer choice list of the correct answer.
+- **doc_to_choice** (`Union[Callable, str]`, *optional*) — Jinja2 template, string, or function to process a sample into a list of possible string choices for `multiple_choice` tasks. Left undefined for `generate_until` tasks.
+- **fewshot_delimiter** (`str`, *optional*, defaults to "\n\n") — String to insert between few-shot examples.
+- **target_delimiter** (`str`, *optional*, defaults to `" "`) — String to insert between input and target output for the datapoint being tested.
+- **gen_prefix** (`str`, *optional*) — String to append after the <|assistant|> token. For example, if the task is to generate a question, the gen_prefix could be "The answer is: " to prompt the model to generate an answer to the question. If not using a chat template then this string will be appended to the end of the prompt.
+Runtime configuration options:
+- **num_fewshot** (`int`, *optional*, defaults to 0) — Number of few-shot examples before the input.
+- **batch_size** (`int`, *optional*, defaults to 1) — Batch size.
+Scoring details:
+- **metric_list** (`str`, *optional*, defaults to None) — A list of metrics to use for evaluation. See docs for expected format.
+- **output_type** (`str`, *optional*, defaults to "generate_until") — Selects the type of model output for the given task. Options are `generate_until`, `loglikelihood`, `loglikelihood_rolling`, and `multiple_choice`.
+- **generation_kwargs** (`dict`, *optional*) — Auxiliary arguments for the `generate` function from HF transformers library. Advanced keyword arguments may not be supported for non-HF LM classes.
+- **repeats** (`int`, *optional*, defaults to 1) — Number of repeated runs through model for each sample. Can be used for cases such as self-consistency.
+- **filter_list** (`Union[str, list]`, *optional*) — List of filters to postprocess model outputs. See below for further detail on the filter API.
+- **should_decontaminate** (`bool`, *optional*, defaults to False) - Whether to decontaminate or not.
+- **doc_to_decontamination_query** (`str`, *optional*) — Query for decontamination if `should_decontaminate` is True. If `should_decontaminate` is True but `doc_to_decontamination_query` is `None`, `doc_to_decontamination_query` will follow `doc_to_text`.
+Other:
+- **metadata** (`dict`, *optional*) — An optional field where arbitrary metadata can be passed. Most tasks should include a `version` key in this field that is used to denote the version of the yaml config. Other special metadata keys are: `num_fewshot`, to override the printed `n-shot` table column for a task. Will also be passed to the `custom_dataset` function if defined.
+## Filters
+A key component of the `lm-evaluation-harness` library is the `Filter` object. In a typical evaluation run of the harness, we take the formatted inputs and run them through our LM, with the appropriate output type (greedy or free-form generation, or loglikelihood-based comparative scoring).
+After getting scores or output text from our LM on each `Instance` or document in the dataset, we then need to feed these responses into a metric or scoring function to return scores to a user.
+However, certain tasks may require more complex behavior than directly turning over model outputs to a metric function. For example, we may want to post-process our output text by truncating it or extracting a model's answer, we may want to ensemble over multiple "takes" on a different document, et cetera.
+**Detailed Aside**:
+We do such post-processing by operating on *responses*, which are stored after running an LM on an `Instance` from the task in `Instance.resps`.
+`resps` is a `List[str]` for each instance, and we pass a `List[List[<expected return type from model>]]` to our filters that is a list of `[instance.resps for instance in instances]`.
+Our filters, after completing a pipeline, must return a `List[<expected return type from model>]` which we then unpack and store each element of in `Instance.filtered_resps` for the corresponding instance. Thus, we take as input a list of returns from our model for each doc, and must return a return from our model *without it being wrapped in a list* for each doc.
+**End Aside**
+A full list of supported filter operations can be found in `lm_eval/filters/__init__.py`. Contributions of new filter types are welcome!
+### Multiple Filter Pipelines
+Tasks need not be limited to a single filter pipeline. We enable users to run multiple, distinct, filter pipelines on *the same model outputs* generated in one run on a task.
+As a case study, let's look at an implementation of solving the Gsm8k math word problem benchmark in `lm_eval/tasks/gsm8k/gsm8k-cot-self-consistency.yaml`. Here, we are emulating the setup used by [Self-Consistency Improves Chain of Thought Prompting](https://arxiv.org/abs/2203.11171), in which evaluation is performed by generating N chain-of-thought outputs from a model via temperature-based sampling, then selecting the answers output by the model at the end of the chains of thought, then majority voting across all those numeric answers.
+Within our YAML file:
+```yaml
+...
+repeats: 64
+filter_list:
+  - name: "score-first"
+    filter:
+      - function: "regex"
+        regex_pattern: "The answer is (\\-?[0-9\\.\\,]*[0-9]+)"
+      - function: "take_first"
+  - name: "maj@64"
+    filter:
+      - function: "regex"
+        regex_pattern: "The answer is (\\-?[0-9\\.\\,]*[0-9]+)"
+      - function: "majority_vote"
+      - function: "take_first"
+  - name: "maj@8"
+    filter:
+      - function: "take_first_k"
+        k: 8
+      - function: "regex"
+        regex_pattern: "The answer is (\\-?[0-9\\.\\,]*[0-9]+)"
+      - function: "majority_vote"
+      - function: "take_first"
+```
+We are able to provide multiple different filter pipelines, each with their own name and list of filters to apply in sequence.
+Our first filter pipeline implements
+- applying a regex to the model generations (extracting the number within the phrase "The answer is (number)")
+- selecting only the first out of the 64 model answers
+Then scoring this single answer.
+```yaml
+- name: "score-first"
+  filter:
+    - function: "regex"
+      regex_pattern: "The answer is (\\-?[0-9\\.\\,]*[0-9]+)"
+    - function: "take_first"
+```
+Our second filter pipeline, "maj@64", does majority voting across all 64 answers via:
+- applying the same regex to all responses, to get the numerical answer from the model for each of the 64 responses per problem
+- applying majority voting to all responses, which then returns a length-1 `[<majority answer>]` list for each
+- taking the first element of this length-1 list, to then score the sole response `<majority answer>` for each document.
+```yaml
+- name: "maj@64"
+  filter:
+    - function: "regex"
+      regex_pattern: "The answer is (\\-?[0-9\\.\\,]*[0-9]+)"
+    - function: "majority_vote"
+    - function: "take_first"
+```
+Our final filter pipeline, "maj@8", does majority voting across the first 8 of the model's responses per document via:
+- subsetting the len-64 list of responses `[answer1, answer2, ..., answer64]` to `[answer1, answer2, ..., answer8]` for each document
+- performing the same sequence of filters on these new sets of 8 responses, for each document.
+```yaml
+- name: "maj@8"
+  filter:
+    - function: "take_first_k"
+      k: 8
+    - function: "regex"
+      regex_pattern: "The answer is (\\-?[0-9\\.\\,]*[0-9]+)"
+    - function: "majority_vote"
+    - function: "take_first"
+```
+Thus, given the 64 responses from our LM on each document, we can report metrics on these responses in these 3 different ways, as defined by our filter pipelines.
+### Adding a custom filter
+Just like adding a custom model with `register_model` decorator one is able to do the same with filters, for example
+```python
+from lm_eval.api.filter import Filter
+from lm_eval.api.registry import register_filter
+@register_filter("new_filter")
+class NewFilter(Filter)
+    ...
+```
+## Embedded Python Code
+Use can use python functions for certain arguments by using the `!function` operator after the argument name followed by `<filename>.<pythonfunctionname>`. This feature can be used for the following arguments:
+1. `doc_to_text`
+2. `doc_to_target`
+3. `doc_to_choice`
+4. `aggregation` for a `metric` in `metric_list`
+## (No Longer Recommended) Direct `Task` Subclassing
+The prior implementation method of new tasks was to subclass `Task`. While we intend to migrate all tasks to the new YAML implementation option going forward, it remains possible to subclass the Task class and implement custom logic. For more information, see `docs/task_guide.md` in v0.3.0 of the `lm-evaluation-harness`.
+## Including a Base YAML
+You can base a YAML on another YAML file as a template. This can be handy when you need to just change the prompt for `doc_to_text` but keep the rest the same or change `filters` to compare which is better. Simply use `include` in the YAML file and write the name of the template you want to base from. This assumes that the base template is in the same directory. Otherwise, You will need to define the full path.
+```yaml
+include: <YAML filename or with full path>
+...
+```
+You can find an example of how to use this feature at [gsm8k-cot-self-consistency.yaml](https://github.com/EleutherAI/lm-evaluation-harness/blob/main/lm_eval/tasks/gsm8k/gsm8k-cot-self-consistency.yaml) where it is based off [gsm8k-cot.yaml](https://github.com/EleutherAI/lm-evaluation-harness/blob/main/lm_eval/tasks/gsm8k/gsm8k-cot.yaml)
+## Passing Arguments to Metrics
+Metrics can be defined in the `metric_list` argument when building the YAML config. Multiple metrics can be listed along with any auxiliary arguments. For example, setting the [`exact_match` metric](https://github.com/huggingface/evaluate/tree/main/metrics/exact_match), auxiliary arguments such as `ignore_case`, `ignore_punctuation`, `regexes_to_ignore` can be listed as well. They will be added to the metric function as `kwargs`. Some metrics have predefined values for `aggregation` and `higher_is_better` so listing the metric name only can be sufficient.
+```yaml
+metric_list:
+  - metric: acc
+  - metric: exact_match
+    aggregation: mean
+    higher_is_better: true
+    ignore_case: true
+    ignore_punctuation: false
+    regexes_to_ignore:
+      - ","
+      - "\\$"
+```
+### Natively Supported Metrics
+Here we list all metrics currently supported natively in `lm-eval`:
+Metrics:
+- `acc` (accuracy)
+- `acc_norm` (length-normalized accuracy)
+- `acc_mutual_info` (baseline loglikelihood - normalized accuracy)
+- `perplexity`
+- `word_perplexity` (perplexity per word)
+- `byte_perplexity` (perplexity per byte)
+- `bits_per_byte`
+- `matthews_corrcoef` (Matthews correlation coefficient)
+- `f1` (F1 score)
+- `bleu`
+- `chrf`
+- `ter`
+Aggregation functions:
+- `mean`
+- `median`
+- `perplexity`
+- `weighted_perplexity`
+- `bits_per_byte`
+### Adding a Multiple Choice Metric
+Adding a multiple choice metric has a few steps. To get it working you need to:
+1. register a metric function
+2. register an aggregation function
+3. update the `Task` definition to make sure the correct arguments are passed
+The default metric and aggregation functions are in `lm_eval/api/metrics.py`, and you can add a function there if it's for general use. The metrics are towards the bottom of the file and look like this:
+```python
+@register_metric(
+    metric="mcc",
+    higher_is_better=True,
+    output_type="multiple_choice",
+    aggregation="matthews_corrcoef",
+)
+def mcc_fn(items):  # This is a passthrough function
+    return items
+```
+Note that many of these are passthrough functions, and for multiple choice (at least) this function is never actually called.
+Aggregation functions are defined towards the top of the file, here's an example:
+```python
+@register_aggregation("matthews_corrcoef")
+def matthews_corrcoef(items):
+    unzipped_list = list(zip(*items))
+    golds = unzipped_list[0]
+    preds = unzipped_list[1]
+    return sklearn.metrics.matthews_corrcoef(golds, preds)
+```
+This function returns a single numeric value. The input is defined in `Task.process_results` in `lm_eval/api/task.py`. There's a section that looks like this:
+```python
+result_dict = {
+    **({"acc": acc} if "acc" in use_metric else {}),
+    **({"f1": (gold, pred)} if "f1" in use_metric else {}),
+    **({"mcc": (gold, pred)} if "mcc" in use_metric else {}),
+    **({"acc_norm": acc_norm} if "acc_norm" in use_metric else {}),
+    **({"exact_match": exact_match} if "exact_match" in use_metric else {}),
+}
+```
+The value here determines the input to the aggregation function, though the name used matches the metric function. These metrics all have simple needs and just need the accuracy or gold and predicted values, but immediately below this there are examples of metrics with more complicated needs you can use as reference.
+## Good Reference Tasks
+Contributing a new task can be daunting! Luckily, much of the work has often been done for you in a different, similarly evaluated task. Good examples of task implementations to study include:
+Multiple choice tasks:
+- SciQ (`lm_eval/tasks/sciq/sciq.yaml`)
+Corpus perplexity evaluations:
+- Wikitext (`lm_eval/tasks/wikitext/wikitext.yaml`)
+Generative tasks:
+- GSM8k (`lm_eval/tasks/gsm8k/gsm8k.yaml`)
+Tasks using complex filtering:
+- GSM8k with CoT (+ with Self-Consistency): (`lm_eval/tasks/gsm8k/gsm8k-cot.yaml` ; `lm_eval/tasks/gsm8k/gsm8k-cot-self-consistency.yaml`)
+# Group Configuration
+When evaluating a language model, it is not unusual to test across a number of tasks that may not be related to one another in order to assess a variety of capabilities. To this end, it may be cumbersome to have to list the set of tasks or add a new group name to each yaml of each individual task.
+To solve this, we can create a **group** yaml config. This is a config that contains the names of the tasks that should be included in a particular group. The config consists of two main keys: a `group` key which denotes the name of the group (as it would be called from the command line, e.g. `mmlu`) and a `task` key which is where we can list the tasks. The tasks listed in `task` are the task names that have been registered. A good example of a group yaml config can be found at [../lm_eval/tasks/mmlu/default/_mmlu.yaml]. See also the [New Task Guide](./new_task_guide.md) for a more in-depth and tutorial-esque explanation of how to write complex GroupConfigs.
+## Configurations
+Groups are configured via the `GroupConfig` object. Below, we describe all fields usable within the object, and their role in defining a task.
+### Parameters
+- **group** (`str`, defaults to `None`) — name of the group. Used to invoke it from the command line.
+- **group_alias** (`str`, defaults to `None`) - Alternative name for the group that will be printed in the table output.
+- **task** (`Union[str, list]`, defaults to `None`) - List of tasks that constitute the group.
+- **aggregate_metric_list** (`list`, defaults to `None`) - similar to `metric_list` in TaskConfigs, provide a list of configurations for metrics that should be aggregated across subtasks. Leaving empty will result in no aggregation being performed for this group. Keys for each list entry are:
+  - `metric: str` - the name of the metric to aggregate over (all subtasks must report a metric holding this name.)
+  - `aggregation: str` - what aggregation function to apply to aggregate these per-subtask metrics. **currently, only `mean` is supported.**
+  - `weight_by_size: bool = True` whether to perform micro- averaging (`True`) or macro- (`False`) averaging of subtasks' accuracy scores when reporting the group's metric. MMLU, for example, averages over per-document accuracies (the *micro average*), resulting in the same accuracy as if one simply concatenated all 57 subjects into a single dataset and evaluated accuracy on that dataset.
+  - `filter_list: Union[str, List[str]] = "none"` - what filter keys one should match on to aggregate results. For example, if trying to aggregate over the `exact_match` metric using `strict-match` filter for `bbh_cot_zeroshot`, then set this to be `filter_list: "strict-match"`.
+- **metadata** (`dict`, *optional*) - As with TaskConfigs, a field where extra config metadata can be passed. set the `num_fewshot` key within this to override the printed n_shot value in a results table for your group, for example.

lm-evaluation-harness/examples/lm-eval-overview.ipynb ADDED Viewed

	@@ -0,0 +1,1240 @@

+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "Qw83KAePAhaS"
+   },
+   "source": [
+    "# Releasing LM-Evaluation-Harness v0.4.0"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "Z7k2vq1iAdqr"
+   },
+   "source": [
+    "With the vast amount of work done in the field today, it helps to have a tool that people can use easily to share their results and use to check others to ensure reported numbers are valid. The LM Evaluation Harness is one such tool the community has used extensively. We want to continue to support the community and with that in mind, we’re excited to announce a major update on the LM Evaluation Harness to further our goal for open and accessible AI research."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "0gDoM0AJAvEc"
+   },
+   "source": [
+    "Our refactor stems from our desires to make the following believed best practices easier to carry out.  \n",
+    "\n",
+    "1.   Never copy results from other papers\n",
+    "2.   Always share your exact prompts\n",
+    "3.   Always provide model outputs\n",
+    "4.   Qualitatively review a small batch of outputs before running evaluation jobs at scale\n",
+    "\n",
+    "We also wanted to make the library a better experience to use and to contribute or design evaluations within. New features in the new release that serve this purpose include:\n",
+    "\n",
+    "1. Faster Evaluation Runtimes (accelerated data-parallel inference with HF Transformers + Accelerate, and commonly used or faster inference libraries such as vLLM and Llama-CPP)\n",
+    "2. Easier addition and sharing of new tasks (YAML-based task config formats, allowing single-file sharing of custom tasks)\n",
+    "3. More configurability, for more advanced workflows and easier operation with modifying prompts\n",
+    "4. Better logging of data at runtime and post-hoc"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "nnwsOpjda_YW"
+   },
+   "source": [
+    "In this notebook we will be going through a short tutorial on how things work."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "zAov81vTbL2K"
+   },
+   "source": [
+    "## Install LM-Eval"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {
+    "colab": {
+     "base_uri": "https://localhost:8080/"
+    },
+    "id": "8hiosGzq_qZg",
+    "outputId": "6ab73e5e-1f54-417e-a388-07e0d870b132"
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Collecting git+https://github.com/EleutherAI/lm-evaluation-harness.git@big-refactor\n",
+      "  Cloning https://github.com/EleutherAI/lm-evaluation-harness.git (to revision big-refactor) to /tmp/pip-req-build-tnssql5s\n",
+      "  Running command git clone --filter=blob:none --quiet https://github.com/EleutherAI/lm-evaluation-harness.git /tmp/pip-req-build-tnssql5s\n",
+      "  Running command git checkout -b big-refactor --track origin/big-refactor\n",
+      "  Switched to a new branch 'big-refactor'\n",
+      "  Branch 'big-refactor' set up to track remote branch 'big-refactor' from 'origin'.\n",
+      "  Resolved https://github.com/EleutherAI/lm-evaluation-harness.git to commit 42f486ee49b65926a444cb0620870a39a5b4b0a8\n",
+      "  Installing build dependencies ... \u001b[?25l\u001b[?25hdone\n",
+      "  Getting requirements to build wheel ... \u001b[?25l\u001b[?25hdone\n",
+      "  Preparing metadata (pyproject.toml) ... \u001b[?25l\u001b[?25hdone\n",
+      "Collecting accelerate>=0.21.0 (from lm-eval==1.0.0)\n",
+      "  Downloading accelerate-0.24.1-py3-none-any.whl (261 kB)\n",
+      "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m261.4/261.4 kB\u001b[0m \u001b[31m4.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+      "\u001b[?25hCollecting evaluate (from lm-eval==1.0.0)\n",
+      "  Downloading evaluate-0.4.1-py3-none-any.whl (84 kB)\n",
+      "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m84.1/84.1 kB\u001b[0m \u001b[31m5.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+      "\u001b[?25hCollecting datasets>=2.0.0 (from lm-eval==1.0.0)\n",
+      "  Downloading datasets-2.15.0-py3-none-any.whl (521 kB)\n",
+      "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m521.2/521.2 kB\u001b[0m \u001b[31m9.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+      "\u001b[?25hCollecting jsonlines (from lm-eval==1.0.0)\n",
+      "  Downloading jsonlines-4.0.0-py3-none-any.whl (8.7 kB)\n",
+      "Requirement already satisfied: numexpr in /usr/local/lib/python3.10/dist-packages (from lm-eval==1.0.0) (2.8.7)\n",
+      "Collecting peft>=0.2.0 (from lm-eval==1.0.0)\n",
+      "  Downloading peft-0.6.2-py3-none-any.whl (174 kB)\n",
+      "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m174.7/174.7 kB\u001b[0m \u001b[31m7.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+      "\u001b[?25hCollecting pybind11>=2.6.2 (from lm-eval==1.0.0)\n",
+      "  Downloading pybind11-2.11.1-py3-none-any.whl (227 kB)\n",
+      "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m227.7/227.7 kB\u001b[0m \u001b[31m12.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+      "\u001b[?25hCollecting pytablewriter (from lm-eval==1.0.0)\n",
+      "  Downloading pytablewriter-1.2.0-py3-none-any.whl (111 kB)\n",
+      "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m111.1/111.1 kB\u001b[0m \u001b[31m8.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+      "\u001b[?25hCollecting rouge-score>=0.0.4 (from lm-eval==1.0.0)\n",
+      "  Downloading rouge_score-0.1.2.tar.gz (17 kB)\n",
+      "  Preparing metadata (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
+      "Collecting sacrebleu>=1.5.0 (from lm-eval==1.0.0)\n",
+      "  Downloading sacrebleu-2.3.2-py3-none-any.whl (119 kB)\n",
+      "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m119.7/119.7 kB\u001b[0m \u001b[31m8.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+      "\u001b[?25hRequirement already satisfied: scikit-learn>=0.24.1 in /usr/local/lib/python3.10/dist-packages (from lm-eval==1.0.0) (1.2.2)\n",
+      "Collecting sqlitedict (from lm-eval==1.0.0)\n",
+      "  Downloading sqlitedict-2.1.0.tar.gz (21 kB)\n",
+      "  Preparing metadata (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
+      "Requirement already satisfied: torch>=1.8 in /usr/local/lib/python3.10/dist-packages (from lm-eval==1.0.0) (2.1.0+cu118)\n",
+      "Collecting tqdm-multiprocess (from lm-eval==1.0.0)\n",
+      "  Downloading tqdm_multiprocess-0.0.11-py3-none-any.whl (9.8 kB)\n",
+      "Requirement already satisfied: transformers>=4.1 in /usr/local/lib/python3.10/dist-packages (from lm-eval==1.0.0) (4.35.2)\n",
+      "Collecting zstandard (from lm-eval==1.0.0)\n",
+      "  Downloading zstandard-0.22.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (5.4 MB)\n",
+      "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m5.4/5.4 MB\u001b[0m \u001b[31m29.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+      "\u001b[?25hRequirement already satisfied: numpy>=1.17 in /usr/local/lib/python3.10/dist-packages (from accelerate>=0.21.0->lm-eval==1.0.0) (1.23.5)\n",
+      "Requirement already satisfied: packaging>=20.0 in /usr/local/lib/python3.10/dist-packages (from accelerate>=0.21.0->lm-eval==1.0.0) (23.2)\n",
+      "Requirement already satisfied: psutil in /usr/local/lib/python3.10/dist-packages (from accelerate>=0.21.0->lm-eval==1.0.0) (5.9.5)\n",
+      "Requirement already satisfied: pyyaml in /usr/local/lib/python3.10/dist-packages (from accelerate>=0.21.0->lm-eval==1.0.0) (6.0.1)\n",
+      "Requirement already satisfied: huggingface-hub in /usr/local/lib/python3.10/dist-packages (from accelerate>=0.21.0->lm-eval==1.0.0) (0.19.4)\n",
+      "Requirement already satisfied: pyarrow>=8.0.0 in /usr/local/lib/python3.10/dist-packages (from datasets>=2.0.0->lm-eval==1.0.0) (9.0.0)\n",
+      "Collecting pyarrow-hotfix (from datasets>=2.0.0->lm-eval==1.0.0)\n",
+      "  Downloading pyarrow_hotfix-0.6-py3-none-any.whl (7.9 kB)\n",
+      "Collecting dill<0.3.8,>=0.3.0 (from datasets>=2.0.0->lm-eval==1.0.0)\n",
+      "  Downloading dill-0.3.7-py3-none-any.whl (115 kB)\n",
+      "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m115.3/115.3 kB\u001b[0m \u001b[31m14.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+      "\u001b[?25hRequirement already satisfied: pandas in /usr/local/lib/python3.10/dist-packages (from datasets>=2.0.0->lm-eval==1.0.0) (1.5.3)\n",
+      "Requirement already satisfied: requests>=2.19.0 in /usr/local/lib/python3.10/dist-packages (from datasets>=2.0.0->lm-eval==1.0.0) (2.31.0)\n",
+      "Requirement already satisfied: tqdm>=4.62.1 in /usr/local/lib/python3.10/dist-packages (from datasets>=2.0.0->lm-eval==1.0.0) (4.66.1)\n",
+      "Requirement already satisfied: xxhash in /usr/local/lib/python3.10/dist-packages (from datasets>=2.0.0->lm-eval==1.0.0) (3.4.1)\n",
+      "Collecting multiprocess (from datasets>=2.0.0->lm-eval==1.0.0)\n",
+      "  Downloading multiprocess-0.70.15-py310-none-any.whl (134 kB)\n",
+      "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m134.8/134.8 kB\u001b[0m \u001b[31m19.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+      "\u001b[?25hRequirement already satisfied: fsspec[http]<=2023.10.0,>=2023.1.0 in /usr/local/lib/python3.10/dist-packages (from datasets>=2.0.0->lm-eval==1.0.0) (2023.6.0)\n",
+      "Requirement already satisfied: aiohttp in /usr/local/lib/python3.10/dist-packages (from datasets>=2.0.0->lm-eval==1.0.0) (3.8.6)\n",
+      "Collecting responses<0.19 (from evaluate->lm-eval==1.0.0)\n",
+      "  Downloading responses-0.18.0-py3-none-any.whl (38 kB)\n",
+      "Requirement already satisfied: safetensors in /usr/local/lib/python3.10/dist-packages (from peft>=0.2.0->lm-eval==1.0.0) (0.4.0)\n",
+      "Requirement already satisfied: absl-py in /usr/local/lib/python3.10/dist-packages (from rouge-score>=0.0.4->lm-eval==1.0.0) (1.4.0)\n",
+      "Requirement already satisfied: nltk in /usr/local/lib/python3.10/dist-packages (from rouge-score>=0.0.4->lm-eval==1.0.0) (3.8.1)\n",
+      "Requirement already satisfied: six>=1.14.0 in /usr/local/lib/python3.10/dist-packages (from rouge-score>=0.0.4->lm-eval==1.0.0) (1.16.0)\n",
+      "Collecting portalocker (from sacrebleu>=1.5.0->lm-eval==1.0.0)\n",
+      "  Downloading portalocker-2.8.2-py3-none-any.whl (17 kB)\n",
+      "Requirement already satisfied: regex in /usr/local/lib/python3.10/dist-packages (from sacrebleu>=1.5.0->lm-eval==1.0.0) (2023.6.3)\n",
+      "Requirement already satisfied: tabulate>=0.8.9 in /usr/local/lib/python3.10/dist-packages (from sacrebleu>=1.5.0->lm-eval==1.0.0) (0.9.0)\n",
+      "Collecting colorama (from sacrebleu>=1.5.0->lm-eval==1.0.0)\n",
+      "  Downloading colorama-0.4.6-py2.py3-none-any.whl (25 kB)\n",
+      "Requirement already satisfied: lxml in /usr/local/lib/python3.10/dist-packages (from sacrebleu>=1.5.0->lm-eval==1.0.0) (4.9.3)\n",
+      "Requirement already satisfied: scipy>=1.3.2 in /usr/local/lib/python3.10/dist-packages (from scikit-learn>=0.24.1->lm-eval==1.0.0) (1.11.3)\n",
+      "Requirement already satisfied: joblib>=1.1.1 in /usr/local/lib/python3.10/dist-packages (from scikit-learn>=0.24.1->lm-eval==1.0.0) (1.3.2)\n",
+      "Requirement already satisfied: threadpoolctl>=2.0.0 in /usr/local/lib/python3.10/dist-packages (from scikit-learn>=0.24.1->lm-eval==1.0.0) (3.2.0)\n",
+      "Requirement already satisfied: filelock in /usr/local/lib/python3.10/dist-packages (from torch>=1.8->lm-eval==1.0.0) (3.13.1)\n",
+      "Requirement already satisfied: typing-extensions in /usr/local/lib/python3.10/dist-packages (from torch>=1.8->lm-eval==1.0.0) (4.5.0)\n",
+      "Requirement already satisfied: sympy in /usr/local/lib/python3.10/dist-packages (from torch>=1.8->lm-eval==1.0.0) (1.12)\n",
+      "Requirement already satisfied: networkx in /usr/local/lib/python3.10/dist-packages (from torch>=1.8->lm-eval==1.0.0) (3.2.1)\n",
+      "Requirement already satisfied: jinja2 in /usr/local/lib/python3.10/dist-packages (from torch>=1.8->lm-eval==1.0.0) (3.1.2)\n",
+      "Requirement already satisfied: triton==2.1.0 in /usr/local/lib/python3.10/dist-packages (from torch>=1.8->lm-eval==1.0.0) (2.1.0)\n",
+      "Requirement already satisfied: tokenizers<0.19,>=0.14 in /usr/local/lib/python3.10/dist-packages (from transformers>=4.1->lm-eval==1.0.0) (0.15.0)\n",
+      "Requirement already satisfied: attrs>=19.2.0 in /usr/local/lib/python3.10/dist-packages (from jsonlines->lm-eval==1.0.0) (23.1.0)\n",
+      "Requirement already satisfied: setuptools>=38.3.0 in /usr/local/lib/python3.10/dist-packages (from pytablewriter->lm-eval==1.0.0) (67.7.2)\n",
+      "Collecting DataProperty<2,>=1.0.1 (from pytablewriter->lm-eval==1.0.0)\n",
+      "  Downloading DataProperty-1.0.1-py3-none-any.whl (27 kB)\n",
+      "Collecting mbstrdecoder<2,>=1.0.0 (from pytablewriter->lm-eval==1.0.0)\n",
+      "  Downloading mbstrdecoder-1.1.3-py3-none-any.whl (7.8 kB)\n",
+      "Collecting pathvalidate<4,>=2.3.0 (from pytablewriter->lm-eval==1.0.0)\n",
+      "  Downloading pathvalidate-3.2.0-py3-none-any.whl (23 kB)\n",
+      "Collecting tabledata<2,>=1.3.1 (from pytablewriter->lm-eval==1.0.0)\n",
+      "  Downloading tabledata-1.3.3-py3-none-any.whl (11 kB)\n",
+      "Collecting tcolorpy<1,>=0.0.5 (from pytablewriter->lm-eval==1.0.0)\n",
+      "  Downloading tcolorpy-0.1.4-py3-none-any.whl (7.9 kB)\n",
+      "Collecting typepy[datetime]<2,>=1.3.2 (from pytablewriter->lm-eval==1.0.0)\n",
+      "  Downloading typepy-1.3.2-py3-none-any.whl (31 kB)\n",
+      "Requirement already satisfied: charset-normalizer<4.0,>=2.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets>=2.0.0->lm-eval==1.0.0) (3.3.2)\n",
+      "Requirement already satisfied: multidict<7.0,>=4.5 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets>=2.0.0->lm-eval==1.0.0) (6.0.4)\n",
+      "Requirement already satisfied: async-timeout<5.0,>=4.0.0a3 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets>=2.0.0->lm-eval==1.0.0) (4.0.3)\n",
+      "Requirement already satisfied: yarl<2.0,>=1.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets>=2.0.0->lm-eval==1.0.0) (1.9.2)\n",
+      "Requirement already satisfied: frozenlist>=1.1.1 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets>=2.0.0->lm-eval==1.0.0) (1.4.0)\n",
+      "Requirement already satisfied: aiosignal>=1.1.2 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets>=2.0.0->lm-eval==1.0.0) (1.3.1)\n",
+      "Requirement already satisfied: chardet<6,>=3.0.4 in /usr/local/lib/python3.10/dist-packages (from mbstrdecoder<2,>=1.0.0->pytablewriter->lm-eval==1.0.0) (5.2.0)\n",
+      "Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.10/dist-packages (from requests>=2.19.0->datasets>=2.0.0->lm-eval==1.0.0) (3.4)\n",
+      "Requirement already satisfied: urllib3<3,>=1.21.1 in /usr/local/lib/python3.10/dist-packages (from requests>=2.19.0->datasets>=2.0.0->lm-eval==1.0.0) (2.0.7)\n",
+      "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.10/dist-packages (from requests>=2.19.0->datasets>=2.0.0->lm-eval==1.0.0) (2023.7.22)\n",
+      "Requirement already satisfied: python-dateutil<3.0.0,>=2.8.0 in /usr/local/lib/python3.10/dist-packages (from typepy[datetime]<2,>=1.3.2->pytablewriter->lm-eval==1.0.0) (2.8.2)\n",
+      "Requirement already satisfied: pytz>=2018.9 in /usr/local/lib/python3.10/dist-packages (from typepy[datetime]<2,>=1.3.2->pytablewriter->lm-eval==1.0.0) (2023.3.post1)\n",
+      "Requirement already satisfied: MarkupSafe>=2.0 in /usr/local/lib/python3.10/dist-packages (from jinja2->torch>=1.8->lm-eval==1.0.0) (2.1.3)\n",
+      "Requirement already satisfied: click in /usr/local/lib/python3.10/dist-packages (from nltk->rouge-score>=0.0.4->lm-eval==1.0.0) (8.1.7)\n",
+      "Requirement already satisfied: mpmath>=0.19 in /usr/local/lib/python3.10/dist-packages (from sympy->torch>=1.8->lm-eval==1.0.0) (1.3.0)\n",
+      "Building wheels for collected packages: lm-eval, rouge-score, sqlitedict\n",
+      "  Building wheel for lm-eval (pyproject.toml) ... \u001b[?25l\u001b[?25hdone\n",
+      "  Created wheel for lm-eval: filename=lm_eval-1.0.0-py3-none-any.whl size=994254 sha256=88356155b19f2891981ecef948326ad6ce8ca40a6009378410ec20d0e225995a\n",
+      "  Stored in directory: /tmp/pip-ephem-wheel-cache-9v6ye7h3/wheels/17/01/26/599c0779e9858a70a73fa8a306699b5b9a868f820c225457b0\n",
+      "  Building wheel for rouge-score (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
+      "  Created wheel for rouge-score: filename=rouge_score-0.1.2-py3-none-any.whl size=24933 sha256=6bb0d44e4881972c43ce194e7cb65233d309758cb15f0dec54590d3d2efcfc36\n",
+      "  Stored in directory: /root/.cache/pip/wheels/5f/dd/89/461065a73be61a532ff8599a28e9beef17985c9e9c31e541b4\n",
+      "  Building wheel for sqlitedict (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
+      "  Created wheel for sqlitedict: filename=sqlitedict-2.1.0-py3-none-any.whl size=16863 sha256=5747f7dd73ddf3d8fbcebf51b5e4f718fabe1e94bccdf16d2f22a2e65ee7fdf4\n",
+      "  Stored in directory: /root/.cache/pip/wheels/79/d6/e7/304e0e6cb2221022c26d8161f7c23cd4f259a9e41e8bbcfabd\n",
+      "Successfully built lm-eval rouge-score sqlitedict\n",
+      "Installing collected packages: sqlitedict, zstandard, tcolorpy, pybind11, pyarrow-hotfix, portalocker, pathvalidate, mbstrdecoder, jsonlines, dill, colorama, typepy, tqdm-multiprocess, sacrebleu, rouge-score, responses, multiprocess, accelerate, datasets, DataProperty, tabledata, peft, evaluate, pytablewriter, lm-eval\n",
+      "Successfully installed DataProperty-1.0.1 accelerate-0.24.1 colorama-0.4.6 datasets-2.15.0 dill-0.3.7 evaluate-0.4.1 jsonlines-4.0.0 lm-eval-1.0.0 mbstrdecoder-1.1.3 multiprocess-0.70.15 pathvalidate-3.2.0 peft-0.6.2 portalocker-2.8.2 pyarrow-hotfix-0.6 pybind11-2.11.1 pytablewriter-1.2.0 responses-0.18.0 rouge-score-0.1.2 sacrebleu-2.3.2 sqlitedict-2.1.0 tabledata-1.3.3 tcolorpy-0.1.4 tqdm-multiprocess-0.0.11 typepy-1.3.2 zstandard-0.22.0\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Install LM-Eval\n",
+    "!pip install git+https://github.com/EleutherAI/lm-evaluation-harness.git"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {
+    "colab": {
+     "base_uri": "https://localhost:8080/",
+     "height": 0,
+     "referenced_widgets": [
+      "a1d3a8aa016544a78e8821c8f6199e06",
+      "f61ed33fad754146bdd2ac9db1ba1c48",
+      "bfa0af6aeff344c6845e1080a878e92e",
+      "fd1ad9e0367d4004aae853b91c3a7617",
+      "6b2d90209ec14230b3d58a74ac9b83bf",
+      "a73f357065d34d7baf0453ae4a8d75e2",
+      "46f521b73fd943c081c648fd873ebc0a",
+      "7c5689bc13684db8a22681f41863dddd",
+      "48763b6233374554ae76035c0483066f",
+      "4986a21eb560448fa79f4b25cde48951",
+      "aed3acd2f2d74003b44079c333a0698e"
+     ]
+    },
+    "id": "uyO5MaKkZyah",
+    "outputId": "d46e8096-5086-4e49-967e-ea33d4a2a335"
+   },
+   "outputs": [
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "a1d3a8aa016544a78e8821c8f6199e06",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Downloading builder script:   0%|          | 0.00/5.67k [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": []
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "8rfUeX6n_wkK"
+   },
+   "source": [
+    "## Create new evaluation tasks with config-based tasks\n",
+    "\n",
+    "Even within the same task, many works have reported numbers based on different choices of evaluation. Some report on the test sets, validation sets, or even subset of the training sets. Others have specialized prompts and verbalizers. We introduce YAMLs to allow users to easily make different variations. By leveraging the YAML configs to configure evaluations, the refactored LM-Eval takes the methods of the `Task` object and makes them configurable by setting the appropriate attributes in the config file. There, users can set the tasks they want by setting the name of the HF dataset (local tasks are also possible), the dataset splits used, and much more. Key configurations relating to prompting, such as `doc_to_text`, previously implemented as a method of the same name, are now configurable with jinja2 to allow high-level scripting to transform a HF dataset to text string as input to the model.\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "HYFUhhfOSJKe"
+   },
+   "source": [
+    "A core-feature to LM-Eval is to configure tasks with YAML configs. With configs, you can fill preset fields to easily set up a task.\n",
+    "\n",
+    "Here, we write a demo YAML config for a multiple-choice evaluation of BoolQ:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {
+    "id": "bg3dGROW-V39"
+   },
+   "outputs": [],
+   "source": [
+    "YAML_boolq_string = \"\"\"\n",
+    "task: demo_boolq\n",
+    "dataset_path: super_glue\n",
+    "dataset_name: boolq\n",
+    "output_type: multiple_choice\n",
+    "training_split: train\n",
+    "validation_split: validation\n",
+    "doc_to_text: \"{{passage}}\\nQuestion: {{question}}?\\nAnswer:\"\n",
+    "doc_to_target: label\n",
+    "doc_to_choice: [\"no\", \"yes\"]\n",
+    "should_decontaminate: true\n",
+    "doc_to_decontamination_query: passage\n",
+    "metric_list:\n",
+    "  - metric: acc\n",
+    "\"\"\"\n",
+    "with open(\"boolq.yaml\", \"w\") as f:\n",
+    "    f.write(YAML_boolq_string)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "And we can now run evaluation on this task, by pointing to the config file we've just created:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {
+    "id": "LOUHK7PtQfq4"
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "2023-11-29:11:54:55,156 INFO     [utils.py:160] NumExpr defaulting to 2 threads.\n",
+      "2023-11-29 11:54:55.942051: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered\n",
+      "2023-11-29 11:54:55.942108: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered\n",
+      "2023-11-29 11:54:55.942142: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1518] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered\n",
+      "2023-11-29 11:54:57.066802: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Could not find TensorRT\n",
+      "2023-11-29:11:55:00,954 INFO     [__main__.py:132] Verbosity set to INFO\n",
+      "2023-11-29:11:55:11,038 WARNING  [__main__.py:138]  --limit SHOULD ONLY BE USED FOR TESTING.REAL METRICS SHOULD NOT BE COMPUTED USING LIMIT.\n",
+      "2023-11-29:11:55:11,038 INFO     [__main__.py:143] Including path: ./\n",
+      "2023-11-29:11:55:11,046 INFO     [__main__.py:205] Selected Tasks: ['demo_boolq']\n",
+      "2023-11-29:11:55:11,047 WARNING  [evaluator.py:93] generation_kwargs specified through cli, these settings will be used over set parameters in yaml tasks.\n",
+      "2023-11-29:11:55:11,110 INFO     [huggingface.py:120] Using device 'cuda'\n",
+      "config.json: 100% 571/571 [00:00<00:00, 2.87MB/s]\n",
+      "model.safetensors: 100% 5.68G/5.68G [00:32<00:00, 173MB/s]\n",
+      "tokenizer_config.json: 100% 396/396 [00:00<00:00, 2.06MB/s]\n",
+      "tokenizer.json: 100% 2.11M/2.11M [00:00<00:00, 11.6MB/s]\n",
+      "special_tokens_map.json: 100% 99.0/99.0 [00:00<00:00, 555kB/s]\n",
+      "2023-11-29:11:56:18,658 WARNING  [task.py:614] [Task: demo_boolq] metric acc is defined, but aggregation is not. using default aggregation=mean\n",
+      "2023-11-29:11:56:18,658 WARNING  [task.py:626] [Task: demo_boolq] metric acc is defined, but higher_is_better is not. using default higher_is_better=True\n",
+      "Downloading builder script: 100% 30.7k/30.7k [00:00<00:00, 59.0MB/s]\n",
+      "Downloading metadata: 100% 38.7k/38.7k [00:00<00:00, 651kB/s]\n",
+      "Downloading readme: 100% 14.8k/14.8k [00:00<00:00, 37.3MB/s]\n",
+      "Downloading data: 100% 4.12M/4.12M [00:00<00:00, 55.1MB/s]\n",
+      "Generating train split: 100% 9427/9427 [00:00<00:00, 15630.89 examples/s]\n",
+      "Generating validation split: 100% 3270/3270 [00:00<00:00, 20002.56 examples/s]\n",
+      "Generating test split: 100% 3245/3245 [00:00<00:00, 20866.19 examples/s]\n",
+      "2023-11-29:11:56:22,315 INFO     [task.py:355] Building contexts for task on rank 0...\n",
+      "2023-11-29:11:56:22,322 INFO     [evaluator.py:319] Running loglikelihood requests\n",
+      "100% 20/20 [00:04<00:00,  4.37it/s]\n",
+      "fatal: not a git repository (or any of the parent directories): .git\n",
+      "hf (pretrained=EleutherAI/pythia-2.8b), gen_kwargs: (), limit: 10.0, num_fewshot: None, batch_size: 1\n",
+      "|  Tasks   |Version|Filter|n-shot|Metric|Value|   |Stderr|\n",
+      "|----------|-------|------|-----:|------|----:|---|-----:|\n",
+      "|demo_boolq|Yaml   |none  |     0|acc   |    1|±  |     0|\n",
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "%env LOGLEVEL=DEBUG\n",
+    "!lm_eval \\\n",
+    "    --model hf \\\n",
+    "    --model_args pretrained=EleutherAI/pythia-2.8b \\\n",
+    "    --include_path ./ \\\n",
+    "    --tasks demo_boolq \\\n",
+    "    --limit 10"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "LOUHK7PtQfq4"
+   },
+   "source": [
+    "Often, tasks are part of a larger group used to measure different capabilities. The dynamism of the field today means new dimensions of evaluation can come about which would mix and match new and older tasks alike. In LM-Eval, We can also group tasks and call that the group name to evaluate on a set of tasks easily. In this instance, let's evaluate the tag `yes_or_no_tasks` which comprise of the tasks `demo_boolq` and `demo_cola`; tasks which are multiple choice tasks with options `yes` and `no` as the name suggests.\n",
+    "\n",
+    "<!-- making new groups is easier than ever, allowing user to work bottom-up by makiing individual tasks and linking them to a group or Top-Down, making a new group by listing existing tasks.\n",
+    "\n",
+    "We also show the aggregate across samples besides only showing the aggregation between subtasks. This may come in handy when certain groups want to be aggregated as a single task. -->\n",
+    "\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {
+    "id": "fthNg3ywO-kA"
+   },
+   "outputs": [],
+   "source": [
+    "YAML_cola_string = \"\"\"\n",
+    "tag: yes_or_no_tasks\n",
+    "task: demo_cola\n",
+    "dataset_path: glue\n",
+    "dataset_name: cola\n",
+    "output_type: multiple_choice\n",
+    "training_split: train\n",
+    "validation_split: validation\n",
+    "doc_to_text: \"{{sentence}}\\nQuestion: Does this sentence make sense?\\nAnswer:\"\n",
+    "doc_to_target: label\n",
+    "doc_to_choice: [\"no\", \"yes\"]\n",
+    "should_decontaminate: true\n",
+    "doc_to_decontamination_query: sentence\n",
+    "metric_list:\n",
+    "  - metric: acc\n",
+    "\"\"\"\n",
+    "with open(\"cola.yaml\", \"w\") as f:\n",
+    "    f.write(YAML_cola_string)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {
+    "id": "XceRKCuuDtbn"
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "2023-11-29:11:56:33,016 INFO     [utils.py:160] NumExpr defaulting to 2 threads.\n",
+      "2023-11-29 11:56:33.852995: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered\n",
+      "2023-11-29 11:56:33.853050: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered\n",
+      "2023-11-29 11:56:33.853087: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1518] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered\n",
+      "2023-11-29 11:56:35.129047: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Could not find TensorRT\n",
+      "2023-11-29:11:56:38,546 INFO     [__main__.py:132] Verbosity set to INFO\n",
+      "2023-11-29:11:56:47,509 WARNING  [__main__.py:138]  --limit SHOULD ONLY BE USED FOR TESTING.REAL METRICS SHOULD NOT BE COMPUTED USING LIMIT.\n",
+      "2023-11-29:11:56:47,509 INFO     [__main__.py:143] Including path: ./\n",
+      "2023-11-29:11:56:47,517 INFO     [__main__.py:205] Selected Tasks: ['yes_or_no_tasks']\n",
+      "2023-11-29:11:56:47,520 WARNING  [evaluator.py:93] generation_kwargs specified through cli, these settings will be used over set parameters in yaml tasks.\n",
+      "2023-11-29:11:56:47,550 INFO     [huggingface.py:120] Using device 'cuda'\n",
+      "2023-11-29:11:57:08,743 WARNING  [task.py:614] [Task: demo_cola] metric acc is defined, but aggregation is not. using default aggregation=mean\n",
+      "2023-11-29:11:57:08,743 WARNING  [task.py:626] [Task: demo_cola] metric acc is defined, but higher_is_better is not. using default higher_is_better=True\n",
+      "Downloading builder script: 100% 28.8k/28.8k [00:00<00:00, 52.7MB/s]\n",
+      "Downloading metadata: 100% 28.7k/28.7k [00:00<00:00, 51.9MB/s]\n",
+      "Downloading readme: 100% 27.9k/27.9k [00:00<00:00, 48.0MB/s]\n",
+      "Downloading data: 100% 377k/377k [00:00<00:00, 12.0MB/s]\n",
+      "Generating train split: 100% 8551/8551 [00:00<00:00, 19744.58 examples/s]\n",
+      "Generating validation split: 100% 1043/1043 [00:00<00:00, 27057.01 examples/s]\n",
+      "Generating test split: 100% 1063/1063 [00:00<00:00, 22705.17 examples/s]\n",
+      "2023-11-29:11:57:11,698 INFO     [task.py:355] Building contexts for task on rank 0...\n",
+      "2023-11-29:11:57:11,704 INFO     [evaluator.py:319] Running loglikelihood requests\n",
+      "100% 20/20 [00:03<00:00,  5.15it/s]\n",
+      "fatal: not a git repository (or any of the parent directories): .git\n",
+      "hf (pretrained=EleutherAI/pythia-2.8b), gen_kwargs: (), limit: 10.0, num_fewshot: None, batch_size: 1\n",
+      "|     Tasks     |Version|Filter|n-shot|Metric|Value|   |Stderr|\n",
+      "|---------------|-------|------|-----:|------|----:|---|-----:|\n",
+      "|yes_or_no_tasks|N/A    |none  |     0|acc   |  0.7|±  |0.1528|\n",
+      "| - demo_cola   |Yaml   |none  |     0|acc   |  0.7|±  |0.1528|\n",
+      "\n",
+      "|    Groups     |Version|Filter|n-shot|Metric|Value|   |Stderr|\n",
+      "|---------------|-------|------|-----:|------|----:|---|-----:|\n",
+      "|yes_or_no_tasks|N/A    |none  |     0|acc   |  0.7|±  |0.1528|\n",
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "# !accelerate launch --no_python\n",
+    "%env LOGLEVEL=DEBUG\n",
+    "!lm_eval \\\n",
+    "    --model hf \\\n",
+    "    --model_args pretrained=EleutherAI/pythia-2.8b \\\n",
+    "    --include_path ./ \\\n",
+    "    --tasks yes_or_no_tasks \\\n",
+    "    --limit 10 \\\n",
+    "    --output output/yes_or_no_tasks/ \\\n",
+    "    --log_samples"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "XceRKCuuDtbn"
+   },
+   "source": [
+    "## Edit Prompt Templates Quickly\n",
+    "\n",
+    "The following is a yaml made to evaluate the specific subtask of `high_school_geography` from MMLU. It uses the standard prompt where the we choose the letters from the options with most likelihood as the model's prediction."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {
+    "id": "GTFvdt9kSlBG"
+   },
+   "outputs": [],
+   "source": [
+    "YAML_mmlu_geo_string = \"\"\"\n",
+    "task: demo_mmlu_high_school_geography\n",
+    "dataset_path: cais/mmlu\n",
+    "dataset_name: high_school_geography\n",
+    "description: \"The following are multiple choice questions (with answers) about high school geography.\\n\\n\"\n",
+    "test_split: test\n",
+    "fewshot_split: dev\n",
+    "fewshot_config:\n",
+    "  sampler: first_n\n",
+    "output_type: multiple_choice\n",
+    "doc_to_text: \"{{question.strip()}}\\nA. {{choices[0]}}\\nB. {{choices[1]}}\\nC. {{choices[2]}}\\nD. {{choices[3]}}\\nAnswer:\"\n",
+    "doc_to_choice: [\"A\", \"B\", \"C\", \"D\"]\n",
+    "doc_to_target: answer\n",
+    "metric_list:\n",
+    "  - metric: acc\n",
+    "    aggregation: mean\n",
+    "    higher_is_better: true\n",
+    "  - metric: acc_norm\n",
+    "    aggregation: mean\n",
+    "    higher_is_better: true\n",
+    "\"\"\"\n",
+    "with open(\"mmlu_high_school_geography.yaml\", \"w\") as f:\n",
+    "    f.write(YAML_mmlu_geo_string)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {
+    "id": "jyKOfCsKb-xy"
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "2023-11-29:11:57:23,598 INFO     [utils.py:160] NumExpr defaulting to 2 threads.\n",
+      "2023-11-29 11:57:24.719750: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered\n",
+      "2023-11-29 11:57:24.719806: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered\n",
+      "2023-11-29 11:57:24.719847: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1518] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered\n",
+      "2023-11-29 11:57:26.656125: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Could not find TensorRT\n",
+      "2023-11-29:11:57:31,563 INFO     [__main__.py:132] Verbosity set to INFO\n",
+      "2023-11-29:11:57:40,541 WARNING  [__main__.py:138]  --limit SHOULD ONLY BE USED FOR TESTING.REAL METRICS SHOULD NOT BE COMPUTED USING LIMIT.\n",
+      "2023-11-29:11:57:40,541 INFO     [__main__.py:143] Including path: ./\n",
+      "2023-11-29:11:57:40,558 INFO     [__main__.py:205] Selected Tasks: ['demo_mmlu_high_school_geography']\n",
+      "2023-11-29:11:57:40,559 WARNING  [evaluator.py:93] generation_kwargs specified through cli, these settings will be used over set parameters in yaml tasks.\n",
+      "2023-11-29:11:57:40,589 INFO     [huggingface.py:120] Using device 'cuda'\n",
+      "Downloading builder script: 100% 5.84k/5.84k [00:00<00:00, 17.7MB/s]\n",
+      "Downloading metadata: 100% 106k/106k [00:00<00:00, 892kB/s] \n",
+      "Downloading readme: 100% 39.7k/39.7k [00:00<00:00, 631kB/s]\n",
+      "Downloading data: 100% 166M/166M [00:01<00:00, 89.0MB/s]\n",
+      "Generating auxiliary_train split: 100% 99842/99842 [00:07<00:00, 12536.83 examples/s]\n",
+      "Generating test split: 100% 198/198 [00:00<00:00, 1439.20 examples/s]\n",
+      "Generating validation split: 100% 22/22 [00:00<00:00, 4181.76 examples/s]\n",
+      "Generating dev split: 100% 5/5 [00:00<00:00, 36.25 examples/s]\n",
+      "2023-11-29:11:58:09,798 INFO     [task.py:355] Building contexts for task on rank 0...\n",
+      "2023-11-29:11:58:09,822 INFO     [evaluator.py:319] Running loglikelihood requests\n",
+      "100% 40/40 [00:05<00:00,  7.86it/s]\n",
+      "fatal: not a git repository (or any of the parent directories): .git\n",
+      "hf (pretrained=EleutherAI/pythia-2.8b), gen_kwargs: (), limit: 10.0, num_fewshot: None, batch_size: 1\n",
+      "|             Tasks             |Version|Filter|n-shot| Metric |Value|   |Stderr|\n",
+      "|-------------------------------|-------|------|-----:|--------|----:|---|-----:|\n",
+      "|demo_mmlu_high_school_geography|Yaml   |none  |     0|acc     |  0.3|±  |0.1528|\n",
+      "|                               |       |none  |     0|acc_norm|  0.3|±  |0.1528|\n",
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "# !accelerate launch --no_python\n",
+    "%env LOGLEVEL=DEBUG\n",
+    "!lm_eval \\\n",
+    "    --model hf \\\n",
+    "    --model_args pretrained=EleutherAI/pythia-2.8b \\\n",
+    "    --include_path ./ \\\n",
+    "    --tasks demo_mmlu_high_school_geography \\\n",
+    "    --limit 10 \\\n",
+    "    --output output/mmlu_high_school_geography/ \\\n",
+    "    --log_samples"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "jyKOfCsKb-xy"
+   },
+   "source": [
+    "We could also evaluate this task in a different way. For example, instead of observing the loglikelihood of the letters, we can instead evaluate on the choices themselves as the continuation. This is done by simply changing `doc_to_choice` from a list of letters to the corresponding `choices` field from the HF dataset. We write `\"{{choices}}\"` so that the string field is interpreted as jinja string that acquires the list from the HF dataset directly.\n",
+    "\n",
+    "Another convenient feature here is since we're only modifying the `doc_to_choice` and the rest of config is the same as the task above, we can use the above configuration as a template by using `include: mmlu_high_school_geography.yaml` to load the config from that file. We'll need to add a unique task name as to not colide with the existing yaml config we're including. For this case we'll simply name this one `mmlu_high_school_geography_continuation`. `doc_to_text` is added here just for sake of clarity."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {
+    "id": "lqElwU54TaK-"
+   },
+   "outputs": [],
+   "source": [
+    "YAML_mmlu_geo_string = \"\"\"\n",
+    "include: mmlu_high_school_geography.yaml\n",
+    "task: demo_mmlu_high_school_geography_continuation\n",
+    "doc_to_text: \"{{question.strip()}}\\nA. {{choices[0]}}\\nB. {{choices[1]}}\\nC. {{choices[2]}}\\nD. {{choices[3]}}\\nAnswer:\"\n",
+    "doc_to_choice: \"{{choices}}\"\n",
+    "\"\"\"\n",
+    "with open(\"mmlu_high_school_geography_continuation.yaml\", \"w\") as f:\n",
+    "    f.write(YAML_mmlu_geo_string)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {
+    "id": "-_CVnDirdy7j"
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "2023-11-29:11:58:21,284 INFO     [utils.py:160] NumExpr defaulting to 2 threads.\n",
+      "2023-11-29 11:58:22.850159: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered\n",
+      "2023-11-29 11:58:22.850219: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered\n",
+      "2023-11-29 11:58:22.850254: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1518] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered\n",
+      "2023-11-29 11:58:24.948103: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Could not find TensorRT\n",
+      "2023-11-29:11:58:28,460 INFO     [__main__.py:132] Verbosity set to INFO\n",
+      "2023-11-29:11:58:37,935 WARNING  [__main__.py:138]  --limit SHOULD ONLY BE USED FOR TESTING.REAL METRICS SHOULD NOT BE COMPUTED USING LIMIT.\n",
+      "2023-11-29:11:58:37,935 INFO     [__main__.py:143] Including path: ./\n",
+      "2023-11-29:11:58:37,969 INFO     [__main__.py:205] Selected Tasks: ['demo_mmlu_high_school_geography_continuation']\n",
+      "2023-11-29:11:58:37,972 WARNING  [evaluator.py:93] generation_kwargs specified through cli, these settings will be used over set parameters in yaml tasks.\n",
+      "2023-11-29:11:58:38,008 INFO     [huggingface.py:120] Using device 'cuda'\n",
+      "2023-11-29:11:58:59,758 INFO     [task.py:355] Building contexts for task on rank 0...\n",
+      "2023-11-29:11:58:59,777 INFO     [evaluator.py:319] Running loglikelihood requests\n",
+      "100% 40/40 [00:02<00:00, 16.23it/s]\n",
+      "fatal: not a git repository (or any of the parent directories): .git\n",
+      "hf (pretrained=EleutherAI/pythia-2.8b), gen_kwargs: (), limit: 10.0, num_fewshot: None, batch_size: 1\n",
+      "|                   Tasks                    |Version|Filter|n-shot| Metric |Value|   |Stderr|\n",
+      "|--------------------------------------------|-------|------|-----:|--------|----:|---|-----:|\n",
+      "|demo_mmlu_high_school_geography_continuation|Yaml   |none  |     0|acc     |  0.1|±  |0.1000|\n",
+      "|                                            |       |none  |     0|acc_norm|  0.2|±  |0.1333|\n",
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "# !accelerate launch --no_python\n",
+    "%env LOGLEVEL=DEBUG\n",
+    "!lm_eval \\\n",
+    "    --model hf \\\n",
+    "    --model_args pretrained=EleutherAI/pythia-2.8b \\\n",
+    "    --include_path ./ \\\n",
+    "    --tasks demo_mmlu_high_school_geography_continuation \\\n",
+    "    --limit 10 \\\n",
+    "    --output output/mmlu_high_school_geography_continuation/ \\\n",
+    "    --log_samples"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "-_CVnDirdy7j"
+   },
+   "source": [
+    "If we take a look at the samples, we can see that it is in fact evaluating the continuation based on the choices rather than the letters."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {
+    "id": "duBDqC6PAdjL"
+   },
+   "outputs": [
+    {
+     "data": {
+      "application/javascript": "\n      ((filepath) => {{\n        if (!google.colab.kernel.accessAllowed) {{\n          return;\n        }}\n        google.colab.files.view(filepath);\n      }})(\"/content/output/mmlu_high_school_geography_continuation/pretrained__EleutherAI__pythia-2.8b_demo_mmlu_high_school_geography_continuation.jsonl\")",
+      "text/plain": [
+       "<IPython.core.display.Javascript object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "from google.colab import files\n",
+    "\n",
+    "\n",
+    "files.view(\n",
+    "    \"output/mmlu_high_school_geography_continuation/pretrained__EleutherAI__pythia-2.8b_demo_mmlu_high_school_geography_continuation.jsonl\"\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "6p0-KPwAgK5j"
+   },
+   "source": [
+    "## Closer Look at YAML Fields\n",
+    "\n",
+    "To prepare a task we can simply fill in a YAML config with the relevant information.\n",
+    "\n",
+    "`output_type`\n",
+    "The current provided evaluation types comprise of the following:\n",
+    "1.   `loglikelihood`: Evaluates the loglikelihood of a continuation, conditioned on some input string.\n",
+    "2.   `loglikelihood_rolling`: evaluate the loglikelihood of producing a string, conditioned on the empty string. (Used for perplexity evaluations)\n",
+    "3.   `multiple_choice`: Evaluates loglikelihood among the a number of choices predicted by the model.\n",
+    "4.   `greedy_until`: Model outputs greedy generation (can be configured to to use beam search and other generation-related parameters)\n",
+    "\n",
+    "The core prompt revolves around 3 fields.\n",
+    "1. `doc_to_text`: Denotes the prompt template that will be used as input to the model.\n",
+    "2. `doc_to_choice`: Available choices that will be used as continuation for the model. This is used when the `output_type` is `multiple_choice`, and otherwise can be left as `None`.\n",
+    "3. `doc_to_target`: When `output_type` is `multiple_choice`, this can be an index that corresponds to the correct answer, or the answer string itself (must be a subset of `doc_to_choice`). For other tasks, this is expected to be a string. You can fill this field with a feature name from the HF dataset so long as the resulting feature follows the conditioned described.\n",
+    "\n",
+    "These three fields can be expressed as strings, column names from the source dataset, or as Jinja2 templates that can use fields from the source dataset as variables.\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "6p0-KPwAgK5j"
+   },
+   "source": [
+    "## What if Jinja is not Sufficient?\n",
+    "\n",
+    "There can be times where the Jinja2 templating language is not enough to make the prompt we had in mind. There are a few ways to circumvent this limitation:\n",
+    "\n",
+    "1. Use `!function` operator for the prompt-related fields to pass a python function that takes as input the dataset row, and will output the prompt template component.\n",
+    "2. Perform a transformation on the dataset beforehand."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Below, we show an example of using `!function` to create `doc_to_text` from a python function:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "metadata": {
+    "colab": {
+     "base_uri": "https://localhost:8080/"
+    },
+    "id": "DYZ5c0JhR1lJ",
+    "outputId": "ca945235-fb9e-4f17-8bfa-78e7d6ec1490"
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "2023-11-29:11:59:08,312 INFO     [utils.py:160] NumExpr defaulting to 2 threads.\n",
+      "2023-11-29 11:59:09.348327: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered\n",
+      "2023-11-29 11:59:09.348387: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered\n",
+      "2023-11-29 11:59:09.348421: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1518] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered\n",
+      "2023-11-29 11:59:10.573752: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Could not find TensorRT\n",
+      "2023-11-29:11:59:14,044 INFO     [__main__.py:132] Verbosity set to INFO\n",
+      "2023-11-29:11:59:23,654 WARNING  [__main__.py:138]  --limit SHOULD ONLY BE USED FOR TESTING.REAL METRICS SHOULD NOT BE COMPUTED USING LIMIT.\n",
+      "2023-11-29:11:59:23,654 INFO     [__main__.py:143] Including path: ./\n",
+      "2023-11-29:11:59:23,678 INFO     [__main__.py:205] Selected Tasks: ['demo_mmlu_high_school_geography_function_prompt']\n",
+      "2023-11-29:11:59:23,679 WARNING  [evaluator.py:93] generation_kwargs specified through cli, these settings will be used over set parameters in yaml tasks.\n",
+      "2023-11-29:11:59:23,708 INFO     [huggingface.py:120] Using device 'cuda'\n",
+      "2023-11-29:11:59:44,516 INFO     [task.py:355] Building contexts for task on rank 0...\n",
+      "2023-11-29:11:59:44,524 INFO     [evaluator.py:319] Running loglikelihood requests\n",
+      "100% 40/40 [00:02<00:00, 15.41it/s]\n",
+      "fatal: not a git repository (or any of the parent directories): .git\n",
+      "hf (pretrained=EleutherAI/pythia-2.8b), gen_kwargs: (), limit: 10.0, num_fewshot: None, batch_size: 1\n",
+      "|                     Tasks                     |Version|Filter|n-shot| Metric |Value|   |Stderr|\n",
+      "|-----------------------------------------------|-------|------|-----:|--------|----:|---|-----:|\n",
+      "|demo_mmlu_high_school_geography_function_prompt|Yaml   |none  |     0|acc     |  0.1|±  |0.1000|\n",
+      "|                                               |       |none  |     0|acc_norm|  0.2|±  |0.1333|\n",
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "YAML_mmlu_geo_string = \"\"\"\n",
+    "include: mmlu_high_school_geography.yaml\n",
+    "task: demo_mmlu_high_school_geography_function_prompt\n",
+    "doc_to_text: !function utils.doc_to_text\n",
+    "doc_to_choice: \"{{choices}}\"\n",
+    "\"\"\"\n",
+    "with open(\"demo_mmlu_high_school_geography_function_prompt.yaml\", \"w\") as f:\n",
+    "    f.write(YAML_mmlu_geo_string)\n",
+    "\n",
+    "DOC_TO_TEXT = \"\"\"\n",
+    "def doc_to_text(x):\n",
+    "    question = x[\"question\"].strip()\n",
+    "    choices = x[\"choices\"]\n",
+    "    option_a = choices[0]\n",
+    "    option_b = choices[1]\n",
+    "    option_c = choices[2]\n",
+    "    option_d = choices[3]\n",
+    "    return f\"{question}\\\\nA. {option_a}\\\\nB. {option_b}\\\\nC. {option_c}\\\\nD. {option_d}\\\\nAnswer:\"\n",
+    "\"\"\"\n",
+    "with open(\"utils.py\", \"w\") as f:\n",
+    "    f.write(DOC_TO_TEXT)\n",
+    "\n",
+    "!lm_eval \\\n",
+    "    --model hf \\\n",
+    "    --model_args pretrained=EleutherAI/pythia-2.8b \\\n",
+    "    --include_path ./ \\\n",
+    "    --tasks demo_mmlu_high_school_geography_function_prompt \\\n",
+    "    --limit 10 \\\n",
+    "    --output output/demo_mmlu_high_school_geography_function_prompt/ \\\n",
+    "    --log_samples"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Next, we'll also show how to do this via preprocessing the dataset as necessary using the `process_docs` config field:\n",
+    "\n",
+    "We will write a function that will modify each document in our evaluation dataset's split to add a field that is suitable for us to use in `doc_to_text`."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "YAML_mmlu_geo_string = \"\"\"\n",
+    "include: mmlu_high_school_geography.yaml\n",
+    "task: demo_mmlu_high_school_geography_function_prompt_2\n",
+    "process_docs: !function utils_process_docs.process_docs\n",
+    "doc_to_text: \"{{input}}\"\n",
+    "doc_to_choice: \"{{choices}}\"\n",
+    "\"\"\"\n",
+    "with open(\"demo_mmlu_high_school_geography_process_docs.yaml\", \"w\") as f:\n",
+    "    f.write(YAML_mmlu_geo_string)\n",
+    "\n",
+    "DOC_TO_TEXT = \"\"\"\n",
+    "def process_docs(dataset):\n",
+    "    def _process_doc(x):\n",
+    "        question = x[\"question\"].strip()\n",
+    "        choices = x[\"choices\"]\n",
+    "        option_a = choices[0]\n",
+    "        option_b = choices[1]\n",
+    "        option_c = choices[2]\n",
+    "        option_d = choices[3]\n",
+    "        doc[\"input\"] = f\"{question}\\\\nA. {option_a}\\\\nB. {option_b}\\\\nC. {option_c}\\\\nD. {option_d}\\\\nAnswer:\"\n",
+    "        return out_doc\n",
+    "\n",
+    "    return dataset.map(_process_doc)\n",
+    "\"\"\"\n",
+    "\n",
+    "with open(\"utils_process_docs.py\", \"w\") as f:\n",
+    "    f.write(DOC_TO_TEXT)\n",
+    "\n",
+    "!lm_eval \\\n",
+    "    --model hf \\\n",
+    "    --model_args pretrained=EleutherAI/pythia-2.8b \\\n",
+    "    --include_path ./ \\\n",
+    "    --tasks demo_mmlu_high_school_geography_function_prompt_2 \\\n",
+    "    --limit 10 \\\n",
+    "    --output output/demo_mmlu_high_school_geography_function_prompt_2/ \\\n",
+    "    --log_samples"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "We hope that this explainer gives you a sense of what can be done with and how to work with LM-Evaluation-Harnes v0.4.0 ! \n",
+    "\n",
+    "For more information, check out our documentation pages in the `docs/` folder, and if you have questions, please raise them in GitHub issues, or in #lm-thunderdome or #release-discussion on the EleutherAI discord server."
+   ]
+  }
+ ],
+ "metadata": {
+  "accelerator": "GPU",
+  "colab": {
+   "collapsed_sections": [
+    "zAov81vTbL2K"
+   ],
+   "gpuType": "T4",
+   "provenance": []
+  },
+  "kernelspec": {
+   "display_name": "Python 3",
+   "name": "python3"
+  },
+  "language_info": {
+   "name": "python"
+  },
+  "widgets": {
+   "application/vnd.jupyter.widget-state+json": {
+    "state": {
+     "46f521b73fd943c081c648fd873ebc0a": {
+      "model_module": "@jupyter-widgets/controls",
+      "model_module_version": "1.5.0",
+      "model_name": "DescriptionStyleModel",
+      "state": {
+       "_model_module": "@jupyter-widgets/controls",
+       "_model_module_version": "1.5.0",
+       "_model_name": "DescriptionStyleModel",
+       "_view_count": null,
+       "_view_module": "@jupyter-widgets/base",
+       "_view_module_version": "1.2.0",
+       "_view_name": "StyleView",
+       "description_width": ""
+      }
+     },
+     "48763b6233374554ae76035c0483066f": {
+      "model_module": "@jupyter-widgets/controls",
+      "model_module_version": "1.5.0",
+      "model_name": "ProgressStyleModel",
+      "state": {
+       "_model_module": "@jupyter-widgets/controls",
+       "_model_module_version": "1.5.0",
+       "_model_name": "ProgressStyleModel",
+       "_view_count": null,
+       "_view_module": "@jupyter-widgets/base",
+       "_view_module_version": "1.2.0",
+       "_view_name": "StyleView",
+       "bar_color": null,
+       "description_width": ""
+      }
+     },
+     "4986a21eb560448fa79f4b25cde48951": {
+      "model_module": "@jupyter-widgets/base",
+      "model_module_version": "1.2.0",
+      "model_name": "LayoutModel",
+      "state": {
+       "_model_module": "@jupyter-widgets/base",
+       "_model_module_version": "1.2.0",
+       "_model_name": "LayoutModel",
+       "_view_count": null,
+       "_view_module": "@jupyter-widgets/base",
+       "_view_module_version": "1.2.0",
+       "_view_name": "LayoutView",
+       "align_content": null,
+       "align_items": null,
+       "align_self": null,
+       "border": null,
+       "bottom": null,
+       "display": null,
+       "flex": null,
+       "flex_flow": null,
+       "grid_area": null,
+       "grid_auto_columns": null,
+       "grid_auto_flow": null,
+       "grid_auto_rows": null,
+       "grid_column": null,
+       "grid_gap": null,
+       "grid_row": null,
+       "grid_template_areas": null,
+       "grid_template_columns": null,
+       "grid_template_rows": null,
+       "height": null,
+       "justify_content": null,
+       "justify_items": null,
+       "left": null,
+       "margin": null,
+       "max_height": null,
+       "max_width": null,
+       "min_height": null,
+       "min_width": null,
+       "object_fit": null,
+       "object_position": null,
+       "order": null,
+       "overflow": null,
+       "overflow_x": null,
+       "overflow_y": null,
+       "padding": null,
+       "right": null,
+       "top": null,
+       "visibility": null,
+       "width": null
+      }
+     },
+     "6b2d90209ec14230b3d58a74ac9b83bf": {
+      "model_module": "@jupyter-widgets/base",
+      "model_module_version": "1.2.0",
+      "model_name": "LayoutModel",
+      "state": {
+       "_model_module": "@jupyter-widgets/base",
+       "_model_module_version": "1.2.0",
+       "_model_name": "LayoutModel",
+       "_view_count": null,
+       "_view_module": "@jupyter-widgets/base",
+       "_view_module_version": "1.2.0",
+       "_view_name": "LayoutView",
+       "align_content": null,
+       "align_items": null,
+       "align_self": null,
+       "border": null,
+       "bottom": null,
+       "display": null,
+       "flex": null,
+       "flex_flow": null,
+       "grid_area": null,
+       "grid_auto_columns": null,
+       "grid_auto_flow": null,
+       "grid_auto_rows": null,
+       "grid_column": null,
+       "grid_gap": null,
+       "grid_row": null,
+       "grid_template_areas": null,
+       "grid_template_columns": null,
+       "grid_template_rows": null,
+       "height": null,
+       "justify_content": null,
+       "justify_items": null,
+       "left": null,
+       "margin": null,
+       "max_height": null,
+       "max_width": null,
+       "min_height": null,
+       "min_width": null,
+       "object_fit": null,
+       "object_position": null,
+       "order": null,
+       "overflow": null,
+       "overflow_x": null,
+       "overflow_y": null,
+       "padding": null,
+       "right": null,
+       "top": null,
+       "visibility": null,
+       "width": null
+      }
+     },
+     "7c5689bc13684db8a22681f41863dddd": {
+      "model_module": "@jupyter-widgets/base",
+      "model_module_version": "1.2.0",
+      "model_name": "LayoutModel",
+      "state": {
+       "_model_module": "@jupyter-widgets/base",
+       "_model_module_version": "1.2.0",
+       "_model_name": "LayoutModel",
+       "_view_count": null,
+       "_view_module": "@jupyter-widgets/base",
+       "_view_module_version": "1.2.0",
+       "_view_name": "LayoutView",
+       "align_content": null,
+       "align_items": null,
+       "align_self": null,
+       "border": null,
+       "bottom": null,
+       "display": null,
+       "flex": null,
+       "flex_flow": null,
+       "grid_area": null,
+       "grid_auto_columns": null,
+       "grid_auto_flow": null,
+       "grid_auto_rows": null,
+       "grid_column": null,
+       "grid_gap": null,
+       "grid_row": null,
+       "grid_template_areas": null,
+       "grid_template_columns": null,
+       "grid_template_rows": null,
+       "height": null,
+       "justify_content": null,
+       "justify_items": null,
+       "left": null,
+       "margin": null,
+       "max_height": null,
+       "max_width": null,
+       "min_height": null,
+       "min_width": null,
+       "object_fit": null,
+       "object_position": null,
+       "order": null,
+       "overflow": null,
+       "overflow_x": null,
+       "overflow_y": null,
+       "padding": null,
+       "right": null,
+       "top": null,
+       "visibility": null,
+       "width": null
+      }
+     },
+     "a1d3a8aa016544a78e8821c8f6199e06": {
+      "model_module": "@jupyter-widgets/controls",
+      "model_module_version": "1.5.0",
+      "model_name": "HBoxModel",
+      "state": {
+       "_dom_classes": [],
+       "_model_module": "@jupyter-widgets/controls",
+       "_model_module_version": "1.5.0",
+       "_model_name": "HBoxModel",
+       "_view_count": null,
+       "_view_module": "@jupyter-widgets/controls",
+       "_view_module_version": "1.5.0",
+       "_view_name": "HBoxView",
+       "box_style": "",
+       "children": [
+        "IPY_MODEL_f61ed33fad754146bdd2ac9db1ba1c48",
+        "IPY_MODEL_bfa0af6aeff344c6845e1080a878e92e",
+        "IPY_MODEL_fd1ad9e0367d4004aae853b91c3a7617"
+       ],
+       "layout": "IPY_MODEL_6b2d90209ec14230b3d58a74ac9b83bf"
+      }
+     },
+     "a73f357065d34d7baf0453ae4a8d75e2": {
+      "model_module": "@jupyter-widgets/base",
+      "model_module_version": "1.2.0",
+      "model_name": "LayoutModel",
+      "state": {
+       "_model_module": "@jupyter-widgets/base",
+       "_model_module_version": "1.2.0",
+       "_model_name": "LayoutModel",
+       "_view_count": null,
+       "_view_module": "@jupyter-widgets/base",
+       "_view_module_version": "1.2.0",
+       "_view_name": "LayoutView",
+       "align_content": null,
+       "align_items": null,
+       "align_self": null,
+       "border": null,
+       "bottom": null,
+       "display": null,
+       "flex": null,
+       "flex_flow": null,
+       "grid_area": null,
+       "grid_auto_columns": null,
+       "grid_auto_flow": null,
+       "grid_auto_rows": null,
+       "grid_column": null,
+       "grid_gap": null,
+       "grid_row": null,
+       "grid_template_areas": null,
+       "grid_template_columns": null,
+       "grid_template_rows": null,
+       "height": null,
+       "justify_content": null,
+       "justify_items": null,
+       "left": null,
+       "margin": null,
+       "max_height": null,
+       "max_width": null,
+       "min_height": null,
+       "min_width": null,
+       "object_fit": null,
+       "object_position": null,
+       "order": null,
+       "overflow": null,
+       "overflow_x": null,
+       "overflow_y": null,
+       "padding": null,
+       "right": null,
+       "top": null,
+       "visibility": null,
+       "width": null
+      }
+     },
+     "aed3acd2f2d74003b44079c333a0698e": {
+      "model_module": "@jupyter-widgets/controls",
+      "model_module_version": "1.5.0",
+      "model_name": "DescriptionStyleModel",
+      "state": {
+       "_model_module": "@jupyter-widgets/controls",
+       "_model_module_version": "1.5.0",
+       "_model_name": "DescriptionStyleModel",
+       "_view_count": null,
+       "_view_module": "@jupyter-widgets/base",
+       "_view_module_version": "1.2.0",
+       "_view_name": "StyleView",
+       "description_width": ""
+      }
+     },
+     "bfa0af6aeff344c6845e1080a878e92e": {
+      "model_module": "@jupyter-widgets/controls",
+      "model_module_version": "1.5.0",
+      "model_name": "FloatProgressModel",
+      "state": {
+       "_dom_classes": [],
+       "_model_module": "@jupyter-widgets/controls",
+       "_model_module_version": "1.5.0",
+       "_model_name": "FloatProgressModel",
+       "_view_count": null,
+       "_view_module": "@jupyter-widgets/controls",
+       "_view_module_version": "1.5.0",
+       "_view_name": "ProgressView",
+       "bar_style": "success",
+       "description": "",
+       "description_tooltip": null,
+       "layout": "IPY_MODEL_7c5689bc13684db8a22681f41863dddd",
+       "max": 5669,
+       "min": 0,
+       "orientation": "horizontal",
+       "style": "IPY_MODEL_48763b6233374554ae76035c0483066f",
+       "value": 5669
+      }
+     },
+     "f61ed33fad754146bdd2ac9db1ba1c48": {
+      "model_module": "@jupyter-widgets/controls",
+      "model_module_version": "1.5.0",
+      "model_name": "HTMLModel",
+      "state": {
+       "_dom_classes": [],
+       "_model_module": "@jupyter-widgets/controls",
+       "_model_module_version": "1.5.0",
+       "_model_name": "HTMLModel",
+       "_view_count": null,
+       "_view_module": "@jupyter-widgets/controls",
+       "_view_module_version": "1.5.0",
+       "_view_name": "HTMLView",
+       "description": "",
+       "description_tooltip": null,
+       "layout": "IPY_MODEL_a73f357065d34d7baf0453ae4a8d75e2",
+       "placeholder": "",
+       "style": "IPY_MODEL_46f521b73fd943c081c648fd873ebc0a",
+       "value": "Downloading builder script: 100%"
+      }
+     },
+     "fd1ad9e0367d4004aae853b91c3a7617": {
+      "model_module": "@jupyter-widgets/controls",
+      "model_module_version": "1.5.0",
+      "model_name": "HTMLModel",
+      "state": {
+       "_dom_classes": [],
+       "_model_module": "@jupyter-widgets/controls",
+       "_model_module_version": "1.5.0",
+       "_model_name": "HTMLModel",
+       "_view_count": null,
+       "_view_module": "@jupyter-widgets/controls",
+       "_view_module_version": "1.5.0",
+       "_view_name": "HTMLView",
+       "description": "",
+       "description_tooltip": null,
+       "layout": "IPY_MODEL_4986a21eb560448fa79f4b25cde48951",
+       "placeholder": "",
+       "style": "IPY_MODEL_aed3acd2f2d74003b44079c333a0698e",
+       "value": " 5.67k/5.67k [00:00&lt;00:00, 205kB/s]"
+      }
+     }
+    },
+    "version_major": 2,
+    "version_minor": 0
+   }
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 0
+}

lm-evaluation-harness/examples/transformer-lens.py ADDED Viewed

	@@ -0,0 +1,59 @@

+import warnings
+import torch
+import torch.nn as nn
+from transformer_lens import HookedTransformer
+from transformers import AutoConfig
+from lm_eval import evaluator
+from lm_eval.models.huggingface import HFLM
+def evaluate_lm_eval(lens_model: HookedTransformer, tasks: list[str], **kwargs):
+    class HFLikeModelAdapter(nn.Module):
+        """Adapts HookedTransformer to match the HuggingFace interface expected by lm-eval"""
+        def __init__(self, model: HookedTransformer):
+            super().__init__()
+            self.model = model
+            self.tokenizer = model.tokenizer
+            self.config = AutoConfig.from_pretrained(model.cfg.tokenizer_name)
+            self.device = model.cfg.device
+            self.tie_weights = lambda: self
+        def forward(self, input_ids=None, attention_mask=None, **kwargs):
+            output = self.model(input_ids, attention_mask=attention_mask, **kwargs)
+            # Make sure output has the expected .logits attribute
+            if not hasattr(output, "logits"):
+                if isinstance(output, torch.Tensor):
+                    output.logits = output
+            return output
+        # Only delegate specific attributes we know we need
+        def to(self, *args, **kwargs):
+            return self.model.to(*args, **kwargs)
+        def eval(self):
+            self.model.eval()
+            return self
+        def train(self, mode=True):
+            self.model.train(mode)
+            return self
+    model = HFLikeModelAdapter(lens_model)
+    warnings.filterwarnings("ignore", message="Failed to get model SHA for")
+    results = evaluator.simple_evaluate(
+        model=HFLM(pretrained=model, tokenizer=model.tokenizer),
+        tasks=tasks,
+        verbosity="WARNING",
+        **kwargs,
+    )
+    return results
+if __name__ == "__main__":
+    # Load base model
+    model = HookedTransformer.from_pretrained("pythia-70m")
+    res = evaluate_lm_eval(model, tasks=["arc_easy"])
+    print(res["results"])

lm-evaluation-harness/examples/visualize-wandb.ipynb ADDED Viewed

	@@ -0,0 +1,172 @@

+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "fc477b96-adee-4829-a9d7-a5eb990df358",
+   "metadata": {},
+   "source": [
+    "# Visualizing Results in Weights and Biases\n",
+    "\n",
+    "With the Weights and Biases integration, you can now spend more time extracting deeper insights into your evaluation results. The integration is designed to streamline the process of logging and visualizing experiment results using the Weights & Biases (W&B) platform.\n",
+    "\n",
+    "The integration provide functionalities\n",
+    "\n",
+    "- to automatically log the evaluation results,\n",
+    "- log the samples as W&B Tables for easy visualization,\n",
+    "- log the `results.json` file as an artifact for version control,\n",
+    "- log the `<task_name>_eval_samples.json` file if the samples are logged,\n",
+    "- generate a comprehensive report for analysis and visualization with all the important metric,\n",
+    "- log task and cli configs,\n",
+    "- and more out of the box like the command used to run the evaluation, GPU/CPU counts, timestamp, etc.\n",
+    "\n",
+    "The integration is super easy to use with the eval harness. Let's see how!"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "3851439a-bff4-41f2-bf21-1b3d8704913b",
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [],
+   "source": [
+    "# Install this project if you did not already have it.\n",
+    "# This is all that is needed to be installed to start using Weights and Biases\n",
+    "\n",
+    "!pip -qq install -e ..[wandb]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "8507fd7e-3b99-4a92-89fa-9eaada74ba91",
+   "metadata": {},
+   "source": [
+    "# Run the Eval Harness\n",
+    "\n",
+    "Run the eval harness as usual with a `wandb_args` flag. This flag is used to provide arguments for initializing a wandb run ([wandb.init](https://docs.wandb.ai/ref/python/init)) as comma separated string arguments.\n",
+    "\n",
+    "If `wandb_args` flag is used, the metrics and all other goodness will be automatically logged to Weights and Biases. In the stdout, you will find the link to the W&B run page as well as link to the generated report."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "eec5866e-f01e-42f8-8803-9d77472ef991",
+   "metadata": {},
+   "source": [
+    "## Set your API Key\n",
+    "\n",
+    "Before you can use W&B, you need to authenticate your machine with an authentication key. Visit https://wandb.ai/authorize to get one."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "d824d163-71a9-4313-935d-f1d56397841c",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import wandb\n",
+    "\n",
+    "\n",
+    "wandb.login()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "124e4a34-1547-4bed-bc09-db012bacbda6",
+   "metadata": {},
+   "source": [
+    "> Note that if you are using command line you can simply authenticate your machine by doing `wandb login` in your terminal. For more info check out the [documentation](https://docs.wandb.ai/quickstart#2-log-in-to-wb)."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "abc6f6b6-179a-4aff-ada9-f380fb74df6e",
+   "metadata": {},
+   "source": [
+    "## Run and log to W&B"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "bd0a8130-a97b-451a-acd2-3f9885b88643",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!lm_eval \\\n",
+    "    --model hf \\\n",
+    "    --model_args pretrained=microsoft/phi-2,trust_remote_code=True \\\n",
+    "    --tasks hellaswag,mmlu_abstract_algebra \\\n",
+    "    --device cuda:0 \\\n",
+    "    --batch_size 8 \\\n",
+    "    --output_path output/phi-2 \\\n",
+    "    --limit 10 \\\n",
+    "    --wandb_args project=lm-eval-harness-integration \\\n",
+    "    --log_samples"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "e974cabdbe70b667",
+   "metadata": {},
+   "source": []
+  },
+  {
+   "cell_type": "markdown",
+   "id": "5178ca9445b844e4",
+   "metadata": {},
+   "source": [
+    "W&B can also be initialized programmatically for use outside the CLI to parse and log the results."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "c6a421b2cf3ddac5",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import lm_eval\n",
+    "from lm_eval.loggers import WandbLogger\n",
+    "\n",
+    "\n",
+    "results = lm_eval.simple_evaluate(\n",
+    "    model=\"hf\",\n",
+    "    model_args=\"pretrained=microsoft/phi-2,trust_remote_code=True\",\n",
+    "    tasks=\"hellaswag,mmlu_abstract_algebra\",\n",
+    "    log_samples=True,\n",
+    ")\n",
+    "\n",
+    "wandb_logger = WandbLogger(\n",
+    "    project=\"lm-eval-harness-integration\", job_type=\"eval\"\n",
+    ")  # or empty if wandb.init(...) already called before\n",
+    "wandb_logger.post_init(results)\n",
+    "wandb_logger.log_eval_result()\n",
+    "wandb_logger.log_eval_samples(results[\"samples\"])  # if log_samples"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.12"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}

lm-evaluation-harness/examples/visualize-zeno.ipynb ADDED Viewed

	@@ -0,0 +1,115 @@

+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Visualizing Results in Zeno\n",
+    "\n",
+    "Benchmarking your models is the first step towards making sure your model performs well.\n",
+    "However, looking at the data behind the benchmark, slicing the data into subsets, and comparing models on individual instances can help you even more in evaluating and quantifying the behavior of your AI system.\n",
+    "\n",
+    "All of this can be done in [Zeno](https://zenoml.com)!\n",
+    "Zeno is super easy to use with the eval harness, let's explore how you can easily upload and visualize your eval results.\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Install this project if you did not already do that. This is all that needs to be installed for you to be able to visualize your data in Zeno!\n",
+    "!pip install -e ..\n",
+    "!pip install -e ..[zeno]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Run the Eval Harness\n",
+    "\n",
+    "To visualize the results, run the eval harness with the `log_samples` and `output_path` flags. We expect `output_path` to contain multiple folders that represent individual model names. You can thus run your evaluation on any number of tasks and models and upload all of the results as projects on Zeno.\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!lm_eval \\\n",
+    "    --model hf \\\n",
+    "    --model_args pretrained=EleutherAI/gpt-neo-2.7B \\\n",
+    "    --tasks hellaswag,wikitext \\\n",
+    "    --batch_size 8 \\\n",
+    "    --device mps \\\n",
+    "    --log_samples \\\n",
+    "    --output_path output/gpt-neo-2.7B \\\n",
+    "    --limit 10"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Set your API Key\n",
+    "\n",
+    "This is so you can be authenticated with Zeno.\n",
+    "If you don't already have a Zeno account, first create an account on [Zeno Hub](https://hub.zenoml.com).\n",
+    "After logging in to Zeno Hub, generate your API key by clicking on your profile at the bottom left to navigate to your account page.\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%env ZENO_API_KEY=YOUR_API_KEY"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Visualize Eval Results\n",
+    "\n",
+    "You can now use the `zeno_visualize` script to upload the results to Zeno.\n",
+    "\n",
+    "This will use all subfolders in `data_path` as different models and upload all tasks within these model folders to Zeno. If you run the eval harness on multiple tasks, the `project_name` will be used as a prefix and one project will be created per task.\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!python ../scripts/zeno_visualize.py --data_path output --project_name \"Zeno Upload Test\""
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "zeno_projects",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.11"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}

lm-evaluation-harness/lm_eval/__init__.py ADDED Viewed

	@@ -0,0 +1,21 @@

+import logging
+import os
+__version__ = "0.4.9.1"
+# Lazy-load .evaluator module to improve CLI startup
+def __getattr__(name):
+    if name == "evaluate":
+        from .evaluator import evaluate
+        return evaluate
+    elif name == "simple_evaluate":
+        from .evaluator import simple_evaluate
+        return simple_evaluate
+    raise AttributeError(f"module {__name__!r} has no attribute {name!r}")
+__all__ = ["evaluate", "simple_evaluate", "__version__"]

lm-evaluation-harness/lm_eval/__main__.py ADDED Viewed

	@@ -0,0 +1,536 @@

+import argparse
+import json
+import logging
+import os
+import sys
+from functools import partial
+from pathlib import Path
+from typing import Union
+def try_parse_json(value: str) -> Union[str, dict, None]:
+    if value is None:
+        return None
+    try:
+        return json.loads(value)
+    except json.JSONDecodeError:
+        if "{" in value:
+            raise argparse.ArgumentTypeError(
+                f"Invalid JSON: {value}. Hint: Use double quotes for JSON strings."
+            )
+        return value
+def _int_or_none_list_arg_type(
+    min_len: int, max_len: int, defaults: str, value: str, split_char: str = ","
+):
+    def parse_value(item):
+        item = item.strip().lower()
+        if item == "none":
+            return None
+        try:
+            return int(item)
+        except ValueError:
+            raise argparse.ArgumentTypeError(f"{item} is not an integer or None")
+    items = [parse_value(v) for v in value.split(split_char)]
+    num_items = len(items)
+    if num_items == 1:
+        # Makes downstream handling the same for single and multiple values
+        items = items * max_len
+    elif num_items < min_len or num_items > max_len:
+        raise argparse.ArgumentTypeError(
+            f"Argument requires {max_len} integers or None, separated by '{split_char}'"
+        )
+    elif num_items != max_len:
+        logging.warning(
+            f"Argument requires {max_len} integers or None, separated by '{split_char}'. "
+            "Missing values will be filled with defaults."
+        )
+        default_items = [parse_value(v) for v in defaults.split(split_char)]
+        items.extend(
+            default_items[num_items:]
+        )  # extend items list with missing defaults
+    return items
+def check_argument_types(parser: argparse.ArgumentParser):
+    """
+    Check to make sure all CLI args are typed, raises error if not
+    """
+    for action in parser._actions:
+        if action.dest != "help" and not action.const:
+            if action.type is None:
+                raise ValueError(
+                    f"Argument '{action.dest}' doesn't have a type specified."
+                )
+            else:
+                continue
+def setup_parser() -> argparse.ArgumentParser:
+    parser = argparse.ArgumentParser(formatter_class=argparse.RawTextHelpFormatter)
+    parser.add_argument(
+        "--model", "-m", type=str, default="hf", help="Name of model e.g. `hf`"
+    )
+    parser.add_argument(
+        "--tasks",
+        "-t",
+        default=None,
+        type=str,
+        metavar="task1,task2",
+        help="Comma-separated list of task names or task groupings to evaluate on.\nTo get full list of tasks, use one of the commands `lm-eval --tasks {{list_groups,list_subtasks,list_tags,list}}` to list out all available names for task groupings; only (sub)tasks; tags; or all of the above",
+    )
+    parser.add_argument(
+        "--model_args",
+        "-a",
+        default="",
+        type=try_parse_json,
+        help="""Comma separated string or JSON formatted arguments for model, e.g. `pretrained=EleutherAI/pythia-160m,dtype=float32` or '{"pretrained":"EleutherAI/pythia-160m","dtype":"float32"}'""",
+    )
+    parser.add_argument(
+        "--num_fewshot",
+        "-f",
+        type=int,
+        default=None,
+        metavar="N",
+        help="Number of examples in few-shot context",
+    )
+    parser.add_argument(
+        "--batch_size",
+        "-b",
+        type=str,
+        default=1,
+        metavar="auto|auto:N|N",
+        help="Acceptable values are 'auto', 'auto:N' or N, where N is an integer. Default 1.",
+    )
+    parser.add_argument(
+        "--max_batch_size",
+        type=int,
+        default=None,
+        metavar="N",
+        help="Maximal batch size to try with --batch_size auto.",
+    )
+    parser.add_argument(
+        "--device",
+        type=str,
+        default=None,
+        help="Device to use (e.g. cuda, cuda:0, cpu).",
+    )
+    parser.add_argument(
+        "--output_path",
+        "-o",
+        default=None,
+        type=str,
+        metavar="DIR|DIR/file.json",
+        help="Path where result metrics will be saved. Can be either a directory or a .json file. If the path is a directory and log_samples is true, the results will be saved in the directory. Else the parent directory will be used.",
+    )
+    parser.add_argument(
+        "--limit",
+        "-L",
+        type=float,
+        default=None,
+        metavar="N|0<N<1",
+        help="Limit the number of examples per task. "
+        "If <1, limit is a percentage of the total number of examples.",
+    )
+    parser.add_argument(
+        "--samples",
+        "-E",
+        default=None,
+        type=str,
+        metavar="/path/to/json",
+        help='JSON string or path to JSON file containing doc indices of selected examples to test. Format: {"task_name":[indices],...}',
+    )
+    parser.add_argument(
+        "--use_cache",
+        "-c",
+        type=str,
+        default=None,
+        metavar="DIR",
+        help="A path to a sqlite db file for caching model responses. `None` if not caching.",
+    )
+    parser.add_argument(
+        "--cache_requests",
+        type=str,
+        default=None,
+        choices=["true", "refresh", "delete"],
+        help="Speed up evaluation by caching the building of dataset requests. `None` if not caching.",
+    )
+    parser.add_argument(
+        "--check_integrity",
+        action="store_true",
+        help="Whether to run the relevant part of the test suite for the tasks.",
+    )
+    parser.add_argument(
+        "--write_out",
+        "-w",
+        action="store_true",
+        default=False,
+        help="Prints the prompt for the first few documents.",
+    )
+    parser.add_argument(
+        "--log_samples",
+        "-s",
+        action="store_true",
+        default=False,
+        help="If True, write out all model outputs and documents for per-sample measurement and post-hoc analysis. Use with --output_path.",
+    )
+    parser.add_argument(
+        "--system_instruction",
+        type=str,
+        default=None,
+        help="System instruction to be used in the prompt",
+    )
+    parser.add_argument(
+        "--apply_chat_template",
+        type=str,
+        nargs="?",
+        const=True,
+        default=False,
+        help=(
+            "If True, apply chat template to the prompt. "
+            "Providing `--apply_chat_template` without an argument will apply the default chat template to the prompt. "
+            "To apply a specific template from the available list of templates, provide the template name as an argument. "
+            "E.g. `--apply_chat_template template_name`"
+        ),
+    )
+    parser.add_argument(
+        "--fewshot_as_multiturn",
+        action="store_true",
+        default=False,
+        help="If True, uses the fewshot as a multi-turn conversation",
+    )
+    parser.add_argument(
+        "--show_config",
+        action="store_true",
+        default=False,
+        help="If True, shows the the full config of all tasks at the end of the evaluation.",
+    )
+    parser.add_argument(
+        "--include_path",
+        type=str,
+        default=None,
+        metavar="DIR",
+        help="Additional path to include if there are external tasks to include.",
+    )
+    parser.add_argument(
+        "--gen_kwargs",
+        type=try_parse_json,
+        default=None,
+        help=(
+            "Either comma delimited string or JSON formatted arguments for model generation on greedy_until tasks,"
+            """ e.g. '{"temperature":0.7,"until":["hello"]}' or temperature=0,top_p=0.1."""
+        ),
+    )
+    parser.add_argument(
+        "--verbosity",
+        "-v",
+        type=str.upper,
+        default=None,
+        metavar="CRITICAL|ERROR|WARNING|INFO|DEBUG",
+        help="(Deprecated) Controls logging verbosity level. Use the `LOGLEVEL` environment variable instead. Set to DEBUG for detailed output when testing or adding new task configurations.",
+    )
+    parser.add_argument(
+        "--wandb_args",
+        type=str,
+        default="",
+        help="Comma separated string arguments passed to wandb.init, e.g. `project=lm-eval,job_type=eval",
+    )
+    parser.add_argument(
+        "--wandb_config_args",
+        type=str,
+        default="",
+        help="Comma separated string arguments passed to wandb.config.update. Use this to trace parameters that aren't already traced by default. eg. `lr=0.01,repeats=3",
+    )
+    parser.add_argument(
+        "--hf_hub_log_args",
+        type=str,
+        default="",
+        help="Comma separated string arguments passed to Hugging Face Hub's log function, e.g. `hub_results_org=EleutherAI,hub_repo_name=lm-eval-results`",
+    )
+    parser.add_argument(
+        "--predict_only",
+        "-x",
+        action="store_true",
+        default=False,
+        help="Use with --log_samples. Only model outputs will be saved and metrics will not be evaluated.",
+    )
+    default_seed_string = "0,1234,1234,1234"
+    parser.add_argument(
+        "--seed",
+        type=partial(_int_or_none_list_arg_type, 3, 4, default_seed_string),
+        default=default_seed_string,  # for backward compatibility
+        help=(
+            "Set seed for python's random, numpy, torch, and fewshot sampling.\n"
+            "Accepts a comma-separated list of 4 values for python's random, numpy, torch, and fewshot sampling seeds, "
+            "respectively, or a single integer to set the same seed for all four.\n"
+            f"The values are either an integer or 'None' to not set the seed. Default is `{default_seed_string}` "
+            "(for backward compatibility).\n"
+            "E.g. `--seed 0,None,8,52` sets `random.seed(0)`, `torch.manual_seed(8)`, and fewshot sampling seed to 52. "
+            "Here numpy's seed is not set since the second value is `None`.\n"
+            "E.g, `--seed 42` sets all four seeds to 42."
+        ),
+    )
+    parser.add_argument(
+        "--trust_remote_code",
+        action="store_true",
+        help="Sets trust_remote_code to True to execute code to create HF Datasets from the Hub",
+    )
+    parser.add_argument(
+        "--confirm_run_unsafe_code",
+        action="store_true",
+        help="Confirm that you understand the risks of running unsafe code for tasks that require it",
+    )
+    parser.add_argument(
+        "--metadata",
+        type=json.loads,
+        default=None,
+        help="""JSON string metadata to pass to task configs, for example '{"max_seq_lengths":[4096,8192]}'. Will be merged with model_args. Can also be set in task config.""",
+    )
+    return parser
+def parse_eval_args(parser: argparse.ArgumentParser) -> argparse.Namespace:
+    check_argument_types(parser)
+    return parser.parse_args()
+def cli_evaluate(args: Union[argparse.Namespace, None] = None) -> None:
+    if not args:
+        # we allow for args to be passed externally, else we parse them ourselves
+        parser = setup_parser()
+        args = parse_eval_args(parser)
+    # defer loading `lm_eval` submodules for faster CLI load
+    from lm_eval import evaluator, utils
+    from lm_eval.evaluator import request_caching_arg_to_dict
+    from lm_eval.loggers import EvaluationTracker, WandbLogger
+    from lm_eval.tasks import TaskManager
+    from lm_eval.utils import (
+        handle_non_serializable,
+        make_table,
+        simple_parse_args_string,
+    )
+    if args.wandb_args:
+        wandb_args_dict = simple_parse_args_string(args.wandb_args)
+        wandb_config_args_dict = simple_parse_args_string(args.wandb_config_args)
+        wandb_logger = WandbLogger(wandb_args_dict, wandb_config_args_dict)
+    utils.setup_logging(args.verbosity)
+    eval_logger = logging.getLogger(__name__)
+    os.environ["TOKENIZERS_PARALLELISM"] = "false"
+    # update the evaluation tracker args with the output path and the HF token
+    if args.output_path:
+        args.hf_hub_log_args += f",output_path={args.output_path}"
+    if os.environ.get("HF_TOKEN", None):
+        args.hf_hub_log_args += f",token={os.environ.get('HF_TOKEN')}"
+    evaluation_tracker_args = simple_parse_args_string(args.hf_hub_log_args)
+    evaluation_tracker = EvaluationTracker(**evaluation_tracker_args)
+    if args.predict_only:
+        args.log_samples = True
+    if (args.log_samples or args.predict_only) and not args.output_path:
+        raise ValueError(
+            "Specify --output_path if providing --log_samples or --predict_only"
+        )
+    if args.fewshot_as_multiturn and args.apply_chat_template is False:
+        raise ValueError(
+            "When `fewshot_as_multiturn` is selected, `apply_chat_template` must be set (either to `True` or to the chosen template name)."
+        )
+    if args.include_path is not None:
+        eval_logger.info(f"Including path: {args.include_path}")
+    metadata = (
+        simple_parse_args_string(args.model_args)
+        if isinstance(args.model_args, str)
+        else args.model_args
+        if isinstance(args.model_args, dict)
+        else {}
+    ) | (
+        args.metadata
+        if isinstance(args.metadata, dict)
+        else simple_parse_args_string(args.metadata)
+    )
+    task_manager = TaskManager(include_path=args.include_path, metadata=metadata)
+    if "push_samples_to_hub" in evaluation_tracker_args and not args.log_samples:
+        eval_logger.warning(
+            "Pushing samples to the Hub requires --log_samples to be set. Samples will not be pushed to the Hub."
+        )
+    if args.limit:
+        eval_logger.warning(
+            " --limit SHOULD ONLY BE USED FOR TESTING."
+            "REAL METRICS SHOULD NOT BE COMPUTED USING LIMIT."
+        )
+    if args.samples:
+        assert args.limit is None, (
+            "If --samples is not None, then --limit must be None."
+        )
+        if (samples := Path(args.samples)).is_file():
+            args.samples = json.loads(samples.read_text())
+        else:
+            args.samples = json.loads(args.samples)
+    if args.tasks is None:
+        eval_logger.error("Need to specify task to evaluate.")
+        sys.exit()
+    elif args.tasks == "list":
+        print(task_manager.list_all_tasks())
+        sys.exit()
+    elif args.tasks == "list_groups":
+        print(task_manager.list_all_tasks(list_subtasks=False, list_tags=False))
+        sys.exit()
+    elif args.tasks == "list_tags":
+        print(task_manager.list_all_tasks(list_groups=False, list_subtasks=False))
+        sys.exit()
+    elif args.tasks == "list_subtasks":
+        print(task_manager.list_all_tasks(list_groups=False, list_tags=False))
+        sys.exit()
+    else:
+        if os.path.isdir(args.tasks):
+            import glob
+            task_names = []
+            yaml_path = os.path.join(args.tasks, "*.yaml")
+            for yaml_file in glob.glob(yaml_path):
+                config = utils.load_yaml_config(yaml_file)
+                task_names.append(config)
+        else:
+            task_list = args.tasks.split(",")
+            task_names = task_manager.match_tasks(task_list)
+            for task in [task for task in task_list if task not in task_names]:
+                if os.path.isfile(task):
+                    config = utils.load_yaml_config(task)
+                    task_names.append(config)
+            task_missing = [
+                task for task in task_list if task not in task_names and "*" not in task
+            ]  # we don't want errors if a wildcard ("*") task name was used
+            if task_missing:
+                missing = ", ".join(task_missing)
+                eval_logger.error(
+                    f"Tasks were not found: {missing}\n"
+                    f"{utils.SPACING}Try `lm-eval --tasks list` for list of available tasks",
+                )
+                raise ValueError(
+                    f"Tasks not found: {missing}. Try `lm-eval --tasks {{list_groups,list_subtasks,list_tags,list}}` to list out all available names for task groupings; only (sub)tasks; tags; or all of the above, or pass '--verbosity DEBUG' to troubleshoot task registration issues."
+                )
+    # Respect user's value passed in via CLI, otherwise default to True and add to comma-separated model args
+    if args.trust_remote_code:
+        eval_logger.info(
+            "Passed `--trust_remote_code`, setting environment variable `HF_DATASETS_TRUST_REMOTE_CODE=true`"
+        )
+        # HACK: import datasets and override its HF_DATASETS_TRUST_REMOTE_CODE value internally,
+        # because it's already been determined based on the prior env var before launching our
+        # script--`datasets` gets imported by lm_eval internally before these lines can update the env.
+        import datasets
+        from packaging.version import parse as vparse
+        if vparse(datasets.__version__) < vparse("4.0.0"):
+            datasets.config.HF_DATASETS_TRUST_REMOTE_CODE = True
+        if isinstance(args.model_args, dict):
+            args.model_args["trust_remote_code"] = True
+        else:
+            args.model_args = args.model_args + ",trust_remote_code=True"
+    (
+        eval_logger.info(f"Selected Tasks: {task_names}")
+        if eval_logger.getEffectiveLevel() >= logging.INFO
+        else print(f"Selected Tasks: {task_names}")
+    )
+    request_caching_args = request_caching_arg_to_dict(
+        cache_requests=args.cache_requests
+    )
+    results = evaluator.simple_evaluate(
+        model=args.model,
+        model_args=args.model_args,
+        tasks=task_names,
+        num_fewshot=args.num_fewshot,
+        batch_size=args.batch_size,
+        max_batch_size=args.max_batch_size,
+        device=args.device,
+        use_cache=args.use_cache,
+        limit=args.limit,
+        samples=args.samples,
+        check_integrity=args.check_integrity,
+        write_out=args.write_out,
+        log_samples=args.log_samples,
+        evaluation_tracker=evaluation_tracker,
+        system_instruction=args.system_instruction,
+        apply_chat_template=args.apply_chat_template,
+        fewshot_as_multiturn=args.fewshot_as_multiturn,
+        gen_kwargs=args.gen_kwargs,
+        task_manager=task_manager,
+        predict_only=args.predict_only,
+        random_seed=args.seed[0],
+        numpy_random_seed=args.seed[1],
+        torch_random_seed=args.seed[2],
+        fewshot_random_seed=args.seed[3],
+        confirm_run_unsafe_code=args.confirm_run_unsafe_code,
+        metadata=metadata,
+        **request_caching_args,
+    )
+    if results is not None:
+        if args.log_samples:
+            samples = results.pop("samples")
+        dumped = json.dumps(
+            results, indent=2, default=handle_non_serializable, ensure_ascii=False
+        )
+        if args.show_config:
+            print(dumped)
+        batch_sizes = ",".join(map(str, results["config"]["batch_sizes"]))
+        # Add W&B logging
+        if args.wandb_args:
+            try:
+                wandb_logger.post_init(results)
+                wandb_logger.log_eval_result()
+                if args.log_samples:
+                    wandb_logger.log_eval_samples(samples)
+            except Exception as e:
+                eval_logger.info(f"Logging to Weights and Biases failed due to {e}")
+        evaluation_tracker.save_results_aggregated(
+            results=results, samples=samples if args.log_samples else None
+        )
+        if args.log_samples:
+            for task_name, config in results["configs"].items():
+                evaluation_tracker.save_results_samples(
+                    task_name=task_name, samples=samples[task_name]
+                )
+        if (
+            evaluation_tracker.push_results_to_hub
+            or evaluation_tracker.push_samples_to_hub
+        ):
+            evaluation_tracker.recreate_metadata_card()
+        print(
+            f"{args.model} ({args.model_args}), gen_kwargs: ({args.gen_kwargs}), limit: {args.limit}, num_fewshot: {args.num_fewshot}, "
+            f"batch_size: {args.batch_size}{f' ({batch_sizes})' if batch_sizes else ''}"
+        )
+        print(make_table(results))
+        if "groups" in results:
+            print(make_table(results, "groups"))
+        if args.wandb_args:
+            # Tear down wandb run once all the logging is done.
+            wandb_logger.run.finish()
+if __name__ == "__main__":
+    cli_evaluate()

lm-evaluation-harness/lm_eval/api/__init__.py ADDED Viewed

File without changes

lm-evaluation-harness/lm_eval/api/filter.py ADDED Viewed

	@@ -0,0 +1,56 @@

+from abc import ABC, abstractmethod
+from dataclasses import dataclass
+from typing import Callable, Iterable, List, Union
+from lm_eval.api.instance import Instance
+class Filter(ABC):
+    """
+    Filter classes operate on a per-task level.
+    They take all model outputs (`instance.resps` for all `task.instances`)
+    across all instances of a task, and perform operations.
+    In a single run, one can configure any number of separate filters or lists of filters.
+    """
+    def __init__(self, **kwargs) -> None:
+        """
+        Can define custom behavior here, if an individual instantiation of a Filter class should have state.
+        """
+    @abstractmethod
+    def apply(self, resps: Union[List, Iterable], docs: List[dict]) -> Iterable:
+        """
+        Defines the operation to perform on a list of the `inst.resps` properties of `Instance` objects.
+        Should return the list of (filtered) response lists *in the same order as they were input*, e.g.
+        if pass in [<inst.resps for instance 0>, <inst.resps for instance 1>] should return
+        [<filtered resps for instance 0>, <filtered resps for instance 1>]
+        """
+        return resps
+@dataclass
+class FilterEnsemble:
+    """
+    FilterEnsemble creates a pipeline applying multiple filters.
+    Its intended usage is to stack multiple post-processing steps in order.
+    `task.apply_filters` should use a list of FilterEnsemble classes that it stores, to apply each
+    pipeline separately.
+    """
+    name: str
+    filters: List[Callable[[], Filter]]
+    def apply(self, instances: List[Instance]) -> None:
+        resps, docs = zip(*((inst.resps, inst.doc) for inst in instances))
+        resps, docs = list(resps), list(docs)
+        for f in self.filters:
+            # apply filters in sequence
+            resps = f().apply(resps, docs)
+        # add the end results after filtering to filtered_requests of their respective source instances.
+        # has key `self.name`: each FilterEnsemble applied in a given run should use a different name.
+        for inst, resp in zip(instances, resps):
+            inst.filtered_resps[self.name] = resp

lm-evaluation-harness/lm_eval/api/group.py ADDED Viewed

	@@ -0,0 +1,115 @@

+import abc
+from dataclasses import asdict, dataclass
+from inspect import getsource
+from typing import Any, Callable, List, Optional, Union
+@dataclass
+class AggMetricConfig(dict):
+    metric: Optional[str] = None
+    aggregation: Optional[str] = "mean"
+    weight_by_size: Optional[str] = False
+    # list of filter names which should be incorporated into the aggregated metric.
+    filter_list: Optional[Union[str, list]] = "none"
+    def __post_init__(self):
+        if self.aggregation != "mean" and not callable(self.aggregation):
+            raise ValueError(
+                f"Currently, 'mean' is the only pre-defined aggregation across groups' subtasks. Got '{self.aggregation}'."
+            )
+        if isinstance(self.filter_list, str):
+            self.filter_list = [self.filter_list]
+@dataclass
+class GroupConfig(dict):
+    group: Optional[str] = None
+    group_alias: Optional[str] = None
+    task: Optional[Union[str, list]] = None
+    aggregate_metric_list: Optional[
+        Union[List[AggMetricConfig], AggMetricConfig, dict]
+    ] = None
+    metadata: Optional[dict] = (
+        None  # by default, not used in the code. allows for users to pass arbitrary info to tasks
+    )
+    def __getitem__(self, item):
+        return getattr(self, item)
+    def __setitem__(self, item, value):
+        return setattr(self, item, value)
+    def __post_init__(self):
+        if self.aggregate_metric_list is not None:
+            if isinstance(self.aggregate_metric_list, dict):
+                self.aggregate_metric_list = [self.aggregate_metric_list]
+            self.aggregate_metric_list = [
+                AggMetricConfig(**item) if isinstance(item, dict) else item
+                for item in self.aggregate_metric_list
+            ]
+    def to_dict(self, keep_callable: bool = False) -> dict:
+        """dumps the current config as a dictionary object, as a printable format.
+        null fields will not be printed.
+        Used for dumping results alongside full task configuration
+        :return: dict
+            A printable dictionary version of the TaskConfig object.
+        # TODO: should any default value in the TaskConfig not be printed?
+        """
+        cfg_dict = asdict(self)
+        # remove values that are `None`
+        for k, v in list(cfg_dict.items()):
+            if callable(v):
+                cfg_dict[k] = self.serialize_function(v, keep_callable=keep_callable)
+        return cfg_dict
+    def serialize_function(
+        self, value: Union[Callable, str], keep_callable=False
+    ) -> Union[Callable, str]:
+        """Serializes a given function or string.
+        If 'keep_callable' is True, the original callable is returned.
+        Otherwise, attempts to return the source code of the callable using 'getsource'.
+        """
+        if keep_callable:
+            return value
+        else:
+            try:
+                return getsource(value)
+            except (TypeError, OSError):
+                return str(value)
+class ConfigurableGroup(abc.ABC):
+    def __init__(
+        self,
+        config: Optional[dict] = None,
+    ) -> None:
+        self._config = GroupConfig(**config)
+    @property
+    def group(self):
+        return self._config.group
+    @property
+    def group_alias(self):
+        return self._config.group_alias
+    @property
+    def version(self):
+        return self._config.version
+    @property
+    def config(self):
+        return self._config.to_dict()
+    @property
+    def group_name(self) -> Any:
+        return self._config.group
+    def __repr__(self):
+        return f"ConfigurableGroup(group={self.group},group_alias={self.group_alias})"

lm-evaluation-harness/lm_eval/api/instance.py ADDED Viewed

	@@ -0,0 +1,38 @@

+from dataclasses import dataclass, field
+from typing import Literal, Optional, Tuple
+OutputType = Literal[
+    "loglikelihood", "loglikelihood_rolling", "generate_until", "multiple_choice"
+]
+@dataclass
+class Instance:
+    request_type: OutputType
+    doc: dict
+    arguments: tuple
+    idx: int
+    metadata: Tuple[Optional[str], Optional[int], Optional[int]] = field(
+        default_factory=lambda: (None, None, None)
+    )
+    resps: list = field(default_factory=list)
+    filtered_resps: dict = field(default_factory=dict)
+    # initialized after init
+    task_name: Optional[str] = None
+    doc_id: Optional[int] = None
+    repeats: Optional[int] = None
+    def __post_init__(self) -> None:
+        # unpack metadata field
+        self.task_name, self.doc_id, self.repeats = self.metadata
+    @property
+    def args(self):
+        """
+        Returns (string,) where `string` is the string to calculate loglikelihood over
+        """
+        return (
+            self.arguments if isinstance(self.arguments, tuple) else (self.arguments,)
+        )

lm-evaluation-harness/lm_eval/api/metrics.py ADDED Viewed

	@@ -0,0 +1,629 @@

+import logging
+import math
+import os
+import random
+import re
+import string
+from collections.abc import Iterable
+from typing import Callable, List, Optional, Sequence, TypeVar
+import numpy as np
+import sacrebleu
+from lm_eval.api.registry import register_aggregation, register_metric
+T = TypeVar("T")
+eval_logger = logging.getLogger(__name__)
+# Register Aggregations First
+@register_aggregation("bypass")
+def bypass_agg(arr):
+    return 999
+@register_aggregation("nanmean")
+def nanmean(arr):
+    if len(arr) == 0 or all(np.isnan(arr)):
+        return np.nan
+    return np.nanmean(arr)
+@register_aggregation("mean")
+def mean(arr):
+    return sum(arr) / len(arr)
+@register_aggregation("median")
+def median(arr):
+    return arr[len(arr) // 2]
+# Certain metrics must be calculated across all documents in a benchmark.
+# We use them as aggregation metrics, paired with no-op passthrough metric fns.
+@register_aggregation("perplexity")
+def perplexity(items):
+    return math.exp(-mean(items))
+@register_aggregation("weighted_perplexity")
+def weighted_perplexity(items):
+    return math.exp(-weighted_mean(items))
+@register_aggregation("bits_per_byte")
+def bits_per_byte(items):
+    return -weighted_mean(items) / math.log(2)
+@register_aggregation("f1")
+def f1_score(items):
+    from sklearn.metrics import f1_score
+    unzipped_list = list(zip(*items))
+    golds = unzipped_list[0]
+    preds = unzipped_list[1]
+    fscore = f1_score(golds, preds)
+    return np.max(fscore)
+@register_aggregation("matthews_corrcoef")
+def matthews_corrcoef(items):
+    from sklearn.metrics import matthews_corrcoef
+    unzipped_list = list(zip(*items))
+    golds = unzipped_list[0]
+    preds = unzipped_list[1]
+    return matthews_corrcoef(golds, preds)
+@register_aggregation("bleu")
+def bleu(items):
+    """The Bilingual Evaluation Understudy Score, or BLEU for short, is a metric
+    for evaluating a generated sentence to a reference sentence. It counts matching
+    n-grams in the candidate translation to n-grams in the reference text, where
+    1-gram or unigram would be each token and a bigram comparison would be each
+    word pair. The comparison is made regardless of word order
+    Source: https://machinelearningmastery.com/calculate-bleu-score-for-text-python/
+    Paper: https://www.aclweb.org/anthology/P02-1040/
+    Higher is better
+    """
+    refs = list(zip(*items))[0]
+    preds = list(zip(*items))[1]
+    refs, preds = _sacreformat(refs, preds)
+    return sacrebleu.corpus_bleu(preds, refs).score
+@register_aggregation("chrf")
+def chrf(items):
+    """chrF++ is a tool for automatic evaluation of machine translation output
+    based on character n-gram precision and recall enhanced with word n-grams.
+    Source: https://github.com/m-popovic/chrF
+    Paper: https://www.aclweb.org/anthology/W15-3049.pdf
+    Higher is better  # TODO I think
+    """
+    refs = list(zip(*items))[0]
+    preds = list(zip(*items))[1]
+    refs, preds = _sacreformat(refs, preds)
+    return sacrebleu.corpus_chrf(preds, refs).score
+@register_aggregation("ter")
+def ter(items):
+    """Translation Error Rate is an error metric for machine translation that
+    measures the number of edits required to change a system output into one
+    of the references
+    Source: http://www.cs.umd.edu/~snover/tercom/
+    Paper: http://mt-archive.info/AMTA-2006-Snover.pdf
+    Lower is better
+    """
+    refs = list(zip(*items))[0]
+    preds = list(zip(*items))[1]
+    refs, preds = _sacreformat(refs, preds)
+    return sacrebleu.corpus_ter(preds, refs).score
+@register_aggregation("brier_score")
+def brier_score(items):  # This is a passthrough function
+    gold, predictions = list(zip(*items))
+    bs, num_class = np.array(predictions).shape
+    gold = list(gold)
+    gold_one_hot = np.eye(num_class)[gold]
+    return np.mean(np.sum((predictions - gold_one_hot) ** 2, axis=1))
+@register_metric(
+    metric="brier_score",
+    higher_is_better=False,
+    output_type=["multiple_choice"],
+    aggregation="brier_score",
+)
+def brier_score_fn(items):  # This is a passthrough function
+    return items
+@register_metric(
+    metric="acc",
+    higher_is_better=True,
+    output_type=["loglikelihood", "multiple_choice"],
+    aggregation="mean",
+)
+def acc_fn(items):  # This is a passthrough function
+    return items
+@register_metric(
+    metric="acc_norm",
+    higher_is_better=True,
+    output_type=["loglikelihood", "multiple_choice"],
+    aggregation="mean",
+)
+def acc_norm_fn(items):  # This is a passthrough function
+    return items
+@register_metric(
+    metric="acc_mutual_info",
+    higher_is_better=True,
+    output_type="multiple_choice",
+    aggregation="mean",
+)
+def acc_mutual_info_fn(items):  # This is a passthrough function
+    return items
+### the code used in the `exact_match_hf_evaluate` function is ported from
+### https://github.com/huggingface/evaluate/blob/main/metrics/exact_match/exact_match.py
+### which is under the apache license.
+# Copyright 2020 The HuggingFace Datasets Authors and the current dataset script contributor.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#     http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+def exact_match_hf_evaluate(
+    predictions,
+    references,
+    regexes_to_ignore=None,
+    ignore_case=False,
+    ignore_punctuation=False,
+    ignore_numbers=False,
+):
+    if regexes_to_ignore is not None:
+        for s in regexes_to_ignore:
+            predictions = np.array([re.sub(s, "", x) for x in predictions])
+            references = np.array([re.sub(s, "", x) for x in references])
+    else:
+        predictions = np.asarray(predictions)
+        references = np.asarray(references)
+    if ignore_case:
+        predictions = np.char.lower(predictions)
+        references = np.char.lower(references)
+    if ignore_punctuation:
+        repl_table = string.punctuation.maketrans("", "", string.punctuation)
+        predictions = np.char.translate(predictions, table=repl_table)
+        references = np.char.translate(references, table=repl_table)
+    if ignore_numbers:
+        repl_table = string.digits.maketrans("", "", string.digits)
+        predictions = np.char.translate(predictions, table=repl_table)
+        references = np.char.translate(references, table=repl_table)
+    score_list = predictions == references
+    return {"exact_match": np.mean(score_list)}
+###
+@register_metric(
+    metric="exact_match",
+    higher_is_better=True,
+    output_type="generate_until",
+    aggregation="mean",
+)
+def exact_match_fn(**kwargs):
+    return exact_match_hf_evaluate(**kwargs)
+@register_metric(
+    metric="perplexity",
+    higher_is_better=False,
+    output_type="loglikelihood",
+    aggregation="perplexity",
+)
+def perplexity_fn(items):  # This is a passthrough function
+    return items
+@register_metric(
+    metric="word_perplexity",
+    higher_is_better=False,
+    output_type="loglikelihood_rolling",
+    aggregation="weighted_perplexity",
+)
+def word_perplexity_fn(items):  # This is a passthrough function
+    return items
+@register_metric(
+    metric="byte_perplexity",
+    higher_is_better=False,
+    output_type="loglikelihood_rolling",
+    aggregation="weighted_perplexity",
+)
+def byte_perplexity_fn(items):  # This is a passthrough function
+    return items
+@register_metric(
+    metric="bits_per_byte",
+    higher_is_better=False,
+    output_type="loglikelihood_rolling",
+    aggregation="bits_per_byte",
+)
+def bits_per_byte_fn(items):  # This is a passthrough function
+    return items
+def pop_stddev(arr):
+    mu = mean(arr)
+    return math.sqrt(sum([(x - mu) ** 2 for x in arr]) / len(arr))
+def sample_stddev(arr: Sequence[T]) -> float:
+    mu = mean(arr)
+    return math.sqrt(sum([(x - mu) ** 2 for x in arr]) / (len(arr) - 1))
+def mean_stderr(arr):
+    return sample_stddev(arr) / math.sqrt(len(arr))
+@register_metric(
+    metric="bypass",
+    higher_is_better=True,
+    output_type=["loglikelihood", "multiple_choice", "generate_until"],
+    aggregation="bypass",
+)
+def bypass(items):
+    return None
+@register_metric(
+    metric="mcc",
+    higher_is_better=True,
+    output_type="multiple_choice",
+    aggregation="matthews_corrcoef",
+)
+def mcc_fn(items):  # This is a passthrough function
+    return items
+@register_metric(
+    metric="f1",
+    higher_is_better=True,
+    output_type="multiple_choice",
+    aggregation="f1",
+)
+def f1_fn(items):  # This is a passthrough function
+    return items
+@register_metric(
+    metric="bleu",
+    higher_is_better=True,
+    output_type="generate_until",
+    aggregation="bleu",
+)
+def bleu_fn(items):  # This is a passthrough function
+    return items
+@register_metric(
+    metric="chrf",
+    higher_is_better=True,
+    output_type="generate_until",
+    aggregation="chrf",
+)
+def chrf_fn(items):  # This is a passthrough function
+    return items
+@register_metric(
+    metric="ter",
+    higher_is_better=True,
+    output_type="generate_until",
+    aggregation="ter",
+)
+def ter_fn(items):  # This is a passthrough function
+    return items
+@register_metric(
+    metric="acc_all",
+    higher_is_better=True,
+    output_type="loglikelihood",
+    aggregation="mean",
+)
+def acc_all(items):
+    # Only count as correct if all answers are labeled correctly for each question
+    question_scoring_dict = {}
+    preds = list(zip(*items))[0]
+    docs = list(zip(*items))[1]
+    for doc, pred in zip(docs, preds):
+        paragraph_id = doc["idx"]["paragraph"]
+        question_id = doc["idx"]["question"]
+        if (paragraph_id, question_id) not in question_scoring_dict:
+            question_scoring_dict[(paragraph_id, question_id)] = []
+        gold_label = doc["label"] == 1
+        question_scoring_dict[(paragraph_id, question_id)].append(gold_label == pred)
+    acc = np.mean([int(all(x)) for x in question_scoring_dict.values()])
+    return acc
+def acc_all_stderr(items):
+    # Only count as correct if all answers are labeled correctly for each question
+    question_scoring_dict = {}
+    preds = list(zip(*items))[0]
+    docs = list(zip(*items))[1]
+    for doc, pred in zip(docs, preds):
+        question_id = doc["idx"]["question"]
+        if question_id not in question_scoring_dict:
+            question_scoring_dict[question_id] = []
+        gold_label = doc["label"] == 1
+        question_scoring_dict[question_id].append(gold_label == pred)
+    acc = mean_stderr([int(all(x)) for x in question_scoring_dict.values()])
+    return acc
+def metric_max_over_ground_truths(metric_fn, prediction, ground_truths):
+    """Compute max metric between prediction and each ground truth."""
+    scores_for_ground_truths = []
+    for ground_truth in ground_truths:
+        score = metric_fn(prediction, ground_truth)
+        scores_for_ground_truths.append(score)
+    return max(scores_for_ground_truths)
+def weighted_mean(items):
+    a, b = zip(*items)
+    return sum(a) / sum(b)
+def is_non_str_iterable(obj):
+    return isinstance(obj, Iterable) and not isinstance(obj, str)
+def _sacreformat(refs, preds):
+    """Format refs and preds for sacrebleu corpus calculation. It is very particular"""
+    # Sacrebleu expects (List[str], List[List[str])
+    #   e.g. sacrebleu.corpus_bleu([pred_t], [[ref1_stream], [ref2_stream], ...])
+    # Note [ref1_stream] is the first reference for each pred.
+    # So lists are size N and (M, N) for N preds and M possible refs for each pred
+    # This is a different order of dimensions that I would expect
+    # We expect refs to be List[str] or List[List[str]], the outer list corresponding to preds
+    # Must become List[List[str]] with the inner list corresponding to preds
+    if not is_non_str_iterable(refs):
+        refs = list(refs)
+    if not is_non_str_iterable(refs[0]):
+        refs = [[ref] for ref in refs]
+    refs = list(zip(*refs))
+    # Note the number of refs in each ref list much match the number of preds
+    # We expect preds to be List[str] or List[List[str]]. Must become List[str]
+    if not is_non_str_iterable(preds):
+        preds = list(preds)
+    if is_non_str_iterable(preds[0]):
+        assert len(preds[0]) == 1, f"Pred must be a str, was {preds[0]}"
+        preds = [pred[0] for pred in preds]
+    return refs, preds
+# stderr stuff
+class _bootstrap_internal:
+    """
+    Pool worker: `(i, xs)` → `n` bootstrap replicates
+    of `f(xs)`using a RNG seeded with `i`.
+    """
+    def __init__(self, f: Callable[[Sequence[T]], float], n: int) -> None:
+        self.f = f
+        self.n = n
+    def __call__(self, v: tuple[int, Sequence[T]]) -> list[float]:
+        i, xs = v
+        rnd = random.Random()
+        rnd.seed(i)
+        res = []
+        for _ in range(self.n):
+            res.append(self.f(rnd.choices(xs, k=len(xs))))
+        return res
+def _bootstrap_internal_no_mp(
+    f: Callable[[Sequence[T]], float], xs: Sequence[T], iters: int
+) -> list[float]:
+    """
+    Single-process fallback: compute `iters` bootstrap replicates
+    of statistic`f(xs)`, chunked (≤ 1000 draws).
+    """
+    res = []
+    chunk_size = min(1000, iters)
+    from tqdm import tqdm
+    print(f"bootstrapping for stddev: {f.__name__}")
+    # A single loop replaces the multiprocessing pool.
+    for i in tqdm(range(iters // chunk_size)):
+        rnd = random.Random(i)
+        for _ in range(chunk_size):
+            res.append(f(rnd.choices(xs, k=len(xs))))
+    return res
+def bootstrap_stderr(
+    f: Callable[[Sequence[T]], float], xs: Sequence[T], iters: int
+) -> float:
+    """
+    Bootstrap estimate of the standard error of statistic `f(xs)`
+    using up to `iters` resamples, chunked (≤ 1000 draws)
+    Executes in parallel unless the env-var `DISABLE_MULTIPROC` is set;
+    """
+    if not os.getenv("DISABLE_MULTIPROC"):
+        import multiprocessing as mp
+        # this gives a biased estimate of the stderr (i.e w/ the mean, it gives something
+        # equivalent to stderr calculated without Bessel's correction in the stddev.
+        # Unfortunately, I haven't been able to figure out what the right correction is
+        # to make the bootstrap unbiased - i considered multiplying by sqrt(n/(n-1)) but
+        # that would be ad-hoc and I can't prove that that would actually be an unbiased estimator)
+        # Thankfully, shouldn't matter because our samples are pretty big usually anyways
+        res = []
+        chunk_size = min(1000, iters)
+        from tqdm import tqdm
+        print("bootstrapping for stddev:", f.__name__)
+        with mp.Pool(mp.cpu_count()) as pool:
+            for bootstrap in tqdm(
+                pool.imap(
+                    _bootstrap_internal(f, chunk_size),
+                    [(i, xs) for i in range(iters // chunk_size)],
+                ),
+                total=iters // chunk_size,
+            ):
+                # sample w replacement
+                res.extend(bootstrap)
+    else:
+        res = _bootstrap_internal_no_mp(f, xs, iters)
+    return sample_stddev(res)
+def stderr_for_metric(
+    metric: Callable[[Sequence[T]], float], bootstrap_iters: int
+) -> Optional[Callable[[Sequence[T]], float]]:
+    """
+    Return a function that estimates the standard error of `metric(xs)`.
+    * If `bootstrap_iters > 0` and the metric is in the pre-approved
+      bootstrappable list, use `bootstrap_stderr` with that many draws.
+    * If the metric has a closed-form SE (e.g. `mean`, `acc_all`), use it.
+    * Otherwise, return `None`.
+    """
+    if bootstrap_iters <= 0:
+        # return no function (don't compute stderr) if bootstrap iters = 0
+        return None
+    bootstrappable = [
+        median,
+        matthews_corrcoef,
+        f1_score,
+        perplexity,
+        bleu,
+        chrf,
+        ter,
+        nanmean,
+    ]
+    if metric in bootstrappable:
+        return lambda x: bootstrap_stderr(metric, x, iters=bootstrap_iters)
+    stderr = {mean: mean_stderr, acc_all: acc_all_stderr}
+    return stderr.get(metric, None)
+def pooled_sample_stderr(stderrs: List[float], sizes: List[int]):
+    # Used to aggregate bootstrapped stderrs across subtasks in a group,
+    # when we are weighting by the size of each subtask.
+    #
+    assert len(stderrs) == len(sizes)
+    # formula source: https://en.wikipedia.org/wiki/Pooled_variance
+    # and: https://stats.stackexchange.com/a/4841331
+    # this empirically seems to match running `stderr_for_metric` on all instances
+    # from the subtasks concatenated with each other.
+    pooled_sample_var = (
+        sum([(size - 1) * stderr**2 * size for size, stderr in zip(sizes, stderrs)])
+    ) / (sum(sizes) - len(sizes))
+    return np.sqrt(pooled_sample_var / sum(sizes))
+def combined_sample_stderr(stderrs: List[float], sizes: List[int], metrics=None):
+    assert metrics is not None, (
+        "Need to pass a list of each subtask's metric for this stderr aggregation"
+    )
+    assert len(stderrs) == len(sizes) and len(sizes) == len(metrics)
+    # See https://github.com/EleutherAI/lm-evaluation-harness/pull/1390 for more documentation.
+    # This formula depends on sample means.
+    # removed because it seems to give erroneously huge stderrs for groupings of tasks
+    # and does not seem to match up with bootstrap-calculated stderrs for groups.
+    ### don't use this unless a statistician has told you it's the right thing to do ###
+    # accumulators: we'll aggregate pairwise N - 1 times
+    variance = stderrs[0] ** 2
+    curr_size = sizes[0]
+    curr_score = metrics[0]
+    for stderr, size, score in zip(stderrs[1:], sizes[1:], metrics[1:]):
+        curr_score = ((curr_score * curr_size) + (score * size)) / (
+            curr_size + size
+        )  # NOTE: this assumes our aggregation fn is "mean"
+        variance = ((curr_size - 1) * variance + (size - 1) * (stderr**2)) / (
+            curr_size + size - 1
+        ) + curr_size * size / ((curr_size + size) * (curr_size + size - 1)) * (
+            curr_score - score
+        ) ** 2
+    return np.sqrt(variance)
+def aggregate_subtask_metrics(metrics, sizes, weight_by_size=True):
+    # A helper function that is used to aggregate
+    # subtask scores cross-task.
+    # TODO: does not hold for non-mean aggregations
+    if not weight_by_size:
+        sizes = [1] * len(sizes)
+    assert len(metrics) == len(sizes)
+    return sum([metric * size for metric, size in zip(metrics, sizes)]) / sum(sizes)

lm-evaluation-harness/lm_eval/api/model.py ADDED Viewed

	@@ -0,0 +1,502 @@

+import abc
+import hashlib
+import json
+import logging
+import os
+from typing import TYPE_CHECKING, Any, Iterable, Optional, Type, TypeVar, Union
+from tqdm import tqdm
+from lm_eval import utils
+if TYPE_CHECKING:
+    from sqlitedict import SqliteDict
+    from lm_eval.api.instance import Instance
+eval_logger = logging.getLogger(__name__)
+T = TypeVar("T", bound="LM")
+class LM(abc.ABC):
+    def __init__(self) -> None:
+        """Defines the interface that should be implemented by all LM subclasses.
+        LMs are assumed to take text (strings) as input and yield strings as output
+        (inputs/outputs should be tokenization-agnostic.)
+        """
+        # set rank and world size to a single process, by default.
+        self._rank = 0
+        self._world_size = 1
+        self.cache_hook: "CacheHook" = CacheHook(None)
+    @abc.abstractmethod
+    def loglikelihood(self, requests) -> list[tuple[float, bool]]:
+        """Compute log-likelihood of generating a continuation from a context.
+        Downstream tasks should attempt to use loglikelihood instead of other
+        LM calls whenever possible.
+        :param requests: list[Instance]
+            A list of Instance objects, with property `args` which returns a tuple (context, continuation).
+            `context: str`
+                Context string. Implementations of LM must be able to handle an
+                empty context string.
+            `continuation: str`
+                The continuation over which log likelihood will be calculated. If
+                there is a word boundary, the space should be in the continuation.
+                For example, context="hello" continuation=" world" is correct.
+        :return: list[tuple[float, bool]]
+            A list of pairs (logprob, isgreedy)
+            `logprob: float`
+                The log probability of `continuation`.
+            `isgreedy`:
+                Whether `continuation` would be generated by greedy sampling from `context`.
+        """
+        pass
+    @abc.abstractmethod
+    def loglikelihood_rolling(self, requests) -> list[float]:
+        """Compute full log-likelihood of a string, with no truncation, for perplexity computation
+        - We will use the full max context length of the model.
+        - For inputs that exceed the max context length, we divide the tokenized string into chunks of up to
+        the max context length.
+        - IMPORTANT: Each document's loglikelihood/perplexity is computed *separately*, unlike other implementations
+          which may simply concatenate multiple documents together.
+        - IMPORTANT: We maximize the amount of context for each prediction. Specifically, for inputs that we break into
+          multiple chunks, the last input will still a full-sized context.
+          Example:
+            Input tokens: [ 0 1 2 3 4 5 6 7 8 9 ]
+            Prefix: BOS/EOS
+            Max context length: 4
+            Resulting input/prediction pairs:
+                INPUT:  BOS   0   1   2
+                PRED:     0   1   2   3
+                INPUT:    3   4   5   6
+                PRED:     4   5   6   7
+                INPUT:    5   6   7   8
+                PRED:             8   9
+          Observe that:
+            1. Each token is predicted exactly once
+            2. For the last pair, we provide the full context, but only score the last two tokens
+        :param requests: list[Instance]
+            A list of Instance objects with property `args` which returns a tuple (context,).
+            string: str
+                String for which we are computing overall loglikelihood
+        :return: list[tuple[float]]
+            A list of tuples (logprob,)
+            logprob: float
+                The log probability of `context` conditioned on the BOS/EOS token.
+                Can also be overridden for custom cases by `prefix_token_id`.
+        """
+        pass
+    # TODO: Add an optional max length
+    @abc.abstractmethod
+    def generate_until(self, requests) -> list[str]:
+        """Generate greedily until a stopping sequence
+        :param requests: list[Instance]
+            A list of Instance objects with property `args` which returns a tuple (context, gen_kwargs).
+            context: str
+                Context string
+            gen_kwargs: dict
+                A dictionary of keyword arguments to pass to the generation function e.g. top_k, until, etc.
+        :return: list[str]
+            A list of model generated continuations.
+            continuation: str
+                The generated continuation.
+        """
+        pass
+    def apply_chat_template(
+        self, chat_history: list[dict[str, str]], add_generation_prompt=True
+    ) -> str:
+        """
+        Defines how to transform few-shot examples provided as chat history into a format that can be used as input to the LM.
+        :param chat_history: list[dict[str, str]]
+            A list of dictionaries with keys 'role' and 'content'.
+            Values are strings representing the role name and the content of the message, respectively.
+        :param add_generation_prompt: bool
+            Whether to append an assistant gen prefix (for e.g. <|assistant|>) to the assistant messages in the chat history. False if prefilling an assistant message.
+        :return: str
+            A string representing the chat history in a format that can be used as input to the LM.
+        """
+        raise NotImplementedError(
+            "To use this model with chat templates, please implement the 'apply_chat_template' method for your model type."
+        )
+    @classmethod
+    def create_from_arg_string(
+        cls: Type[T], arg_string: str, additional_config: Optional[dict] = None
+    ) -> T:
+        """
+        Creates an instance of the LM class using the given argument string and additional config.
+        Parameters:
+        - arg_string: A string containing arguments in the format key1=value1,key2=value2.
+        - additional_config: Optional dictionary containing additional configuration parameters.
+        Returns:
+        - Instance of the LM class.
+        """
+        additional_config = {} if additional_config is None else additional_config
+        args = utils.simple_parse_args_string(arg_string)
+        args2 = {k: v for k, v in additional_config.items() if v is not None}
+        return cls(**args, **args2)
+    @classmethod
+    def create_from_arg_obj(
+        cls: Type[T], arg_dict: dict, additional_config: Optional[dict] = None
+    ) -> T:
+        """
+        Creates an instance of the LM class using the given arg_obj
+        Parameters:
+        - arg_obj: A dict containing arguments in the format key1=value1,key2=value2.
+        - additional_config: Optional dictionary containing additional configuration parameters.
+        Returns:
+        - Instance of the LM class.
+        """
+        additional_config = additional_config or {} | {
+            k: v for k, v in additional_config.items() if v is not None
+        }
+        return cls(**arg_dict, **additional_config)
+    @property
+    def rank(self):
+        # used in the case of parallelism. Hardcoded to
+        # ensure no errors arise using API models which do
+        # not support multi-device parallelism nor expect it.
+        return self._rank
+    @property
+    def world_size(self):
+        # used in the case of parallelism. Hardcoded to
+        # ensure no errors arise using API models which do
+        # not support multi-device parallelism nor expect it.
+        return self._world_size
+    @property
+    def tokenizer_name(self) -> str:
+        """Must be defined for LM subclasses which implement Chat Templating.
+        Should return the name of the tokenizer or chat template used.
+        Used only to properly fingerprint caches when requests are being cached with `--cache_requests`, otherwise not used.
+        """
+        raise NotImplementedError(
+            "To use this model with chat templates, please implement the 'tokenizer_name' property."
+        )
+    def chat_template(self, chat_template: Union[bool, str] = False) -> Optional[str]:
+        """Returns the chat template structure for user/assistant messages if a template is provided.
+        This method is intended to be overridden in a subclass to define a specific chat template format.
+        For models that do not support chat templates, this method returns None by default.
+        """
+        return ""
+    def set_cache_hook(self, cache_hook: "CacheHook") -> None:
+        self.cache_hook = cache_hook
+### SQLite-based caching of LM responses
+def hash_args(attr: str, args: Iterable[Any]) -> str:
+    dat = json.dumps([attr] + list(args))
+    return hashlib.sha256(dat.encode("utf-8")).hexdigest()
+class CacheHook:
+    def __init__(self, cachinglm: Optional["CachingLM"]) -> None:
+        if cachinglm is None:
+            self.dbdict: Optional["SqliteDict"] = None
+            return
+        self.dbdict = cachinglm.dbdict
+    def add_partial(self, attr: str, req: Iterable[Any], res: Any) -> None:
+        if self.dbdict is None:
+            return
+        hsh = hash_args(attr, req)
+        self.dbdict[hsh] = res
+class CachingLM:
+    def __init__(self, lm: LM, cache_db: str) -> None:
+        """LM wrapper that returns cached results if they exist, and uses the underlying LM if not.
+        :param lm: LM
+            Underlying LM
+        :param cache_db: str
+            Path to cache db
+        """
+        from sqlitedict import SqliteDict
+        self.lm: LM = lm
+        self.cache_db: str = cache_db
+        if os.path.dirname(cache_db):
+            os.makedirs(os.path.dirname(cache_db), exist_ok=True)
+        self.dbdict = SqliteDict(cache_db, autocommit=True)
+        # add hook to lm
+        lm.set_cache_hook(self.get_cache_hook())
+    def __getattr__(self, attr: str) -> Any:
+        lm_attr = getattr(self.lm, attr)
+        if attr not in ["loglikelihood", "loglikelihood_rolling", "generate_until"]:
+            eval_logger.debug(f"Passing through attribute '{attr}' to underlying LM")
+            return lm_attr
+        def _fn(requests: list["Instance"]) -> list["Instance"]:
+            res = []
+            remaining_reqs = []
+            warned = False
+            # figure out which ones are cached and which ones are new
+            eval_logger.info(
+                f"Loading '{attr}' responses from cache '{self.cache_db}' where possible..."
+            )
+            for req in tqdm(requests, desc="Checking cached requests"):
+                hsh = hash_args(attr, req.args)
+                if attr == "generate_until" and req.args[1].get("do_sample", False):
+                    # when we are doing non-greedy generation, don't use the cache
+                    # (else every "randomly sampled" generation would be identical for repeats > 1).
+                    if not warned:
+                        eval_logger.warning(
+                            f"Arguments to lm.generate_until() '{req.args[1]}' include non-deterministic sampling. Caching will not be performed for such requests."
+                        )
+                        warned = True
+                    res.append(None)
+                    remaining_reqs.append(req)
+                elif hsh in self.dbdict:
+                    ob = self.dbdict[hsh]
+                    assert ob is not None
+                    res.append(ob)
+                else:
+                    res.append(None)
+                    remaining_reqs.append(req)
+            eval_logger.info(
+                f"Cached requests: {len(requests) - len(remaining_reqs)}, Requests remaining: {len(remaining_reqs)}"
+            )
+            if remaining_reqs:
+                # actually run the LM on the requests that do not have cached results
+                rem_res = getattr(self.lm, attr)(remaining_reqs)
+            else:
+                rem_res = []
+            # stick the new ones back into the list and also cache any of the new ones
+            resptr = 0
+            for req, r in zip(remaining_reqs, rem_res):
+                while res[resptr] is not None:
+                    resptr += 1
+                res[resptr] = r
+                # caching
+                hsh = hash_args(attr, req.args)
+                self.dbdict[hsh] = r
+            self.dbdict.commit()
+            return res
+        return _fn
+    def get_cache_hook(self) -> "CacheHook":
+        return CacheHook(self)
+class TemplateLM(LM):
+    """
+    A class acting as intermediary between the LM base class
+    and boilerplate often included in other LM subclasses.
+    """
+    tokenizer = None
+    @property
+    @abc.abstractmethod
+    def eot_token_id(self):
+        pass
+    @property
+    def prefix_token_id(self):
+        # it is used as prefix for loglikelihood
+        return self.eot_token_id
+    @abc.abstractmethod
+    def tok_encode(self, string: str, **kwargs) -> list[int]:
+        """
+        Tokenize a string using the model's tokenizer and return a list of token IDs.
+        """
+        pass
+    @abc.abstractmethod
+    def _loglikelihood_tokens(
+        self, requests: list["Instance"], **kwargs
+    ) -> list[tuple[float, bool]]:
+        pass
+    def _encode_pair(
+        self, context: str, continuation: str
+    ) -> tuple[list[int], list[int]]:
+        import transformers
+        n_spaces = len(context) - len(context.rstrip())
+        if n_spaces > 0:
+            continuation = context[-n_spaces:] + continuation
+            context = context[:-n_spaces]
+        model_class = getattr(self, "AUTO_MODEL_CLASS", None)
+        if model_class == transformers.AutoModelForSeq2SeqLM:
+            context_enc = self.tok_encode(context)
+            continuation_enc = self.tok_encode(continuation, add_special_tokens=False)
+        else:
+            whole_enc = self.tok_encode(context + continuation)
+            context_enc = self.tok_encode(context)
+            context_enc_len = len(context_enc)
+            continuation_enc = whole_enc[context_enc_len:]
+        return context_enc, continuation_enc
+    def loglikelihood(
+        self, requests: list["Instance"], disable_tqdm: bool = False
+    ) -> list[tuple[float, bool]]:
+        new_reqs = []
+        for context, continuation in [req.args for req in requests]:
+            if context == "":
+                # BOS or EOS as context
+                context_enc, continuation_enc = (
+                    [self.prefix_token_id],
+                    self.tok_encode(continuation),
+                )
+            else:
+                context_enc, continuation_enc = self._encode_pair(context, continuation)
+            new_reqs.append(((context, continuation), context_enc, continuation_enc))
+        return self._loglikelihood_tokens(new_reqs, disable_tqdm=disable_tqdm)
+    @abc.abstractmethod
+    def loglikelihood_rolling(
+        self, requests, disable_tqdm: bool = False
+    ) -> list[float]:
+        pass
+    @abc.abstractmethod
+    def generate_until(self, requests, disable_tqdm: bool = False) -> list[str]:
+        pass
+    def chat_template(self, chat_template: Union[bool, str] = False) -> Optional[str]:
+        """
+        Set and get the appropriate chat template for the model.
+        This method sets the tokenizer's chat_template and returns the template string for reproducibility.
+        The template selection logic is adapted from the Transformers library's `apply_chat_template`
+        method in the Tokenizer class. The original implementation can be found at:
+        https://github.com/huggingface/transformers/blob/fc35907f95459d7a6c5281dfadd680b6f7b620e3/src/transformers/tokenization_utils_base.py#L1687
+        This method ensures that the right template is chosen based on the following:
+        0. If the model has no 'tokenizer' attribute: assumes that there is only a single possible chat template, handled on the model provider side internally. Returns the empty string.
+        1. If the model's tokenizer has multiple templates:
+            a. Use the specified template if it exists in the dictionary.
+            b. Use the default template from the list if no specific template is provided.
+            c. Raise an error if no default template exists and no specific template is provided.
+        2. If the model's tokenizer has a single template or no template:
+            a. Use the tokenizer's chat template if available.
+            b. Fall back to the default chat template if no tokenizer chat template exists.
+        Args:
+            chat_template (Union[bool, str]): Specifies the chat template to use.
+                - If False or None, no template is applied.
+                - If True, the default or only available template is used.
+                - If a string, the template with the matching name is used.
+        Returns:
+            Optional[str]: The selected chat template, or None if no template is applied.
+        """
+        if self.tokenizer is None:
+            return ""
+        if chat_template is False or chat_template is None:
+            eval_logger.warning(
+                "model.chat_template was called with the chat_template set to False or None. "
+                "Therefore no chat template will be applied. Make sure this is an intended behavior."
+            )
+            return None
+        # Convert boolean chat_template to None to ensure compatibility with the adapted logic
+        if isinstance(chat_template, bool):
+            chat_template = None
+        using_default_template = False
+        # First, handle the cases when the model has a dict of multiple templates
+        try:
+            template = (
+                self.tokenizer.chat_template or self.tokenizer.default_chat_template
+            )
+        except AttributeError:
+            return None
+        if isinstance(template, dict):
+            using_default_dict = self.tokenizer.chat_template is None
+            if chat_template is not None:
+                if chat_template in template:
+                    selected_template = template[chat_template]
+                    if using_default_dict:
+                        using_default_template = True
+                else:
+                    raise ValueError(
+                        f"The specified chat template '{chat_template}' is not available. "
+                        f"Available template names are {sorted(template.keys())}."
+                    )
+            else:
+                # If user didn't pass a chat template, use the default template from the dict
+                if "default" in template:
+                    selected_template = template["default"]
+                    using_default_template = True
+                else:
+                    raise ValueError(
+                        "This model has multiple chat templates with no default specified! Please either pass a chat "
+                        "template or the name of the template you wish to use to the `chat_template` argument. Available "
+                        f"template names are {sorted(template.keys())}."
+                    )
+        # Cases when the model has a single template or no template
+        else:
+            # priority: `chat_template` argument > `tokenizer.chat_template` > `tokenizer.default_chat_template
+            if isinstance(chat_template, str):
+                eval_logger.warning(
+                    "Chat template name provided, but the tokenizer's chat template is not a dictionary. "
+                    "Using the tokenizer's chat template or the default template instead."
+                )
+            if self.tokenizer.chat_template is not None:
+                selected_template = self.tokenizer.chat_template
+            else:
+                selected_template = self.tokenizer.default_chat_template
+                using_default_template = True
+        if using_default_template:
+            eval_logger.warning(
+                "No chat template is set for this tokenizer, falling back to a default class-level template. This is "
+                "very error-prone, because models are often trained with templates different from the class default! "
+                "Default chat templates are a legacy feature and will be removed in Transformers v4.43, at which "
+                "point any code depending on them will stop working. We recommend setting a valid chat template before "
+                "then to ensure that this model continues working without issues."
+            )
+        return selected_template

lm-evaluation-harness/lm_eval/api/registry.py ADDED Viewed

	@@ -0,0 +1,196 @@

+import logging
+from typing import Callable, Dict, Union
+import evaluate as hf_evaluate
+from lm_eval.api.model import LM
+eval_logger = logging.getLogger(__name__)
+MODEL_REGISTRY = {}
+def register_model(*names):
+    # either pass a list or a single alias.
+    # function receives them as a tuple of strings
+    def decorate(cls):
+        for name in names:
+            assert issubclass(cls, LM), (
+                f"Model '{name}' ({cls.__name__}) must extend LM class"
+            )
+            assert name not in MODEL_REGISTRY, (
+                f"Model named '{name}' conflicts with existing model! Please register with a non-conflicting alias instead."
+            )
+            MODEL_REGISTRY[name] = cls
+        return cls
+    return decorate
+def get_model(model_name):
+    try:
+        return MODEL_REGISTRY[model_name]
+    except KeyError:
+        raise ValueError(
+            f"Attempted to load model '{model_name}', but no model for this name found! Supported model names: {', '.join(MODEL_REGISTRY.keys())}"
+        )
+TASK_REGISTRY = {}
+GROUP_REGISTRY = {}
+ALL_TASKS = set()
+func2task_index = {}
+def register_task(name):
+    def decorate(fn):
+        assert name not in TASK_REGISTRY, (
+            f"task named '{name}' conflicts with existing registered task!"
+        )
+        TASK_REGISTRY[name] = fn
+        ALL_TASKS.add(name)
+        func2task_index[fn.__name__] = name
+        return fn
+    return decorate
+def register_group(name):
+    def decorate(fn):
+        func_name = func2task_index[fn.__name__]
+        if name in GROUP_REGISTRY:
+            GROUP_REGISTRY[name].append(func_name)
+        else:
+            GROUP_REGISTRY[name] = [func_name]
+            ALL_TASKS.add(name)
+        return fn
+    return decorate
+OUTPUT_TYPE_REGISTRY = {}
+METRIC_REGISTRY = {}
+METRIC_AGGREGATION_REGISTRY = {}
+AGGREGATION_REGISTRY: Dict[str, Callable[[], Dict[str, Callable]]] = {}
+HIGHER_IS_BETTER_REGISTRY = {}
+FILTER_REGISTRY = {}
+DEFAULT_METRIC_REGISTRY = {
+    "loglikelihood": [
+        "perplexity",
+        "acc",
+    ],
+    "loglikelihood_rolling": ["word_perplexity", "byte_perplexity", "bits_per_byte"],
+    "multiple_choice": ["acc", "acc_norm"],
+    "generate_until": ["exact_match"],
+}
+def register_metric(**args):
+    # TODO: do we want to enforce a certain interface to registered metrics?
+    def decorate(fn):
+        assert "metric" in args
+        name = args["metric"]
+        for key, registry in [
+            ("metric", METRIC_REGISTRY),
+            ("higher_is_better", HIGHER_IS_BETTER_REGISTRY),
+            ("aggregation", METRIC_AGGREGATION_REGISTRY),
+        ]:
+            if key in args:
+                value = args[key]
+                assert value not in registry, (
+                    f"{key} named '{value}' conflicts with existing registered {key}!"
+                )
+                if key == "metric":
+                    registry[name] = fn
+                elif key == "aggregation":
+                    registry[name] = AGGREGATION_REGISTRY[value]
+                else:
+                    registry[name] = value
+        return fn
+    return decorate
+def get_metric(name: str, hf_evaluate_metric=False) -> Callable:
+    if not hf_evaluate_metric:
+        if name in METRIC_REGISTRY:
+            return METRIC_REGISTRY[name]
+        else:
+            eval_logger.warning(
+                f"Could not find registered metric '{name}' in lm-eval, searching in HF Evaluate library..."
+            )
+    try:
+        metric_object = hf_evaluate.load(name)
+        return metric_object.compute
+    except Exception:
+        eval_logger.error(
+            f"{name} not found in the evaluate library! Please check https://huggingface.co/evaluate-metric",
+        )
+def register_aggregation(name: str):
+    def decorate(fn):
+        assert name not in AGGREGATION_REGISTRY, (
+            f"aggregation named '{name}' conflicts with existing registered aggregation!"
+        )
+        AGGREGATION_REGISTRY[name] = fn
+        return fn
+    return decorate
+def get_aggregation(name: str) -> Callable[[], Dict[str, Callable]]:
+    try:
+        return AGGREGATION_REGISTRY[name]
+    except KeyError:
+        eval_logger.warning(f"{name} not a registered aggregation metric!")
+def get_metric_aggregation(name: str) -> Callable[[], Dict[str, Callable]]:
+    try:
+        return METRIC_AGGREGATION_REGISTRY[name]
+    except KeyError:
+        eval_logger.warning(f"{name} metric is not assigned a default aggregation!")
+def is_higher_better(metric_name) -> bool:
+    try:
+        return HIGHER_IS_BETTER_REGISTRY[metric_name]
+    except KeyError:
+        eval_logger.warning(
+            f"higher_is_better not specified for metric '{metric_name}'!"
+        )
+def register_filter(name):
+    def decorate(cls):
+        if name in FILTER_REGISTRY:
+            eval_logger.info(
+                f"Registering filter `{name}` that is already in Registry {FILTER_REGISTRY}"
+            )
+        FILTER_REGISTRY[name] = cls
+        return cls
+    return decorate
+def get_filter(filter_name: Union[str, Callable]) -> Callable:
+    try:
+        return FILTER_REGISTRY[filter_name]
+    except KeyError as e:
+        if callable(filter_name):
+            return filter_name
+        else:
+            eval_logger.warning(f"filter `{filter_name}` is not registered!")
+            raise e

lm-evaluation-harness/lm_eval/api/samplers.py ADDED Viewed

	@@ -0,0 +1,232 @@

+import logging
+import warnings
+from functools import partial
+from typing import TYPE_CHECKING, Iterable, Optional, Union
+import datasets
+if TYPE_CHECKING:
+    from random import Random
+    from lm_eval.api.task import ConfigurableTask, Task
+eval_logger = logging.getLogger("lm-eval")
+class ContextSampler:
+    def __init__(
+        self,
+        docs: list[dict],
+        task: Union["Task", "ConfigurableTask"],
+        fewshot_indices: Optional[Iterable] = None,
+        rnd: Optional["Random"] = None,
+    ) -> None:
+        self.rnd = rnd
+        if not self.rnd:
+            raise ValueError(
+                "A `random.Random` generator argument must be provided to `rnd` of FewShotSampler!"
+            )
+        self.task = task
+        self.config = task._config
+        self.target_delimiter = self.config.target_delimiter
+        self.fewshot_delimiter = self.config.fewshot_delimiter
+        if (
+            self.config.fewshot_config is not None
+            and self.config.fewshot_config.get("doc_to_text", None) is not None
+        ):
+            self.doc_to_text = partial(
+                self.task.doc_to_text,
+                doc_to_text=self.config.fewshot_config.get("doc_to_text", None),
+            )
+        else:
+            self.doc_to_text = self.task.doc_to_text
+        if (
+            self.config.fewshot_config is not None
+            and self.config.fewshot_config.get("doc_to_target", None) is not None
+        ):
+            self.doc_to_target = partial(
+                self.task.doc_to_target,
+                doc_to_target=self.config.fewshot_config.get("doc_to_target", None),
+            )
+        else:
+            self.doc_to_target = self.task.doc_to_target
+        if (
+            self.config.fewshot_config is not None
+            and self.config.fewshot_config.get("doc_to_choice", None) is not None
+        ):
+            self.doc_to_choice = partial(
+                self.task.doc_to_choice,
+                doc_to_choice=self.config.fewshot_config.get("doc_to_choice", None),
+            )
+        else:
+            self.doc_to_choice = self.task.doc_to_choice
+        self.docs = docs  # HF dataset split, provided by task._fewshot_docs()
+        if fewshot_indices:  # subset few-shot docs from
+            if not isinstance(self.docs, datasets.Dataset):
+                raise ValueError(
+                    "Got `fewshot_indices` but fewshot_docs are not a HF dataset. Don't use both `fewshot_indices` and a user-defined few-shot sample list simultaneously"
+                )
+            self.docs = self.docs.select(fewshot_indices)
+    def get_context(self, doc: dict, num_fewshot: int, gen_prefix: str = None):
+        # draw an extra fewshot sample if using same split as evaluating on
+        prefix = gen_prefix + " " if gen_prefix else ""
+        n_samples = (
+            num_fewshot + 1
+            if self.config.fewshot_split == self.config.test_split
+            else num_fewshot
+        )
+        # draw `n_samples` docs from fewshot_docs
+        fewshotex = self.sample(n_samples)
+        # get rid of the doc that's the one we're evaluating, if it's in the fewshot
+        # TODO: should we just stop people from using fewshot from same split as evaluating?
+        selected_docs = [x for x in fewshotex if x != doc][:num_fewshot]
+        labeled_examples = ""
+        for doc in selected_docs:
+            doc_content = self.doc_to_text(doc)
+            doc_target = self.doc_to_target(doc)
+            if self.config.doc_to_choice is None or isinstance(doc_content, str):
+                labeled_examples += doc_content
+            else:
+                labeled_examples += self.doc_to_choice(doc)[doc_content]
+            if doc_target != "":
+                if self.target_delimiter.isspace() and str(doc_target)[0].isspace():
+                    # TODO: add logger warn once here.
+                    warnings.warn(
+                        "Both target_delimiter and target start with a space. This may cause issues.",
+                        Warning,
+                        stacklevel=2,
+                    )
+                labeled_examples += self.target_delimiter
+                labeled_examples += prefix
+                labeled_examples += (
+                    str(doc_target[0])
+                    if isinstance(doc_target, list)
+                    else doc_target
+                    if self.config.doc_to_choice is None or isinstance(doc_target, str)
+                    else str(self.doc_to_choice(doc)[doc_target])
+                )
+                labeled_examples += self.fewshot_delimiter
+        return labeled_examples
+    def get_chat_context(
+        self,
+        doc: dict,
+        num_fewshot: int,
+        fewshot_as_multiturn: bool = False,
+        gen_prefix: Optional[str] = None,
+    ):
+        # TODO: Do we need any other delimiter
+        prefix = gen_prefix + " " if gen_prefix else ""
+        chat_history = []
+        # draw an extra fewshot sample if using same split as evaluating on
+        n_samples = (
+            num_fewshot + 1
+            if self.config.fewshot_split == self.config.test_split
+            else num_fewshot
+        )
+        # draw `n_samples` docs from fewshot_docs
+        fewshotex = self.sample(n_samples)
+        # get rid of the doc that's the one we're evaluating, if it's in the fewshot
+        # TODO: should we just stop people from using fewshot from same split as evaluating?
+        selected_docs = [x for x in fewshotex if x != doc][:num_fewshot]
+        if fewshot_as_multiturn:
+            for doc in selected_docs:
+                doc_content = self.doc_to_text(doc)
+                doc_target = self.doc_to_target(doc)
+                chat_history.append(
+                    {
+                        "role": "user",
+                        "content": doc_content
+                        if self.config.doc_to_choice is None
+                        or isinstance(doc_content, str)
+                        else self.doc_to_choice(doc)[doc_content],
+                    }
+                )
+                chat_history.append(
+                    {
+                        "role": "assistant",
+                        "content": prefix + str(doc_target[0])
+                        if isinstance(doc_target, list)
+                        else prefix + doc_target
+                        if self.config.doc_to_choice is None
+                        or isinstance(doc_target, str)
+                        else prefix + str(self.doc_to_choice(doc)[doc_target]),
+                    }
+                )
+        else:
+            # get fewshot context as one user turn
+            chat_history.append(
+                {
+                    "role": "user",
+                    "content": self.get_context(
+                        doc, num_fewshot, gen_prefix=gen_prefix
+                    ),
+                }
+            )
+        return chat_history
+    def sample(self, n: int):
+        """
+        Draw `n` samples from our fewshot docs. This method should be overridden by subclasses.
+        """
+        return self.rnd.sample(self.docs, n)
+class FirstNSampler(ContextSampler):
+    def sample(self, n: int) -> None:
+        """
+        Draw the first `n` samples in order from the specified split.
+        Used for tasks with "canonical" ordered fewshot examples, such as MMLU and CMMLU.
+        """
+        assert n <= len(self.docs), (
+            f"Error: number of fewshot samples requested exceeds the {len(self.docs)} that are available."
+        )
+        return self.docs[:n]
+class BalancedSampler(ContextSampler):
+    def sample(self, n: int) -> None:
+        """
+        TODO: this should return approximately class-balanced samples from our fewshot examples.
+        TODO: what order should they be in? maybe random?
+        """
+        pass
+class ManualSampler(ContextSampler):
+    def sample(self, n: int) -> None:
+        """ """
+        pass
+SAMPLER_REGISTRY = {
+    "default": ContextSampler,
+    "first_n": FirstNSampler,
+}
+def get_sampler(name: str):
+    try:
+        return SAMPLER_REGISTRY[name]
+    except KeyError:
+        raise ValueError(
+            f"Attempted to use contextsampler '{name}', but no sampling strategy for this name found! Supported model names: {', '.join(SAMPLER_REGISTRY.keys())}"
+        )

lm-evaluation-harness/lm_eval/api/task.py ADDED Viewed

	@@ -0,0 +1,1885 @@

+import abc
+import ast
+import logging
+import random
+import re
+from collections.abc import Callable
+from copy import deepcopy
+from dataclasses import asdict, dataclass
+from inspect import getsource
+from typing import (
+    Any,
+    Dict,
+    Iterable,
+    Iterator,
+    List,
+    Literal,
+    Mapping,
+    Optional,
+    Tuple,
+    Union,
+)
+import datasets
+import numpy as np
+from tqdm import tqdm
+from lm_eval import utils
+from lm_eval.api import samplers
+from lm_eval.api.instance import Instance, OutputType
+from lm_eval.api.metrics import bits_per_byte, mean, weighted_perplexity
+from lm_eval.api.registry import (
+    AGGREGATION_REGISTRY,
+    DEFAULT_METRIC_REGISTRY,
+    get_aggregation,
+    get_metric,
+    get_metric_aggregation,
+    is_higher_better,
+)
+from lm_eval.caching.cache import load_from_cache, save_to_cache
+from lm_eval.filters import build_filter_ensemble
+from lm_eval.prompts import get_prompt
+ALL_OUTPUT_TYPES = [
+    "loglikelihood",
+    "multiple_choice",
+    "loglikelihood_rolling",
+    "generate_until",
+]
+eval_logger = logging.getLogger(__name__)
+@dataclass
+class TaskConfig(dict):
+    # task naming/registry
+    task: Optional[str] = None
+    task_alias: Optional[str] = None
+    tag: Optional[Union[str, list]] = None
+    # HF dataset options.
+    # which dataset to use,
+    # and what splits for what purpose
+    custom_dataset: Optional[Callable] = None
+    dataset_path: Optional[str] = None
+    dataset_name: Optional[str] = None
+    dataset_kwargs: Optional[dict] = None
+    training_split: Optional[str] = None
+    validation_split: Optional[str] = None
+    test_split: Optional[str] = None
+    fewshot_split: Optional[str] = (
+        None  # TODO: assert that this not None if num_fewshot > 0. (?) assert if this is same split as one evaluating (?)
+    )
+    # formatting / prompting options.
+    # see docs/advanced_task_guide.md for more info
+    process_docs: Optional[Callable] = None
+    doc_to_text: Optional[Union[Callable, str]] = None
+    doc_to_target: Optional[Union[Callable, str]] = None
+    doc_to_image: Union[Callable, str] = None
+    doc_to_audio: Union[Callable, str] = None
+    unsafe_code: bool = False
+    doc_to_choice: Optional[Union[Callable, str, dict, list]] = None
+    process_results: Optional[Union[Callable, str]] = None
+    use_prompt: Optional[str] = None
+    description: str = ""
+    target_delimiter: str = " "
+    fewshot_delimiter: str = "\n\n"
+    fewshot_config: Optional[dict] = None
+    # runtime configuration options
+    num_fewshot: Optional[int] = None
+    # scoring options
+    metric_list: Optional[list] = None
+    output_type: OutputType = "generate_until"
+    generation_kwargs: Optional[dict] = None
+    repeats: int = 1
+    filter_list: Optional[Union[str, list]] = None
+    should_decontaminate: bool = False
+    doc_to_decontamination_query: Optional[str] = None
+    gen_prefix: Optional[str] = None
+    metadata: Optional[dict] = (
+        None  # by default, not used in the code. allows for users to pass arbitrary info to tasks
+    )
+    def __post_init__(self) -> None:
+        if self.generation_kwargs is not None:
+            if self.output_type != "generate_until":
+                eval_logger.warning(
+                    f"[{self.task}] passed `generation_kwargs`, but not using `output_type: generate_until`!"
+                )
+            if "temperature" in self.generation_kwargs:
+                self.generation_kwargs["temperature"] = float(
+                    self.generation_kwargs["temperature"]
+                )
+            if "until" not in self.generation_kwargs:
+                eval_logger.warning(
+                    f"{self.task}: No `until` specified in `generation_kwargs`! Defaulting to the fewshot_delimiter={repr(self.fewshot_delimiter)}"
+                )
+                self.generation_kwargs["until"] = [self.fewshot_delimiter]
+        else:
+            if self.output_type == "generate_until":
+                # ensure that we greedily generate in absence of explicit arguments otherwise
+                self.generation_kwargs = {
+                    "until": (
+                        None
+                        if self.fewshot_delimiter is None
+                        else [self.fewshot_delimiter]
+                    ),
+                    "do_sample": False,
+                    "temperature": 0,
+                }
+                eval_logger.warning(
+                    f"{self.task}: No `generation_kwargs` specified in task config, defaulting to {self.generation_kwargs}"
+                )
+    def __getitem__(self, item):
+        return getattr(self, item)
+    def __setitem__(self, item, value):
+        return setattr(self, item, value)
+    def to_dict(self, keep_callable: bool = False) -> dict:
+        """dumps the current config as a dictionary object, as a printable format.
+        null fields will not be printed.
+        Used for dumping results alongside full task configuration
+        :return: dict
+            A printable dictionary version of the TaskConfig object.
+        # TODO: should any default value in the TaskConfig not be printed?
+        """
+        cfg_dict = asdict(self)
+        # remove values that are `None`
+        for k, v in list(cfg_dict.items()):
+            if v is None:
+                cfg_dict.pop(k)
+            elif k == "metric_list":
+                for metric_dict in v:
+                    for metric_key, metric_value in metric_dict.items():
+                        if callable(metric_value):
+                            metric_dict[metric_key] = self.serialize_function(
+                                metric_value, keep_callable=keep_callable
+                            )
+                cfg_dict[k] = v
+            elif callable(v):
+                cfg_dict[k] = self.serialize_function(v, keep_callable=keep_callable)
+        return cfg_dict
+    def serialize_function(
+        self, value: Union[Callable, str], keep_callable=False
+    ) -> Union[Callable, str]:
+        """Serializes a given function or string.
+        If 'keep_callable' is True, the original callable is returned.
+        Otherwise, attempts to return the source code of the callable using 'getsource'.
+        """
+        if keep_callable:
+            return value
+        else:
+            try:
+                return getsource(value)
+            except (TypeError, OSError):
+                return str(value)
+class Task(abc.ABC):
+    """A task represents an entire benchmark including its dataset, problems,
+    answers, and evaluation methods. See BoolQ for a simple example implementation
+    A `doc` can be any python object which represents one instance of evaluation.
+    This is usually a dictionary e.g.
+        {"question": ..., "answer": ...} or
+        {"question": ..., question, answer)
+    """
+    VERSION: Optional[Union[int, str]] = None
+    # The name of the `Task` benchmark as denoted in the HuggingFace datasets Hub
+    # or a path to a custom `datasets` loading script.
+    DATASET_PATH: Optional[str] = None
+    # The name of a subset within `DATASET_PATH`.
+    DATASET_NAME: Optional[str] = None
+    OUTPUT_TYPE: Optional[OutputType] = None
+    def __init__(
+        self,
+        data_dir: Optional[str] = None,
+        cache_dir: Optional[str] = None,
+        download_mode: Optional[datasets.DownloadMode] = None,
+        config: Optional[Mapping] = None,  # Union[dict, TaskConfig]
+    ) -> None:
+        """
+        :param data_dir: str
+            Stores the path to a local folder containing the `Task`'s data files.
+            Use this to specify the path to manually downloaded data (usually when
+            the dataset is not publicly accessible).
+        :param cache_dir: str
+            The directory to read/write the `Task` dataset. This follows the
+            HuggingFace `datasets` API with the default cache directory located at:
+                `~/.cache/huggingface/datasets`
+            NOTE: You can change the cache location globally for a given process
+            to another directory:
+                `export HF_DATASETS_CACHE="/path/to/another/directory"`
+        :param download_mode: datasets.DownloadMode
+            How to treat pre-existing `Task` downloads and data.
+            - `datasets.DownloadMode.REUSE_DATASET_IF_EXISTS`
+                Reuse download and reuse dataset.
+            - `datasets.DownloadMode.REUSE_CACHE_IF_EXISTS`
+                Reuse download with fresh dataset.
+            - `datasets.DownloadMode.FORCE_REDOWNLOAD`
+                Fresh download and fresh dataset.
+        """
+        self.download(data_dir, cache_dir, download_mode)
+        self._training_docs: Optional[list] = None
+        self._fewshot_docs: Optional[list] = None
+        self._instances: Optional[List[Instance]] = None
+        self._config: TaskConfig = TaskConfig({**config}) if config else TaskConfig()
+        self._filters = [build_filter_ensemble("none", [["take_first", None]])]
+        self.fewshot_rnd: Optional[random.Random] = (
+            None  # purposely induce errors in case of improper usage
+        )
+    def download(
+        self,
+        data_dir: Optional[str] = None,
+        cache_dir: Optional[str] = None,
+        download_mode=None,
+    ) -> None:
+        """Downloads and returns the task dataset.
+        Override this method to download the dataset from a custom API.
+        :param data_dir: str
+            Stores the path to a local folder containing the `Task`'s data files.
+            Use this to specify the path to manually downloaded data (usually when
+            the dataset is not publicly accessible).
+        :param cache_dir: str
+            The directory to read/write the `Task` dataset. This follows the
+            HuggingFace `datasets` API with the default cache directory located at:
+                `~/.cache/huggingface/datasets`
+            NOTE: You can change the cache location globally for a given process
+            by setting the shell environment variable, `HF_DATASETS_CACHE`,
+            to another directory:
+                `export HF_DATASETS_CACHE="/path/to/another/directory"`
+        :param download_mode: datasets.DownloadMode
+            How to treat pre-existing `Task` downloads and data.
+            - `datasets.DownloadMode.REUSE_DATASET_IF_EXISTS`
+                Reuse download and reuse dataset.
+            - `datasets.DownloadMode.REUSE_CACHE_IF_EXISTS`
+                Reuse download with fresh dataset.
+            - `datasets.DownloadMode.FORCE_REDOWNLOAD`
+                Fresh download and fresh dataset.
+        """
+        self.dataset = datasets.load_dataset(
+            path=self.DATASET_PATH,
+            name=self.DATASET_NAME,
+            data_dir=data_dir,
+            cache_dir=cache_dir,
+            download_mode=download_mode,
+        )
+    @property
+    def config(self) -> TaskConfig:
+        """Returns the TaskConfig associated with this class."""
+        return self._config
+    @abc.abstractmethod
+    def has_training_docs(self):
+        """Whether the task has a training set"""
+        pass
+    @abc.abstractmethod
+    def has_validation_docs(self):
+        """Whether the task has a validation set"""
+        pass
+    @abc.abstractmethod
+    def has_test_docs(self):
+        """Whether the task has a test set"""
+        pass
+    def training_docs(self) -> Iterable:
+        """
+        :return: Iterable[obj]
+            A iterable of any object, that doc_to_text can handle
+        """
+        return []
+    def validation_docs(self) -> Iterable:
+        """
+        :return: Iterable[obj]
+            A iterable of any object, that doc_to_text can handle
+        """
+        return []
+    def test_docs(self) -> Iterable:
+        """
+        :return: Iterable[obj]
+            A iterable of any object, that doc_to_text can handle
+        """
+        return []
+    def fewshot_docs(self) -> Iterable:
+        """
+        :return: Iterable[obj]
+            A iterable of any object, that doc_to_text can handle
+        """
+        if self.has_training_docs():
+            return self.training_docs()
+        elif self.has_validation_docs():
+            return self.validation_docs()
+        else:
+            if self.config.get("num_fewshot", 0) > 0:
+                eval_logger.warning(
+                    f"[Task: {self.config.task}] has_training_docs and has_validation_docs are False"
+                    ", using test_docs as fewshot_docs but this is not recommended."
+                )
+            return self.test_docs()
+    def _process_doc(self, doc: dict) -> dict:
+        """
+        Override this to process (detokenize, strip, replace, etc.) individual
+        documents. This can be used in a map over documents of a data split.
+        E.g. `map(self._process_doc, self.dataset["validation"])`
+        :return: dict
+            The processed version of the specified `doc`.
+        """
+        return doc
+    @property
+    def instances(self) -> List[Instance]:
+        """After calling `task.build_all_requests()`, tasks
+        maintain a list of the dataset instances which will be evaluated.
+        """
+        return self._instances
+    def fewshot_examples(self, k, rnd):
+        if self._training_docs is None:
+            self._training_docs = list(self.training_docs())
+        return rnd.sample(self._training_docs, k)
+    def doc_to_decontamination_query(self, doc):
+        raise NotImplementedError(
+            "Override doc_to_decontamination_query with document specific decontamination query."
+        )
+    @abc.abstractmethod
+    def doc_to_text(self, doc):
+        pass
+    @abc.abstractmethod
+    def doc_to_target(self, doc):
+        pass
+    # not an abstractmethod because not every language-only task has to implement this
+    def doc_to_image(self, doc):
+        raise NotImplementedError
+    def doc_to_audio(self, doc):
+        raise NotImplementedError
+    def doc_to_prefix(self, doc):
+        return ""
+    def build_all_requests(
+        self,
+        *,
+        limit: Union[int, None] = None,
+        samples: Optional[List[int]] = None,
+        rank: int = 0,
+        world_size: int = 1,
+        cache_requests: bool = False,
+        rewrite_requests_cache: bool = False,
+        system_instruction: Optional[str] = None,
+        apply_chat_template: bool = False,
+        fewshot_as_multiturn: bool = False,
+        chat_template: Optional[Callable] = None,
+        tokenizer_name: str = "",
+    ) -> None:
+        """Build a set of Instances for a task, and store them in task.instances"""
+        # used with caching
+        og_limit = limit
+        cache_key = f"requests-{self._config.task}-{self.config.num_fewshot}shot-rank{rank}-world_size{world_size}"
+        cache_key += "-chat_template" if apply_chat_template else ""
+        cache_key += "-fewshot_as_multiturn" if fewshot_as_multiturn else ""
+        cache_key += (
+            f"-system_prompt_hash{utils.hash_string(system_instruction)}"
+            if system_instruction is not None
+            else ""
+        )
+        cache_key += f"-tokenizer{tokenizer_name}"
+        cached_instances = load_from_cache(file_name=cache_key, cache=cache_requests)
+        if cache_requests and cached_instances and not rewrite_requests_cache:
+            cached_instances = cached_instances[:limit]
+            flattened_instances = [
+                instance
+                for instance_group in cached_instances
+                for instance in instance_group
+            ]
+            self._instances = flattened_instances
+            return
+        eval_logger.info(f"Building contexts for {self.config.task} on rank {rank}...")
+        instances = []
+        # process all documents when caching is specified for simplicity
+        if (
+            cache_requests
+            and (not cached_instances or rewrite_requests_cache)
+            and limit is not None
+        ):
+            limit = None
+        doc_id_docs = list(
+            self.doc_iterator(
+                rank=rank, limit=limit, samples=samples, world_size=world_size
+            )
+        )
+        num_docs = len(doc_id_docs)
+        for doc_id, doc in tqdm(
+            doc_id_docs,
+            total=num_docs,
+        ):
+            # sample fewshot context #TODO: need to offset doc_id by rank now!
+            fewshot_ctx = self.fewshot_context(
+                doc,
+                num_fewshot=0
+                if self.config.num_fewshot is None
+                else self.config.num_fewshot,
+                system_instruction=system_instruction,
+                apply_chat_template=apply_chat_template,
+                fewshot_as_multiturn=fewshot_as_multiturn,
+                chat_template=chat_template,
+                gen_prefix=self.doc_to_prefix(doc),
+            )
+            # TODO: we should override self.config.repeats if doing greedy gen so users don't waste time+compute
+            inst = self.construct_requests(
+                doc=doc,
+                ctx=fewshot_ctx,
+                metadata=(self.config["task"], doc_id, self.config.repeats),
+                apply_chat_template=apply_chat_template,
+                chat_template=chat_template,
+            )
+            if not isinstance(inst, list):
+                inst = [inst]
+            instances.append(inst)
+        # now flatten, this is to allow slicing to work with pickles
+        sliced_instances = instances[:og_limit]
+        flattened_instances = [
+            instance
+            for instance_group in sliced_instances
+            for instance in instance_group
+        ]
+        self._instances = flattened_instances
+        if len(self._instances) == 0:
+            raise ValueError("task.build_requests() did not find any docs!")
+        if cache_requests and (not cached_instances or rewrite_requests_cache):
+            save_to_cache(file_name=cache_key, obj=instances)
+    @abc.abstractmethod
+    def construct_requests(self, doc, ctx, **kwargs):
+        """Uses RequestFactory to construct Requests and returns an iterable of
+        Requests which will be sent to the LM.
+        :param doc:
+            The document as returned from training_docs, validation_docs, or test_docs.
+        :param ctx: str
+            The context string, generated by fewshot_context. This includes the natural
+            language description, as well as the few shot examples, and the question
+            part of the document for `doc`.
+        :param doc_idx: int
+            The index of a document within `self.test_docs()` or `self.validation_docs()`,
+            whichever is the main split used.
+        :param repeats: int
+        TODO: update this docstring
+            The number of times each instance in a dataset is inferred on. Defaults to 1,
+            can be increased for techniques like majority voting.
+        """
+        pass
+    @abc.abstractmethod
+    def process_results(self, doc, results):
+        """Take a single document and the LM results and evaluates, returning a
+        dict where keys are the names of submetrics and values are the values of
+        the metric for that one document
+        :param doc:
+            The document as returned from training_docs, validation_docs, or test_docs.
+        :param results:
+            The results of the requests created in construct_requests.
+        """
+        pass
+    @abc.abstractmethod
+    def aggregation(self):
+        """
+        :returns: {str: [metric_score] -> float}
+            A dictionary where keys are the names of submetrics and values are
+            functions that aggregate a list of metric scores
+        """
+        pass
+    @abc.abstractmethod
+    def higher_is_better(self):
+        """
+        :returns: {str: bool}
+            A dictionary where keys are the names of submetrics and values are
+            whether a higher value of the submetric is better
+        """
+        pass
+    def get_config(self, key: str) -> Any:
+        return getattr(self._config, key, None)
+    @classmethod
+    def count_bytes(cls, doc):
+        """Used for byte-level perplexity metrics in rolling loglikelihood"""
+        return len(doc.encode("utf-8"))
+    @classmethod
+    def count_words(cls, doc):
+        """Downstream loglikelihood_rolling perplexity tasks with custom word boundaries should override this!"""
+        return len(re.split(r"\s+", doc))
+    @utils.positional_deprecated
+    def fewshot_context(self, doc, num_fewshot, rnd=None, description=None, **kwargs):
+        """Returns a fewshot context string that is made up of a prepended description
+        (if provided), the `num_fewshot` number of examples, and an appended prompt example.
+        :param doc: str
+            The document as returned from training_docs, validation_docs, or test_docs.
+        :param num_fewshot: int
+            The number of fewshot examples to provide in the returned context string.
+        :param rnd: random.Random
+            The pseudo-random number generator used to randomly sample examples.
+            WARNING: This is currently a required arg although it's optionalized with a default `None`.
+        :param description: str
+            The task's description that will be prepended to the fewshot examples.
+        :returns: str
+            The fewshot context.
+        """
+        if rnd is None:
+            if self.fewshot_rnd is not None:
+                rnd = self.fewshot_rnd
+            else:
+                raise ValueError(
+                    "A `random.Random` generator argument must be provided to `rnd`"
+                )
+        description = description if description else ""
+        if num_fewshot == 0:
+            labeled_examples = ""
+        else:
+            # for sets with no training docs, draw from other set *but ensure no overlap with current doc*
+            if self.has_training_docs():
+                fewshotex = self.fewshot_examples(k=num_fewshot, rnd=rnd)
+            else:
+                if self._fewshot_docs is None:
+                    self._fewshot_docs = list(
+                        self.validation_docs()
+                        if self.has_validation_docs()
+                        else self.test_docs()
+                    )
+                fewshotex = rnd.sample(self._fewshot_docs, num_fewshot + 1)
+                # get rid of the doc that's the one we're evaluating, if it's in the fewshot
+                fewshotex = [x for x in fewshotex if x != doc][:num_fewshot]
+            labeled_examples = (
+                "\n\n".join(
+                    [
+                        self.doc_to_text(doc) + self.doc_to_target(doc)
+                        for doc in fewshotex
+                    ]
+                )
+                + "\n\n"
+            )
+        example = self.doc_to_text(doc)
+        return description + labeled_examples + example
+    def apply_filters(self) -> Optional[List[Instance]]:
+        """Iterates over FilterEnsembles and applies them to instances"""
+        if hasattr(self, "_filters"):
+            for f in self._filters:
+                f.apply(self._instances)
+        else:
+            eval_logger.warning("No filter defined, passing through instances")
+            return self._instances
+    def dump_config(self) -> dict:
+        """Returns the config as a dictionary."""
+        # TODO: this should only return the overrides applied to a non-YAML task's configuration.
+        # (num_fewshot)
+        return self.config.to_dict()
+    def set_config(self, key: str, value: Any, update: bool = False) -> None:
+        """Set or update the configuration for a given key."""
+        if key is None:
+            raise ValueError("Key must be provided.")
+        if update:
+            current_value = getattr(self._config, key, {})
+            if not isinstance(current_value, dict):
+                raise TypeError(
+                    f"Expected a dict for key '{key}', got {type(current_value).__name__} instead."
+                )
+            current_value.update(value)
+        else:
+            setattr(self._config, key, value)
+    def override_metric(self, metric_name: str) -> None:
+        """
+        Override the default metrics used for evaluation with custom metrics.
+        Parameters:
+        - metric_name (str): The name of the custom metric to override. Should be registered in api.metrics.
+        """
+        (
+            self._metric_fn_list,
+            self._aggregation_list,
+            self._metric_fn_kwargs,
+            self._higher_is_better,
+        ) = ({}, {}, {}, {})
+        self._metric_fn_list[metric_name] = get_metric(metric_name)
+        self._aggregation_list[metric_name] = get_metric_aggregation(metric_name)
+        self._higher_is_better[metric_name] = is_higher_better(metric_name)
+        self._metric_fn_kwargs[metric_name] = {}
+        if not isinstance(self, ConfigurableTask):
+            self.process_results = lambda x, y: {metric_name: get_metric(metric_name)}
+            self.aggregation = lambda: {
+                metric_name: get_metric_aggregation(metric_name)
+            }
+        setattr(self._config, "metric_list", [{"metric": metric_name}])
+        setattr(self._config, "process_results", None)
+    def set_fewshot_seed(self, seed: Optional[int] = None) -> None:
+        self.fewshot_rnd = random.Random(seed)
+        if hasattr(self, "sampler"):
+            self.sampler.rnd = self.fewshot_rnd
+    @property
+    def eval_docs(self) -> Union[datasets.Dataset, List[dict]]:
+        if self.has_test_docs():
+            return self.test_docs()
+        elif self.has_validation_docs():
+            return self.validation_docs()
+        else:
+            raise ValueError(
+                f"Task dataset (path={self.DATASET_PATH}, name={self.DATASET_NAME}) must have valid or test docs!"
+            )
+    def doc_iterator(
+        self,
+        *,
+        rank: int = 0,
+        limit: Union[int, None] = None,
+        world_size: int = 1,
+        samples: Optional[List[int]] = None,
+    ) -> Iterator[Tuple[int, Any]]:
+        if samples:
+            n = len(self.eval_docs)
+            assert all([e < n for e in samples]), (
+                f"Elements of --samples should be in the interval [0,k-1] where k is the number of total examples. In this case, k={n}."
+            )
+            eval_logger.info(
+                f"{self.config.task}: Evaluating on {len(samples)} examples"
+            )
+            doc_iterator = utils.create_iterator(
+                enumerate(x for i, x in enumerate(self.eval_docs) if i in samples),
+                rank=int(rank),
+                limit=None,  # limit does not matter here since we are selecting samples directly
+                world_size=int(world_size),
+            )
+        else:
+            limit = int(limit) if limit else None
+            doc_iterator = utils.create_iterator(
+                enumerate(self.eval_docs),
+                rank=int(rank),
+                limit=limit,
+                world_size=int(world_size),
+            )
+        return doc_iterator
+class ConfigurableTask(Task):
+    VERSION = "Yaml"
+    OUTPUT_TYPE = None
+    CONFIG = None
+    def __init__(
+        self,
+        data_dir=None,
+        cache_dir=None,
+        download_mode=None,
+        config: Optional[dict] = None,
+    ) -> None:  # TODO no super() call here
+        # Get pre-configured attributes
+        self._config = self.CONFIG
+        # Use new configurations if there was no preconfiguration
+        if self.config is None:
+            self._config = TaskConfig(**config)
+        # Overwrite configs
+        else:
+            if config is not None:
+                self._config.__dict__.update(config)
+        if self.config is None:
+            raise ValueError(
+                "Must pass a config to ConfigurableTask, either in cls.CONFIG or `config` kwarg"
+            )
+        if isinstance(self.config.metadata, dict):
+            if "version" in self.config.metadata:
+                self.VERSION = self.config.metadata["version"]
+        if self.config.output_type is not None:
+            if self.config.output_type not in ALL_OUTPUT_TYPES:
+                raise ValueError(
+                    f"Got invalid output_type '{self.config.output_type}', must be in '{','.join(ALL_OUTPUT_TYPES)}'"
+                )
+            self.OUTPUT_TYPE = self.config.output_type
+        if self.config.doc_to_image is not None:
+            # mark the task as requiring multimodality.
+            self.MULTIMODAL = True
+        if self.config.doc_to_audio:
+            # mark the task as requiring multimodality.
+            self.MULTIMODAL = True
+        if self.config.unsafe_code is not False:
+            self.UNSAFE_CODE = True
+        if self.config.dataset_path is not None:
+            self.DATASET_PATH = self.config.dataset_path
+        if self.config.dataset_name is not None:
+            self.DATASET_NAME = self.config.dataset_name
+        self._metric_fn_list = {}
+        self._metric_fn_kwargs = {}
+        self._aggregation_list = {}
+        self._higher_is_better = {}
+        if self.config.metric_list is None:
+            # TODO: handle this in TaskConfig.__post_init__ ?
+            _metric_list = DEFAULT_METRIC_REGISTRY[self.config.output_type]
+            for metric_name in _metric_list:
+                self._metric_fn_list[metric_name] = get_metric(metric_name)
+                self._metric_fn_kwargs[metric_name] = {}
+                self._aggregation_list[metric_name] = get_metric_aggregation(
+                    metric_name
+                )
+                self._higher_is_better[metric_name] = is_higher_better(metric_name)
+        else:
+            for metric_config in self.config.metric_list:
+                if "metric" not in metric_config:
+                    raise ValueError(
+                        "'metric' key not provided for an entry in 'metric_list', must be specified!"
+                    )
+                metric_name = metric_config["metric"]
+                kwargs = {
+                    key: metric_config[key]
+                    for key in metric_config
+                    if key
+                    not in ["metric", "aggregation", "higher_is_better", "hf_evaluate"]
+                }
+                hf_evaluate_metric = (
+                    "hf_evaluate" in metric_config
+                    and metric_config["hf_evaluate"] is True
+                )
+                if self.config.process_results is not None:
+                    self._metric_fn_list[metric_name] = None
+                    self._metric_fn_kwargs[metric_name] = {}
+                elif callable(metric_name):
+                    metric_fn = metric_name.__call__
+                    metric_name = metric_name.__name__
+                    self._metric_fn_list[metric_name] = metric_fn
+                    self._metric_fn_kwargs[metric_name] = kwargs
+                else:
+                    self._metric_fn_list[metric_name] = get_metric(
+                        metric_name, hf_evaluate_metric
+                    )
+                    self._metric_fn_kwargs[metric_name] = kwargs
+                if "aggregation" in metric_config:
+                    agg_name = metric_config["aggregation"]
+                    if isinstance(agg_name, str):
+                        self._aggregation_list[metric_name] = get_aggregation(agg_name)
+                    elif callable(agg_name):  # noqa: E721
+                        self._aggregation_list[metric_name] = metric_config[
+                            "aggregation"
+                        ]
+                else:
+                    INV_AGG_REGISTRY = {v: k for k, v in AGGREGATION_REGISTRY.items()}
+                    metric_agg = get_metric_aggregation(metric_name)
+                    eval_logger.warning(
+                        f"[Task: {self.config.task}] metric {metric_name} is defined, but aggregation is not. "
+                        f"using default "
+                        f"aggregation={INV_AGG_REGISTRY[metric_agg]}"
+                    )
+                    self._aggregation_list[metric_name] = metric_agg
+                if "higher_is_better" in metric_config:
+                    self._higher_is_better[metric_name] = metric_config[
+                        "higher_is_better"
+                    ]
+                else:
+                    eval_logger.warning(
+                        f"[Task: {self.config.task}] metric {metric_name} is defined, but higher_is_better is not. "
+                        f"using default "
+                        f"higher_is_better={is_higher_better(metric_name)}"
+                    )
+                    self._higher_is_better[metric_name] = is_higher_better(metric_name)
+        self.download(self.config.dataset_kwargs)
+        self._training_docs = None
+        self._fewshot_docs = None
+        if self.config.filter_list is not None:
+            self._filters = []
+            for filter_config in self.config.filter_list:
+                filter_name = filter_config["name"]
+                filter_functions = filter_config["filter"]
+                components = []
+                for function in filter_functions:
+                    kwargs = {
+                        key: function[key] for key in function if key != "function"
+                    }
+                    components.append([function["function"], kwargs])
+                filter_pipeline = build_filter_ensemble(filter_name, components)
+                self._filters.append(filter_pipeline)
+        else:
+            # TODO: handle repeats in a more general way rather than just discarding
+            eval_logger.debug(
+                "No custom filters defined. Using default 'take_first' filter for handling repeats."
+            )
+            self._filters = [build_filter_ensemble("none", [["take_first", None]])]
+        if self.config.use_prompt is not None:
+            eval_logger.info(f"loading prompt {self.config.use_prompt}")
+            self.prompt = get_prompt(
+                self.config.use_prompt, self.DATASET_PATH, self.DATASET_NAME
+            )
+        else:
+            self.prompt = None
+        if self.fewshot_docs() is not None:
+            self.fewshot_rnd = (
+                random.Random()
+            )  # setting with no seed, to be overridden at a later time
+            config_sampler: Union[str, Callable] = (
+                self.config.fewshot_config.get("sampler", "default")
+                if self.config.fewshot_config
+                else "default"
+            )
+            if isinstance(config_sampler, str):
+                self.sampler = samplers.get_sampler(config_sampler)(
+                    list(self.fewshot_docs()), self, rnd=self.fewshot_rnd
+                )
+            elif callable(config_sampler) and issubclass(
+                config_sampler, samplers.ContextSampler
+            ):
+                self.sampler = config_sampler(
+                    docs=list(self.fewshot_docs()), task=self, rnd=self.fewshot_rnd
+                )
+            else:
+                raise TypeError(
+                    f"fewshot_config.sampler should be a string or callable of ContextSampler type, "
+                    f"not {type(config_sampler)}"
+                )
+        self.task_docs = self.eval_docs
+        # Test One Doc
+        self.features = list(self.task_docs.features.keys())
+        self.multiple_input = 0
+        self.multiple_target = 0
+        test_doc = self.task_docs[0]
+        test_text = self.doc_to_text(test_doc)
+        test_target = self.doc_to_target(test_doc)
+        if self.config.doc_to_choice is not None:
+            test_choice = self.doc_to_choice(test_doc)
+            if not isinstance(test_choice, list):
+                eval_logger.error("doc_to_choice must return list")
+            else:
+                num_choice = len(test_choice)
+            if isinstance(test_text, int):
+                eval_logger.debug(
+                    "doc_to_text returned an int. Assuming multiple inputs."
+                )
+                self.multiple_input = num_choice
+        else:
+            test_choice = None
+        if isinstance(test_target, list):
+            eval_logger.debug(
+                "doc_to_target returned a list. Assuming multiple targets."
+            )
+            self.multiple_target = len(test_target)
+        else:
+            if (isinstance(test_target, int)) and (test_choice is not None):
+                test_target = test_choice[test_target]
+            else:
+                test_target = str(test_target)
+        if test_choice is not None:
+            check_choices = test_choice
+        else:
+            check_choices = [test_target]
+        if self.config.doc_to_choice is not None:
+            for choice in check_choices:
+                choice_has_whitespace = True if choice[0].isspace() else False
+                delimiter_has_whitespace = (
+                    True
+                    if self.config.target_delimiter.rstrip()
+                    != self.config.target_delimiter
+                    else False
+                )
+                if delimiter_has_whitespace and choice_has_whitespace:
+                    eval_logger.debug(
+                        f'Both target_delimiter "{self.config.target_delimiter}" and target choice: "{choice}" have whitespace'
+                    )
+                elif (not delimiter_has_whitespace) and (not choice_has_whitespace):
+                    eval_logger.debug(
+                        f'Both target_delimiter "{self.config.target_delimiter}" and target choice: "{choice}" do not have whitespace, ignore if the language you are evaluating on does not require/use whitespace'
+                    )
+    def download(
+        self, dataset_kwargs: Optional[Dict[str, Any]] = None, **kwargs
+    ) -> None:
+        from packaging.version import parse as vparse
+        if dataset_kwargs and vparse(datasets.__version__) >= vparse("4.0.0"):
+            dataset_kwargs.pop("trust_remote_code", None)
+        if isinstance(self.config.custom_dataset, Callable):
+            eval_logger.warning(
+                f"{self.config.task}: Custom kwargs can be passed to `--metadata` in console (as json string) or to the TaskManager."
+                + "\nFor example --metadata='{\"max_seq_lengths\":[4096, 8192]}'. For details see task Readme."
+            )
+            self.dataset = self.config.custom_dataset(
+                **(self.config.metadata or {}), **(self.config.dataset_kwargs or {})
+            )
+        else:
+            self.dataset = datasets.load_dataset(
+                path=self.DATASET_PATH,
+                name=self.DATASET_NAME,
+                **dataset_kwargs if dataset_kwargs is not None else {},
+            )
+    def has_training_docs(self) -> bool:
+        if self.config.training_split is not None:
+            return True
+        else:
+            return False
+    def has_validation_docs(self) -> bool:
+        if self.config.validation_split is not None:
+            return True
+        else:
+            return False
+    def has_test_docs(self) -> bool:
+        if self.config.test_split is not None:
+            return True
+        else:
+            return False
+    def training_docs(self) -> datasets.Dataset:
+        if self.has_training_docs():
+            if self.config.process_docs is not None:
+                return self.config.process_docs(
+                    self.dataset[self.config.training_split]
+                )
+            return self.dataset[self.config.training_split]
+    def validation_docs(self) -> datasets.Dataset:
+        if self.has_validation_docs():
+            if self.config.process_docs is not None:
+                return self.config.process_docs(
+                    self.dataset[self.config.validation_split]
+                )
+            return self.dataset[self.config.validation_split]
+    def test_docs(self) -> datasets.Dataset:
+        if self.has_test_docs():
+            if self.config.process_docs is not None:
+                return self.config.process_docs(self.dataset[self.config.test_split])
+            return self.dataset[self.config.test_split]
+    def fewshot_docs(self):
+        if self.config.fewshot_split is not None:
+            if self.config.process_docs is not None:
+                return self.config.process_docs(self.dataset[self.config.fewshot_split])
+            return self.dataset[self.config.fewshot_split]
+        elif (
+            self.config.fewshot_config is not None
+            and self.config.fewshot_config.get("samples", None) is not None
+        ):
+            if isinstance(self.config.fewshot_config["samples"], list):
+                return self.config.fewshot_config["samples"]
+            elif callable(self.config.fewshot_config["samples"]):
+                return self.config.fewshot_config["samples"]()
+            else:
+                raise Exception(
+                    "`fewshot_config['samples']` was incorrectly defined in the configuration. It should be either a list of samples as a dict, or function returning this list."
+                )
+        else:
+            if (self.config.num_fewshot is not None) and (self.config.num_fewshot > 0):
+                eval_logger.warning(
+                    f"[Task: {self.config.task}] "
+                    "num_fewshot > 0 but fewshot_split is None. "
+                    "using preconfigured rule."
+                )
+            return super().fewshot_docs()
+    @staticmethod
+    def append_target_question(
+        labeled_examples: List[Dict[str, str]],
+        question: str,
+        fewshot_as_multiturn: bool = False,
+        gen_prefix: Optional[str] = None,
+    ) -> None:
+        """Adds a target question to the labeled examples list.
+        If fewshot_as_multiturn is True, or labeled_examples is empty, or the last entry is a system turn, appends the question as a new user entry.
+        Otherwise, it is appended to the last user entry, ensuring that the conversation alternates between the user and the assistant.
+        """
+        if not fewshot_as_multiturn:
+            # if no messages or last message is system, append as new user entry
+            if len(labeled_examples) == 0 or labeled_examples[-1]["role"] == "system":
+                labeled_examples.append({"role": "user", "content": question})
+            # if last message is user, append to it to avoid two user messages in a row
+            else:
+                labeled_examples[-1]["content"] += question
+        else:
+            # if fewshot_as_multiturn is True, append as next user entry (last is always assistant)
+            labeled_examples.append({"role": "user", "content": question})
+        if gen_prefix:
+            labeled_examples.append({"role": "assistant", "content": gen_prefix})
+    @utils.positional_deprecated
+    def fewshot_context(
+        self,
+        doc: dict,
+        num_fewshot: int,
+        system_instruction: Optional[str] = None,
+        apply_chat_template: bool = False,
+        fewshot_as_multiturn: bool = False,
+        chat_template: Optional[Callable] = None,
+        gen_prefix: Optional[str] = None,
+    ) -> Union[str, List[str]]:
+        """Returns a fewshot context string that is made up of a prepended description
+        (if provided), the `num_fewshot` number of examples, and an appended prompt example.
+        :param doc: str
+            The document as returned from training_docs, validation_docs, or test_docs.
+        :param num_fewshot: int
+            The number of fewshot examples to provide in the returned context string.
+        :param  system_instruction: str
+            System instruction to be applied to the prompt.
+        :param apply_chat_template: bool
+            Whether to apply the chat template to the fewshot context.
+        :param fewshot_as_multiturn: bool
+            Whether to provide the fewshot examples as a multiturn conversation or a single user turn.
+        :param chat_template:
+            callable (from lm.apply_chat_template) that takes in a list[Dict] chat transcript and renders it into a string.
+        :param gen_prefix:
+            String to append after the <|assistant|> token.
+        :returns: str
+            The fewshot context.
+        """
+        if apply_chat_template:
+            labeled_examples = []
+        else:
+            labeled_examples = ""
+        # get task description
+        if description := self.config.description:
+            description = utils.apply_template(self.config.description, doc)
+        # create system prompt based on the provided system instruction and description
+        if system_instruction is not None and description:
+            system_prompt = (
+                f"{system_instruction}{self.sampler.fewshot_delimiter}{description}"
+            )
+        elif system_instruction is not None:
+            system_prompt = system_instruction
+        elif description:
+            system_prompt = description
+        else:
+            system_prompt = ""
+        # add system prompt if specified
+        if system_prompt:
+            if apply_chat_template:
+                labeled_examples.append({"role": "system", "content": system_prompt})
+            else:
+                labeled_examples = system_prompt
+        # if few-shot - append examples after the system prompt
+        if num_fewshot > 0:
+            if apply_chat_template:
+                labeled_examples.extend(
+                    self.sampler.get_chat_context(
+                        doc,
+                        num_fewshot,
+                        fewshot_as_multiturn,
+                        gen_prefix=gen_prefix,
+                    )
+                )
+            else:
+                labeled_examples += self.sampler.get_context(
+                    doc, num_fewshot, gen_prefix=gen_prefix
+                )
+        example = self.doc_to_text(doc)
+        if apply_chat_template:
+            if self.multiple_input:
+                # TODO: append prefill?
+                if not labeled_examples:
+                    return ""
+                return chat_template(labeled_examples)
+            if isinstance(example, str):
+                self.append_target_question(
+                    labeled_examples,
+                    example,
+                    fewshot_as_multiturn,
+                    gen_prefix=gen_prefix,
+                )
+            # for loglikelihood create a list of questions with appended choices
+            elif isinstance(example, list):
+                labeled_examples_list = []
+                # copy chat history for each example and append the answer
+                for ex in example:
+                    chat = deepcopy(labeled_examples)
+                    self.append_target_question(
+                        chat,
+                        ex,
+                        fewshot_as_multiturn,
+                        gen_prefix=gen_prefix,
+                    )
+                    # TODO: append prefill?
+                    labeled_examples_list.append(
+                        chat_template(
+                            chat,
+                            add_generation_prompt=False if gen_prefix else True,
+                        )
+                    )
+                return labeled_examples_list
+            # if example is an integer, append the choice or convert to string
+            elif isinstance(example, int):
+                if self.config.doc_to_choice is not None:
+                    choices = self.doc_to_choice(doc)
+                    self.append_target_question(
+                        labeled_examples,
+                        choices[example],
+                        fewshot_as_multiturn,
+                        gen_prefix=gen_prefix,
+                    )
+                else:
+                    self.append_target_question(
+                        labeled_examples,
+                        str(example),
+                        fewshot_as_multiturn,
+                        gen_prefix=gen_prefix,
+                    )
+                # return lm.apply_chat_template(labeled_examples)
+            return chat_template(
+                labeled_examples,
+                add_generation_prompt=False if gen_prefix else True,
+            )
+        else:
+            prefix = (
+                self.config.target_delimiter + gen_prefix
+                if gen_prefix is not None
+                else ""
+            )
+            if self.multiple_input:
+                return labeled_examples
+            if isinstance(example, str):
+                return labeled_examples + example + prefix
+            elif isinstance(example, list):
+                return [labeled_examples + ex + prefix for ex in example]
+            elif isinstance(example, int):
+                if self.config.doc_to_choice is not None:
+                    choices = self.doc_to_choice(doc)
+                    return labeled_examples + choices[example] + prefix
+                else:
+                    return labeled_examples + str(example) + prefix
+    def apply_filters(self) -> Optional[List[Instance]]:
+        """Iterates over FilterEnsembles and applies them to instances"""
+        if hasattr(self, "_filters"):
+            for f in self._filters:
+                f.apply(self._instances)
+        else:
+            eval_logger.warning("No filter defined, passing through instances")
+            return self._instances
+    def should_decontaminate(self):
+        return self.config.should_decontaminate
+    def doc_to_decontamination_query(self, doc: dict):
+        if self.config.should_decontaminate:
+            if self.config.doc_to_decontamination_query is None:
+                return self.doc_to_text(doc)
+            else:
+                doc_to_decontamination_query = self.config.doc_to_decontamination_query
+                if doc_to_decontamination_query in self.features:
+                    return doc[doc_to_decontamination_query]
+                elif callable(doc_to_decontamination_query):
+                    return doc_to_decontamination_query(doc)
+                else:
+                    return ast.literal_eval(
+                        utils.apply_template(
+                            self.config.doc_to_decontamination_query, doc
+                        )
+                    )
+    def _process_doc(self, doc: dict) -> dict:
+        """
+        Override this to process (detokenize, strip, replace, etc.) individual
+        documents. This can be used in a map over documents of a data split.
+        E.g. `map(self._process_doc, self.dataset["validation"])`
+        :return: dict
+            The processed version of the specified `doc`.
+        """
+        return doc
+    def doc_to_text(self, doc, doc_to_text=None):
+        if self.prompt is not None:
+            doc_to_text = self.prompt
+        elif doc_to_text is not None:
+            doc_to_text = doc_to_text
+        else:
+            doc_to_text = self.config.doc_to_text
+        if isinstance(doc_to_text, int):
+            return doc_to_text
+        elif isinstance(doc_to_text, str):
+            if doc_to_text in self.features:
+                # if self.config.doc_to_choice is not None:
+                #     return self.doc_to_choice(doc)[doc[doc_to_text]]
+                # else:
+                return doc[doc_to_text]
+            else:
+                text_string = utils.apply_template(doc_to_text, doc)
+                if text_string.isdigit() and self._config.doc_to_choice is not None:
+                    return ast.literal_eval(text_string)
+                else:
+                    return text_string
+        elif callable(doc_to_text):
+            return doc_to_text(doc)
+        # Used when applying a Promptsource template
+        elif hasattr(doc_to_text, "apply"):
+            applied_prompt = doc_to_text.apply(doc)
+            if len(applied_prompt) == 2:
+                return applied_prompt[0]
+            else:
+                eval_logger.warning("Applied prompt returns empty string")
+                return self.config.fewshot_delimiter
+        else:
+            print(type(doc_to_text))
+            raise TypeError
+    def doc_to_target(self, doc: Mapping, doc_to_target=None) -> Union[int, str, list]:
+        if self.prompt is not None:
+            doc_to_target = self.prompt
+        elif doc_to_target is not None:
+            doc_to_target = doc_to_target
+        else:
+            doc_to_target = self.config.doc_to_target
+        if isinstance(doc_to_target, int):
+            return doc_to_target
+        elif isinstance(doc_to_target, str):
+            if doc_to_target in self.features:
+                # if self.config.doc_to_choice is not None:
+                #     return self.doc_to_choice(doc)[doc[doc_to_target]]
+                # else:
+                return doc[doc_to_target]
+            else:
+                target_string = utils.apply_template(doc_to_target, doc)
+                if target_string.isdigit() and self._config.doc_to_choice is not None:
+                    return ast.literal_eval(target_string)
+                elif (
+                    len(target_string) >= 2
+                    and (target_string[0] == "[")
+                    and (target_string[-1] == "]")
+                ):
+                    try:
+                        return ast.literal_eval(target_string)
+                    except (SyntaxError, ValueError):
+                        return target_string
+                else:
+                    return target_string
+        elif isinstance(doc_to_target, list):
+            return doc_to_target
+        elif callable(doc_to_target):
+            return doc_to_target(doc)
+        # Used when applying a Promptsource template
+        elif hasattr(doc_to_target, "apply"):
+            applied_prompt = doc_to_target.apply(doc)
+            if len(applied_prompt) == 2:
+                return applied_prompt[1]
+            else:
+                eval_logger.warning("Applied prompt returns empty string")
+                return self.config.fewshot_delimiter
+        else:
+            raise TypeError
+    def doc_to_choice(self, doc: Any, doc_to_choice=None) -> List[str]:
+        if self.prompt is not None:
+            doc_to_choice = self.prompt
+        elif doc_to_choice is not None:
+            doc_to_choice = doc_to_choice
+        elif self.config.doc_to_choice is None:
+            eval_logger.error("doc_to_choice was called but not set in config")
+        else:
+            doc_to_choice = self.config.doc_to_choice
+        if isinstance(doc_to_choice, str):
+            if doc_to_choice in self.features:
+                return doc[doc_to_choice]
+            else:
+                return ast.literal_eval(utils.apply_template(doc_to_choice, doc))
+        elif isinstance(doc_to_choice, list):
+            return doc_to_choice
+        elif isinstance(doc_to_choice, dict):
+            return list(doc_to_choice.values())
+        elif callable(doc_to_choice):
+            return doc_to_choice(doc)
+        elif hasattr(doc_to_choice, "get_answer_choices_list"):
+            return doc_to_choice.get_answer_choices_list(doc)
+        else:
+            raise TypeError
+    def doc_to_image(self, doc: Any, doc_to_image=None) -> Union[int, str, list]:
+        if doc_to_image is not None:
+            doc_to_image = doc_to_image
+        elif self.config.doc_to_image is not None:
+            doc_to_image = self.config.doc_to_image
+        else:
+            return None
+        if isinstance(doc_to_image, list):
+            image_feature = [
+                self.doc_to_image(doc, feature) for feature in doc_to_image
+            ]
+            return [feature for feature in image_feature if feature is not None]
+        elif isinstance(doc_to_image, str):
+            if doc_to_image in self.features:
+                return doc[doc_to_image]
+            else:
+                return ast.literal_eval(utils.apply_template(doc_to_image, doc))
+        elif callable(doc_to_image):
+            return doc_to_image(doc)
+        else:
+            return None
+    def doc_to_audio(self, doc: Any, doc_to_audio=None) -> Union[int, str, list]:
+        if doc_to_audio is not None:
+            doc_to_audio = doc_to_audio
+        elif self.config.doc_to_audio is not None:
+            doc_to_audio = self.config.doc_to_audio
+        else:
+            return None
+        if isinstance(doc_to_audio, list):
+            audio_feature = [
+                self.doc_to_audio(doc, feature) for feature in doc_to_audio
+            ]
+            return [feature for feature in audio_feature if feature is not None]
+        elif isinstance(doc_to_audio, str):
+            if doc_to_audio in self.features:
+                return doc[doc_to_audio]
+            else:
+                return ast.literal_eval(utils.apply_template(doc_to_audio, doc))
+        elif callable(doc_to_audio):
+            return doc_to_audio(doc)
+        else:
+            return None
+    def doc_to_prefix(self, doc):
+        if (gen_prefix := self.config.gen_prefix) is not None:
+            if gen_prefix in self.features:
+                return doc[gen_prefix]
+            else:
+                return utils.apply_template(gen_prefix, doc)
+        return None
+    def construct_requests(
+        self, doc: dict, ctx: str, **kwargs
+    ) -> Union[List[Instance], Instance]:
+        apply_chat_template = kwargs.pop("apply_chat_template", False)
+        chat_template: Callable | None = kwargs.pop("chat_template", None)
+        aux_arguments = None
+        if self.OUTPUT_TYPE == "loglikelihood":
+            arguments = (ctx, self.doc_to_target(doc))
+        elif self.OUTPUT_TYPE == "loglikelihood_rolling":
+            arguments = (self.doc_to_target(doc),)
+        elif self.OUTPUT_TYPE == "multiple_choice":
+            choices = self.doc_to_choice(doc)
+            target_delimiter = self.config.target_delimiter
+            if apply_chat_template:
+                target_delimiter = ""
+            if self.multiple_input:
+                # If there are multiple inputs, choices are placed in the ctx
+                # apply chat_template to choices if apply_chat_template
+                cont = self.doc_to_target(doc)
+                arguments = [
+                    (
+                        ctx
+                        + (
+                            chat_template([{"role": "user", "content": choice}])
+                            if apply_chat_template
+                            else choice
+                        ),
+                        f"{target_delimiter}{cont}",
+                    )
+                    for choice in choices
+                ]
+            else:
+                # Otherwise they are placed in the continuation
+                arguments = [(ctx, f"{target_delimiter}{cont}") for cont in choices]
+            # TODO: we should raise a warning telling users this will at most ~2x runtime.
+            if "acc_mutual_info" in self._metric_fn_list.keys():
+                # if we are calculating multiple choice accuracy
+                # using mutual information instead of raw loglikelihood as metric, need unconditional lls.
+                # here mutual info refers to calculating
+                # log(P(choice|ctx) / P(choice)) = log(P(choice|ctx)) - log(P(choice))
+                # in other words normalizing by subtracting the unconditional logprob of each choice.
+                # TODO: should these be strided? will have to modify the processing in process_results if so
+                aux_arguments = [
+                    ("", f"{target_delimiter}{choice}") for choice in choices
+                ]
+                arguments.extend(aux_arguments)
+        elif self.OUTPUT_TYPE == "generate_until":
+            arguments = (ctx, deepcopy(self.config.generation_kwargs))
+        multimodal_arg = {}
+        if (
+            self.config.doc_to_image
+        ):  # TODO: ensure that non-multimodal tasks aren't getting visual args
+            multimodal_arg = {
+                **multimodal_arg,
+                **{"visual": self.doc_to_image(doc)},
+            }
+        if (
+            self.config.doc_to_audio
+        ):  # TODO: ensure that non-multimodal tasks aren't getting audio args
+            multimodal_arg = {
+                **multimodal_arg,
+                **{"audio": self.doc_to_audio(doc)},
+            }
+        if bool(multimodal_arg):
+            if isinstance(arguments, list):
+                arguments = [arg + (multimodal_arg,) for arg in arguments]
+            else:
+                arguments = arguments + (multimodal_arg,)
+        if self.OUTPUT_TYPE == "multiple_choice":
+            request_list = [
+                Instance(
+                    request_type="loglikelihood",
+                    doc=doc,
+                    arguments=arg,
+                    idx=i,
+                    **kwargs,
+                )
+                for i, arg in enumerate(arguments)
+            ]
+            return request_list
+        return Instance(
+            request_type=self.OUTPUT_TYPE,
+            doc=doc,
+            arguments=arguments,
+            idx=0,
+            **kwargs,
+        )
+    def process_results(self, doc, results):
+        if callable(self.config.process_results):
+            return self.config.process_results(doc, results)
+        result_dict = {}
+        use_metric = list(self._metric_fn_list.keys())
+        if self.OUTPUT_TYPE == "loglikelihood":
+            results = results[0]
+            ll, is_greedy = results
+            return {
+                **({"perplexity": ll} if "perplexity" in use_metric else {}),
+                **({"acc": int(is_greedy)} if "acc" in use_metric else {}),
+            }
+        elif self.OUTPUT_TYPE == "loglikelihood_rolling":
+            (loglikelihood,) = results
+            _words = self.count_words(self.doc_to_target(doc))
+            _bytes = self.count_bytes(self.doc_to_target(doc))
+            return {
+                **(
+                    {"word_perplexity": (loglikelihood, _words)}
+                    if "word_perplexity" in use_metric
+                    else {}
+                ),
+                **(
+                    {"byte_perplexity": (loglikelihood, _bytes)}
+                    if "byte_perplexity" in use_metric
+                    else {}
+                ),
+                **(
+                    {"bits_per_byte": (loglikelihood, _bytes)}
+                    if "bits_per_byte" in use_metric
+                    else {}
+                ),
+            }
+        elif self.OUTPUT_TYPE == "multiple_choice":
+            lls, is_greedy = zip(*results)
+            # retrieve choices in List[str] form, to compute choice lengths, etc.
+            choices = self.doc_to_choice(doc)
+            completion_len = np.array([float(len(i)) for i in choices])
+            if (
+                2 * len(choices) == len(lls)
+                and "acc_mutual_info" in self._metric_fn_list.keys()
+            ):
+                # then we are doing mutual info.
+                # this stores the "dryrun" / unconditional answer loglikelihoods
+                # as we extend the args list with unconditional ("", continuation) pairs
+                lls_unconditional = lls[len(choices) :]
+                if len(lls_unconditional) != len(choices):
+                    raise ValueError
+                # and this stores our "regular" conditional loglikelihoods
+                lls = lls[: len(choices)]
+            pred = np.argmax(lls)
+            pred_norm = np.argmax(lls / completion_len)
+            if self.multiple_input:
+                gold = self.doc_to_text(doc)
+            else:
+                gold = self.doc_to_target(doc)
+            gold_index_error = False
+            if isinstance(gold, list):
+                gold = [i if i < len(choices) else -100 for i in gold]
+                if -100 in gold:
+                    gold_index_error = True
+            else:
+                if isinstance(gold, int):
+                    gold = gold if gold < len(choices) else -100
+                elif isinstance(gold, str):
+                    gold = choices.index(gold) if gold in choices else -100
+                if gold == -100:
+                    gold_index_error = True
+            if gold_index_error:
+                eval_logger.warning(
+                    f"Label index was not in within range of available choices,"
+                    f"Sample:\n\n{doc}\n\n"
+                )
+            if self.multiple_target:
+                acc = 1.0 if pred in gold else 0.0
+                acc_norm = 1.0 if pred_norm in gold else 0.0
+                exact_match = int(any([is_greedy[i] if i != -100 else 0 for i in gold]))
+            else:
+                acc = 1.0 if pred == gold else 0.0
+                acc_norm = 1.0 if pred_norm == gold else 0.0
+                # TODO: this gets score of 0 on arc_challenge for pythia-70m. need to test that this works properly
+                exact_match = int(is_greedy[gold]) if gold != -100 else 0
+            prob_norm = utils.softmax(lls)
+            # TODO use keyword arguments to the metric?
+            # gold, pred, norm stuff, the original lls,
+            result_dict = {
+                **({"acc": acc} if "acc" in use_metric else {}),
+                **({"f1": (gold, pred)} if "f1" in use_metric else {}),
+                **({"mcc": (gold, pred)} if "mcc" in use_metric else {}),
+                **({"acc_norm": acc_norm} if "acc_norm" in use_metric else {}),
+                **({"exact_match": exact_match} if "exact_match" in use_metric else {}),
+                **(
+                    {"brier_score": (gold, prob_norm)}
+                    if "brier_score" in use_metric
+                    else {}
+                ),
+            }
+            if "acc_mutual_info" in use_metric:
+                lls_mutual_info = [
+                    ll_c - ll_u for ll_c, ll_u in zip(lls, lls_unconditional)
+                ]
+                acc_mutual_info = 1.0 if np.argmax(lls_mutual_info) == gold else 0.0
+                result_dict["acc_mutual_info"] = acc_mutual_info
+        elif self.OUTPUT_TYPE == "generate_until":
+            gold = self.doc_to_target(doc)
+            result = results[0]
+            if self.config.doc_to_choice is not None:
+                # If you set doc_to_choice,
+                # it assumes that doc_to_target returns a number.
+                choices = self.doc_to_choice(doc)
+                gold = choices[gold]
+            # we expect multiple_targets to be a list.
+            elif self.multiple_target:
+                gold = list(gold)
+            # TODO: handle this better
+            elif type(gold) is not type(result) and not (
+                "bypass" in self._metric_fn_list.keys() or isinstance(result, list)
+            ):
+                # cast gold to the same type as result
+                gold = type(result)(gold)
+            for metric in self._metric_fn_list.keys():
+                if self.multiple_target:
+                    # in the case where we have multiple targets,
+                    # return true if any are true
+                    # TODO: this may break for multipLe_target, non zero-or-1 metrics
+                    scores = []
+                    if not isinstance(gold, list):
+                        # sometimes, a multiple_target dataset has exceptions where one doc has only one string answer
+                        # print(gold)
+                        gold = [gold]
+                    if metric == "exact_match":
+                        result = [result for _ in range(len(gold))]
+                        scores = self._metric_fn_list[metric](
+                            references=gold,
+                            predictions=result,
+                            **self._metric_fn_kwargs[metric],
+                        )[metric]
+                        result_score = 1.0 if scores > 0.0 else 0.0
+                    else:
+                        for gold_option in gold:
+                            try:
+                                result_score = self._metric_fn_list[metric](
+                                    references=[gold_option],
+                                    predictions=[result],
+                                    **self._metric_fn_kwargs[metric],
+                                )
+                            except (
+                                TypeError
+                            ):  # TODO: this is hacky and I don't want to do it
+                                result_score = self._metric_fn_list[metric](
+                                    [gold_option, result]
+                                )
+                            if isinstance(result_score, dict):
+                                # TODO: this handles the case where HF evaluate returns a dict.
+                                result_score = result_score[metric]
+                            scores.append(result_score)
+                        if any(scores):
+                            result_score = 1.0
+                        else:
+                            result_score = 0.0
+                else:
+                    try:
+                        result_score = self._metric_fn_list[metric](
+                            references=[gold],
+                            predictions=[result],
+                            **self._metric_fn_kwargs[metric],
+                        )
+                    except TypeError:  # needed for now in order to use a different interface between our own metrics and HF Evaluate metrics
+                        result_score = self._metric_fn_list[metric]([gold, result])
+                if isinstance(result_score, dict):
+                    # TODO: this handles the case where HF evaluate returns a dict.
+                    # This allows for multiple metrics to be returned from the same function
+                    for k, v in result_score.items():
+                        result_dict[k] = v
+                else:
+                    result_dict[metric] = result_score
+        else:
+            raise ValueError(
+                f"Passed invalid output_type '{self.OUTPUT_TYPE}' ! Please use one of ",
+                "'loglikelihood', 'loglikelihood_rolling', 'generate_until' or 'multiple_choice'",
+            )
+        return result_dict
+    def aggregation(self) -> dict:
+        return self._aggregation_list
+    def higher_is_better(self) -> dict:
+        return self._higher_is_better
+    def get_config(self, key: str) -> Any:
+        return getattr(self._config, key, None)
+    @property
+    def task_name(self) -> Any:
+        return getattr(self.config, "task", None)
+    def __repr__(self):
+        return (
+            f"ConfigurableTask(task_name={getattr(self.config, 'task', None)},"
+            f"output_type={self.OUTPUT_TYPE},"
+            f"num_fewshot={getattr(self.config, 'num_fewshot', None)},"
+            f"num_samples={len(self.eval_docs)})"
+        )
+class MultipleChoiceTask(Task):
+    OUTPUT_TYPE = "loglikelihood"
+    def doc_to_target(self, doc: dict) -> str:
+        return " " + doc["choices"][doc["gold"]]
+    def construct_requests(self, doc: dict, ctx: str, **kwargs) -> List[Instance]:
+        # TODO: add mutual info here?
+        return [
+            Instance(
+                request_type="loglikelihood",
+                doc=doc,
+                arguments=(ctx, " {}".format(choice)),
+                idx=i,
+                **kwargs,
+            )
+            for i, choice in enumerate(doc["choices"])
+        ]
+    def process_results(self, doc: dict, results: Iterable[Tuple[float, bool]]) -> dict:
+        results = [
+            res[0] for res in results
+        ]  # only retain loglikelihoods, discard is_greedy TODO: do we need is_greedy anywhere?
+        gold = doc["gold"]
+        acc = 1.0 if np.argmax(results) == gold else 0.0
+        completion_len = np.array([float(len(i)) for i in doc["choices"]])
+        acc_norm = 1.0 if np.argmax(results / completion_len) == gold else 0.0
+        return {
+            "acc": acc,
+            "acc_norm": acc_norm,
+        }
+    def higher_is_better(self) -> dict:
+        return {
+            "acc": True,
+            "acc_norm": True,
+        }
+    def aggregation(self) -> dict:
+        return {
+            "acc": mean,
+            "acc_norm": mean,
+        }
+class PerplexityTask(Task):
+    OUTPUT_TYPE = "loglikelihood_rolling"
+    def has_training_docs(self) -> bool:
+        return False
+    def fewshot_examples(self, k: int, rnd) -> List:
+        if k != 0:
+            raise ValueError(
+                "The number of fewshot examples must be 0 for perplexity tasks."
+            )
+        return []
+    def fewshot_context(self, doc: dict, num_fewshot: int) -> Literal[""]:
+        if num_fewshot != 0:
+            raise ValueError(
+                "The number of fewshot examples must be 0 for perplexity tasks."
+            )
+        return ""
+    def higher_is_better(self) -> dict:
+        return {
+            "word_perplexity": False,
+            "byte_perplexity": False,
+            "bits_per_byte": False,
+        }
+    def doc_to_decontamination_query(self, doc):
+        return doc
+    def doc_to_text(self, doc) -> str:
+        return ""
+    def doc_to_target(self, doc):
+        return doc
+    def construct_requests(self, doc: dict, ctx: Optional[str], **kwargs):
+        if bool(ctx):
+            raise ValueError
+        return Instance(
+            request_type=self.OUTPUT_TYPE,
+            doc=doc,
+            arguments=(self.doc_to_target(doc),),
+            idx=0,
+            **kwargs,
+        )
+    def process_results(self, doc: dict, results: Tuple[float]) -> dict:
+        (loglikelihood,) = results
+        words = self.count_words(self.doc_to_target(doc))
+        bytes_ = self.count_bytes(self.doc_to_target(doc))
+        return {
+            "word_perplexity": (loglikelihood, words),
+            "byte_perplexity": (loglikelihood, bytes_),
+            "bits_per_byte": (loglikelihood, bytes_),
+        }
+    def aggregation(self) -> dict:
+        return {
+            "word_perplexity": weighted_perplexity,
+            "byte_perplexity": weighted_perplexity,
+            "bits_per_byte": bits_per_byte,
+        }
+    @classmethod
+    def count_bytes(cls, doc) -> int:
+        return len(doc.encode("utf-8"))
+    @classmethod
+    def count_words(cls, doc) -> int:
+        """Downstream tasks with custom word boundaries should override this!"""
+        return len(re.split(r"\s+", doc))

lm-evaluation-harness/lm_eval/caching/__init__.py ADDED Viewed

File without changes

lm-evaluation-harness/lm_eval/caching/cache.py ADDED Viewed

	@@ -0,0 +1,59 @@

+import hashlib
+import logging
+import os
+import dill
+eval_logger = logging.getLogger(__name__)
+MODULE_DIR = os.path.dirname(os.path.realpath(__file__))
+OVERRIDE_PATH = os.getenv("LM_HARNESS_CACHE_PATH")
+PATH = OVERRIDE_PATH if OVERRIDE_PATH else f"{MODULE_DIR}/.cache"
+# This should be sufficient for uniqueness
+HASH_INPUT = "EleutherAI-lm-evaluation-harness"
+HASH_PREFIX = hashlib.sha256(HASH_INPUT.encode("utf-8")).hexdigest()
+FILE_SUFFIX = f".{HASH_PREFIX}.pickle"
+def load_from_cache(file_name: str, cache: bool = False):
+    if not cache:
+        return
+    try:
+        path = f"{PATH}/{file_name}{FILE_SUFFIX}"
+        with open(path, "rb") as file:
+            cached_task_dict = dill.loads(file.read())
+            return cached_task_dict
+    except Exception:
+        eval_logger.debug(f"{file_name} is not cached, generating...")
+        pass
+def save_to_cache(file_name, obj):
+    if not os.path.exists(PATH):
+        os.mkdir(PATH)
+    file_path = f"{PATH}/{file_name}{FILE_SUFFIX}"
+    eval_logger.debug(f"Saving {file_path} to cache...")
+    with open(file_path, "wb") as file:
+        file.write(dill.dumps(obj))
+# NOTE the "key" param is to allow for flexibility
+def delete_cache(key: str = ""):
+    files = os.listdir(PATH)
+    for file in files:
+        if file.startswith(key) and file.endswith(FILE_SUFFIX):
+            file_path = f"{PATH}/{file}"
+            os.unlink(file_path)

lm-evaluation-harness/lm_eval/decontamination/__init__.py ADDED Viewed

File without changes

lm-evaluation-harness/lm_eval/decontamination/archiver.py ADDED Viewed

	@@ -0,0 +1,174 @@

+import datetime
+import io
+import json
+import mmap
+import os
+from pathlib import Path
+from typing import Any
+import jsonlines
+import tqdm
+import zstandard
+def json_serial(obj: Any) -> str:
+    """JSON serializer for objects not serializable by default json code"""
+    if isinstance(obj, (datetime.datetime,)):
+        return obj.isoformat()
+    raise TypeError("Type %s not serializable" % type(obj))
+# Modified version of lm_dataformat Archive for single file.
+class Archive:
+    def __init__(self, file_path: str, compression_level: int = 3) -> None:
+        self.file_path = file_path
+        dir_name = os.path.dirname(file_path)
+        if dir_name:
+            os.makedirs(dir_name, exist_ok=True)
+        self.fh = open(self.file_path, "wb")
+        self.cctx = zstandard.ZstdCompressor(level=compression_level)
+        self.compressor = self.cctx.stream_writer(self.fh)
+    def add_data(self, data, meta=None) -> None:
+        if meta is None:
+            meta = {}
+        self.compressor.write(
+            json.dumps({"text": data, "meta": meta}, default=json_serial).encode(
+                "UTF-8"
+            )
+            + b"\n"
+        )
+    def commit(self) -> None:
+        self.compressor.flush(zstandard.FLUSH_FRAME)
+        self.fh.flush()
+        self.fh.close()
+# Modified version of lm_dataformat Reader with self.fh set, allowing peeking for tqdm.
+class Reader:
+    def __init__(self) -> None:
+        pass
+    def read(
+        self,
+        file,
+        get_meta: bool = False,
+        autojoin_paragraphs: bool = True,
+        para_joiner: str = "\n\n",
+    ):
+        with open(file, "rb") as fh:
+            self.fh = fh
+            cctx = zstandard.ZstdDecompressor()
+            reader = io.BufferedReader(cctx.stream_reader(fh))
+            rdr = jsonlines.Reader(reader)
+            for ob in rdr:
+                # naive jsonl where each object is just the string itself, with no meta. For legacy compatibility.
+                if isinstance(ob, str):
+                    assert not get_meta
+                    yield ob
+                    continue
+                text = ob["text"]
+                if autojoin_paragraphs and isinstance(text, list):
+                    text = para_joiner.join(text)
+                if get_meta:
+                    yield text, (ob["meta"] if "meta" in ob else {})
+                else:
+                    yield text
+class TextArchive:
+    def __init__(self, file_path, mode: str = "rb+") -> None:
+        self.file_path = file_path
+        dir_name = os.path.dirname(file_path)
+        if dir_name:
+            os.makedirs(dir_name, exist_ok=True)
+        if not os.path.exists(file_path):
+            Path(file_path).touch()
+        self.fh = open(self.file_path, mode)
+    def add_data(self, data) -> None:
+        self.fh.write(data.encode("UTF-8") + b"\n")
+    def commit(self) -> None:
+        self.fh.flush()
+        self.fh.close()
+class TextReader:
+    def __init__(self, file_path) -> None:
+        self.file_path = file_path
+    # Optimized mmap read with infrequent tqdm updates to maintain speed
+    # Tested up to 250MB/s.
+    def read_tqdm(self, update_frequency: int = 10000):
+        current_file_position = 0
+        line_counter = 0
+        with (
+            open(self.file_path, "r", encoding="utf-8") as fh,
+            tqdm.tqdm(
+                total=os.path.getsize(self.file_path),
+                dynamic_ncols=True,
+                unit="byte",
+                unit_scale=1,
+            ) as progress,
+        ):
+            with mmap.mmap(fh.fileno(), length=0, access=mmap.ACCESS_READ) as mmap_obj:
+                for line in iter(mmap_obj.readline, b""):
+                    line = line.decode("utf-8")
+                    line_counter += 1
+                    if line_counter == update_frequency:
+                        new_file_pos = mmap_obj.tell()
+                        bytes_read = new_file_pos - current_file_position
+                        current_file_position = new_file_pos
+                        progress.update(bytes_read)
+                        line_counter = 0
+                    yield line[:-1]
+    def read_and_tell(self):
+        current_file_position = 0
+        with open(self.file_path, "r", encoding="utf8") as fh:
+            with mmap.mmap(fh.fileno(), length=0, access=mmap.ACCESS_READ) as mmap_obj:
+                for line in iter(mmap_obj.readline, b""):
+                    line = line.decode("utf-8")
+                    new_file_pos = mmap_obj.tell()
+                    raw_bytes_read = new_file_pos - current_file_position
+                    current_file_position = new_file_pos
+                    yield line[:-1], raw_bytes_read
+    def read(self):
+        with open(self.file_path, "r", encoding="utf8") as fh:
+            with mmap.mmap(fh.fileno(), length=0, access=mmap.ACCESS_READ) as mmap_obj:
+                for line in iter(mmap_obj.readline, b""):
+                    line = line.decode("utf-8")
+                    yield line[:-1]
+    def read_slow(self):
+        with open(self.file_path, "r", encoding="utf8") as fh:
+            while True:
+                line = fh.readline()
+                if line == -1 or line == "":
+                    break
+                else:
+                    yield line[:-1]
+# Optimized for speed. Decompresses the archive in shell before
+# using the mmap'd TextReader.
+class ZStdTextReader:
+    def __init__(self, file) -> None:
+        self.file = file
+    def read_tqdm(self):
+        decompressed_file = self.file[:-4]
+        print("Decompressing file, please wait...")
+        os.system(f"zstd -d {self.file}")  # linux decompress is faster
+        reader = TextReader(decompressed_file)
+        yield from reader.read_tqdm()
+        os.remove(decompressed_file)

lm-evaluation-harness/lm_eval/decontamination/decontaminate.py ADDED Viewed

	@@ -0,0 +1,166 @@

+import collections
+import glob
+import json
+import os
+import pickle
+import random
+import time
+from .archiver import ZStdTextReader
+from .janitor import Janitor, word_ngrams
+# Was used for testing the evaluator decoupled from the full logic below
+def get_train_overlap_stub(docs: dict, ngrams_path: str, ngrams_n_size: str):
+    simulated_overlap = 0.1
+    contaminated = int(len(docs) * simulated_overlap)
+    return random.sample(range(len(docs)), contaminated)
+# Returns a dictionary containing all overlapping documents in each
+# task. In the standard use case, an overlap occurs when any of the 13-grams
+# found in the task document exist in the training set documents.
+#
+# To generate 13-grams for the pile see scripts/clean_training_data. The final output of these
+# scripts are an info.json file containing the n_gram_size (13) and a bunch of "ngrams_{x}.bkt.txt.sorted.zst"
+# files. These should exist in the "ngrams_path" provided to this function.
+# Algorithm:
+# 1. Build lookups for each dataset {ngram: list(document_ids)}
+# 2. Merge into an overall lookup {ngram: [(task_name, task_set, doc_ids),]}
+# 3. Full scan the 13-grams from the training set against the merged lookup,
+#    saving matches in the "duplicates" dictionary {(task_name, task_set): set(doc_ids)}
+# 4. Strip the task_set from the dictionary keys and return
+#
+# We cache the task+set lookups as well as the overlaps.
+def get_train_overlap(docs_by_task_set: dict, ngrams_path: str, limit: int) -> dict:
+    # return get_train_overlap_stub(docs, ngrams_path, ngrams_n_size)
+    info_dict_path = os.path.join(ngrams_path, "info.json")
+    info_dict = json.load(open(info_dict_path, "r", encoding="utf-8"))
+    ngrams_n_size = info_dict["ngram_size"]
+    janitor = Janitor()
+    # Build lookup for each dataset first in case we use different task combinations later
+    print("Building Lookups...")
+    start = time.perf_counter()
+    def get_overlaps_dump_path(task_name, task_set, ngrams_n_size, limit) -> str:
+        return f"data/{task_name}/{task_set}_{ngrams_n_size}grams_limit{limit}.overlaps"
+    lookups = {}
+    duplicates = {}  # (task_name, task_set): set(doc_ids)}
+    sets_to_decontaminate = len(docs_by_task_set.keys())
+    for (task_name, task_set), docs in docs_by_task_set.items():
+        if not os.path.exists(f"data/{task_name}"):
+            os.mkdir(f"data/{task_name}")
+        # Check if we've decontaminated this combination before
+        overlaps_dump_path = get_overlaps_dump_path(
+            task_name, task_set, ngrams_n_size, limit
+        )
+        if os.path.exists(overlaps_dump_path):
+            duplicates[(task_name, task_set)] = pickle.load(
+                open(overlaps_dump_path, "rb")
+            )
+            sets_to_decontaminate -= 1
+            continue
+        else:
+            duplicates[(task_name, task_set)] = set()
+        # Build/load the task lookup {ngram: set(documents)}.
+        task_set_lookup_path = (
+            f"data/{task_name}/{task_set}_{ngrams_n_size}grams_limit{limit}.lookup"
+        )
+        if os.path.exists(task_set_lookup_path):
+            print(f"{task_set_lookup_path} available, loading...")
+            lookups[(task_name, task_set)] = pickle.load(
+                open(task_set_lookup_path, "rb")
+            )
+        else:
+            print(f"{task_set_lookup_path} not available, building...")
+            lookup = collections.defaultdict(set)
+            for doc_id, document in enumerate(docs):
+                ngrams = word_ngrams(janitor.normalize_string(document), ngrams_n_size)
+                for ngram in ngrams:
+                    lookup[ngram].add(doc_id)
+            pickle.dump(lookup, open(task_set_lookup_path, "wb"))
+            lookups[(task_name, task_set)] = lookup
+    elapsed = time.perf_counter() - start
+    print(f"Building lookups took {elapsed:0.5f} seconds.")
+    matched_ngrams = []
+    if sets_to_decontaminate > 0:
+        print("Merging lookups...")
+        start = time.perf_counter()
+        merged_lookup = collections.defaultdict(list)
+        for (task_name, task_set), lookup in lookups.items():
+            for ngram, doc_ids in lookup.items():
+                merged_lookup[ngram].append((task_name, task_set, doc_ids))
+        elapsed = time.perf_counter() - start
+        print(f"Merging lookups took {elapsed:0.5f} seconds.")
+        print(f"{ngrams_n_size} grams files found in {ngrams_path}:")
+        files = glob.glob(os.path.join(ngrams_path, "*.sorted.zst"))
+        print(files)
+        for file in files:
+            start = time.perf_counter()
+            print(f"Scanning {file}")
+            reader = ZStdTextReader(file)
+            total_ngrams = 0
+            unique_ngrams = 0
+            matching_unique = 0
+            non_matching_unique = 0
+            current_ngram = ""
+            for line in reader.read_tqdm():  # Scan training set ngrams file
+                total_ngrams += 1
+                [ngram, document_id] = line.rsplit(" ", 1)
+                if (
+                    ngram != current_ngram
+                ):  # Only need to match the ngram once in training set
+                    unique_ngrams += 1
+                    current_ngram = ngram
+                    if ngram in merged_lookup:
+                        matched_ngrams.append(ngram)  # For logging
+                        matching_unique += 1
+                        for task_name, task_set, doc_ids in merged_lookup[ngram]:
+                            task_doc_set = duplicates[(task_name, task_set)]
+                            for doc_id in doc_ids:  # Record contamination across all relevant task/set combos
+                                task_doc_set.add(doc_id)
+                        del merged_lookup[ngram]  # No point matching again
+                    else:
+                        non_matching_unique += 1
+            print(f"Total Ngrams: {total_ngrams}")
+            print(f"Unique Ngrams: {unique_ngrams}")
+            print(f"Unique Matching: {matching_unique}")
+            print(f"Unique Non Matching: {non_matching_unique}")
+            print("Matched ngrams:")
+            for ngram in matched_ngrams:
+                print(ngram)
+            elapsed = time.perf_counter() - start
+            print(f"Read took {elapsed:0.5f} seconds.")
+            print(f"Speed: {(os.path.getsize(file) / 1000000.0) / elapsed}MB/second")
+        print(duplicates)
+        # Dump overlaps separately
+        for (task_name, task_set), doc_ids in duplicates.items():
+            overlaps_dump_path = get_overlaps_dump_path(
+                task_name, task_set, ngrams_n_size, limit
+            )
+            pickle.dump(doc_ids, open(overlaps_dump_path, "wb"))
+    # Strip task set and return
+    return {task_name: doc_ids for (task_name, task_set), doc_ids in duplicates.items()}

lm-evaluation-harness/lm_eval/decontamination/janitor.py ADDED Viewed

	@@ -0,0 +1,329 @@

+import pickle
+import re
+import string
+import traceback
+from typing import Iterator, List, Sequence, Tuple, TypeVar
+# This is a cpp module.
+# See scripts/clean_training_data/README.md for instructions to compile janitor_util.cpp
+try:
+    import janitor_util
+    JANITOR_CPP = True
+except Exception:
+    print("WARNING: C++ module could not be loaded. Janitor running in python mode")
+    traceback.print_exc()
+    JANITOR_CPP = False
+T = TypeVar("T")
+# Implementation from nltk source
+# https://www.nltk.org/_modules/nltk/util.html
+def form_ngrams(sequence: Iterator[T], n: int) -> Iterator[Tuple[T, ...]]:
+    history = []
+    while n > 1:
+        # PEP 479, prevent RuntimeError from being raised when StopIteration bubbles out of generator
+        try:
+            next_item = next(sequence)
+        except StopIteration:
+            # no more data, terminate the generator
+            return
+        history.append(next_item)
+        n -= 1
+    for item in sequence:
+        history.append(item)
+        yield tuple(history)
+        del history[0]
+def word_ngrams(s: str, n: int) -> Iterator[str]:
+    """Splits a string into ngram words"""
+    tokens = s.split()  # not a generator :(
+    ngram_seqs = form_ngrams(iter(tokens), n)
+    return (" ".join(ngram) for ngram in ngram_seqs)
+# Does character sequences only - combined faster function to play around with later
+# def word_ngrams_indices_combined(sequence, n):
+#     current_word = ""
+#     history = []
+#     gap = False;
+#     start = 0
+#     end = 0
+#     for character in sequence:
+#         if character == " ":
+#             if not gap:
+#                 gap = True
+#                 history.append(current_word)
+#                 end += len(current_word) - 1
+#                 current_word = ""
+#                 if len(history) == n:
+#                     yield (tuple(history), start, end)
+#                     del history[0]
+#                     start = end + 1
+#                     end = start
+#         else:
+#             gap = False
+#             current_word += character
+# https://stackoverflow.com/questions/13734451/string-split-with-indices-in-python
+def split_indices(s: str) -> Iterator[Tuple[str, Tuple[int, int]]]:
+    """Splits a string on whitespaces and records the indices of each in the original string.
+    @:return generator((word, (start_idx, end_idx)), ...)
+    """
+    return ((m.group(0), (m.start(), m.end() - 1)) for m in re.finditer(r"\S+", s))
+def word_ngrams_indices(s: str, n: int) -> Iterator[Tuple[str, Tuple[int, int]]]:
+    """Splits a string into pairs of (ngram words, their start/end indices)"""
+    tokens_with_indices = split_indices(s)
+    # Generator of ngrams of (word, idx_pairs)
+    # (
+    #   [(word, (start,end)), (word, (start, end))...],
+    #   [(word, (start, end)), ...],
+    #   ...
+    # )
+    ngram_seqs_with_indices = form_ngrams(tokens_with_indices, n)
+    # Generator of pairs of word and index ngrams
+    # (
+    #   ([word, word, ...], [(start,end), (start,end), ...]),
+    #   ...
+    # )
+    ngram_indices_pairs = (
+        zip(*ngram_with_indices) for ngram_with_indices in ngram_seqs_with_indices
+    )
+    # Generator of ( (word_ngram, (start, end)), (word_ngram, start, end)), ...)
+    return (
+        (" ".join(ngram_seq), (indices[0][0], indices[-1][1]))
+        for ngram_seq, indices in ngram_indices_pairs
+    )
+class Janitor:
+    # FIXME delete_chars: Should anything else go here? Special chars?
+    def __init__(
+        self,
+        ngram_n: int = 13,
+        window_to_remove: int = 200,
+        too_dirty_cutoff: int = 10,
+        minimum_slice_length: int = 200,
+        delete_chars: str = string.punctuation,
+    ) -> None:
+        self.ngram_n = ngram_n
+        self.window_to_remove = window_to_remove
+        self.too_dirty_cutoff = too_dirty_cutoff
+        self.minimum_slice_length = minimum_slice_length
+        self.delete_chars = delete_chars
+        self.dirt_ngrams = set()
+        # If in python, we'll translate uppercase to lowercase and delete naughty characters.
+        # This is fast by python standards
+        # https://stackoverflow.com/questions/638893/what-is-the-most-efficient-way-in-python-to-convert-a-string-to-all-lowercase-st
+        self.translation_table = str.maketrans(
+            string.ascii_lowercase + string.ascii_uppercase,  # These characters
+            string.ascii_lowercase * 2,  # Become these characters
+            self.delete_chars,  # These are deleted
+        )
+    ##############
+    # I/O for saving contamination ngrams
+    ##############
+    def save_contamination_ngrams(self, filename: str) -> None:
+        with open(filename, "wb") as fp:
+            pickle.dump(filename, fp)
+    def load_contamination_ngrams(self, filename: str) -> None:
+        with open(filename, "rb") as fp:
+            self.dirt_ngrams = pickle.load(fp)
+    ##############
+    # Call these :)
+    ##############
+    def register_contaminant(self, dirt_string: str) -> None:
+        """Register a string as contamination to be removed, e.g. a test set
+        This breaks the dirt_string into ngrams to store for future cleaning"""
+        if JANITOR_CPP:
+            return self.register_contaminant_cpp(dirt_string)
+        else:
+            print("WARNING: Janitor running in python mode")
+            return self.register_contaminant_python(dirt_string)
+    def clean(self, dirty_string: str) -> List[str]:
+        """Clean a string (e.g. a training set) by removing all ngrams previously
+        registered as contaminants. Returns a list of clean chunks, or empty if
+        the string was too dirty"""
+        if JANITOR_CPP:
+            return self.clean_cpp(dirty_string)
+        else:
+            print("WARNING: Janitor running in python mode")
+            return self.clean_python(dirty_string)
+    def _split_chunks(
+        self, dirty_string: str, dirty_parts: Sequence[Tuple]
+    ) -> List[str]:
+        clean_chunks = []
+        splice_idx = 0
+        end = -1
+        for i, (ngram, start, end) in enumerate(dirty_parts):
+            if i >= self.too_dirty_cutoff:
+                return []
+            start = max(0, start - self.window_to_remove)
+            end = min(len(dirty_string), end + self.window_to_remove)
+            if start - splice_idx > self.minimum_slice_length:
+                clean_chunks.append(dirty_string[splice_idx:start])
+            splice_idx = end
+        if end < len(dirty_string) - self.minimum_slice_length:
+            clean_chunks.append(dirty_string[end + 1 :])
+        return clean_chunks
+    ##############
+    # Fast C++
+    ##############
+    def register_contaminant_cpp(self, dirt_string) -> None:
+        self.dirt_ngrams.update(
+            janitor_util.clean_ngram(dirt_string, self.delete_chars, self.ngram_n)
+        )
+    def clean_cpp(self, dirty_string: str) -> List[str]:
+        contamination_indices = janitor_util.clean_ngram_with_indices(
+            dirty_string, self.delete_chars, self.ngram_n
+        )
+        return self._split_chunks(dirty_string, contamination_indices)
+    ##############
+    # Slow python
+    ##############
+    def normalize_string(self, s: str) -> str:
+        return s.translate(self.translation_table)
+    def register_contaminant_python(self, dirt_string: str) -> None:
+        self.dirt_ngrams.update(
+            word_ngrams(self.normalize_string(dirt_string), self.ngram_n)
+        )
+    def clean_python(self, dirty_string: str) -> List[str]:
+        contamination_indices = (
+            (None, *idx_pair)
+            for dirty_ngram, idx_pair in word_ngrams_indices(dirty_string, self.ngram_n)
+            if self.normalize_string(dirty_ngram) in self.dirt_ngrams
+        )
+        return self._split_chunks(dirty_string, contamination_indices)
+##################################################################
+# Tests
+#################################################################
+# def print_cpp():
+#     source = """   ,, I'm a very !dirty,, ,,  dirty boy. Clean me daddy. \n\nhe he he hehe heh.  lastword  """ * 2
+#     for i in range(1, 10, 2):
+#         pprint(janitor_util.clean_ngram(source, string.punctuation, i))
+#         for ngram, start, end in \
+#                 janitor_util.clean_ngram_with_indices(source, string.punctuation, i):
+#             print(ngram, "\t", start, end, source[start:end].replace("\n", "\\n"))
+# def test_cpp():
+#     source = """   ,, I'm a very !dirty,, ,,  dirty boy. Clean me daddy. \n\nhe he he hehe heh.  lastword  """ * 2
+#     contaminant = "dirty boy. Clean he he"
+#     jan_python = Janitor()
+#     jan_cpp = Janitor()
+#     jan_python.register_contaminant_python(contaminant)
+#     jan_cpp.register_contaminant(contaminant)
+#     assert jan_python.dirt_ngrams == jan_cpp.dirt_ngrams, (jan_python.dirt_ngrams, jan_cpp.dirt_ngrams)
+#     assert jan_python.clean_python(source) == jan_cpp.clean(source), \
+#         (jan_python.clean_python(source), jan_cpp.clean(source))
+#     print("Passed test, python==cpp")
+# def benchmark():
+#     # Download and put in data folder: enwik8 (100 MB) from https://cs.fit.edu/~mmahoney/compression/textdata.html
+#     setup = \
+#         """
+#         with open("data/enwik8", "r") as f:
+#             data = f.read()
+#         jan = Janitor(too_dirty_cutoff=1000)
+#         jan.register_contaminant('''
+#         theories is that there is a connection between &quot;geekdom&quot; and autism.
+#         This is hinted, for instance, by a ''Wired Magazine'' article in 2001 entitled &quot;
+#         The [[Geek]] Syndrome&quot;, which is a point argued by many in the autism rights
+#         movement{{ref|Wired}}.  This article, many professionals assert, is just one example of
+#         the media's application of mental disease labels to what is actually variant normal behavior
+#         &amp;mdash;they argue that shyness, lack of athletic ability or social skills, and intellectual
+#         interests, even when they seem unusual to others, are not in themselves signs of autism or
+#         Asperger's syndrome. Others assert that it is actually the medical profession which is applying
+#         mental disease labels to children who in the past would have simply been accepted as a little
+#         different or even labeled 'gifted'. See [[clinomorphism]] for further discussion of this issue.
+#         Due to the recent publicity surrounding autism and autis
+#         ultan Al Nahyan]] granted [[Petroleum]] concessions, and oil was first found in 1958.  At first,
+#         oil money had a marginal impact.  A few lowrise concete buildings were erected, and the first
+#         paved road was completed in 1961, but Sheikh Shakbut, uncertain whether the new oil royalties
+#         would last, took a cautious approach, preferring to save the revenue rather than investing it in
+#         development.  His brother, [[Zayed bin Sultan Al Nahayan]], saw that oil wealth had the potential
+#         to transform Abu Dhabi.  The ruling Al Nahayan family decided that Sheikh Zayed should replace his
+#         brother as Ruler and carry out his vision of developing the country.  On [[August 6]], [[1966]],
+#         with the assistance of the British, Sheikh Zayed became the new ruler.  See generally, Al-Fahim, M,
+#         ''From Rags to Riches: A Story of Abu Dhabi'', Chapter Six (London Centre of Arab Studies, 1995),
+#         ISBN 1 900404 00 1. With the announcement by Britain in 1968 that it would withdraw from the
+#         Gulf area by 1971, Sheikh Zayed became the main driving force behind the formation of the
+#         [[United Arab Emirates]]. After the Emirates gained independence in 1971,
+#         ''')
+#         """
+#     n = 1
+#     print(f"Timing {n} run on 100 MB")
+#     print("Register contaminant")
+#     # print("\tPython", timeit.timeit("jan.register_contaminant_python(data)", setup=setup, globals=globals(), number=n))
+#     print("\tCpp", timeit.timeit("jan.register_contaminant(data)", setup=setup, globals=globals(), number=n))
+#     print("Clean")
+#     # print("\tPython", timeit.timeit("jan.clean_python(data)", setup=setup, globals=globals(), number=n))
+#     print("\tCpp", timeit.timeit("jan.clean(data)", setup=setup, globals=globals(), number=n))
+# def test_janitor_general():
+#     source = """   ,, I'm a very !dirty,, ,,  dirty boy. Clean me daddy. \n\nhe he he hehe heh.  lastword  """ * 2
+#     contaminant = "dirty boy. Clean he he"
+#     jan = Janitor(ngram_n=3)
+#     jan.register_contaminant(contaminant)
+#     cleaned = " ".join(jan.clean(source))
+#     for contam in jan.dirt_ngrams:
+#         assert contam not in cleaned, contam
+#     filename = "data/saved_contam"
+#     jan.save_contamination_ngrams(filename)
+#     jan = Janitor(ngram_n=3)
+#     jan.load_contamination_ngrams(filename)
+#     cleaned = " ".join(jan.clean(source))
+#     for contam in jan.dirt_ngrams:
+#         assert contam not in cleaned, contam
+# if __name__ == "__main__":
+#     test()
+#     # print_cpp()
+#     # test_cpp()
+#     # benchmark()

lm-evaluation-harness/lm_eval/evaluator.py ADDED Viewed

	@@ -0,0 +1,787 @@

+import itertools
+import json
+import logging
+import os
+import random
+import time
+from collections import defaultdict
+from typing import TYPE_CHECKING, List, Optional, Union
+import numpy as np
+import torch
+import lm_eval.api.metrics
+import lm_eval.api.registry
+import lm_eval.api.task
+import lm_eval.models
+from lm_eval.caching.cache import delete_cache
+from lm_eval.evaluator_utils import (
+    consolidate_group_results,
+    consolidate_results,
+    get_sample_size,
+    get_subtask_list,
+    get_task_list,
+    prepare_print_tasks,
+    print_writeout,
+    run_task_tests,
+)
+from lm_eval.loggers import EvaluationTracker
+from lm_eval.loggers.utils import add_env_info, add_tokenizer_info, get_git_commit_hash
+from lm_eval.tasks import TaskManager, get_task_dict
+from lm_eval.utils import (
+    handle_non_serializable,
+    hash_dict_images,
+    hash_string,
+    positional_deprecated,
+    setup_logging,
+    simple_parse_args_string,
+    wrap_text,
+)
+if TYPE_CHECKING:
+    from lm_eval.api.model import LM
+    from lm_eval.api.task import Task
+eval_logger = logging.getLogger(__name__)
+@positional_deprecated
+def simple_evaluate(
+    model,
+    model_args: Optional[Union[str, dict]] = None,
+    tasks: Optional[List[Union[str, dict, object]]] = None,
+    num_fewshot: Optional[int] = None,
+    batch_size: Optional[Union[int, str]] = None,
+    max_batch_size: Optional[int] = None,
+    device: Optional[str] = None,
+    use_cache: Optional[str] = None,
+    cache_requests: bool = False,
+    rewrite_requests_cache: bool = False,
+    delete_requests_cache: bool = False,
+    limit: Optional[Union[int, float]] = None,
+    samples: Optional[dict] = None,
+    bootstrap_iters: int = 100000,
+    check_integrity: bool = False,
+    write_out: bool = False,
+    log_samples: bool = True,
+    evaluation_tracker: Optional[EvaluationTracker] = None,
+    system_instruction: Optional[str] = None,
+    apply_chat_template: Union[bool, str] = False,
+    fewshot_as_multiturn: bool = False,
+    gen_kwargs: Union[str, dict, None] = None,
+    task_manager: Optional[TaskManager] = None,
+    verbosity=None,
+    predict_only: bool = False,
+    random_seed: int = 0,
+    numpy_random_seed: int = 1234,
+    torch_random_seed: int = 1234,
+    fewshot_random_seed: int = 1234,
+    confirm_run_unsafe_code: bool = False,
+    metadata: Optional[dict] = None,
+):
+    """Instantiate and evaluate a model on a list of tasks.
+    :param model: Union[str, LM]
+        Name of model or LM object, see lm_eval.models.get_model
+    :param model_args: Optional[str, dict]
+        String or dict arguments for each model class, see LM.create_from_arg_string and LM.create_from_arg_object.
+        Ignored if `model` argument is a LM object.
+    :param tasks: list[Union[str, dict, Task]]
+        List of task names or Task objects. Task objects will be taken to have name task.EVAL_HARNESS_NAME if defined and type(task).__name__ otherwise.
+    :param num_fewshot: int
+        Number of examples in few-shot context
+    :param batch_size: int or str, optional
+        Batch size for model
+    :param max_batch_size: int, optional
+        Maximal batch size to try with automatic batch size detection
+    :param device: str, optional
+        PyTorch device (e.g. "cpu" or "cuda:0") for running models
+    :param use_cache: str, optional
+        A path to a sqlite db file for caching model responses. `None` if not caching.
+    :param cache_requests: bool, optional
+        Speed up evaluation by caching the building of dataset requests. `None` if not caching.
+    :param rewrite_requests_cache: bool, optional
+        Rewrites all the request cache if set to `True`. `None` if not desired.
+    :param delete_requests_cache: bool, optional
+        Deletes all the request cache if set to `True`. `None` if not desired.
+    :param limit: int or float, optional
+        Limit the number of examples per task (only use this for testing), If <1, limit is a percentage of the total number of examples.
+    :param samples: dictionary, optional
+        Dictionary indicating which examples should be tested in each task, e.g., {"mmlu_astronomy":[0,3,6],"mmlu_anatomy":[1,4,7,10]}.
+    :param bootstrap_iters:
+        Number of iterations for bootstrap statistics, used when calculating stderrs. set to 0 for no stderr calculations to be performed.
+    :param check_integrity: bool
+        Whether to run the relevant part of the test suite for the tasks
+    :param write_out: bool
+        If True, write out an example document and model input for checking task integrity
+    :param log_samples: bool
+        If True, write out all model outputs and documents for per-sample measurement and post-hoc analysis
+    :param system_instruction: str
+        System instruction to be applied to the prompt
+    :param apply_chat_template: Union[bool, str]
+        Specifies whether to apply a chat template to the prompt.
+        - If set to True, the default chat template is applied.
+        - If set to a string, applies the specified chat template by name.
+        Defaults to False (no chat template applied).
+    :param fewshot_as_multiturn: bool
+        Whether to provide the fewshot examples as a multiturn conversation or a single user turn.
+    :param gen_kwargs: dict or comma-separated string
+        Arguments for model generation
+        Ignored for all tasks with loglikelihood output_type
+    :param verbosity: str
+        Verbosity level for logging
+    :param predict_only: bool
+        If true only model outputs will be generated and returned. Metrics will not be evaluated
+    :param random_seed: int
+        Random seed for python's random module. If set to None, the seed will not be set.
+    :param numpy_random_seed: int
+        Random seed for numpy. If set to None, the seed will not be set.
+    :param torch_random_seed: int
+        Random seed for torch. If set to None, the seed will not be set.
+    :param fewshot_random_seed: int
+        Random seed for fewshot sampler random generator. If set to None, the seed of generator will be set to None.
+    :param metadata: dict
+        Additional metadata to be added to the task manager. Will get passed to the download function of the task.
+    return
+        Dictionary of results
+    """
+    if verbosity is not None:
+        setup_logging(verbosity=verbosity)
+    start_date = time.time()
+    if limit is not None and samples is not None:
+        raise ValueError(
+            "Either 'limit' or 'samples' must be None, but both are not None."
+        )
+    _NEEDS_CHAT_TEMPLATE = ("inst", "chat")
+    if (
+        (
+            isinstance(model_args, str)
+            and any(kw in model_args.lower() for kw in _NEEDS_CHAT_TEMPLATE)
+        )
+        or (
+            isinstance(model_args, dict)
+            and any(
+                any(kw in str(v).lower() for kw in _NEEDS_CHAT_TEMPLATE)
+                for v in model_args.values()
+            )
+        )
+    ) and not apply_chat_template:
+        eval_logger.warning(
+            wrap_text(
+                f"""pretrained={model_args.get("pretrained") if isinstance(model_args, dict) else model_args} appears to be an
+                instruct or chat variant but chat template is not applied.
+                Recommend setting `apply_chat_template` (optionally `fewshot_as_multiturn`).""",
+            )
+        )
+    if delete_requests_cache:
+        eval_logger.info("Deleting requests cache...")
+        delete_cache()
+    seed_message = []
+    if random_seed is not None:
+        # See https://github.com/EleutherAI/lm-evaluation-harness/pull/1412
+        seed_message.append(f"Setting random seed to {random_seed}")
+        random.seed(random_seed)
+    if numpy_random_seed is not None:
+        seed_message.append(f"Setting numpy seed to {numpy_random_seed}")
+        np.random.seed(numpy_random_seed)
+    if torch_random_seed is not None:
+        seed_message.append(f"Setting torch manual seed to {torch_random_seed}")
+        torch.manual_seed(torch_random_seed)
+    if fewshot_random_seed is not None:
+        seed_message.append(f"Setting fewshot manual seed to {fewshot_random_seed}")
+    if seed_message:
+        eval_logger.info(" | ".join(seed_message))
+    if tasks is None:
+        tasks = []
+    if len(tasks) == 0:
+        raise ValueError(
+            "No tasks specified, or no tasks found. Please verify the task names."
+        )
+    if gen_kwargs is not None:
+        if isinstance(gen_kwargs, str):
+            gen_kwargs = simple_parse_args_string(gen_kwargs)
+        eval_logger.warning(
+            f"generation_kwargs: {gen_kwargs} specified through cli, these settings will update set parameters in yaml tasks. "
+            "Ensure 'do_sample=True' for non-greedy decoding!"
+        )
+        if not gen_kwargs:
+            gen_kwargs = None
+    if isinstance(model, str):
+        if model_args is None:
+            eval_logger.warning("model_args not specified. Using defaults.")
+            model_args = ""
+        if isinstance(model_args, dict):
+            eval_logger.info(
+                f"Initializing {model} model, with arguments: {model_args}"
+            )
+            lm = lm_eval.api.registry.get_model(model).create_from_arg_obj(
+                model_args,
+                {
+                    "batch_size": batch_size,
+                    "max_batch_size": max_batch_size,
+                    "device": device,
+                },
+            )
+        else:
+            eval_logger.info(
+                wrap_text(
+                    f"Initializing {model} model, with arguments: {simple_parse_args_string(model_args)}"
+                )
+            )
+            lm = lm_eval.api.registry.get_model(model).create_from_arg_string(
+                model_args,
+                {
+                    "batch_size": batch_size,
+                    "max_batch_size": max_batch_size,
+                    "device": device,
+                },
+            )
+    else:
+        if not isinstance(model, lm_eval.api.model.LM):
+            raise TypeError(
+                f"The value of `model` passed to simple_evaluate() was of type {type(model)}, but is required to be a subclass of lm_eval.api.model.LM . This may be because you are passing an initialized Hugging Face PreTrainedModel without having wrapped it in `lm_eval.models.huggingface.HFLM(pretrained=my_model)` first."
+            )
+        eval_logger.info("Using pre-initialized model")
+        lm = model
+    if use_cache is not None:
+        eval_logger.info(f"Using cache at {use_cache + '_rank' + str(lm.rank) + '.db'}")
+        lm = lm_eval.api.model.CachingLM(
+            lm,
+            use_cache
+            # each rank receives a different cache db.
+            # necessary to avoid multiple writes to cache at once
+            + "_rank"
+            + str(lm.rank)
+            + ".db",
+        )
+    if task_manager is None:
+        metadata = (
+            simple_parse_args_string(model_args)
+            if isinstance(model_args, str)
+            else model_args
+            if isinstance(model_args, dict)
+            else {}
+        ) | (metadata or {})
+        task_manager = TaskManager(metadata=metadata)
+    task_dict = get_task_dict(
+        tasks,
+        task_manager,
+    )
+    # helper function to recursively apply config overrides to leaf subtasks, skipping their constituent groups.
+    # (setting of num_fewshot ; bypassing metric calculation ; setting fewshot seed)
+    def _adjust_config(task_dict):
+        adjusted_task_dict = {}
+        for task_name, task_obj in task_dict.items():
+            if isinstance(task_obj, dict):
+                adjusted_task_dict = {
+                    **adjusted_task_dict,
+                    **{task_name: _adjust_config(task_obj)},
+                }
+            else:
+                if task_obj.get_config("output_type") == "generate_until":
+                    if gen_kwargs is not None:
+                        task_obj.set_config(
+                            key="generation_kwargs", value=gen_kwargs, update=True
+                        )
+                    eval_logger.info(
+                        f"{task_obj.config.task}: Using gen_kwargs: {task_obj.config.generation_kwargs}"
+                    )
+                if predict_only:
+                    eval_logger.info(
+                        f"Processing {task_name} in output-only mode. Metrics will not be calculated!"
+                    )
+                    # we have to change the class properties post-hoc. This is pretty hacky.
+                    task_obj.override_metric(metric_name="bypass")
+                # override tasks' fewshot values to the provided num_fewshot arg value
+                # except if tasks have it set to 0 manually in their configs--then we should never overwrite that
+                if num_fewshot is not None:
+                    if (default_num_fewshot := task_obj.get_config("num_fewshot")) == 0:
+                        eval_logger.info(
+                            f"num_fewshot has been set to 0 for {task_name} in its config. Manual configuration will be ignored."
+                        )
+                    else:
+                        eval_logger.warning(
+                            f"Overwriting default num_fewshot of {task_name} from {default_num_fewshot} to {num_fewshot}"
+                        )
+                        task_obj.set_config(key="num_fewshot", value=num_fewshot)
+                else:
+                    # if num_fewshot not provided, and the task does not define a default one, default to 0
+                    if (
+                        default_num_fewshot := task_obj.get_config("num_fewshot")
+                    ) is None:
+                        task_obj.set_config(key="num_fewshot", value=0)
+                # fewshot_random_seed set for tasks, even with a default num_fewshot (e.g. in the YAML file)
+                task_obj.set_fewshot_seed(seed=fewshot_random_seed)
+                adjusted_task_dict[task_name] = task_obj
+        return adjusted_task_dict
+    task_dict = _adjust_config(task_dict)
+    if check_integrity:
+        run_task_tests(task_list=tasks)
+    if evaluation_tracker is not None:
+        evaluation_tracker.general_config_tracker.log_experiment_args(
+            model_source=model,
+            model_args=model_args,
+            system_instruction=system_instruction,
+            chat_template=lm.chat_template(apply_chat_template)
+            if apply_chat_template
+            else None,
+            fewshot_as_multiturn=fewshot_as_multiturn,
+        )
+    results = evaluate(
+        lm=lm,
+        task_dict=task_dict,
+        limit=limit,
+        samples=samples,
+        cache_requests=cache_requests,
+        rewrite_requests_cache=rewrite_requests_cache,
+        bootstrap_iters=bootstrap_iters,
+        write_out=write_out,
+        log_samples=True if predict_only else log_samples,
+        system_instruction=system_instruction,
+        apply_chat_template=apply_chat_template,
+        fewshot_as_multiturn=fewshot_as_multiturn,
+        verbosity=verbosity,
+        confirm_run_unsafe_code=confirm_run_unsafe_code,
+    )
+    if verbosity is not None:
+        setup_logging(verbosity=verbosity)
+    if lm.rank == 0:
+        if isinstance(model, str):
+            model_name = model
+        elif hasattr(model, "config") and hasattr(model.config, "_name_or_path"):
+            model_name = model.config._name_or_path
+        else:
+            model_name = type(model).__name__
+        # add info about the model and few shot config
+        results["config"] = {
+            "model": model_name,
+            "model_args": model_args,
+        }
+        # add more detailed model info if available
+        if isinstance(lm, lm_eval.models.huggingface.HFLM):
+            results["config"].update(lm.get_model_info())
+        # add info about execution
+        results["config"].update(
+            {
+                "batch_size": batch_size,
+                "batch_sizes": (
+                    list(lm.batch_sizes.values()) if hasattr(lm, "batch_sizes") else []
+                ),
+                "device": device,
+                "use_cache": use_cache,
+                "limit": limit,
+                "bootstrap_iters": bootstrap_iters,
+                "gen_kwargs": gen_kwargs,
+                "random_seed": random_seed,
+                "numpy_seed": numpy_random_seed,
+                "torch_seed": torch_random_seed,
+                "fewshot_seed": fewshot_random_seed,
+            }
+        )
+        results["git_hash"] = get_git_commit_hash()
+        results["date"] = start_date
+        add_env_info(results)  # additional environment info to results
+        add_tokenizer_info(results, lm)  # additional info about tokenizer
+        return results
+    else:
+        return None
+@positional_deprecated
+def evaluate(
+    lm: "LM",
+    task_dict,
+    limit: Optional[int] = None,
+    samples: Optional[dict] = None,
+    cache_requests: bool = False,
+    rewrite_requests_cache: bool = False,
+    bootstrap_iters: Optional[int] = 100000,
+    write_out: bool = False,
+    log_samples: bool = True,
+    system_instruction: Optional[str] = None,
+    apply_chat_template: Union[bool, str] = False,
+    fewshot_as_multiturn: bool = False,
+    verbosity: str = "INFO",
+    confirm_run_unsafe_code: bool = False,
+):
+    """Instantiate and evaluate a model on a list of tasks.
+    :param lm: obj
+        Language Model
+    :param task_dict: dict[str, Task]
+        Dictionary of tasks. Tasks will be taken to have name type(task).config.task .
+    :param limit: int, optional
+        Limit the number of examples per task (only use this for testing)
+    :param samples: dictionary, optional
+        Dictionary indicating which examples should be tested in each task, e.g., {"mmlu_astronomy":[0,3,6],"mmlu_anatomy":[1,4,7,10]}.
+    :param cache_requests: bool, optional
+        Speed up evaluation by caching the building of dataset requests.
+    :param rewrite_requests_cache: bool, optional
+        Rewrites all the request cache if set to `True`.
+    :param bootstrap_iters:
+        Number of iterations for bootstrap statistics, used when calculating stderr. Set to 0 for skipping all stderr calculations.
+    :param write_out: bool
+        If True, write out an example document and model input for checking task integrity
+    :param log_samples: bool
+        If True, write out all model outputs and documents for per-sample measurement and post-hoc analysis
+    :param system_instruction: str
+        System instruction to be applied to the prompt
+    :param apply_chat_template: Union[bool, str]
+        Specifies whether to apply a chat template to the prompt.
+        - If set to True, the default chat template is applied.
+        - If set to a string, applies the specified chat template by name.
+        Defaults to False (no chat template applied).
+    :param fewshot_as_multiturn: bool
+        Whether to provide the fewshot examples as a multiturn conversation or a single user turn.
+    :param verbosity: str
+        Verbosity level for logging
+    :param confirm_run_unsafe_code: bool
+        Whether to confirm running tasks marked as unsafe.
+    :return
+        Dictionary of results
+    """
+    if limit is not None and samples is not None:
+        raise ValueError(
+            "Either 'limit' or 'samples' must be None, but both are not None."
+        )
+    if samples is not None:
+        eval_logger.info(f"Evaluating examples for tasks {list(samples.keys())}")
+    if apply_chat_template:
+        eval_logger.warning(
+            "Chat template formatting change affects loglikelihood and multiple-choice tasks. See docs/chat-template-readme.md for details."
+        )
+    # tracks all Instances/requests a model must generate output on.
+    requests = defaultdict(list)
+    # stores the amount to pad out reqs per req. type so that
+    # number of fwd passes per distributed rank is equal
+    padding_requests = defaultdict(int)
+    # get lists of group hierarchy and each type of request
+    eval_tasks = get_task_list(task_dict)
+    if not log_samples:
+        if not all(
+            "bypass" not in getattr(task_output.task, "_metric_fn_list", {}).keys()
+            for task_output in eval_tasks
+        ):
+            raise ValueError("log_samples must be True for 'bypass' metric-only tasks")
+    # validation checks:
+    # 1.are we running multimodal task <-> non-multimodal model class, or vice-versa.
+    # 2.are we running code that is marked as unsafe.
+    incompatible_tasks = []
+    for task_output in eval_tasks:
+        task: Task = task_output.task
+        if getattr(task, "MULTIMODAL", False) and not getattr(lm, "MULTIMODAL", False):
+            incompatible_tasks.append(task_output.task_name)
+        elif getattr(task, "UNSAFE_CODE", False) and not confirm_run_unsafe_code:
+            raise ValueError(
+                f"Attempted to run task: {task_output.task_name} which is marked as unsafe. Set confirm_run_unsafe_code=True to run this task."
+            )
+    if len(incompatible_tasks) > 0:
+        if not getattr(lm, "MULTIMODAL", False):
+            raise ValueError(
+                f"Attempted to run tasks: {incompatible_tasks} which require multimodal input, but the selected model type does not currently implement this. Multimodal support is currently restricted to the ['hf-multimodal', 'vllm-vlm'] model type."
+            )
+    # end validation check
+    # Cache the limit arg.
+    limit_arg = limit
+    limits = []
+    for task_output in eval_tasks:
+        task: Task = task_output.task
+        limit = get_sample_size(task, limit_arg)
+        limits.append(limit)
+        task.build_all_requests(
+            limit=limit,
+            samples=samples.get(task_output.task_name, None)
+            if samples is not None
+            else samples,
+            rank=lm.rank,
+            world_size=lm.world_size,
+            cache_requests=cache_requests,
+            rewrite_requests_cache=rewrite_requests_cache,
+            system_instruction=system_instruction,
+            apply_chat_template=bool(apply_chat_template),
+            fewshot_as_multiturn=fewshot_as_multiturn,
+            chat_template=getattr(lm, "apply_chat_template")
+            if apply_chat_template
+            else None,
+            tokenizer_name=getattr(lm, "tokenizer_name", "")
+            if apply_chat_template
+            else "",
+        )
+        eval_logger.debug(
+            f"Task: {task_output.task_name}; number of requests on this rank: {len(task.instances)}"
+        )
+        if write_out:
+            print_writeout(task)
+        # aggregate Instances by LM method requested to get output.
+        for instance in task.instances:
+            reqtype = instance.request_type
+            requests[reqtype].append(instance)
+        if lm.world_size > 1:
+            instances_rnk = torch.tensor(len(task._instances), device=lm.device)
+            gathered_item = (
+                lm.accelerator.gather(instances_rnk).cpu().detach().numpy().tolist()
+            )
+            # "multiple_choice" task types dispatch (several) "loglikelihood" request types
+            reqtype = (
+                "loglikelihood"
+                if task.OUTPUT_TYPE == "multiple_choice"
+                else task.OUTPUT_TYPE
+            )
+            # compute number of pseudo-batches to pad with (FSDP/DDP require even batches among ranks)
+            numpad = max(gathered_item) - gathered_item[lm.rank]
+            # todo: may not account for padding in cases like SquadV2 which has multiple req types
+            padding_requests[reqtype] += numpad
+    ### Run LM on inputs, get all outputs ###
+    # execute each type of request
+    for reqtype, reqs in requests.items():
+        eval_logger.info(f"Running {reqtype} requests")
+        # create `K` copies of each request `req` based off `K = req.repeats`
+        cloned_reqs = []
+        for req in reqs:
+            cloned_reqs.extend([req] * req.repeats)
+        if (lm.world_size > 1) and (padding_requests[reqtype] > 0):
+            for _ in range(padding_requests[reqtype]):
+                cloned_reqs.extend([req] * req.repeats)
+        # run requests through model
+        resps = getattr(lm, reqtype)(cloned_reqs)
+        # put responses from model into a list of length K for each request.
+        for x, req in zip(resps, cloned_reqs):
+            req.resps.append(x)
+        if lm.world_size > 1:
+            lm.accelerator.wait_for_everyone()
+    RANK = lm.rank
+    WORLD_SIZE = lm.world_size
+    ### Postprocess outputs ###
+    # TODO: del model here, maybe (idea: allow user to specify device of e.g. reward model separately)
+    for task_output, limit in zip(eval_tasks, limits):
+        task = task_output.task
+        task.apply_filters()
+        ### Collect values of metrics on all datapoints ###
+        # # unpack results and sort back in order and return control to Task
+        # TODO: make it possible to use a different metric per filter
+        # Pre-process task.instances to group by doc_id
+        instances_by_doc_id = defaultdict(list)
+        for instance in task.instances:
+            instances_by_doc_id[instance.doc_id].append(instance)
+        # Sort instances within each group
+        for instances in instances_by_doc_id.values():
+            instances.sort(key=lambda x: x.idx)
+        # iterate over different filters used
+        for filter_key in task.instances[0].filtered_resps.keys():
+            indices = (
+                samples.get(task_output.task_name, None)
+                if samples is not None
+                else None
+            )
+            doc_iterator = task.doc_iterator(
+                rank=RANK,
+                limit=limit,
+                world_size=WORLD_SIZE,
+                samples=indices,
+            )
+            for doc_id, doc in doc_iterator:
+                if indices:
+                    doc_id_true = indices[doc_id]
+                else:
+                    doc_id_true = doc_id
+                requests = instances_by_doc_id[doc_id]
+                metrics = task.process_results(
+                    doc, [req.filtered_resps[filter_key] for req in requests]
+                )
+                if log_samples:
+                    target = task.doc_to_target(doc)
+                    example = {
+                        "doc_id": doc_id_true,
+                        "doc": doc,
+                        "target": target,
+                        "arguments": [req.args for req in requests],
+                        "resps": [req.resps for req in requests],
+                        "filtered_resps": [
+                            req.filtered_resps[filter_key] for req in requests
+                        ],
+                        "filter": filter_key,
+                        "metrics": list(metrics.keys()),
+                        "doc_hash": hash_string(
+                            json.dumps(
+                                requests[0].doc,
+                                indent=2,
+                                default=handle_non_serializable,
+                                ensure_ascii=False,
+                            )
+                        ),
+                        "prompt_hash": hash_string(requests[0].arguments[0]),
+                        "target_hash": hash_string(str(target)),
+                    }
+                    example.update(metrics)
+                    task_output.logged_samples.append(example)
+                for metric, value in metrics.items():
+                    task_output.sample_metrics[(metric, filter_key)].append(value)
+    if WORLD_SIZE > 1:
+        # if multigpu, then gather data across all ranks to rank 0
+        # first gather logged samples across all ranks
+        for task_output in eval_tasks:
+            if log_samples:
+                # for task_name, task_samples in list(samples.items()):
+                full_samples = [None] * WORLD_SIZE if RANK == 0 else None
+                torch.distributed.gather_object(
+                    obj=task_output.logged_samples,
+                    object_gather_list=full_samples,
+                    dst=0,
+                )
+                if RANK == 0:
+                    task_output.logged_samples = list(
+                        itertools.chain.from_iterable(full_samples)
+                    )
+            # then collect metrics across all ranks
+            for metrics in task_output.sample_metrics:
+                metric_list = [None] * WORLD_SIZE if RANK == 0 else None
+                torch.distributed.gather_object(
+                    obj=task_output.sample_metrics[metrics],
+                    object_gather_list=metric_list,
+                    dst=0,
+                )
+                if RANK == 0:
+                    task_output.sample_metrics[metrics] = list(
+                        itertools.chain.from_iterable(metric_list)
+                    )
+    if RANK == 0:
+        ### Aggregate results over all datapoints ###
+        # aggregate results ; run bootstrap CIs
+        for task_output in eval_tasks:
+            task_output.calculate_aggregate_metric(bootstrap_iters=bootstrap_iters)
+        (
+            results,
+            samples,
+            configs,
+            versions,
+            num_fewshot,
+            higher_is_better,
+        ) = consolidate_results(eval_tasks)
+        ### Calculate group metrics ###
+        if bool(results):
+            results, versions, show_group_table, *_ = consolidate_group_results(
+                results, versions, task_dict
+            )
+        results_agg, group_agg = prepare_print_tasks(task_dict, results)
+        subtask_list = get_subtask_list(task_dict)
+        # collect all higher_is_better values for metrics
+        # in the group's subtasks.
+        # TODO: clean this up ; unify with the below metric_list loop?
+        _higher_is_better = {}
+        for group, task_list in subtask_list.items():
+            if (
+                len(task_list) != 0
+            ):  # subtask list will list "task_name": [] for solo tasks
+                for task in task_list:
+                    for m, h in higher_is_better[task].items():
+                        if m not in _higher_is_better.keys():
+                            _higher_is_better[m] = h
+                        if (
+                            m in _higher_is_better
+                            and _higher_is_better[m] is not None
+                            and _higher_is_better[m] != h
+                        ):
+                            eval_logger.warning(
+                                f"Higher_is_better values for metric {m} in group {group} are not consistent. Defaulting to None."
+                            )
+                            _higher_is_better[m] = None
+                higher_is_better[group] = _higher_is_better
+        results_dict = {
+            "results": dict(results_agg.items()),
+            **(
+                {"groups": dict(group_agg.items())}
+                if (bool(group_agg) & show_group_table)
+                else {}
+            ),
+            "group_subtasks": dict(reversed(subtask_list.items())),
+            "configs": dict(sorted(configs.items())),
+            "versions": dict(sorted(versions.items())),
+            "n-shot": dict(sorted(num_fewshot.items())),
+            "higher_is_better": dict(sorted(higher_is_better.items())),
+            "n-samples": {
+                task_output.task_name: {
+                    "original": len(task_output.task.eval_docs),
+                    "effective": min(
+                        limit if limit else len(task_output.task.eval_docs),
+                        len(task_output.task.eval_docs),
+                    ),
+                }
+                for task_output, limit in zip(eval_tasks, limits)
+            },
+        }
+        if log_samples:
+            # default: hash images
+            samples = (
+                hash_dict_images(samples)
+                if os.environ.get("LMEVAL_HASHMM", "1") != "0"
+                and (hasattr(lm, "MULTIMODAL"))
+                else samples
+            )
+            results_dict["samples"] = dict(samples)
+        return results_dict
+    else:
+        return None
+def request_caching_arg_to_dict(cache_requests: str) -> dict:
+    request_caching_args = {
+        "cache_requests": cache_requests in {"true", "refresh"},
+        "rewrite_requests_cache": cache_requests == "refresh",
+        "delete_requests_cache": cache_requests == "delete",
+    }
+    return request_caching_args

lm-evaluation-harness/lm_eval/evaluator_utils.py ADDED Viewed

	@@ -0,0 +1,554 @@

+import collections
+import logging
+import math
+import pathlib
+import sys
+from typing import List, Optional, Tuple, Union
+from lm_eval.api.group import ConfigurableGroup
+from lm_eval.api.metrics import (
+    aggregate_subtask_metrics,
+    mean,
+    pooled_sample_stderr,
+    stderr_for_metric,
+)
+from lm_eval.api.task import Task
+from lm_eval.utils import positional_deprecated
+eval_logger = logging.getLogger(__name__)
+class TaskOutput:
+    """
+    Wrapper class for Task outputs.It contains various attributes and methods to manage and calculate metrics for the task.
+        Attributes:
+            task (object): The task object.
+            task_name (str): The name of the task.
+            task_config (dict): The configuration of the task.
+            version (str): The version of the task.
+            group_name (str): The name of the task group.
+            n_shot (int): The number of shots for the task.
+            task_alias (str): The alias of the task.
+            group_alias (str): The alias of the task group.
+            is_group (bool): Indicates if the task is a group.
+            logged_samples (list): The list of logged samples.
+            sample_len (int): The length of the samples.
+            sample_metrics (defaultdict): The dictionary of samples' metrics.
+            agg_metrics (defaultdict): The dictionary of aggregate metrics.
+        Methods:
+            from_taskdict(cls, task_name: str, task):
+                Creates a TaskOutput instance from a task dictionary.
+            calculate_aggregate_metric(bootstrap_iters=100000) -> None:
+                Calculates the aggregate metrics for the task.
+    """
+    def __init__(
+        self,
+        task=None,
+        task_name=None,
+        task_config=None,
+        version=None,
+        group_name=None,
+        n_shot=None,
+        task_alias=None,
+        group_alias=None,
+        is_group=None,
+    ):
+        self.task = task
+        self.task_config = task_config
+        self.task_name = task_name
+        self.group_name = group_name
+        self.version = version
+        self.n_shot = n_shot
+        self.task_alias = task_alias
+        self.group_alias = group_alias
+        self.is_group = is_group
+        self.logged_samples = []
+        self.sample_len = None
+        self.sample_metrics = collections.defaultdict(list)
+        self.agg_metrics = collections.defaultdict(list)
+    @classmethod
+    def from_taskdict(cls, task_name: str, task):
+        if isinstance(task, tuple):
+            group_name, task = task
+        else:
+            group_name = None
+        if not task:
+            # these gets filtered out in get_task_list
+            # once they are added to group hierarchy
+            is_group = True
+            return cls(
+                task=task, task_name=task_name, is_group=is_group, group_name=group_name
+            )
+        version = task.VERSION
+        task_config = dict(task.dump_config())
+        if (n_shot := task_config.get("num_fewshot")) == 0:
+            n_shot = task_config.get("metadata", {}).get("num_fewshot", 0)
+        task_alias = task_config.get("alias")
+        group_alias = task_config.get("group_alias")
+        return cls(
+            task=task,
+            task_name=task_name,
+            task_config=task_config,
+            group_name=group_name,
+            version=version,
+            n_shot=n_shot,
+            task_alias=task_alias,
+            group_alias=group_alias,
+        )
+    def calculate_aggregate_metric(self, bootstrap_iters=100000) -> None:
+        for (metric, filter_key), items in self.sample_metrics.items():
+            try:
+                agg_fn = self.task.aggregation()[metric]
+            except KeyError:
+                # This is when process results output an arbitrary metric
+                # TODO: Handle this better and allow other aggregate functions other than mean.
+                agg_fn = mean
+            metric_key = f"{metric},{filter_key}"
+            self.agg_metrics[metric_key] = agg_fn(items)
+            self.sample_len = len(items)  # TODO: same sample size for each metric?
+            if isinstance(bootstrap_iters, int):
+                stderr_fn = stderr_for_metric(
+                    metric=agg_fn,
+                    bootstrap_iters=min(bootstrap_iters, 100)
+                    if metric in ["bleu", "chrf", "ter"]
+                    else bootstrap_iters,
+                )
+                self.agg_metrics[f"{metric}_stderr,{filter_key}"] = (
+                    stderr_fn(items) if (stderr_fn and len(items) > 1) else "N/A"
+                )
+            else:
+                raise ValueError(
+                    f"Received bootstrap_iters '{bootstrap_iters}' but expected an integer. Set to 0 to turn off stderr calculations."
+                )
+    def __repr__(self):
+        return (
+            f"TaskOutput(task_name={self.task_name}, "
+            f"group_name={self.group_name}, "
+            f"version={self.version}, "
+            f"n_shot={self.n_shot}, "
+            f"task_alias={self.task_alias}, "
+            f"group_alias={self.group_alias})"
+        )
+def get_task_list(task_dict: dict) -> List[TaskOutput]:
+    outputs = []
+    for task_name, task_obj in task_dict.items():
+        if isinstance(task_obj, dict):
+            _outputs = get_task_list(task_obj)
+            outputs.extend(_outputs)
+        else:
+            task_output = TaskOutput.from_taskdict(task_name, task_obj)
+            outputs.append(task_output)
+    return outputs
+def get_subtask_list(task_dict, task_root=None, depth=0):
+    subtask_list = {}
+    for group_obj, task_obj in task_dict.items():
+        if isinstance(group_obj, ConfigurableGroup):
+            # group_name = group_obj.group_name
+            group_name = group_obj.group_name
+        else:
+            group_name = group_obj
+        if isinstance(task_obj, dict):
+            _subtask_list = get_subtask_list(
+                task_obj, task_root=group_name, depth=depth + 1
+            )
+            if task_root:
+                subtask_list.setdefault((task_root, depth), []).extend(
+                    [
+                        _task
+                        for (_task, _depth) in _subtask_list.keys()
+                        if (_depth - 1) == depth
+                    ]
+                )
+            subtask_list = {**subtask_list, **_subtask_list}
+        else:
+            if isinstance(task_obj, ConfigurableGroup):
+                # group_or_task_name = task_obj.group_name
+                group_or_task_name = task_obj.group_name
+            elif isinstance(task_obj, Task):
+                # group_or_task_name = task_obj.task_name
+                group_or_task_name = task_obj.task_name
+            if task_root is None:
+                subtask_list.setdefault((group_or_task_name, depth), [])
+            else:
+                subtask_list.setdefault((task_root, depth), []).append(
+                    group_or_task_name
+                )
+    if depth == 0:
+        _subtask_list = {}
+        for group_key, task_list in subtask_list.items():
+            group_name, depth = group_key
+            _subtask_list[group_name] = task_list
+        subtask_list = _subtask_list
+    return subtask_list
+def print_writeout(task) -> None:
+    for inst in task.instances:
+        # print the prompt for the first few documents
+        if inst.doc_id < 1:
+            eval_logger.info(
+                f"Task: {task}; document {inst.doc_id}; context prompt (starting on next line):\
+    \n{inst.args[0]}\n(end of prompt on previous line)\ntarget string or answer choice index (starting on next line):\n{task.doc_to_target(inst.doc)}\n(end of target on previous line)"
+            )
+            eval_logger.info(f"Request: {str(inst)}")
+def get_sample_size(task, limit: Optional[int]) -> Union[int, None]:
+    if limit is not None:
+        limit = (
+            int(math.ceil(len(task.eval_docs) * limit)) if limit < 1.0 else int(limit)
+        )
+    return limit
+def prepare_print_tasks(
+    task_dict: dict,
+    results: dict,
+    task_depth=0,
+    group_depth=0,
+) -> Tuple[dict, dict]:
+    """
+    @param task_dict: Dictionary representing the group hierarchy of tasks. Each key is a group name and its
+    value is a list of task names.
+    @param results: Dictionary containing the results of each task. Each key is a
+    group name and its value is a dictionary of task results.
+    @param task_depth: The indentation level for printing the task
+    hierarchy. Default is 0.
+    @param group_depth: The indentation level for printing the group
+    hierarchy. Default is 0.
+    @return: A tuple of two dictionaries: results_agg and groups_agg. results_agg contains
+    aggregated results for each task, and groups_agg contains aggregated results for each group.
+    Prepares the task hierarchy and aggregates the results for each task and group recursively for printing.
+    """
+    def _sort_task_dict(task_dict):
+        """
+        Helper utility. Sorts the task dict at the current level of the hierarchy based on alphabetized task name.
+        Required so that we end up sorting within each sub-header correctly.
+        """
+        return dict(
+            sorted(
+                task_dict.items(),
+                key=lambda item: item[0].group_name
+                if isinstance(item[0], ConfigurableGroup)
+                else item[0],
+            )
+        )
+    task_agg = collections.defaultdict(dict)
+    group_agg = collections.defaultdict(dict)
+    task_dict = _sort_task_dict(task_dict)
+    for task_or_group_name, task_or_group_obj in task_dict.items():
+        tab_string = " " * task_depth + "- " if task_depth > 0 else ""
+        if isinstance(task_or_group_name, ConfigurableGroup):
+            # string_name = task_or_group_name.group_name
+            name = task_or_group_name.group_name
+            from_configurable_group = True
+            task_or_group_obj = _sort_task_dict(task_or_group_obj)
+        elif isinstance(task_or_group_name, str):
+            name = task_or_group_name
+            if isinstance(task_or_group_obj, Task):
+                # string_name = task_or_group_obj.task_name
+                name = task_or_group_obj.task_name
+            from_configurable_group = False
+        task_agg[name] = results[name].copy()
+        if from_configurable_group:
+            if task_or_group_name.group_alias is not None:
+                alias = task_or_group_name.group_alias
+            else:
+                alias = task_or_group_name.group
+        else:
+            if "alias" in task_agg[name]:
+                alias = task_agg[name]["alias"]
+            else:
+                alias = name
+        task_agg[name]["alias"] = tab_string + alias
+        if "samples" in task_agg[name]:
+            task_agg[name].pop("samples")
+        if from_configurable_group and (" " not in results[name]):
+            group_tab_string = " " * group_depth + "- " if group_depth > 0 else ""
+            group_agg[name] = results[name].copy()
+            group_agg[name]["alias"] = group_tab_string + alias
+            if "samples" in group_agg[name]:
+                group_agg[name].pop("samples")
+        if isinstance(task_or_group_obj, dict):
+            task_depth += 1
+            group_depth += 1
+            _task_agg, _group_agg = prepare_print_tasks(
+                task_or_group_obj, results, task_depth, group_depth
+            )
+            task_agg = {
+                **task_agg,
+                **_task_agg,
+            }
+            group_agg = {**group_agg, **_group_agg}
+            task_depth -= 1
+            group_depth -= 1
+    return task_agg, group_agg
+def consolidate_results(
+    eval_tasks: List[TaskOutput],
+) -> Tuple[dict, dict, dict, dict, dict, dict]:
+    """
+    @param eval_tasks: list(TaskOutput).
+    @return: A tuple containing the consolidated results, samples, configs, versions, and num_fewshot.
+    Consolidates the results of multiple evaluation tasks into a single structure.
+    The method iterates over each evaluation instance and extracts relevant information to create the consolidated
+    results structure. The consolidated results structure has the following properties:
+    - results: A defaultdict with task names as keys and dictionaries as values. Each dictionary contains
+    metric/filter pairs as keys and corresponding metric values as values. The "alias" key is used to store task
+    aliases specified in the task configuration.
+    - samples: A defaultdict with task names as keys and lists of log samples as values.
+    - configs: A defaultdict with task names as keys and task configurations as values.
+    - versions: A defaultdict with task names as keys and task versions as values.
+    - num_fewshot: A defaultdict with task names as keys and number of few-shot samples as values.
+    - higher_is_better: A defaultdict with task names as keys and indicators of whether higher values are better
+    for each metric as values.
+    The method then returns the consolidated results, samples, configs, versions, and num_fewshot as a tuple.
+    """
+    # stores the final result for each task, for each metric/filter pair.
+    results = collections.defaultdict(dict)
+    # logs info about each document evaluated.
+    samples = collections.defaultdict(list)
+    # store num-fewshot value per task
+    num_fewshot = collections.defaultdict(int)
+    # Tracks the YAML configs of all chosen task
+    configs = collections.defaultdict(dict)
+    # Tracks each task's version.
+    versions = collections.defaultdict(dict)
+    # Track `higher_is_better` for each metric
+    higher_is_better = collections.defaultdict(dict)
+    for task_output in eval_tasks:
+        if "task_alias" in (task_config := task_output.task_config):
+            results[task_output.task_name]["alias"] = task_config["task_alias"]
+        else:
+            results[task_output.task_name]["alias"] = task_output.task_name
+        if group_alias := task_output.group_alias:
+            if group_alias not in results and (group_name := task_output.group_name):
+                results[group_name]["alias"] = group_alias
+        num_fewshot[task_output.task_name] = task_output.n_shot
+        configs[task_output.task_name] = task_output.task_config
+        versions[task_output.task_name] = task_output.version
+        samples[task_output.task_name] = task_output.logged_samples
+        higher_is_better[task_output.task_name] = task_output.task.higher_is_better()
+        for (metric, filter_key), items in task_output.sample_metrics.items():
+            metric_key = f"{metric},{filter_key}"
+            results[task_output.task_name][metric_key] = task_output.agg_metrics[
+                metric_key
+            ]
+            results[task_output.task_name]["samples"] = task_output.sample_len
+            results[task_output.task_name][f"{metric}_stderr,{filter_key}"] = (
+                task_output.agg_metrics[f"{metric}_stderr,{filter_key}"]
+            )
+    return results, samples, configs, versions, num_fewshot, higher_is_better
+def consolidate_group_results(
+    results,
+    versions,
+    task_dict,
+    task_root=None,
+    show_group_table=False,
+    task_aggregation_list=None,
+) -> Tuple[dict, dict, bool, Union[None,]]:
+    """
+    (Recursively) calculates groups' aggregated metrics and updates the results and versions dictionaries with this info.
+    @return: a tuple [results, versions, show_group_table, task_aggregation_list] with formats described below:
+    - results: A defaultdict with task names (and, after this function is called, group names of
+    groups that perform aggregation) as keys, and dictionaries with "alias" and metric,filter_name pairs as keys.
+    - versions: A defaultdict with task names (and, after this function is called, group names of
+    groups that perform aggregation) as keys, and float values representing the task or group's version if a version is specified. (defaulting to None).
+    - show_group_table: a boolean which is true if there exists a group that requires printing of its aggregated scores in a group table.
+    - task_aggregation_list: a defaultdict listing the subtasks to average over to produce a given group's end metric.
+    The method then returns the updated results, versions, show_group_table, and task_aggregation_list as a tuple.
+    In the top-level invocation of this function, task_aggregation_list is ignored.
+    """
+    if task_root is None:
+        task_root = {}
+    if task_aggregation_list is None:
+        task_aggregation_list = {}
+    for group_or_task, group_or_task_info in task_dict.items():
+        # Convert to string
+        if isinstance(group_or_task, ConfigurableGroup):
+            group_config = group_or_task.config
+            group_or_task = group_or_task.group_name
+        else:
+            group_config = None
+        if isinstance(group_or_task_info, Task):
+            if task_root:
+                task_aggregation_list.setdefault(task_root, []).append(
+                    group_or_task_info.task_name
+                )
+        else:
+            (
+                results,
+                versions,
+                show_group_table,
+                _task_aggregation_list,
+            ) = consolidate_group_results(
+                results,
+                versions,
+                group_or_task_info,
+                group_or_task,
+                show_group_table,
+                task_aggregation_list,
+            )
+            if task_root:
+                task_aggregation_list.setdefault(task_root, []).extend(
+                    task_aggregation_list.get(group_or_task, [])
+                )
+            if (group_config is None) or (
+                group_config["aggregate_metric_list"] is None
+            ):
+                results[group_or_task][" "] = " "
+                continue
+            if "aggregate_metric_list" in group_config:
+                agg_metric_list = group_config["aggregate_metric_list"]
+            show_group_table = show_group_table | bool(
+                group_config["aggregate_metric_list"]
+            )
+            task_list = _task_aggregation_list[group_or_task]
+            metric_list = list(
+                {
+                    key
+                    for task in task_list
+                    for key in results[task].keys()
+                    if "_stderr" not in key and key not in ["task", "alias", "samples"]
+                }
+            )
+            for metric in metric_list:
+                stderr = "_stderr,".join(metric.split(","))
+                # gather metrics, sizes, and stderrs from subtasks
+                metrics = [
+                    results[task][metric]
+                    for task in task_list
+                    if metric in results[task]
+                ]  # TODO: copy?
+                stderrs = [
+                    results[task][stderr]
+                    for task in task_list
+                    if stderr in results[task]
+                ]
+                sizes = [
+                    results[task]["samples"]
+                    for task in task_list
+                    if metric in results[task]
+                ]
+                for metric_config in agg_metric_list:
+                    for filter_name in metric_config["filter_list"]:
+                        if metric != ",".join([metric_config["metric"], filter_name]):
+                            continue
+                        # compute group's pooled metric and stderr
+                        if metric_config["aggregation"] == "mean":
+                            aggregate_fn = aggregate_subtask_metrics
+                        elif callable(metric_config["aggregation"]):
+                            aggregate_fn = metric_config["aggregation"]
+                        else:
+                            raise ValueError(
+                                f"Currently, only 'mean' is supported for automatically aggregating scores across groups' subtasks. Got '{metric_config['aggregation']}' for group '{group_or_task}'"
+                            )
+                        results[group_or_task][metric] = aggregate_fn(
+                            metrics,
+                            sizes,
+                            metric_config["weight_by_size"],
+                        )
+                        # TODO: calculate groups' metrics using arbitrary agg fns
+                        if "N/A" in stderrs:
+                            results[group_or_task][stderr] = "N/A"
+                        else:
+                            # NOTE: this assumes we are using the mean to aggregate. There are warnings about this elsewhere
+                            results[group_or_task][stderr] = pooled_sample_stderr(
+                                stderrs, sizes
+                            )
+                results[group_or_task]["samples"] = sum(sizes)
+                group_metadata = group_config.get("metadata", None)
+                if group_metadata is not None:
+                    versions[group_or_task] = group_metadata.get("version", None)
+    # print(results)
+    return results, versions, show_group_table, task_aggregation_list
+@positional_deprecated
+def find_test_root(start_path: pathlib.Path) -> pathlib.Path:
+    """
+    Search upward in the directory tree to a maximum of three layers
+    to find and return the package root (containing the 'tests' folder)
+    """
+    cur_path = start_path.resolve()
+    max_layers = 3
+    for _ in range(max_layers):
+        if (cur_path / "tests" / "test_version_stable.py").exists():
+            return cur_path
+        else:
+            cur_path = cur_path.parent.resolve()
+    raise FileNotFoundError(
+        f"Unable to find package root within {max_layers} upwards" + f"of {start_path}"
+    )
+@positional_deprecated
+def run_task_tests(task_list: List[str]):
+    """
+    Find the package root and run the tests for the given tasks
+    """
+    import pytest
+    package_root = find_test_root(start_path=pathlib.Path(__file__))
+    task_string = " or ".join(task_list)
+    args = [
+        f"{package_root}/tests/test_version_stable.py",
+        f"--rootdir={package_root}",
+        "-k",
+        f"{task_string}",
+    ]
+    sys.path.append(str(package_root))
+    pytest_return_val = pytest.main(args)
+    if pytest_return_val:
+        raise ValueError(
+            f"Not all tests for the specified tasks ({task_list}) ran successfully! Error code: {pytest_return_val}"
+        )

lm-evaluation-harness/lm_eval/filters/__init__.py ADDED Viewed

	@@ -0,0 +1,25 @@

+from functools import partial
+from typing import List
+from lm_eval.api.filter import FilterEnsemble
+from lm_eval.api.registry import get_filter
+from . import custom, extraction, selection, transformation
+def build_filter_ensemble(
+    filter_name: str, components: List[List[str]]
+) -> FilterEnsemble:
+    """
+    Create a filtering pipeline.
+    """
+    filters = []
+    for function, kwargs in components:
+        if kwargs is None:
+            kwargs = {}
+        # create a filter given its name in the registry
+        f = partial(get_filter(function), **kwargs)
+        # add the filter as a pipeline step
+        filters.append(f)
+    return FilterEnsemble(name=filter_name, filters=filters)

lm-evaluation-harness/lm_eval/filters/custom.py ADDED Viewed

	@@ -0,0 +1,17 @@

+from lm_eval.api.filter import Filter
+from lm_eval.api.registry import register_filter
+@register_filter("custom")
+class CustomFilter(Filter):
+    """
+    Custom filter that applies a custom, user-defined function to the model responses.
+    """
+    def __init__(self, **kwargs) -> None:
+        self.filter_fn = kwargs.pop("filter_fn")
+        super().__init__(**kwargs)
+    def apply(self, resps, docs):
+        return self.filter_fn(resps, docs)

lm-evaluation-harness/lm_eval/filters/decontamination.py ADDED Viewed

	@@ -0,0 +1,25 @@

+from lm_eval.api.filter import Filter
+from lm_eval.api.registry import register_filter
+@register_filter("decontaminate")
+class DecontaminationFilter(Filter):
+    """
+    A filter which evaluates
+    """
+    name = "track_decontamination"
+    def __init__(self, path) -> None:
+        """
+        TODO: make sure only ever run one time on the train set (should this be cached as a class var? keyed by value for "path").
+        should further cache result on a given (task_name, doc_id)
+        """
+        self._decontam_results = None
+    def apply(self, resps, docs) -> None:
+        """
+        Return {"no_contamination", "only_contamination"} keys for the 2 different subsets
+        """
+        pass

lm-evaluation-harness/lm_eval/filters/extraction.py ADDED Viewed

	@@ -0,0 +1,233 @@

+import re
+import sys
+import unicodedata
+from lm_eval.api.filter import Filter
+from lm_eval.api.registry import register_filter
+@register_filter("regex")
+class RegexFilter(Filter):
+    """A filter that extracts values from text using regex pattern matching.
+    This filter applies a regex pattern to each model response and extracts matched values.
+    If no match is found, returns a fallback value. Useful for extracting structured data
+    (like numbers) from unstructured model outputs.
+    """
+    def __init__(
+        self,
+        regex_pattern: str = r"#### (\-?[0-9\.\,]+)",
+        group_select: int = 0,
+        fallback: str = "[invalid]",
+    ) -> None:
+        """
+        pass a string `regex` to run `re.compile(r"regex")` on.
+        `fallback` defines the output returned if no matches for the regex are located.
+        """
+        self.regex_pattern = regex_pattern
+        self.regex = re.compile(regex_pattern)
+        self.group_select = group_select
+        self.fallback = fallback
+    def apply(self, resps: list[list[str]], docs: list[dict]) -> list[list[str]]:
+        # here, we assume we have a list, in which each element is
+        # a list of model responses for some particular input/target pair.
+        # so we process each of these (same input/target response sets)
+        # independently (and keep them a list.)
+        def filter_set(inst):
+            filtered = []
+            for resp in inst:
+                match = self.regex.findall(resp)
+                if match:
+                    match = match[self.group_select]
+                    if isinstance(match, tuple):
+                        match = [m for m in match if m]
+                        if match:
+                            match = match[0]
+                        else:
+                            match = self.fallback
+                    match = match.strip()
+                else:
+                    match = self.fallback
+                filtered.append(match)
+            return filtered
+        filtered_resps = list(map(lambda x: filter_set(x), resps))
+        return filtered_resps
+@register_filter("regex_pos")
+class POSFilter(Filter):
+    """ """
+    def __init__(
+        self,
+        regex_pattern: str = r"\['(.*?)'\]",
+        group_select=0,
+        fallback=None,
+    ) -> None:
+        """
+        pass a string `regex` to run `re.compile(r"regex")` on.
+        `fallback` defines the output returned if no matches for the regex are located.
+        """
+        if fallback is None:
+            fallback = ["invalid"]
+        self.regex_pattern = regex_pattern
+        self.regex = re.compile(regex_pattern)
+        self.group_select = group_select
+        self.fallback = fallback
+    def apply(self, resps, docs):
+        def extract_tagged_tokens(text):
+            # Extract tagged tokens list from text input using regex
+            tokens = re.findall(r"\('([^']*)', '([^']*)'\)", text)
+            return [(token, pos) for token, pos in tokens]
+        def extract_pos_tags(result):
+            pos_tags = []
+            if isinstance(result, str):
+                result = extract_tagged_tokens(result)
+            pos_tags.extend(pos for _, pos in result)
+            return pos_tags if pos_tags else self.fallback
+        def filter_set(inst):
+            filtered = []
+            for resp in inst:
+                match = extract_pos_tags(resp)
+                filtered.append(match)
+            return filtered
+        filtered_resps = map(lambda x: filter_set(x), resps)
+        return filtered_resps
+@register_filter("remove_whitespace")
+class WhitespaceFilter(Filter):
+    """Filters out leading whitespace from responses."""
+    def apply(self, resps: list[list[str]], docs: list[dict]) -> list[list[str]]:
+        def filter_set(inst):
+            filtered_resp = []
+            for resp in inst:
+                resp = resp.lstrip()
+                filtered_resp.append(resp)
+            return filtered_resp
+        filtered_resps = [filter_set(resp) for resp in resps]
+        return filtered_resps
+@register_filter("multi_choice_regex")
+class MultiChoiceRegexFilter(RegexFilter):
+    """
+    A filter used to extract a model's answer on multiple choice questions with
+    letter answers. assumes each document has a "choices" field
+    containing the list of answer choices and that the answer label symbols
+    are of the form (A), (B), (C), ... or A, B, C.
+    """
+    def __init__(
+        self,
+        regex_pattern: str = r"#### (\-?[0-9\.\,]+)",
+        group_select=0,
+        fallback: str = "[invalid]",
+        ignore_case=False,
+        ignore_punctuation=False,
+        regexes_to_ignore=None,
+    ) -> None:
+        """
+        regex_pattern: The basic regex pattern to use. If fails to match, we will use the customized match procedure
+                        - step 1 : We parse the choices between ([A-Z])s then try to find these choices in the response.
+                        - step 2 : We parse the choice with regex: r's*([A-?])', where ? varies by number of choices.
+        group_select: Selects the (group_select)th match from the findall result.
+        ignore_case: Ignores the case during step 1 matching
+        ignore_punctuation: Remove the punctuation during step 1 matching
+        regexes_to_ignore: Remove these regexes during step 1 matching
+        """
+        super().__init__(regex_pattern, group_select, fallback)
+        self.ignore_case = ignore_case
+        self.ignore_punctuation = ignore_punctuation
+        self.regexes_to_ignore = regexes_to_ignore
+    def apply(self, resps: list[list[str]], docs: list[dict]) -> list[list[str]]:
+        # here, we assume we have a list, in which each element is
+        # a list of model responses for some particular input/target pair.
+        # so we process each of these (same input/target response sets)
+        # independently (and keep them a list.)
+        def find_match(regex, resp, convert_dict={}):
+            match = regex.findall(resp)
+            if match:
+                match = match[self.group_select]
+                if isinstance(match, tuple):
+                    match = [m for m in match if m][0]
+                match = match.strip()
+                if match and match in convert_dict:
+                    match = convert_dict[match]
+            return match
+        punct_tbl = dict.fromkeys(
+            i
+            for i in range(sys.maxunicode)
+            if unicodedata.category(chr(i)).startswith("P")
+        )
+        def filter_ignores(st):
+            if self.regexes_to_ignore is not None:
+                for s in self.regexes_to_ignore:
+                    st = re.sub(s, "", st)
+            if self.ignore_case:
+                st = st.lower()
+            if self.ignore_punctuation:
+                # https://stackoverflow.com/a/266162
+                st = st.translate(punct_tbl)
+            return st
+        filtered_resps = []
+        for r, doc in zip(resps, docs):
+            fallback_regexes = []
+            choice_to_alpha = {}
+            next_alpha = "A"
+            without_paren_fallback_regexes = []
+            without_paren_to_target = {}
+            choices = doc["choices"]
+            for c in choices:
+                m = filter_ignores(c.strip())
+                fallback_regexes.append(f"{re.escape(m)}")
+                choice_to_alpha[m] = f"({next_alpha})"
+                without_paren_fallback_regexes.append(next_alpha)
+                without_paren_to_target[next_alpha] = f"({next_alpha})"
+                next_alpha = chr(ord(next_alpha) + 1)
+            fallback_regex = re.compile("|".join(fallback_regexes))
+            without_paren_fallback_regex = "|".join(without_paren_fallback_regexes)
+            without_paren_fallback_regex = re.compile(
+                rf":[\s]*({without_paren_fallback_regex})"
+            )
+            filtered = []
+            for resp in r:
+                match = find_match(self.regex, resp)
+                if not match:
+                    match = find_match(
+                        fallback_regex, filter_ignores(resp), choice_to_alpha
+                    )
+                    if not match:
+                        match = find_match(
+                            without_paren_fallback_regex, resp, without_paren_to_target
+                        )
+                if not match:
+                    match = self.fallback
+                filtered.append(match)
+            filtered_resps.append(filtered)
+        return filtered_resps

lm-evaluation-harness/lm_eval/filters/selection.py ADDED Viewed

	@@ -0,0 +1,61 @@

+from collections import Counter
+from lm_eval.api.filter import Filter
+from lm_eval.api.registry import register_filter
+# TODO: implement "arg_max" filter. either it should take in an arbitrary "scoring"/reward function
+# that takes an input and returns a scalar and then should select the max reward,
+# or should implement different filters for different ways of handling a reward model's inference.
+@register_filter("take_first")
+class TakeFirstFilter(Filter):
+    def __init__(self) -> None:
+        """
+        Can define custom behavior here, if an individual instantiation of a Filter class should have state.
+        """
+    def apply(self, resps, docs):
+        """
+        Assuming each entry of `resps` is a list of model responses, we discard all but the first response.
+        """
+        return map(lambda r: r[0], resps)
+@register_filter("take_first_k")
+class TakeKFilter(Filter):
+    def __init__(self, **kwargs) -> None:
+        self.k = kwargs.pop("k")
+        super().__init__(**kwargs)
+    def apply(self, resps, docs):
+        # need resp to be subscriptable to check below
+        resps = list(resps)
+        # check we have at least k responses per doc, else we can't take the first k
+        assert len(resps[0]) >= self.k, (
+            f"Need at least {self.k} responses per doc to take first {self.k}, but got {len(resps[0])} only! Please increase TaskConfig.repeats ."
+        )
+        return map(lambda r: r[: self.k], resps)
+@register_filter("majority_vote")
+class MajorityVoteFilter(Filter):
+    def __init__(self) -> None:
+        """
+        Can define custom behavior here, if an individual instantiation of a Filter class should have state.
+        """
+    def apply(self, resps, docs):
+        """
+        Each entry of `resps` is a list of model responses.
+        We select the response that occurs most frequently in each entry of `resps`.
+        """
+        def select_majority(resp):
+            counts = Counter(resp)
+            vote = counts.most_common(1)[0][0]
+            return vote
+        return map(lambda r: [select_majority(r)], resps)

lm-evaluation-harness/lm_eval/filters/transformation.py ADDED Viewed

	@@ -0,0 +1,122 @@

+import re
+from lm_eval.api.filter import Filter
+from lm_eval.api.registry import register_filter
+@register_filter("lowercase")
+class LowercaseFilter(Filter):
+    def __init__(self) -> None:
+        pass
+    def apply(self, resps, docs):
+        def filter_set(inst):
+            return [resp.lower() for resp in inst]
+        return [filter_set(resp) for resp in resps]
+@register_filter("uppercase")
+class UppercaseFilter(Filter):
+    def __init__(self) -> None:
+        pass
+    def apply(self, resps, docs):
+        def filter_set(inst):
+            return [resp.upper() for resp in inst]
+        return [filter_set(resp) for resp in resps]
+@register_filter("map")
+class MapFilter(Filter):
+    def __init__(self, mapping_dict: dict = None, default_value=None) -> None:
+        """
+        Initializes the MapFilter with a given mapping dictionary and default value.
+        Args:
+        - mapping_dict (dict): A dictionary containing the key-value mappings.
+                               Default is an empty dictionary.
+        - default_value (Any): The value to be returned when a key is not found in the mapping_dict.
+                               Default is None.
+        Example:
+        mapper = MapFilter({'A': 1, 'B': 2}, default_value=0)
+        """
+        if mapping_dict is None:
+            mapping_dict = {}
+        assert isinstance(mapping_dict, dict), (
+            "Provided mapping_dict is not a dictionary"
+        )
+        self.mapping_dict = mapping_dict
+        self.default_value = default_value
+    def apply(self, resps, docs):
+        def filter_set(inst):
+            return [self.mapping_dict.get(resp, self.default_value) for resp in inst]
+        return [filter_set(resp) for resp in resps]
+@register_filter("format_span")
+class SPANFilter(Filter):
+    def __init__(self) -> None:
+        pass
+    def apply(self, resps, docs):
+        def format_ner_text(text):
+            label_dict = {
+                "person": "PER",
+                "location": "LOC",
+                "organization": "ORG",
+                "counties": "LOC",
+                "places": "LOC",
+                "people": "PER",
+                "persons": "PER",
+                "company": "ORG",
+                "country": "LOC",
+                "continent": "LOC",
+                "time": "DATE",
+                "date": "DATE",
+                "per": "PER",
+                "loc": "LOC",
+                "org": "ORG",
+            }
+            text = text.lower()
+            for key, value in label_dict.items():
+                text = text.replace(key, value)
+            text = "$".join(i for i in text.split("$$"))
+            return text.rstrip("$$")
+        def format_named_entities(text):
+            """
+            Extract named entities from text and format them as 'label: value $$ label: value'.
+            Handles grouped entities (e.g., LOC: kenya, uganda) and excludes 'none' values.
+            """
+            # Regular expression to match label: entities pattern
+            pattern = r"\b(PER|LOC|ORG|DATE):\s*([^$]+)"
+            # Normalize newline characters
+            text = text.replace("\n", "$").strip()
+            matches = re.findall(pattern, text)
+            formatted_entities = []
+            for label, values in matches:
+                # Split multiple entities separated by commas and strip whitespace
+                entities = [value.strip() for value in values.split(",")]
+                # Exclude 'none' entities
+                for entity in entities:
+                    if entity.lower() != "none":
+                        formatted_entities.append(f"{label.lower()}: {entity}")
+            # Join entities with the desired separator
+            return " $ ".join(formatted_entities)
+        def filter_set(inst):
+            return [
+                format_named_entities(format_ner_text(resp.lower())) for resp in inst
+            ]
+        return [filter_set(resp) for resp in resps]

lm-evaluation-harness/lm_eval/loggers/__init__.py ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ from .evaluation_tracker import EvaluationTracker
2	+ from .wandb_logger import WandbLogger

lm-evaluation-harness/lm_eval/loggers/evaluation_tracker.py ADDED Viewed

	@@ -0,0 +1,537 @@

+import json
+import logging
+import os
+import re
+import time
+from collections import defaultdict
+from dataclasses import asdict, dataclass
+from datetime import datetime
+from pathlib import Path
+from datasets import load_dataset
+from datasets.utils.metadata import MetadataConfigs
+from huggingface_hub import (
+    DatasetCard,
+    DatasetCardData,
+    HfApi,
+    hf_hub_url,
+)
+from huggingface_hub.utils import build_hf_headers, get_session, hf_raise_for_status
+from lm_eval.utils import (
+    get_file_datetime,
+    get_file_task_name,
+    get_results_filenames,
+    get_sample_results_filenames,
+    handle_non_serializable,
+    hash_string,
+    sanitize_list,
+    sanitize_model_name,
+    sanitize_task_name,
+)
+eval_logger = logging.getLogger(__name__)
+@dataclass(init=False)
+class GeneralConfigTracker:
+    """
+    Tracker for the evaluation parameters.
+    Attributes:
+        model_source (str): Source of the model (e.g. Hugging Face, GGUF, etc.)
+        model_name (str): Name of the model.
+        model_name_sanitized (str): Sanitized model name for directory creation.
+        start_time (float): Start time of the experiment. Logged at class init.
+        end_time (float): Start time of the experiment. Logged when calling [`GeneralConfigTracker.log_end_time`]
+        total_evaluation_time_seconds (str): Inferred total evaluation time in seconds (from the start and end times).
+    """
+    model_source: str = None
+    model_name: str = None
+    model_name_sanitized: str = None
+    system_instruction: str = None
+    system_instruction_sha: str = None
+    fewshot_as_multiturn: bool = None
+    chat_template: str = None
+    chat_template_sha: str = None
+    start_time: float = None
+    end_time: float = None
+    total_evaluation_time_seconds: str = None
+    def __init__(self) -> None:
+        """Starts the evaluation timer."""
+        self.start_time = time.perf_counter()
+    @staticmethod
+    def _get_model_name(model_args: str) -> str:
+        """Extracts the model name from the model arguments."""
+        def extract_model_name(model_args: str, key: str) -> str:
+            """Extracts the model name from the model arguments using a key."""
+            args_after_key = model_args.split(key)[1]
+            return args_after_key.split(",")[0]
+        # order does matter, e.g. peft and delta are provided together with pretrained
+        prefixes = ["peft=", "delta=", "pretrained=", "model=", "path=", "engine="]
+        for prefix in prefixes:
+            if prefix in model_args:
+                return extract_model_name(model_args, prefix)
+        return ""
+    def log_experiment_args(
+        self,
+        model_source: str,
+        model_args: str,
+        system_instruction: str,
+        chat_template: str,
+        fewshot_as_multiturn: bool,
+    ) -> None:
+        """Logs model parameters and job ID."""
+        self.model_source = model_source
+        self.model_name = GeneralConfigTracker._get_model_name(model_args)
+        self.model_name_sanitized = sanitize_model_name(self.model_name)
+        self.system_instruction = system_instruction
+        self.system_instruction_sha = (
+            hash_string(system_instruction) if system_instruction else None
+        )
+        self.chat_template = chat_template
+        self.chat_template_sha = hash_string(chat_template) if chat_template else None
+        self.fewshot_as_multiturn = fewshot_as_multiturn
+    def log_end_time(self) -> None:
+        """Logs the end time of the evaluation and calculates the total evaluation time."""
+        self.end_time = time.perf_counter()
+        self.total_evaluation_time_seconds = str(self.end_time - self.start_time)
+class EvaluationTracker:
+    """
+    Keeps track and saves relevant information of the evaluation process.
+    Compiles the data from trackers and writes it to files, which can be published to the Hugging Face hub if requested.
+    """
+    def __init__(
+        self,
+        output_path: str = None,
+        hub_results_org: str = "",
+        hub_repo_name: str = "",
+        details_repo_name: str = "",
+        results_repo_name: str = "",
+        push_results_to_hub: bool = False,
+        push_samples_to_hub: bool = False,
+        public_repo: bool = False,
+        token: str = "",
+        leaderboard_url: str = "",
+        point_of_contact: str = "",
+        gated: bool = False,
+    ) -> None:
+        """
+        Creates all the necessary loggers for evaluation tracking.
+        Args:
+            output_path (str): Path to save the results. If not provided, the results won't be saved.
+            hub_results_org (str): The Hugging Face organization to push the results to. If not provided, the results will be pushed to the owner of the Hugging Face token.
+            hub_repo_name (str): The name of the Hugging Face repository to push the results to. If not provided, the results will be pushed to `lm-eval-results`.
+            details_repo_name (str): The name of the Hugging Face repository to push the details to. If not provided, the results will be pushed to `lm-eval-results`.
+            result_repo_name (str): The name of the Hugging Face repository to push the results to. If not provided, the results will not be pushed and will be found in the details_hub_repo.
+            push_results_to_hub (bool): Whether to push the results to the Hugging Face hub.
+            push_samples_to_hub (bool): Whether to push the samples to the Hugging Face hub.
+            public_repo (bool): Whether to push the results to a public or private repository.
+            token (str): Token to use when pushing to the Hugging Face hub. This token should have write access to `hub_results_org`.
+            leaderboard_url (str): URL to the leaderboard on the Hugging Face hub on the dataset card.
+            point_of_contact (str): Contact information on the Hugging Face hub dataset card.
+            gated (bool): Whether to gate the repository.
+        """
+        self.general_config_tracker = GeneralConfigTracker()
+        self.output_path = output_path
+        self.push_results_to_hub = push_results_to_hub
+        self.push_samples_to_hub = push_samples_to_hub
+        self.public_repo = public_repo
+        self.leaderboard_url = leaderboard_url
+        self.point_of_contact = point_of_contact
+        self.api = HfApi(token=token) if token else None
+        self.gated_repo = gated
+        if not self.api and (push_results_to_hub or push_samples_to_hub):
+            raise ValueError(
+                "Hugging Face token is not defined, but 'push_results_to_hub' or 'push_samples_to_hub' is set to True. "
+                "Please provide a valid Hugging Face token by setting the HF_TOKEN environment variable."
+            )
+        if (
+            self.api
+            and hub_results_org == ""
+            and (push_results_to_hub or push_samples_to_hub)
+        ):
+            hub_results_org = self.api.whoami()["name"]
+            eval_logger.warning(
+                f"hub_results_org was not specified. Results will be pushed to '{hub_results_org}'."
+            )
+        if hub_repo_name == "":
+            details_repo_name = (
+                details_repo_name if details_repo_name != "" else "lm-eval-results"
+            )
+            results_repo_name = (
+                results_repo_name if results_repo_name != "" else details_repo_name
+            )
+        else:
+            details_repo_name = hub_repo_name
+            results_repo_name = hub_repo_name
+            eval_logger.warning(
+                "hub_repo_name was specified. Both details and results will be pushed to the same repository. Using hub_repo_name is no longer recommended, details_repo_name and results_repo_name should be used instead."
+            )
+        self.details_repo = f"{hub_results_org}/{details_repo_name}"
+        self.details_repo_private = f"{hub_results_org}/{details_repo_name}-private"
+        self.results_repo = f"{hub_results_org}/{results_repo_name}"
+        self.results_repo_private = f"{hub_results_org}/{results_repo_name}-private"
+    def save_results_aggregated(
+        self,
+        results: dict,
+        samples: dict,
+    ) -> None:
+        """
+        Saves the aggregated results and samples to the output path and pushes them to the Hugging Face hub if requested.
+        Args:
+            results (dict): The aggregated results to save.
+            samples (dict): The samples results to save.
+        """
+        self.general_config_tracker.log_end_time()
+        if self.output_path:
+            try:
+                eval_logger.info("Saving results aggregated")
+                # calculate cumulative hash for each task - only if samples are provided
+                task_hashes = {}
+                if samples:
+                    for task_name, task_samples in samples.items():
+                        sample_hashes = [
+                            s["doc_hash"] + s["prompt_hash"] + s["target_hash"]
+                            for s in task_samples
+                        ]
+                        task_hashes[task_name] = hash_string("".join(sample_hashes))
+                # update initial results dict
+                results.update({"task_hashes": task_hashes})
+                results.update(asdict(self.general_config_tracker))
+                dumped = json.dumps(
+                    results,
+                    indent=2,
+                    default=handle_non_serializable,
+                    ensure_ascii=False,
+                )
+                path = Path(self.output_path if self.output_path else Path.cwd())
+                self.date_id = datetime.now().isoformat().replace(":", "-")
+                if path.suffix == ".json":
+                    path.parent.mkdir(parents=True, exist_ok=True)
+                    file_results_aggregated = path.with_name(
+                        f"{path.stem}_{self.date_id}.json"
+                    )
+                else:
+                    path = path.joinpath(
+                        self.general_config_tracker.model_name_sanitized
+                    )
+                    path.mkdir(parents=True, exist_ok=True)
+                    file_results_aggregated = path.joinpath(
+                        f"results_{self.date_id}.json"
+                    )
+                file_results_aggregated.open("w", encoding="utf-8").write(dumped)
+                if self.api and self.push_results_to_hub:
+                    repo_id = (
+                        self.results_repo
+                        if self.public_repo
+                        else self.results_repo_private
+                    )
+                    self.api.create_repo(
+                        repo_id=repo_id,
+                        repo_type="dataset",
+                        private=not self.public_repo,
+                        exist_ok=True,
+                    )
+                    self.api.upload_file(
+                        repo_id=repo_id,
+                        path_or_fileobj=str(file_results_aggregated),
+                        path_in_repo=os.path.join(
+                            self.general_config_tracker.model_name,
+                            file_results_aggregated.name,
+                        ),
+                        repo_type="dataset",
+                        commit_message=f"Adding aggregated results for {self.general_config_tracker.model_name}",
+                    )
+                    eval_logger.info(
+                        "Successfully pushed aggregated results to the Hugging Face Hub. "
+                        f"You can find them at: {repo_id}"
+                    )
+            except Exception as e:
+                eval_logger.warning("Could not save results aggregated")
+                eval_logger.info(repr(e))
+        else:
+            eval_logger.info(
+                "Output path not provided, skipping saving results aggregated"
+            )
+    def save_results_samples(
+        self,
+        task_name: str,
+        samples: dict,
+    ) -> None:
+        """
+        Saves the samples results to the output path and pushes them to the Hugging Face hub if requested.
+        Args:
+            task_name (str): The task name to save the samples for.
+            samples (dict): The samples results to save.
+        """
+        if self.output_path:
+            try:
+                eval_logger.info(f"Saving per-sample results for: {task_name}")
+                path = Path(self.output_path if self.output_path else Path.cwd())
+                if path.suffix == ".json":
+                    path = path.parent
+                else:
+                    path = path.joinpath(
+                        self.general_config_tracker.model_name_sanitized
+                    )
+                path.mkdir(parents=True, exist_ok=True)
+                file_results_samples = path.joinpath(
+                    f"samples_{task_name}_{self.date_id}.jsonl"
+                )
+                for sample in samples:
+                    # we first need to sanitize arguments and resps
+                    # otherwise we won't be able to load the dataset
+                    # using the datasets library
+                    arguments = {}
+                    for i, arg in enumerate(sample["arguments"]):
+                        arguments[f"gen_args_{i}"] = {}
+                        for j, tmp in enumerate(arg):
+                            arguments[f"gen_args_{i}"][f"arg_{j}"] = tmp
+                    sample["resps"] = sanitize_list(sample["resps"])
+                    sample["filtered_resps"] = sanitize_list(sample["filtered_resps"])
+                    sample["arguments"] = arguments
+                    sample["target"] = str(sample["target"])
+                    sample_dump = (
+                        json.dumps(
+                            sample,
+                            default=handle_non_serializable,
+                            ensure_ascii=False,
+                        )
+                        + "\n"
+                    )
+                    with open(file_results_samples, "a", encoding="utf-8") as f:
+                        f.write(sample_dump)
+                if self.api and self.push_samples_to_hub:
+                    repo_id = (
+                        self.details_repo
+                        if self.public_repo
+                        else self.details_repo_private
+                    )
+                    self.api.create_repo(
+                        repo_id=repo_id,
+                        repo_type="dataset",
+                        private=not self.public_repo,
+                        exist_ok=True,
+                    )
+                    try:
+                        if self.gated_repo:
+                            headers = build_hf_headers()
+                            r = get_session().put(
+                                url=f"https://huggingface.co/api/datasets/{repo_id}/settings",
+                                headers=headers,
+                                json={"gated": "auto"},
+                            )
+                            hf_raise_for_status(r)
+                    except Exception as e:
+                        eval_logger.warning("Could not gate the repository")
+                        eval_logger.info(repr(e))
+                    self.api.upload_folder(
+                        repo_id=repo_id,
+                        folder_path=str(path),
+                        path_in_repo=self.general_config_tracker.model_name_sanitized,
+                        repo_type="dataset",
+                        commit_message=f"Adding samples results for {task_name} to {self.general_config_tracker.model_name}",
+                    )
+                    eval_logger.info(
+                        f"Successfully pushed sample results for task: {task_name} to the Hugging Face Hub. "
+                        f"You can find them at: {repo_id}"
+                    )
+            except Exception as e:
+                eval_logger.warning("Could not save sample results")
+                eval_logger.info(repr(e))
+        else:
+            eval_logger.info("Output path not provided, skipping saving sample results")
+    def recreate_metadata_card(self) -> None:
+        """
+        Creates a metadata card for the evaluation results dataset and pushes it to the Hugging Face hub.
+        """
+        eval_logger.info("Recreating metadata card")
+        repo_id = self.details_repo if self.public_repo else self.details_repo_private
+        files_in_repo = self.api.list_repo_files(repo_id=repo_id, repo_type="dataset")
+        results_files = get_results_filenames(files_in_repo)
+        sample_files = get_sample_results_filenames(files_in_repo)
+        # Build a dictionary to store the latest evaluation datetime for:
+        # - Each tested model and its aggregated results
+        # - Each task and sample results, if existing
+        # i.e. {
+        #     "org__model_name__gsm8k": "2021-09-01T12:00:00",
+        #     "org__model_name__ifeval": "2021-09-01T12:00:00",
+        #     "org__model_name__results": "2021-09-01T12:00:00"
+        # }
+        latest_task_results_datetime = defaultdict(lambda: datetime.min.isoformat())
+        for file_path in sample_files:
+            file_path = Path(file_path)
+            filename = file_path.name
+            model_name = file_path.parent
+            task_name = get_file_task_name(filename)
+            results_datetime = get_file_datetime(filename)
+            task_name_sanitized = sanitize_task_name(task_name)
+            # Results and sample results for the same model and task will have the same datetime
+            samples_key = f"{model_name}__{task_name_sanitized}"
+            results_key = f"{model_name}__results"
+            latest_datetime = max(
+                latest_task_results_datetime[samples_key],
+                results_datetime,
+            )
+            latest_task_results_datetime[samples_key] = latest_datetime
+            latest_task_results_datetime[results_key] = max(
+                latest_task_results_datetime[results_key],
+                latest_datetime,
+            )
+        # Create metadata card
+        card_metadata = MetadataConfigs()
+        # Add the latest aggregated results to the metadata card for easy access
+        for file_path in results_files:
+            file_path = Path(file_path)
+            results_filename = file_path.name
+            model_name = file_path.parent
+            eval_date = get_file_datetime(results_filename)
+            eval_date_sanitized = re.sub(r"[^\w\.]", "_", eval_date)
+            results_filename = Path("**") / Path(results_filename).name
+            config_name = f"{model_name}__results"
+            sanitized_last_eval_date_results = re.sub(
+                r"[^\w\.]", "_", latest_task_results_datetime[config_name]
+            )
+            if eval_date_sanitized == sanitized_last_eval_date_results:
+                # Ensure that all results files are listed in the metadata card
+                current_results = card_metadata.get(config_name, {"data_files": []})
+                current_results["data_files"].append(
+                    {"split": eval_date_sanitized, "path": [str(results_filename)]}
+                )
+                card_metadata[config_name] = current_results
+                # If the results file is the newest, update the "latest" field in the metadata card
+                card_metadata[config_name]["data_files"].append(
+                    {"split": "latest", "path": [str(results_filename)]}
+                )
+        # Add the tasks details configs
+        for file_path in sample_files:
+            file_path = Path(file_path)
+            filename = file_path.name
+            model_name = file_path.parent
+            task_name = get_file_task_name(filename)
+            eval_date = get_file_datetime(filename)
+            task_name_sanitized = sanitize_task_name(task_name)
+            eval_date_sanitized = re.sub(r"[^\w\.]", "_", eval_date)
+            results_filename = Path("**") / Path(filename).name
+            config_name = f"{model_name}__{task_name_sanitized}"
+            sanitized_last_eval_date_results = re.sub(
+                r"[^\w\.]", "_", latest_task_results_datetime[config_name]
+            )
+            if eval_date_sanitized == sanitized_last_eval_date_results:
+                # Ensure that all sample results files are listed in the metadata card
+                current_details_for_task = card_metadata.get(
+                    config_name, {"data_files": []}
+                )
+                current_details_for_task["data_files"].append(
+                    {"split": eval_date_sanitized, "path": [str(results_filename)]}
+                )
+                card_metadata[config_name] = current_details_for_task
+                # If the samples results file is the newest, update the "latest" field in the metadata card
+                card_metadata[config_name]["data_files"].append(
+                    {"split": "latest", "path": [str(results_filename)]}
+                )
+        # Get latest results and extract info to update metadata card examples
+        latest_datetime = max(latest_task_results_datetime.values())
+        latest_model_name = max(
+            latest_task_results_datetime, key=lambda k: latest_task_results_datetime[k]
+        )
+        last_results_file = [
+            f for f in results_files if latest_datetime.replace(":", "-") in f
+        ][0]
+        last_results_file_path = hf_hub_url(
+            repo_id=repo_id, filename=last_results_file, repo_type="dataset"
+        )
+        latest_results_file = load_dataset(
+            "json", data_files=last_results_file_path, split="train"
+        )
+        results_dict = latest_results_file["results"][0]
+        new_dictionary = {"all": results_dict}
+        new_dictionary.update(results_dict)
+        results_string = json.dumps(new_dictionary, indent=4)
+        dataset_summary = (
+            "Dataset automatically created during the evaluation run of model "
+        )
+        if self.general_config_tracker.model_source == "hf":
+            dataset_summary += f"[{self.general_config_tracker.model_name}](https://huggingface.co/{self.general_config_tracker.model_name})\n"
+        else:
+            dataset_summary += f"{self.general_config_tracker.model_name}\n"
+        dataset_summary += (
+            f"The dataset is composed of {len(card_metadata) - 1} configuration(s), each one corresponding to one of the evaluated task.\n\n"
+            f"The dataset has been created from {len(results_files)} run(s). Each run can be found as a specific split in each "
+            'configuration, the split being named using the timestamp of the run.The "train" split is always pointing to the latest results.\n\n'
+            'An additional configuration "results" store all the aggregated results of the run.\n\n'
+            "To load the details from a run, you can for instance do the following:\n"
+        )
+        if self.general_config_tracker.model_source == "hf":
+            dataset_summary += (
+                "```python\nfrom datasets import load_dataset\n"
+                f'data = load_dataset(\n\t"{repo_id}",\n\tname="{latest_model_name}",\n\tsplit="latest"\n)\n```\n\n'
+            )
+        dataset_summary += (
+            "## Latest results\n\n"
+            f"These are the [latest results from run {latest_datetime}]({last_results_file_path.replace('/resolve/', '/blob/')}) "
+            "(note that there might be results for other tasks in the repos if successive evals didn't cover the same tasks. "
+            'You find each in the results and the "latest" split for each eval):\n\n'
+            f"```python\n{results_string}\n```"
+        )
+        card_data = DatasetCardData(
+            dataset_summary=dataset_summary,
+            repo_url=f"https://huggingface.co/{self.general_config_tracker.model_name}",
+            pretty_name=f"Evaluation run of {self.general_config_tracker.model_name}",
+            leaderboard_url=self.leaderboard_url,
+            point_of_contact=self.point_of_contact,
+        )
+        card_metadata.to_dataset_card_data(card_data)
+        card = DatasetCard.from_template(
+            card_data,
+            pretty_name=card_data.pretty_name,
+        )
+        card.push_to_hub(repo_id, repo_type="dataset")

lm-evaluation-harness/lm_eval/loggers/utils.py ADDED Viewed

	@@ -0,0 +1,149 @@

+import logging
+import os
+import re
+import subprocess
+from importlib.metadata import version
+from pathlib import Path
+from typing import Any, Dict, Optional, Tuple, Union
+import numpy as np
+from torch.utils.collect_env import get_pretty_env_info
+from transformers import __version__ as trans_version
+logger = logging.getLogger(__name__)
+def remove_none_pattern(input_string: str) -> Tuple[str, bool]:
+    """Remove the ',none' substring from the input_string if it exists at the end.
+    Args:
+        input_string (str): The input string from which to remove the ',none' substring.
+    Returns:
+        Tuple[str, bool]: A tuple containing the modified input_string with the ',none' substring removed
+                          and a boolean indicating whether the modification was made (True) or not (False).
+    """
+    # Define the pattern to match ',none' at the end of the string
+    pattern = re.compile(r",none$")
+    # Use sub() to replace ',none' with an empty string
+    result = re.sub(pattern, "", input_string)
+    # check if the input_string changed
+    removed = result != input_string
+    return result, removed
+def _handle_non_serializable(o: Any) -> Union[int, str, list]:
+    """Handle non-serializable objects by converting them to serializable types.
+    Args:
+        o (Any): The object to be handled.
+    Returns:
+        Union[int, str, list]: The converted object. If the object is of type np.int64 or np.int32,
+            it will be converted to int. If the object is of type set, it will be converted
+            to a list. Otherwise, it will be converted to str.
+    """
+    if isinstance(o, np.int64) or isinstance(o, np.int32):
+        return int(o)
+    elif isinstance(o, set):
+        return list(o)
+    else:
+        return str(o)
+def get_commit_from_path(repo_path: Union[Path, str]) -> Optional[str]:
+    try:
+        git_folder = Path(repo_path, ".git")
+        if git_folder.is_file():
+            git_folder = Path(
+                git_folder.parent,
+                git_folder.read_text(encoding="utf-8").split("\n")[0].split(" ")[-1],
+            )
+        if Path(git_folder, "HEAD").exists():
+            head_name = (
+                Path(git_folder, "HEAD")
+                .read_text(encoding="utf-8")
+                .split("\n")[0]
+                .split(" ")[-1]
+            )
+            head_ref = Path(git_folder, head_name)
+            git_hash = head_ref.read_text(encoding="utf-8").replace("\n", "")
+        else:
+            git_hash = None
+    except Exception as err:
+        logger.debug(
+            f"Failed to retrieve a Git commit hash from path: {str(repo_path)}. Error: {err}"
+        )
+        return None
+    return git_hash
+def get_git_commit_hash():
+    """
+    Gets the git commit hash of your current repo (if it exists).
+    Source: https://github.com/EleutherAI/gpt-neox/blob/b608043be541602170bfcfb8ec9bf85e8a0799e0/megatron/neox_arguments/neox_args.py#L42
+    """
+    try:
+        git_hash = subprocess.check_output(["git", "describe", "--always"]).strip()
+        git_hash = git_hash.decode()
+    except (subprocess.CalledProcessError, FileNotFoundError):
+        # FileNotFoundError occurs when git not installed on system
+        git_hash = get_commit_from_path(os.getcwd())  # git hash of repo if exists
+    return git_hash
+def add_env_info(storage: Dict[str, Any]):
+    try:
+        pretty_env_info = get_pretty_env_info()
+    except Exception as err:
+        pretty_env_info = str(err)
+    try:
+        lm_eval_version = version("lm_eval")
+    except Exception as err:
+        lm_eval_version = str(err)
+    transformers_version = trans_version
+    upper_dir_commit = get_commit_from_path(
+        Path(os.getcwd(), "..")
+    )  # git hash of upper repo if exists
+    added_info = {
+        "pretty_env_info": pretty_env_info,
+        "transformers_version": transformers_version,
+        "lm_eval_version": lm_eval_version,
+        "upper_git_hash": upper_dir_commit,  # in case this repo is submodule
+    }
+    storage.update(added_info)
+def add_tokenizer_info(storage: Dict[str, Any], lm):
+    if getattr(lm, "tokenizer", False):
+        try:
+            tokenizer_info = {
+                "tokenizer_pad_token": [
+                    lm.tokenizer.pad_token,
+                    str(lm.tokenizer.pad_token_id),
+                ],
+                "tokenizer_eos_token": [
+                    lm.tokenizer.eos_token,
+                    str(lm.tokenizer.eos_token_id),
+                ],
+                "tokenizer_bos_token": [
+                    lm.tokenizer.bos_token,
+                    str(lm.tokenizer.bos_token_id),
+                ],
+                "eot_token_id": getattr(lm, "eot_token_id", None),
+                "max_length": getattr(lm, "max_length", None),
+            }
+            storage.update(tokenizer_info)
+        except Exception as err:
+            logger.debug(
+                f"Logging detailed tokenizer info failed with {err}, skipping..."
+            )
+        # seems gguf and textsynth do not have tokenizer
+    else:
+        logger.debug(
+            "LM does not have a 'tokenizer' attribute, not logging tokenizer metadata to results."
+        )

lm-evaluation-harness/lm_eval/loggers/wandb_logger.py ADDED Viewed

	@@ -0,0 +1,358 @@

+import copy
+import json
+import logging
+from typing import Any, Dict, List, Literal, Tuple
+import numpy as np
+import pandas as pd
+from packaging.version import Version
+from lm_eval.loggers.utils import _handle_non_serializable, remove_none_pattern
+logger = logging.getLogger(__name__)
+def get_wandb_printer() -> Literal["Printer"]:
+    """Returns a wandb printer instance for pretty stdout."""
+    from wandb.sdk.lib.printer import new_printer
+    printer = new_printer()
+    return printer
+class WandbLogger:
+    def __init__(self, init_args=None, config_args=None) -> None:
+        """Attaches to wandb logger if already initialized. Otherwise, passes init_args to wandb.init() and config_args to wandb.config.update()
+        Args:
+            init_args Optional[Dict]: Arguments for init configuration.
+            config_args Optional[Dict]: Arguments for config
+        Parse and log the results returned from evaluator.simple_evaluate() with:
+            wandb_logger.post_init(results)
+            wandb_logger.log_eval_result()
+            wandb_logger.log_eval_samples(results["samples"])
+        """
+        try:
+            import wandb
+            assert Version(wandb.__version__) >= Version("0.13.6")
+            if Version(wandb.__version__) < Version("0.13.6"):
+                wandb.require("report-editing:v0")
+        except Exception as e:
+            logger.warning(
+                "To use the wandb reporting functionality please install wandb>=0.13.6.\n"
+                "To install the latest version of wandb run `pip install wandb --upgrade`\n"
+                f"{e}"
+            )
+        self.wandb_args: Dict[str, Any] = init_args or {}
+        self.wandb_config_args: Dict[str, Any] = config_args or {}
+        # pop the step key from the args to save for all logging calls
+        self.step = self.wandb_args.pop("step", None)
+        # initialize a W&B run
+        if wandb.run is None:
+            self.run = wandb.init(**self.wandb_args)
+            if self.wandb_config_args:
+                self.run.config.update(self.wandb_config_args)
+        else:
+            self.run = wandb.run
+        self.printer = get_wandb_printer()
+    def post_init(self, results: Dict[str, Any]) -> None:
+        self.results: Dict[str, Any] = copy.deepcopy(results)
+        self.task_names: List[str] = list(results.get("results", {}).keys())
+        self.group_names: List[str] = list(results.get("groups", {}).keys())
+    def _get_config(self) -> Dict[str, Any]:
+        """Get configuration parameters."""
+        self.task_configs = self.results.get("configs", {})
+        cli_configs = self.results.get("config", {})
+        configs = {
+            "task_configs": self.task_configs,
+            "cli_configs": cli_configs,
+        }
+        return configs
+    def _sanitize_results_dict(self) -> Tuple[Dict[str, str], Dict[str, Any]]:
+        """Sanitize the results dictionary."""
+        _results = copy.deepcopy(self.results.get("results", dict()))
+        # Remove None from the metric string name
+        tmp_results = copy.deepcopy(_results)
+        for task_name in self.task_names:
+            task_result = tmp_results.get(task_name, dict())
+            for metric_name, metric_value in task_result.items():
+                _metric_name, removed = remove_none_pattern(metric_name)
+                if removed:
+                    _results[task_name][_metric_name] = metric_value
+                    _results[task_name].pop(metric_name)
+        # remove string valued keys from the results dict
+        wandb_summary = {}
+        for task in self.task_names:
+            task_result = _results.get(task, dict())
+            for metric_name, metric_value in task_result.items():
+                if isinstance(metric_value, str):
+                    wandb_summary[f"{task}/{metric_name}"] = metric_value
+        for summary_metric, summary_value in wandb_summary.items():
+            _task, _summary_metric = summary_metric.split("/")
+            _results[_task].pop(_summary_metric)
+        tmp_results = copy.deepcopy(_results)
+        for task_name, task_results in tmp_results.items():
+            for metric_name, metric_value in task_results.items():
+                _results[f"{task_name}/{metric_name}"] = metric_value
+                _results[task_name].pop(metric_name)
+        for task in self.task_names:
+            _results.pop(task)
+        return wandb_summary, _results
+    def _log_results_as_table(self) -> None:
+        """Generate and log evaluation results as a table to W&B."""
+        columns = [
+            "Version",
+            "Filter",
+            "num_fewshot",
+            "Metric",
+            "Value",
+            "Stderr",
+        ]
+        def make_table(columns: List[str], key: str = "results"):
+            import wandb
+            table = wandb.Table(columns=columns)
+            results = copy.deepcopy(self.results)
+            for k, dic in results.get(key).items():
+                if k in self.group_names and not key == "groups":
+                    continue
+                version = results.get("versions").get(k)
+                if version == "N/A":
+                    version = None
+                n = results.get("n-shot").get(k)
+                for (mf), v in dic.items():
+                    m, _, f = mf.partition(",")
+                    if m.endswith("_stderr"):
+                        continue
+                    if m == "alias":
+                        continue
+                    if m + "_stderr" + "," + f in dic:
+                        se = dic[m + "_stderr" + "," + f]
+                        if se != "N/A":
+                            se = "%.4f" % se
+                        table.add_data(*[k, version, f, n, m, str(v), str(se)])
+                    else:
+                        table.add_data(*[k, version, f, n, m, str(v), ""])
+            return table
+        # log the complete eval result to W&B Table
+        table = make_table(["Tasks"] + columns, "results")
+        self.run.log({"evaluation/eval_results": table}, step=self.step)
+        if "groups" in self.results.keys():
+            table = make_table(["Groups"] + columns, "groups")
+            self.run.log({"evaluation/group_eval_results": table}, step=self.step)
+    def _log_results_as_artifact(self) -> None:
+        """Log results as JSON artifact to W&B."""
+        import wandb
+        dumped = json.dumps(
+            self.results, indent=2, default=_handle_non_serializable, ensure_ascii=False
+        )
+        artifact = wandb.Artifact("results", type="eval_results")
+        with artifact.new_file("results.json", mode="w", encoding="utf-8") as f:
+            f.write(dumped)
+        self.run.log_artifact(artifact)
+    def log_eval_result(self) -> None:
+        """Log evaluation results to W&B."""
+        # Log configs to wandb
+        configs = self._get_config()
+        self.run.config.update(configs, allow_val_change=self.step is not None)
+        wandb_summary, self.wandb_results = self._sanitize_results_dict()
+        # update wandb.run.summary with items that were removed
+        self.run.summary.update(wandb_summary)
+        # Log the evaluation metrics to wandb
+        self.run.log(self.wandb_results, step=self.step)
+        # Log the evaluation metrics as W&B Table
+        self._log_results_as_table()
+        # Log the results dict as json to W&B Artifacts
+        self._log_results_as_artifact()
+    def _generate_dataset(
+        self, data: List[Dict[str, Any]], config: Dict[str, Any]
+    ) -> pd.DataFrame:
+        """Generate a dataset from evaluation data.
+        Args:
+            data (List[Dict[str, Any]]): The data to generate a dataset for.
+            config (Dict[str, Any]): The configuration of the task.
+        Returns:
+            pd.DataFrame: A dataframe that is ready to be uploaded to W&B.
+        """
+        ids = [x["doc_id"] for x in data]
+        labels = [x["target"] for x in data]
+        instance = [""] * len(ids)
+        resps = [""] * len(ids)
+        filtered_resps = [""] * len(ids)
+        model_outputs = {}
+        metrics_list = config["metric_list"]
+        metrics = {}
+        for metric in metrics_list:
+            metric = metric.get("metric")
+            if metric in ["word_perplexity", "byte_perplexity", "bits_per_byte"]:
+                metrics[f"{metric}_loglikelihood"] = [x[metric][0] for x in data]
+                if metric in ["byte_perplexity", "bits_per_byte"]:
+                    metrics[f"{metric}_bytes"] = [x[metric][1] for x in data]
+                else:
+                    metrics[f"{metric}_words"] = [x[metric][1] for x in data]
+            else:
+                metrics[metric] = [x[metric] for x in data]
+        if config["output_type"] == "loglikelihood":
+            instance = [x["arguments"][0][0] for x in data]
+            labels = [x["arguments"][0][1] for x in data]
+            resps = [
+                f"log probability of continuation is {x['resps'][0][0][0]} "
+                + "\n\n"
+                + "continuation will {} generated with greedy sampling".format(
+                    "not be" if not x["resps"][0][0][1] else "be"
+                )
+                for x in data
+            ]
+            filtered_resps = [
+                f"log probability of continuation is {x['filtered_resps'][0][0]} "
+                + "\n\n"
+                + "continuation will {} generated with greedy sampling".format(
+                    "not be" if not x["filtered_resps"][0][1] else "be"
+                )
+                for x in data
+            ]
+        elif config["output_type"] == "multiple_choice":
+            instance = [x["arguments"][0][0] for x in data]
+            choices = [
+                "\n".join([f"{idx}. {y[1]}" for idx, y in enumerate(x["arguments"])])
+                for x in data
+            ]
+            resps = [np.argmax([n[0][0] for n in x["resps"]]) for x in data]
+            filtered_resps = [
+                np.argmax([n[0] for n in x["filtered_resps"]]) for x in data
+            ]
+        elif config["output_type"] == "loglikelihood_rolling":
+            instance = [x["arguments"][0][0] for x in data]
+            resps = [x["resps"][0][0] for x in data]
+            filtered_resps = [x["filtered_resps"][0] for x in data]
+        elif config["output_type"] == "generate_until":
+            instance = [x["arguments"][0][0] for x in data]
+            resps = [x["resps"][0][0] for x in data]
+            filtered_resps = [x["filtered_resps"][0] for x in data]
+        model_outputs["raw_predictions"] = resps
+        model_outputs["filtered_predictions"] = filtered_resps
+        df_data = {
+            "id": ids,
+            "data": instance,
+        }
+        if config["output_type"] == "multiple_choice":
+            df_data["choices"] = choices
+        tmp_data = {
+            "input_len": [len(x) for x in instance],
+            "labels": labels,
+            "output_type": config["output_type"],
+        }
+        df_data.update(tmp_data)
+        df_data.update(model_outputs)
+        df_data.update(metrics)
+        return pd.DataFrame(df_data)
+    def _log_samples_as_artifact(
+        self, data: List[Dict[str, Any]], task_name: str
+    ) -> None:
+        import wandb
+        # log the samples as an artifact
+        dumped = json.dumps(
+            data,
+            indent=2,
+            default=_handle_non_serializable,
+            ensure_ascii=False,
+        )
+        artifact = wandb.Artifact(f"{task_name}", type="samples_by_task")
+        with artifact.new_file(
+            f"{task_name}_eval_samples.json", mode="w", encoding="utf-8"
+        ) as f:
+            f.write(dumped)
+        self.run.log_artifact(artifact)
+        # artifact.wait()
+    def log_eval_samples(self, samples: Dict[str, List[Dict[str, Any]]]) -> None:
+        """Log evaluation samples to W&B.
+        Args:
+            samples (Dict[str, List[Dict[str, Any]]]): Evaluation samples for each task.
+        """
+        task_names: List[str] = [
+            x for x in self.task_names if x not in self.group_names
+        ]
+        ungrouped_tasks = []
+        tasks_by_groups = {}
+        for task_name in task_names:
+            group_names = self.task_configs[task_name].get("group", None)
+            if group_names:
+                if isinstance(group_names, str):
+                    group_names = [group_names]
+                for group_name in group_names:
+                    if not tasks_by_groups.get(group_name):
+                        tasks_by_groups[group_name] = [task_name]
+                    else:
+                        tasks_by_groups[group_name].append(task_name)
+            else:
+                ungrouped_tasks.append(task_name)
+        for task_name in ungrouped_tasks:
+            eval_preds = samples[task_name]
+            # log the samples as a W&B Table
+            df = self._generate_dataset(eval_preds, self.task_configs.get(task_name))
+            self.run.log({f"{task_name}_eval_results": df}, step=self.step)
+            # log the samples as a json file as W&B Artifact
+            self._log_samples_as_artifact(eval_preds, task_name)
+        for group, grouped_tasks in tasks_by_groups.items():
+            grouped_df = pd.DataFrame()
+            for task_name in grouped_tasks:
+                eval_preds = samples[task_name]
+                df = self._generate_dataset(
+                    eval_preds, self.task_configs.get(task_name)
+                )
+                df["group"] = group
+                df["task"] = task_name
+                grouped_df = pd.concat([grouped_df, df], ignore_index=True)
+                # log the samples as a json file as W&B Artifact
+                self._log_samples_as_artifact(eval_preds, task_name)
+            self.run.log({f"{group}_eval_results": grouped_df}, step=self.step)

lm-evaluation-harness/lm_eval/models/__init__.py ADDED Viewed

	@@ -0,0 +1,35 @@

+from . import (
+    anthropic_llms,
+    api_models,
+    dummy,
+    gguf,
+    hf_audiolm,
+    hf_steered,
+    hf_vlms,
+    huggingface,
+    ibm_watsonx_ai,
+    mamba_lm,
+    nemo_lm,
+    neuron_optimum,
+    openai_completions,
+    optimum_ipex,
+    optimum_lm,
+    sglang_causallms,
+    sglang_generate_API,
+    textsynth,
+    vllm_causallms,
+    vllm_vlms,
+)
+# TODO: implement __all__
+try:
+    # enable hf hub transfer if available
+    import hf_transfer  # type: ignore # noqa
+    import huggingface_hub.constants  # type: ignore
+    huggingface_hub.constants.HF_HUB_ENABLE_HF_TRANSFER = True
+except ImportError:
+    pass

lm-evaluation-harness/lm_eval/models/anthropic_llms.py ADDED Viewed

	@@ -0,0 +1,382 @@

+import logging
+import os
+from functools import cached_property
+from typing import Any, Dict, List, Tuple, Union
+from tqdm import tqdm
+from lm_eval.api.model import LM
+from lm_eval.api.registry import register_model
+from lm_eval.models.openai_completions import LocalCompletionsAPI
+from lm_eval.models.utils import handle_stop_sequences, retry_on_specific_exceptions
+eval_logger = logging.getLogger(__name__)
+def anthropic_completion(
+    client,  #: anthropic.Anthropic,
+    model: str,
+    prompt: str,
+    max_tokens_to_sample: int,
+    temperature: float,
+    stop: List[str],
+    **kwargs: Any,
+) -> str:
+    """Wrapper function around the Anthropic completion API client with exponential back-off
+    in case of RateLimitError.
+    params:
+        client: anthropic.Anthropic
+            Anthropic API client
+        model: str
+            Anthropic model e.g. 'claude-instant-v1', 'claude-2'
+        prompt: str
+            Prompt to feed to the model
+        max_tokens_to_sample: int
+            Maximum number of tokens to sample from the model
+        temperature: float
+            Sampling temperature
+        stop: List[str]
+            List of stop sequences
+        kwargs: Any
+            Additional model_args to pass to the API client
+    """
+    try:
+        import anthropic
+    except ModuleNotFoundError as exception:
+        raise type(exception)(
+            "attempted to use 'anthropic' LM type, but package `anthropic` is not installed. \
+please install anthropic via `pip install 'lm-eval[anthropic]'` or `pip install -e '.[anthropic]'`",
+        )
+    def _exception_callback(e: Exception, sleep_time: float) -> None:
+        eval_logger.warning(
+            f"RateLimitError occurred: {e.__cause__}\n Retrying in {sleep_time} seconds"
+        )
+    @retry_on_specific_exceptions(
+        on_exceptions=[anthropic.RateLimitError],
+        max_retries=None,  # retry forever, consider changing
+        on_exception_callback=_exception_callback,
+    )
+    def completion():
+        response = client.completions.create(
+            prompt=f"{anthropic.HUMAN_PROMPT} {prompt}{anthropic.AI_PROMPT}",
+            model=model,
+            # NOTE: Claude really likes to do CoT, and overly aggressive stop sequences
+            #       (e.g. gsm8k's ":") may truncate a lot of the input.
+            stop_sequences=[anthropic.HUMAN_PROMPT] + stop,
+            max_tokens_to_sample=max_tokens_to_sample,
+            temperature=temperature,
+            **kwargs,
+        )
+        return response.completion
+    return completion()
+def anthropic_chat(
+    client,  #: anthropic.Anthropic,
+    model: str,
+    prompt: str,
+    max_tokens: int,
+    temperature: float,
+    stop: List[str],
+    **kwargs: Any,
+) -> str:
+    """Wrapper function around the Anthropic completion API client with exponential back-off
+    in case of RateLimitError.
+    params:
+        client: anthropic.Anthropic
+            Anthropic API client
+        model: str
+            Anthropic model e.g. 'claude-3-opus-20240229', 'claude-3-sonnet-20240229'
+        prompt: str
+            Prompt to feed to the model
+        max_tokens: int
+            Maximum number of tokens to sample from the model
+        temperature: float
+            Sampling temperature
+        stop: List[str]
+            List of stop sequences
+        kwargs: Any
+            Additional model_args to pass to the API client
+    """
+    try:
+        import anthropic
+    except ModuleNotFoundError as exception:
+        raise type(exception)(
+            "attempted to use 'anthropic' LM type, but package `anthropic` is not installed. \
+please install anthropic via `pip install 'lm-eval[anthropic]'` or `pip install -e '.[anthropic]'`",
+        )
+    def _exception_callback(e: Exception, sleep_time: float) -> None:
+        eval_logger.warning(
+            f"RateLimitError occurred: {e.__cause__}\n Retrying in {sleep_time} seconds"
+        )
+    @retry_on_specific_exceptions(
+        on_exceptions=[
+            anthropic.RateLimitError,
+            anthropic.APIConnectionError,
+            anthropic.APIStatusError,
+        ],
+        max_retries=None,  # retry forever, consider changing
+        on_exception_callback=_exception_callback,
+    )
+    def messages():
+        response = client.messages.create(
+            model=model,
+            max_tokens=max_tokens,
+            temperature=temperature,
+            messages=[{"role": "user", "content": f"{prompt}"}],
+            **kwargs,
+        )
+        return response.content[0].text
+    return messages()
+@register_model("anthropic-completions")
+class AnthropicLM(LM):
+    REQ_CHUNK_SIZE = 20  # TODO: not used
+    def __init__(
+        self,
+        batch_size: int = 1,
+        model: str = "claude-2.0",
+        max_tokens_to_sample: int = 256,
+        temperature: float = 0,  # defaults to 1
+        **kwargs,  # top_p, top_k, etc.
+    ) -> None:
+        """Anthropic API wrapper.
+        :param model: str
+            Anthropic model e.g. 'claude-instant-v1', 'claude-2'
+        :param max_tokens_to_sample: int
+            Maximum number of tokens to sample from the model
+        :param temperature: float
+            Sampling temperature
+        :param kwargs: Any
+            Additional model_args to pass to the API client
+        """
+        super().__init__()
+        try:
+            import anthropic
+        except ModuleNotFoundError as exception:
+            raise type(exception)(
+                "attempted to use 'anthropic' LM type, but package `anthropic` is not installed. \
+please install anthropic via `pip install 'lm-eval[anthropic]'` or `pip install -e '.[anthropic]'`",
+            )
+        self.model = model
+        # defaults to os.environ.get("ANTHROPIC_API_KEY")
+        self.client = anthropic.Anthropic()
+        self.temperature = temperature
+        self.max_tokens_to_sample = max_tokens_to_sample
+        self.tokenizer = self.client.get_tokenizer()
+        self.kwargs = kwargs
+    @property
+    def eot_token_id(self):
+        # Not sure but anthropic.HUMAN_PROMPT ?
+        raise NotImplementedError("No idea about anthropic tokenization.")
+    @property
+    def max_length(self) -> int:
+        return 2048
+    @property
+    def max_gen_toks(self) -> int:
+        return self.max_tokens_to_sample
+    @property
+    def batch_size(self):
+        # Isn't used because we override _loglikelihood_tokens
+        raise NotImplementedError("No support for logits.")
+    @property
+    def device(self):
+        # Isn't used because we override _loglikelihood_tokens
+        raise NotImplementedError("No support for logits.")
+    def tok_encode(self, string: str) -> List[int]:
+        return self.tokenizer.encode(string).ids
+    def tok_decode(self, tokens: List[int]) -> str:
+        return self.tokenizer.decode(tokens)
+    def _loglikelihood_tokens(self, requests, disable_tqdm: bool = False):
+        raise NotImplementedError("No support for logits.")
+    def generate_until(self, requests, disable_tqdm: bool = False) -> List[str]:
+        try:
+            import anthropic
+        except ModuleNotFoundError as exception:
+            raise type(exception)(
+                "attempted to use 'anthropic' LM type, but package `anthropic` is not installed. \
+please install anthropic via `pip install 'lm-eval[anthropic]'` or `pip install -e '.[anthropic]'`",
+            )
+        if not requests:
+            return []
+        _requests: List[Tuple[str, dict]] = [req.args for req in requests]
+        res = []
+        for request in tqdm(_requests, disable=disable_tqdm):
+            try:
+                inp = request[0]
+                request_args = request[1]
+                # generation_kwargs
+                until = request_args.get("until")
+                max_gen_toks = request_args.get("max_gen_toks", self.max_length)
+                temperature = request_args.get("temperature", self.temperature)
+                response = anthropic_completion(
+                    client=self.client,
+                    model=self.model,
+                    prompt=inp,
+                    max_tokens_to_sample=max_gen_toks,
+                    temperature=temperature,  # TODO: implement non-greedy sampling for Anthropic
+                    stop=until,  # type: ignore
+                    **self.kwargs,
+                )
+                res.append(response)
+                self.cache_hook.add_partial("generate_until", request, response)
+            except anthropic.APIConnectionError as e:  # type: ignore # noqa: F821
+                eval_logger.critical(f"Server unreachable: {e.__cause__}")
+                break
+            except anthropic.APIStatusError as e:  # type: ignore # noqa: F821
+                eval_logger.critical(f"API error {e.status_code}: {e.message}")
+                break
+        return res
+    def _model_call(self, inps):
+        # Isn't used because we override _loglikelihood_tokens
+        raise NotImplementedError()
+    def _model_generate(self, context, max_length, eos_token_id):
+        # Isn't used because we override generate_until
+        raise NotImplementedError()
+    def loglikelihood(self, requests, disable_tqdm: bool = False):
+        raise NotImplementedError("No support for logits.")
+    def loglikelihood_rolling(self, requests, disable_tqdm: bool = False):
+        raise NotImplementedError("No support for logits.")
+@register_model("anthropic-chat", "anthropic-chat-completions")
+class AnthropicChat(LocalCompletionsAPI):
+    def __init__(
+        self,
+        base_url="https://api.anthropic.com/v1/messages",
+        tokenizer_backend=None,
+        **kwargs,
+    ):
+        super().__init__(
+            base_url=base_url, tokenizer_backend=tokenizer_backend, **kwargs
+        )
+        eval_logger.warning(
+            "Chat completions does not support batching. Defaulting to batch size 1."
+        )
+        self._batch_size = 1
+        self.anthropic_version = "2023-06-01"
+        eval_logger.warning(
+            f"Using Anthropic Version: {self.anthropic_version}. Confirm the current version here: https://docs.anthropic.com/en/api/versioning"
+        )
+    @cached_property
+    def api_key(self):
+        """Override this property to return the API key for the API request."""
+        key = os.environ.get("ANTHROPIC_API_KEY", None)
+        if key is None:
+            raise ValueError(
+                "API key not found. Please set the ANTHROPIC_API_KEY environment variable."
+            )
+        return key
+    @cached_property
+    def header(self):
+        return {
+            "x-api-key": f"{self.api_key}",
+            "anthropic-version": self.anthropic_version,
+        }
+    def _create_payload(
+        self,
+        messages: List[Dict],
+        generate=True,
+        gen_kwargs: dict = None,
+        eos="\n\nHuman:",
+        **kwargs,
+    ) -> dict:
+        system = (
+            messages[0].get("content") if messages[0].get("role") == "system" else None
+        )
+        if system:
+            messages = messages[1:]
+        cleaned_messages = []
+        for msg in messages:
+            cleaned_msg = {
+                "role": msg["role"],
+                "content": [
+                    {"type": msg["type"], msg["type"]: msg["content"]},
+                ],
+            }
+            cleaned_messages.append(cleaned_msg)
+        gen_kwargs.pop("do_sample", False)
+        max_tokens = gen_kwargs.pop("max_gen_toks", self._max_gen_toks)
+        temperature = gen_kwargs.pop("temperature", 0)
+        stop = handle_stop_sequences(gen_kwargs.pop("until", ["\n\nHuman:"]), eos=eos)
+        if not isinstance(stop, list):
+            stop = [stop]
+        # Filter out empty or whitespace-only stop sequences for Anthropic API
+        stop = [s for s in stop if s and s.strip()]
+        out = {
+            "messages": cleaned_messages,
+            "model": self.model,
+            "max_tokens": max_tokens,
+            "temperature": temperature,
+            "stop_sequences": stop,
+            **gen_kwargs,
+        }
+        if system:
+            out["system"] = system
+        return out
+    def parse_generations(
+        self, outputs: Union[Dict, List[Dict]], **kwargs
+    ) -> List[str]:
+        res = []
+        if not isinstance(outputs, list):
+            outputs = [outputs]
+        for out in outputs:
+            for choices in out["content"]:
+                res.append(choices["text"])
+        return res
+    def tok_encode(
+        self,
+        string: str,
+        left_truncate_len=None,
+        add_special_tokens=None,
+        **kwargs,
+    ) -> List[str]:
+        return [string]
+    def loglikelihood(self, requests, **kwargs):
+        raise NotImplementedError(
+            "Anthropic Chat Completions API does not support the return of loglikelihood"
+        )

lm-evaluation-harness/lm_eval/models/api_models.py ADDED Viewed

	@@ -0,0 +1,810 @@

+import abc
+import asyncio
+import copy
+import itertools
+import json
+import logging
+from functools import cached_property
+from typing import (
+    TYPE_CHECKING,
+    Any,
+    Awaitable,
+    Callable,
+    Dict,
+    Iterable,
+    List,
+    Literal,
+    NamedTuple,
+    Optional,
+    Tuple,
+    Union,
+)
+try:
+    import requests
+    from aiohttp import ClientSession, ClientTimeout, TCPConnector
+    from tenacity import RetryError, retry, stop_after_attempt, wait_exponential
+    from tqdm import tqdm
+    from tqdm.asyncio import tqdm_asyncio
+except ModuleNotFoundError:
+    pass
+import base64
+from importlib.util import find_spec
+from io import BytesIO
+from lm_eval import utils
+from lm_eval.api.instance import Instance
+from lm_eval.api.model import TemplateLM
+from lm_eval.models.utils import Collator, chunks, configure_pad_token
+if TYPE_CHECKING:
+    from PIL import Image
+eval_logger = logging.getLogger(__name__)
+LogLikelihoodInputs = Tuple[Tuple[str, str], List[int], List[int]]
+# utility class to keep track of json encoded chats
+class JsonChatStr(NamedTuple):
+    prompt: str
+    def encode(self, encoding):
+        return self.prompt.encode(encoding)
+def create_image_prompt(
+    imgs: list["Image.Image"], chat: dict, fmt: str = "PNG"
+) -> dict:
+    """
+    Parameters
+    ----------
+    img : list[PIL.Image.Image]
+        The list of images to encode to base64
+    chat : dict
+    fmt : str, optional
+        Any format Pillow understands (e.g. "PNG", "JPEG").
+        Defaults to "PNG".
+    Returns
+    -------
+    dict
+    """
+    images = []
+    for img in imgs:
+        buf = BytesIO()
+        img.save(buf, format=fmt)
+        img_b64 = base64.b64encode(buf.getvalue()).decode("utf-8")
+        img_dict = {
+            "type": "image_url",
+            "image_url": {"url": f"data:image/png;base64,{img_b64}", "detail": "auto"},
+        }
+        images.append(img_dict)
+    # chat is in format of list[dict["role": "user"/"system", "content": str, "type": "text"],...]
+    # with images, we need "content" to be a list of dicts with "type" and "text"/"image_url"
+    # currently we do not support few-shots so only one user message
+    # text content also has <image> placeholders, which apparently is not necessary for API class (confirm)
+    if isinstance(chat[-1]["content"], list):
+        chat[-1]["content"] = images + chat[-1]["content"]
+    else:
+        text_content = {"type": "text", "text": chat[-1]["content"]}
+        chat[-1]["content"] = images + [text_content]
+    chat[-1].pop("type")
+    return chat
+class TemplateAPI(TemplateLM):
+    MULTIMODAL = True
+    def __init__(
+        self,
+        model: str = None,
+        pretrained: str = None,  # `model` takes precedence over `pretrained` when passed.
+        base_url: str = None,
+        tokenizer: Optional[str] = None,
+        # Loglikelihood tasks require a tokenizer to calculate context lengths,
+        # however the requests can be sent as a string if the API doesn't support token inputs.
+        # use tokenized_requests=False
+        tokenizer_backend: Optional[
+            Literal["tiktoken", "huggingface", "None", "none"]
+        ] = "huggingface",
+        truncate: bool = False,
+        # number of concurrent requests. More useful if not batching
+        num_concurrent: int = 1,
+        max_retries: int = 3,
+        max_gen_toks: int = 256,
+        batch_size: Union[str, int] = 1,
+        seed: int = 1234,
+        max_length: Optional[int] = 2048,
+        add_bos_token: bool = False,
+        custom_prefix_token_id: int = None,
+        # send the requests as tokens or strings
+        tokenized_requests: bool = True,
+        trust_remote_code: bool = False,
+        revision: Optional[str] = "main",
+        use_fast_tokenizer: bool = True,
+        verify_certificate: bool = True,
+        eos_string: str = None,
+        # timeout in seconds
+        timeout: int = 300,
+        header: Optional[Dict[str, str]] = None,
+        max_images: int = 1,
+        **kwargs,
+    ) -> None:
+        super().__init__()
+        missing_packages = [
+            pkg
+            for pkg in ["aiohttp", "tqdm", "tenacity", "requests"]
+            if find_spec(pkg) is None
+        ]
+        if missing_packages:
+            raise ModuleNotFoundError(
+                f"Attempted to use an API model, but the required packages {missing_packages} are not installed. "
+                'Please install these via `pip install lm-eval[api]` or `pip install -e ."[api]"`'
+            )
+        self.model = model or pretrained
+        self.base_url = base_url
+        self.tokenizer = tokenizer
+        self._header = header
+        if not isinstance(batch_size, int) and "auto" in batch_size:
+            eval_logger.warning(
+                "Automatic batch size is not supported for API models. Defaulting to batch size 1."
+            )
+        elif int(batch_size) > 1:
+            eval_logger.warning(
+                "Batch size > 1 detected. Ensure your API supports batched requests with varying total sequence lengths."
+            )
+        self._batch_size = int(batch_size) if batch_size != "auto" else 1
+        self._truncate = truncate
+        self._max_gen_toks = int(max_gen_toks)
+        self._seed = int(seed)
+        # max_length - 1 as we always have 1 token for generation
+        eval_logger.info(f"Using max length {max_length} - 1")
+        self.max_length = max_length - 1
+        if int(num_concurrent) <= 1:
+            eval_logger.info(
+                "Concurrent requests are disabled. To enable concurrent requests, set `num_concurrent` > 1."
+            )
+        self._concurrent = int(num_concurrent)
+        self.tokenizer_backend = (
+            None if tokenizer_backend in ("None", "none") else tokenizer_backend
+        )
+        self.add_bos_token = add_bos_token
+        self.custom_prefix_token_id = custom_prefix_token_id
+        self.tokenized_requests = tokenized_requests
+        self.max_retries = int(max_retries)
+        self.verify_certificate = verify_certificate
+        self._eos_string = eos_string
+        self.timeout = int(timeout)
+        self.max_images = int(max_images)
+        eval_logger.info(f"Using tokenizer {self.tokenizer_backend}")
+        if self.tokenizer_backend is None:
+            self.tokenizer = None
+            self.tokenized_requests = False
+        else:
+            if self.tokenizer is None:
+                if self.tokenizer_backend == "huggingface":
+                    import transformers
+                    self.tokenizer = transformers.AutoTokenizer.from_pretrained(
+                        self.tokenizer if self.tokenizer else self.model,
+                        trust_remote_code=trust_remote_code,
+                        revision=revision,
+                        use_fast=use_fast_tokenizer,
+                    )
+                    # Not used as the API will handle padding but to mirror the behavior of the HFLM
+                    self.tokenizer = configure_pad_token(self.tokenizer)
+                elif self.tokenizer_backend == "tiktoken":
+                    try:
+                        import tiktoken
+                        self.tokenizer = tiktoken.encoding_for_model(self.model)
+                    except ModuleNotFoundError as e:
+                        raise ModuleNotFoundError(
+                            "Attempted to use 'openai' LM type, but the package `tiktoken` is not installed. "
+                            "Please install it via `pip install lm-eval[api]` or `pip install -e .[api]`."
+                        ) from e
+                    if "openai" not in self.base_url:
+                        eval_logger.warning(
+                            f"Passed `base_url={self.base_url}` but using (OpenAI) Tiktoken tokenizer backend. "
+                            "Pass `tokenizer_backend=huggingface` and provide the HF tokenizer name if your model does not use Tiktoken."
+                        )
+            else:
+                import transformers
+                assert isinstance(tokenizer, str), "tokenizer must be a string"
+                self.tokenizer = transformers.AutoTokenizer.from_pretrained(
+                    tokenizer,
+                    trust_remote_code=trust_remote_code,
+                    revision=revision,
+                    use_fast=use_fast_tokenizer,
+                )
+    @abc.abstractmethod
+    def _create_payload(
+        self,
+        messages: Union[List[List[int]], List[dict], List[str], str],
+        *,
+        generate: bool = True,
+        gen_kwargs: Optional[dict] = None,
+        seed: int = 1234,
+        eos: str = None,
+        **kwargs,
+    ) -> dict:
+        """This method is responsible for creating the json payload that will be sent to the API."""
+        raise NotImplementedError
+    def create_message(
+        self,
+        messages: Union[List[List[int]], List[str], List[JsonChatStr]],
+        generate=False,
+    ) -> Union[List[List[int]], List[dict], List[str], str]:
+        """Helper method to transform the prompt into the expected API input format. messages consist of batched requests"""
+        if isinstance(messages[0], JsonChatStr):
+            # for chat completions we need to decode the json string to list[dict,...]
+            assert self._batch_size == 1, (
+                "non-tokenized chat requests are only supported with batch_size=1"
+            )
+            # list[dict["role":..., "content":...],...]
+            return json.loads(messages[0].prompt)
+        if not self.tokenized_requests:
+            # if messages are tokenized:
+            if isinstance(messages[0][0], int):
+                # assuming decoding is lossless. However, this is only for loglikelihood requests
+                # as we need to compute the context length. For generations, we don't need to tokenize.
+                messages = self.decode_batch(messages)
+            if self._batch_size <= 1:
+                # if batch is 1 return str
+                return messages[0]
+            else:
+                # list[str,...]
+                return messages
+        # list[list[int], ...]
+        return messages
+    @staticmethod
+    @abc.abstractmethod
+    def parse_logprobs(
+        outputs: Union[Any, List[Any]],
+        tokens: List[List[int]] = None,
+        ctxlen: List[int] = None,
+        **kwargs,
+    ) -> List[Tuple[float, bool]]:
+        """Method used to parse the logprobs from the (batched) API response. This method should return a list of tuples"""
+        raise NotImplementedError
+    @staticmethod
+    @abc.abstractmethod
+    def parse_generations(outputs: Union[Any, List[Any]], **kwargs) -> List[str]:
+        """Method used to parse the generations from the (batched) API response. This method should return a list of str"""
+        raise NotImplementedError
+    @cached_property
+    def api_key(self) -> str:
+        """Override this property to return the API key for the API request."""
+        return ""
+    @cached_property
+    def header(self) -> dict:
+        """Override this property to return the headers for the API request."""
+        return self._header or {"Authorization": f"Bearer {self.api_key}"}
+    @property
+    def tokenizer_name(self) -> str:
+        """Must be defined for LM subclasses which implement Chat Templating.
+        Should return the name of the tokenizer or chat template used.
+        Used only to properly fingerprint caches when requests are being cached with `--cache_requests`, otherwise not used.
+        """
+        return ""
+    def apply_chat_template(
+        self, chat_history: List[Dict[str, str]], add_generation_prompt: bool = True
+    ) -> Union[str, JsonChatStr]:
+        """Applies a chat template to a list of chat history between user and model."""
+        if self.tokenizer_backend == "huggingface" and self.tokenized_requests:
+            return self.tokenizer.apply_chat_template(
+                chat_history,
+                tokenize=False,
+                add_generation_prompt=add_generation_prompt,
+                continue_final_message=not add_generation_prompt,
+            )
+        else:
+            # bit of a hack. We'll load back before sending to the API
+            return JsonChatStr(
+                json.dumps(
+                    [{**item, "type": "text"} for item in chat_history],
+                    ensure_ascii=False,
+                )
+            )
+    @cached_property
+    def eot_token_id(self) -> Optional[int]:
+        if self.tokenizer is None:
+            return None
+        else:
+            if self.tokenizer_backend == "huggingface":
+                return self.tokenizer.eos_token_id
+            elif self.tokenizer_backend == "tiktoken":
+                return self.tokenizer.eot_token
+    @cached_property
+    def eos_string(self) -> Optional[str]:
+        if self._eos_string:
+            return self._eos_string
+        elif self.tokenizer is not None:
+            if self.tokenizer_backend == "huggingface":
+                return self.tokenizer.eos_token
+            elif self.tokenizer_backend == "tiktoken":
+                return self.tokenizer.decode([self.tokenizer.eot_token])
+        else:
+            eval_logger.warning(
+                "Cannot determine EOS string to pass to stop sequence. Manually set by passing `eos_string` to model_args."
+            )
+            return None
+    @cached_property
+    def prefix_token_id(self) -> Optional[int]:
+        if self.tokenizer is None:
+            return None
+        else:
+            if self.custom_prefix_token_id is not None:
+                return self.custom_prefix_token_id
+            if self.tokenizer_backend == "huggingface":
+                if self.tokenizer.bos_token_id is not None:
+                    return self.tokenizer.bos_token_id
+                return self.tokenizer.eos_token_id
+            else:
+                return self.tokenizer.eot_token
+    def tok_encode(
+        self,
+        string: str,
+        left_truncate_len: int = None,
+        add_special_tokens: bool = False,
+        truncation: bool = False,
+        **kwargs,
+    ) -> Union[List[List[int]], List[int], List[str]]:
+        if self.tokenizer_backend is None:
+            return [string]
+        elif self.tokenizer_backend == "huggingface":
+            # by default for CausalLM - false or self.add_bos_token is set
+            if not add_special_tokens:
+                add_special_tokens = False or self.add_bos_token
+            encoding: Union[List[List[int]], List[int]] = self.tokenizer(
+                string,
+                add_special_tokens=add_special_tokens,
+                truncation=truncation,
+                return_attention_mask=False,
+            ).input_ids
+            # left-truncate the encoded context to be at most `left_truncate_len` tokens long
+            if left_truncate_len:
+                if not isinstance(string, str):
+                    encoding = [enc[-left_truncate_len:] for enc in encoding]
+                else:
+                    encoding = encoding[-left_truncate_len:]
+            return encoding
+        else:
+            try:
+                encoding = self.tokenizer.encode(string)
+            except Exception:
+                encoding = self.tokenizer.encode_batch(string)
+            return encoding
+    def decode_batch(self, tokens: List[List[int]]) -> List[str]:
+        if self.tokenizer_backend == "huggingface":
+            return self.tokenizer.batch_decode(tokens)
+        elif self.tokenizer_backend == "tiktoken":
+            return self.tokenizer.decode_batch(tokens)
+    def model_call(
+        self,
+        messages: Union[List[List[int]], List[str], List[JsonChatStr]],
+        *,
+        generate: bool = True,
+        gen_kwargs: Optional[Dict] = None,
+        **kwargs,
+    ) -> Optional[dict]:
+        # !!! Copy: shared dict for each request, need new object !!!
+        gen_kwargs = copy.deepcopy(gen_kwargs)
+        try:
+            response = requests.post(
+                self.base_url,
+                json=self._create_payload(
+                    self.create_message(messages),
+                    generate=generate,
+                    gen_kwargs=gen_kwargs,
+                    seed=self._seed,
+                    eos=self.eos_string,
+                    **kwargs,
+                ),
+                headers=self.header,
+                verify=self.verify_certificate,
+            )
+            if not response.ok:
+                eval_logger.warning(
+                    f"API request failed with error message: {response.text}. Retrying..."
+                )
+            response.raise_for_status()
+            return response.json()
+        except RetryError:
+            eval_logger.error(
+                "API request failed after multiple retries. Please check the API status."
+            )
+            return None
+    async def amodel_call(
+        self,
+        session: ClientSession,
+        sem: asyncio.Semaphore,
+        messages: Union[List[List[int]], List[str], List[JsonChatStr]],
+        *,
+        generate: bool = True,
+        cache_keys: list = None,
+        ctxlens: Optional[List[int]] = None,
+        gen_kwargs: Optional[Dict] = None,
+        **kwargs,
+    ) -> Union[List[str], List[Tuple[float, bool]], None]:
+        # !!! Copy: shared dict for each request, need new object !!!
+        gen_kwargs = copy.deepcopy(gen_kwargs)
+        payload = self._create_payload(
+            self.create_message(messages),
+            generate=generate,
+            gen_kwargs=gen_kwargs,
+            seed=self._seed,
+            **kwargs,
+        )
+        cache_method = "generate_until" if generate else "loglikelihood"
+        acquired = await sem.acquire()
+        try:
+            async with session.post(
+                self.base_url,
+                json=payload,
+                headers=self.header,
+            ) as response:
+                if not response.ok:
+                    error_text = await response.text()
+                    eval_logger.warning(
+                        f"API request failed! Status code: {response.status}, "
+                        f"Response text: {error_text}. Retrying..."
+                    )
+                # raising exception will retry the request
+                response.raise_for_status()
+                outputs = await response.json()
+            answers = (
+                self.parse_generations(
+                    outputs=outputs,
+                )
+                if generate
+                else self.parse_logprobs(
+                    outputs=outputs,
+                    tokens=messages,
+                    ctxlens=ctxlens,
+                )
+            )
+            if cache_keys:
+                for res, cache in zip(answers, cache_keys):
+                    self.cache_hook.add_partial(cache_method, cache, res)
+            return answers
+        # If the retries also fail
+        except BaseException as e:
+            eval_logger.error(f"Exception:{repr(e)}, {outputs}, retrying.")
+            raise e
+        finally:
+            if acquired:
+                sem.release()
+    def batch_loglikelihood_requests(
+        self, chunks: Iterable[List[LogLikelihoodInputs]]
+    ) -> Tuple[List[List[int]], List[int], List[Tuple[str, str]]]:
+        inputs = []
+        ctxlens = []
+        cache_keys = []
+        for chunk in chunks:
+            for cache_key, context_enc, continuation_enc in chunk:
+                # max_length - 1 as we always have 1 token for generation
+                inp = (context_enc + continuation_enc)[-self.max_length :]
+                if len(inp) < len(context_enc + continuation_enc):
+                    eval_logger.warning(
+                        f"Context length ({len(context_enc)}) + continuation length ({len(continuation_enc)}) > max_length ({self.max_length}). Left truncating context."
+                    )
+                ctxlen = len(context_enc) - max(
+                    0, len(context_enc) + len(continuation_enc) - self.max_length
+                )
+                inputs.append(inp)
+                ctxlens.append(ctxlen)
+                cache_keys.append(cache_key)
+        return inputs, ctxlens, cache_keys
+    async def get_batched_requests(
+        self,
+        requests: list,
+        cache_keys: list,
+        *,
+        generate: bool = True,
+        ctxlens: List[int] = None,
+        **kwargs,
+    ) -> Union[List[List[str]], List[List[Tuple[float, bool]]]]:
+        ctxlens = ctxlens if ctxlens else [None] * len(requests)
+        conn = TCPConnector(limit=self._concurrent, ssl=self.verify_certificate)
+        sem = asyncio.Semaphore(self._concurrent)
+        async with ClientSession(
+            connector=conn, timeout=ClientTimeout(total=self.timeout)
+        ) as session:
+            retry_: Callable[..., Awaitable[Any]] = retry(
+                stop=stop_after_attempt(self.max_retries),
+                wait=wait_exponential(multiplier=0.5, min=1, max=10),
+                reraise=True,
+                before_sleep=lambda retry_state: eval_logger.info(
+                    f"Retry attempt {retry_state.attempt_number}"
+                ),
+            )(self.amodel_call)
+            # Create tasks for each batch of request
+            tasks = [
+                asyncio.create_task(
+                    retry_(
+                        session=session,
+                        sem=sem,
+                        messages=message,
+                        cache_keys=cache_key,
+                        generate=generate,
+                        ctxlens=ctxlen,
+                        **kwargs,
+                    )
+                )
+                for message, cache_key, ctxlen in zip(
+                    chunks(requests, n=self._batch_size),
+                    chunks(cache_keys, n=self._batch_size),
+                    chunks(ctxlens, n=self._batch_size),
+                )
+            ]
+            return await tqdm_asyncio.gather(*tasks, desc="Requesting API")
+    def _loglikelihood_tokens(self, requests, **kwargs) -> List[Tuple[float, bool]]:
+        assert self.tokenizer is not None, (
+            "Tokenizer is required for loglikelihood tasks to compute context lengths."
+        )
+        res = []
+        def _collate(req: LogLikelihoodInputs):
+            """Defines the key for the sorted method"""
+            # the negative sign on len(toks) sorts descending - this has a few advantages:
+            # - time estimates will always be over not underestimates, which is more useful for planning
+            # - to know the size of a batch when going through the list, you know the first one is always the batch
+            #   padded context length. this is useful to simplify the batching logic and more importantly to make
+            #   automatic adaptive batches much much easier to implement
+            # - any OOMs will happen right away rather than near the end
+            toks = req[1] + req[2]
+            return -len(toks), tuple(toks)
+        re_ord = Collator(
+            requests,
+            sort_fn=_collate,
+            group_by=None,
+        )
+        # if concurrent then we'll batch in the async context
+        chunked = re_ord.get_batched(n=self._batch_size if self._concurrent <= 1 else 0)
+        if self._concurrent <= 1:
+            pbar = tqdm(desc="Requesting API", total=len(requests))
+            for chunk in chunked:
+                inputs, ctxlens, cache_keys = self.batch_loglikelihood_requests([chunk])
+                outputs = retry(
+                    stop=stop_after_attempt(self.max_retries),
+                    wait=wait_exponential(multiplier=0.5, min=1, max=10),
+                    reraise=True,
+                )(self.model_call)(messages=inputs, generate=False)
+                if isinstance(outputs, dict):
+                    outputs = [outputs]
+                for answer_, cache_key in zip(
+                    self.parse_logprobs(
+                        outputs=outputs, tokens=inputs, ctxlens=ctxlens
+                    ),
+                    cache_keys,
+                ):
+                    if answer_ is not None:
+                        res.append(answer_)
+                        # cache requests that aren't from a loglikelihood_rolling request
+                        if cache_key is not None:
+                            self.cache_hook.add_partial(
+                                "loglikelihood", cache_key, answer_
+                            )
+                        pbar.update(1)
+        else:
+            inputs, ctxlens, cache_keys = self.batch_loglikelihood_requests(chunked)
+            res = itertools.chain.from_iterable(
+                asyncio.run(
+                    self.get_batched_requests(
+                        inputs, cache_keys, generate=False, ctxlens=ctxlens
+                    )
+                )
+            )
+        return re_ord.get_original(res)
+    def generate_until(
+        self, requests: List[Instance], disable_tqdm: bool = False
+    ) -> List[str]:
+        res = []
+        def _collate_gen(_requests):
+            # sort by the length of the non-tokenized contexts
+            return -len(_requests[0])
+        # Let the API deal with tokenization
+        if len(requests[0].args) > 2:
+            assert self.tokenizer is None, (
+                "tokenizer is not supported for multimodal requests yet!"
+            )
+            eval_logger.info(
+                f"Using max_images {self.max_images}. Set in the model args."
+            )
+            requests, all_gen_kwargs, auxiliary_args = zip(
+                *(req.args for req in requests)
+            )
+            requests = tuple(
+                JsonChatStr(
+                    json.dumps(
+                        create_image_prompt(
+                            y["visual"][: self.max_images], json.loads(x.prompt)
+                        )
+                    )
+                )
+                for x, y in zip(requests, auxiliary_args)
+            )
+        else:
+            requests, all_gen_kwargs = zip(*(req.args for req in requests))
+        if self.tokenized_requests:
+            encodings_list = self.tok_encode(
+                requests, add_special_tokens=self.add_bos_token
+            )
+        else:
+            encodings_list = [None] * len(requests)
+        requests = [
+            (a, b, c) for a, b, c in zip(requests, all_gen_kwargs, encodings_list)
+        ]
+        re_ord = Collator(
+            requests,
+            sort_fn=_collate_gen,
+            group_by="gen_kwargs",
+        )
+        chunked = re_ord.get_batched(
+            n=self._batch_size if self._concurrent <= 1 else 0, batch_fn=None
+        )
+        if not self.tokenized_requests:
+            eval_logger.info(
+                "Tokenized requests are disabled. Context + generation length is not checked."
+            )
+        if self._concurrent <= 1:
+            pbar = tqdm(desc="Requesting API", total=len(requests))
+            for chunk in chunked:
+                contexts, all_gen_kwargs, encodings_list = zip(*chunk)
+                if self.tokenized_requests:
+                    max_gen_toks = all_gen_kwargs[0].get(
+                        "max_gen_toks", self._max_gen_toks
+                    )
+                    max_context_len = self.max_length - max_gen_toks
+                    encodings_list = [x[-max_context_len:] for x in encodings_list]
+                    if any(
+                        len(x) + max_gen_toks > self.max_length for x in encodings_list
+                    ):
+                        eval_logger.warning(
+                            f"Some contexts exceeded (max length: ({self.max_length}) - max_gen_toks: ({max_gen_toks}). They were left truncated."
+                        )
+                req = encodings_list if self.tokenized_requests else contexts
+                outputs = retry(
+                    stop=stop_after_attempt(self.max_retries),
+                    wait=wait_exponential(multiplier=0.5, min=1, max=10),
+                    reraise=True,
+                )(self.model_call)(
+                    messages=req,
+                    generate=True,
+                    gen_kwargs=copy.deepcopy(all_gen_kwargs[0]),
+                )
+                for generated_text, context in zip(
+                    self.parse_generations(
+                        outputs=outputs,
+                        contexts=contexts,
+                    ),
+                    contexts,
+                ):
+                    if generated_text is not None:
+                        res.append(generated_text)
+                        # partial caching
+                        if context is not None:
+                            self.cache_hook.add_partial(
+                                "generate_until",
+                                (context, all_gen_kwargs[0]),
+                                generated_text,
+                            )
+                            pbar.update(1)
+        else:
+            for chunk in chunked:
+                contexts, all_gen_kwargs, encodings_list = zip(*chunk)
+                if self.tokenized_requests:
+                    max_gen_toks = all_gen_kwargs[0].get(
+                        "max_gen_toks", self._max_gen_toks
+                    )
+                    max_context_len = self.max_length - max_gen_toks
+                    encodings_list = [x[-max_context_len:] for x in encodings_list]
+                    if any(
+                        len(x) + max_gen_toks > self.max_length for x in encodings_list
+                    ):
+                        eval_logger.warning(
+                            f"Some contexts exceeded (max length: ({self.max_length}) - max_gen_toks ({max_gen_toks}). They were left truncated."
+                        )
+                req = encodings_list if self.tokenized_requests else contexts
+                results = itertools.chain.from_iterable(
+                    asyncio.run(
+                        self.get_batched_requests(
+                            req,
+                            cache_keys=[(ctx, all_gen_kwargs[0]) for ctx in contexts],
+                            generate=True,
+                            gen_kwargs=copy.deepcopy(all_gen_kwargs[0]),
+                        )
+                    )
+                )
+                res.extend(results)
+        return re_ord.get_original(res)
+    def loglikelihood_rolling(
+        self, requests: List[Instance], disable_tqdm: bool = False
+    ) -> List[float]:
+        loglikelihoods = []
+        for (string,) in tqdm([req.args for req in requests], disable=disable_tqdm):
+            rolling_token_windows = list(
+                map(
+                    utils.make_disjoint_window,
+                    utils.get_rolling_token_windows(
+                        token_list=self.tok_encode(string),
+                        prefix_token=self.prefix_token_id,
+                        # max_seq_len - (1 for context)
+                        max_seq_len=self.max_length - 1,
+                        context_len=1,
+                    ),
+                )
+            )
+            # TODO: Right now, we pass single EOT token to the Encoder and the full context to the decoder, in seq2seq case
+            rolling_token_windows = [(None,) + x for x in rolling_token_windows]
+            string_nll = self._loglikelihood_tokens(
+                rolling_token_windows,
+                disable_tqdm=True,
+            )
+            # discard is_greedy
+            string_nll = [x[0] for x in string_nll]
+            string_nll = sum(string_nll)
+            loglikelihoods.append(string_nll)
+            # cache this loglikelihood_rolling request
+            self.cache_hook.add_partial("loglikelihood_rolling", (string,), string_nll)
+        return loglikelihoods

lm-evaluation-harness/lm_eval/models/dummy.py ADDED Viewed

	@@ -0,0 +1,41 @@

+import random
+from tqdm import tqdm
+from lm_eval.api.model import LM
+from lm_eval.api.registry import register_model
+@register_model("dummy")
+class DummyLM(LM):
+    def __init__(self) -> None:
+        super().__init__()
+    @classmethod
+    def create_from_arg_string(cls, arg_string, additional_config=None):
+        return cls()
+    def loglikelihood(self, requests, disable_tqdm: bool = False):
+        res = []
+        for _ in tqdm(requests, disable=disable_tqdm):
+            res.append((-random.random(), False))
+        return res
+    def generate_until(self, requests, disable_tqdm: bool = False):
+        res = []
+        for request in tqdm(requests, disable=disable_tqdm):
+            res.append("lol")
+            assert request.arguments[0].strip() != ""
+        return res
+    def loglikelihood_rolling(self, requests, disable_tqdm: bool = False):
+        res = []
+        for _ in tqdm(requests, disable=disable_tqdm):
+            res.append(-random.random())
+        return res

lm-evaluation-harness/lm_eval/models/gguf.py ADDED Viewed

	@@ -0,0 +1,132 @@

+import logging
+import time
+import requests
+from requests.exceptions import RequestException
+from tqdm import tqdm
+from lm_eval.api.model import LM
+from lm_eval.api.registry import register_model
+logger = logging.getLogger(__name__)
+def get_result(logprobs, context_length):
+    is_greedy = True
+    offsets = logprobs["text_offset"]
+    tokens = logprobs["tokens"]
+    tokens_logprobs = logprobs["token_logprobs"]
+    idx = 0
+    while offsets[idx] < context_length:
+        idx += 1
+    continuation_logprobs = sum(tokens_logprobs[idx:-1])
+    for i in range(idx, len(tokens)):
+        token = tokens[i]
+        top_tokens = logprobs["top_logprobs"][i]
+        top_token = max(top_tokens.keys(), key=lambda x: top_tokens[x])
+        if top_token != token:
+            is_greedy = False
+            break
+    return continuation_logprobs, is_greedy
+@register_model("gguf", "ggml")
+class GGUFLM(LM):
+    def __init__(self, base_url=None, max_length=2048, **kwargs):
+        super().__init__()
+        self.base_url = base_url
+        assert self.base_url, "must pass `base_url` to use GGUF LM!"
+        self.logprobs = 10
+        self.temperature = 0.0
+        self.max_length = max_length
+    def gguf_completion(
+        self, context, continuation=None, stop=None, retries=3, delay=5, **kwargs
+    ):
+        for _ in range(retries):
+            try:
+                prompt = context
+                request = {
+                    "prompt": prompt,
+                    "logprobs": self.logprobs,
+                    "temperature": self.temperature,
+                }
+                if continuation:
+                    prompt += continuation
+                    request.update({"prompt": prompt, "max_tokens": 1, "echo": True})
+                if stop is not None:
+                    request["stop"] = stop
+                response = requests.post(
+                    f"{self.base_url}/v1/completions", json=request
+                )
+                response.raise_for_status()
+                return response.json()
+            except RequestException as e:
+                logger.error(f"RequestException: {e}")
+                time.sleep(delay)  # wait before retrying
+        else:
+            raise RuntimeError(
+                f"Failed to get a valid response after {retries} retries."
+            )
+    def loglikelihood(self, requests, disable_tqdm: bool = False):
+        if not requests:
+            return []
+        res = []
+        for context, continuation in tqdm(
+            [req.args for req in requests], disable=disable_tqdm
+        ):
+            response = self.gguf_completion(context=context, continuation=continuation)
+            if response and "choices" in response and response["choices"]:
+                choice = response["choices"][0]
+                logprobs = choice.get("logprobs")
+                if (
+                    logprobs
+                    and "token_logprobs" in logprobs
+                    and logprobs["token_logprobs"]
+                ):
+                    logprob, is_greedy = get_result(logprobs, len(context))
+                    res.append((logprob, is_greedy))
+                else:
+                    logger.warning(
+                        "Invalid logprobs data. Expected 'logprobs' to contain 'token_logprobs' list."
+                    )
+            else:
+                logger.error(
+                    f"Invalid response for loglikelihood. Response: {response}"
+                )
+                assert False
+        return res
+    def generate_until(self, requests, disable_tqdm: bool = False):
+        if not requests:
+            return []
+        res = []
+        for request in tqdm([req.args for req in requests], disable=disable_tqdm):
+            inp = request[0]
+            request_args = request[1]
+            until = request_args.get("until", ["</s>"])
+            response = self.gguf_completion(context=inp, stop=until)
+            if response and "choices" in response and response["choices"]:
+                choice = response["choices"][0]
+                if "text" in choice:
+                    generated_text = choice["text"].strip()
+                    res.append(generated_text)
+                else:
+                    logger.error(
+                        f"Invalid response for greedy_until. Response: {response}"
+                    )
+                    res.append(None)  # Add default value in case of error
+            else:
+                logger.error(f"Invalid response for greedy_until. Response: {response}")
+                res.append(None)  # Add default value in case of error
+        return res
+    def loglikelihood_rolling(self, requests, disable_tqdm: bool = False):
+        raise NotImplementedError(
+            "loglikelihood_rolling not yet supported for GGUF models"
+        )