Merge pull request #29 from naist-nlp/bertscore

de9uch1 · web-flow · commit 661eb61d1fe1 · 2024-12-19T10:44:41.000+09:00
Implement BERTScore
diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml
@@ -16,6 +16,14 @@ jobs:
       matrix:
         platform: ["ubuntu-latest", "windows-latest"]
         python-version: ["3.10", "3.11"]
+        pytest_marker:
+          - null
+          - "metrics_bertscore"
+          - "metrics_bleurt"
+          - "metrics_xcometlite"
+          - "metrics_metricx24"
+          - "metrics_metricx23"
+          - "metrics_metricx23qe"
     runs-on: ${{ matrix.platform }}
     steps:
     - uses: actions/checkout@v4
@@ -30,111 +38,4 @@ jobs:
     - name: Test with pytest
       run: |
         uv run huggingface-cli login --token ${{ secrets.HUGGINGFACE_TOKEN }}
-        uv run pytest
-
-  metrics_bleurt:
-    strategy:
-      matrix:
-        platform: ["ubuntu-latest", "windows-latest"]
-        python-version: ["3.10", "3.11"]
-    runs-on: ${{ matrix.platform }}
-    steps:
-    - uses: actions/checkout@v4
-    - name: Install uv
-      uses: astral-sh/setup-uv@v4
-      with:
-        python-version: ${{ matrix.python-version }}
-    - name: Set up Python ${{ matrix.python-version }}
-      run: uv python install
-    - name: Install the project
-      run: uv sync --all-extras --dev
-    - name: Test with pytest
-      run: |
-        uv run huggingface-cli login --token ${{ secrets.HUGGINGFACE_TOKEN }}
-        uv run pytest -m "metrics_bleurt"
-
-  metrics_xcometlite:
-    strategy:
-      matrix:
-        platform: ["ubuntu-latest", "windows-latest"]
-        python-version: ["3.10", "3.11"]
-
-    runs-on: ${{ matrix.platform }}
-
-    steps:
-    - uses: actions/checkout@v4
-    - name: Install uv
-      uses: astral-sh/setup-uv@v4
-      with:
-        python-version: ${{ matrix.python-version }}
-    - name: Set up Python ${{ matrix.python-version }}
-      run: uv python install
-    - name: Install the project
-      run: uv sync --all-extras --dev
-    - name: Test with pytest
-      run: |
-        uv run huggingface-cli login --token ${{ secrets.HUGGINGFACE_TOKEN }}
-        uv run pytest -m "metrics_xcometlite"
-
-  metrics_metricx24:
-    strategy:
-      matrix:
-        platform: ["ubuntu-latest", "windows-latest"]
-        python-version: ["3.10", "3.11"]
-    runs-on: ${{ matrix.platform }}
-    steps:
-    - uses: actions/checkout@v4
-    - name: Install uv
-      uses: astral-sh/setup-uv@v4
-      with:
-        python-version: ${{ matrix.python-version }}
-    - name: Set up Python ${{ matrix.python-version }}
-      run: uv python install
-    - name: Install the project
-      run: uv sync --all-extras --dev
-    - name: Test with pytest
-      run: |
-        uv run huggingface-cli login --token ${{ secrets.HUGGINGFACE_TOKEN }}
-        uv run pytest -m "metrics_metricx24"
-
-  metrics_metricx23:
-    strategy:
-      matrix:
-        platform: ["ubuntu-latest", "windows-latest"]
-        python-version: ["3.10", "3.11"]
-    runs-on: ${{ matrix.platform }}
-    steps:
-    - uses: actions/checkout@v4
-    - name: Install uv
-      uses: astral-sh/setup-uv@v4
-      with:
-        python-version: ${{ matrix.python-version }}
-    - name: Set up Python ${{ matrix.python-version }}
-      run: uv python install
-    - name: Install the project
-      run: uv sync --all-extras --dev
-    - name: Test with pytest
-      run: |
-        uv run huggingface-cli login --token ${{ secrets.HUGGINGFACE_TOKEN }}
-        uv run pytest -m "metrics_metricx23"
-
-  metrics_metricx23qe:
-    strategy:
-      matrix:
-        platform: ["ubuntu-latest", "windows-latest"]
-        python-version: ["3.10", "3.11"]
-    runs-on: ${{ matrix.platform }}
-    steps:
-    - uses: actions/checkout@v4
-    - name: Install uv
-      uses: astral-sh/setup-uv@v4
-      with:
-        python-version: ${{ matrix.python-version }}
-    - name: Set up Python ${{ matrix.python-version }}
-      run: uv python install
-    - name: Install the project
-      run: uv sync --all-extras --dev
-    - name: Test with pytest
-      run: |
-        uv run huggingface-cli login --token ${{ secrets.HUGGINGFACE_TOKEN }}
-        uv run pytest -m "metrics_metricx23qe"
+        uv run pytest ${{ matrix.pytest_marker && format('-m {0}', matrix.pytest_marker) || '' }}
diff --git a/.readthedocs.yaml b/.readthedocs.yaml
@@ -14,7 +14,8 @@ build:
     - asdf plugin add uv
     - asdf install uv latest
     - asdf global uv latest
-    - uv sync --extra docs --frozen
+    - uv python install 3.10
+    - uv sync --all-extras --all-groups
     - uv run sphinx-apidoc --remove-old -d1 -Tfe -o docs/source ./ "$READTHEDOCS_REPOSITORY_PATH/**/*_test.py" "$READTHEDOCS_REPOSITORY_PATH/**/conftest.py"
     - uv run -m sphinx -T -b html -d docs/_build/doctrees -D language=en docs $READTHEDOCS_OUTPUT/html
 
diff --git a/README.md b/README.md
@@ -15,7 +15,8 @@
 <b>
       <a href="https://aclanthology.org/2024.emnlp-demo.37">Paper</a> |
       <a href="https://mbrs.readthedocs.io">Reference docs</a> |
-      <a href="https://github.com/naist-nlp/mbrs#citation">Citation</a>
+      <a href="https://github.com/naist-nlp/mbrs#citation">Citation</a> |
+      <a href="https://github.com/naist-nlp/mbrs/releases">Release notes</a>
 </b>
 </p>
 
@@ -35,6 +36,13 @@ cd mbrs/
 pip install ./
 ```
 
+For uv users:
+``` bash
+git clone https://github.com/naist-nlp/mbrs.git
+cd mbrs/
+uv sync
+```
+
 ## Quick start
 
 mbrs provides two interfaces: command-line interface (CLI) and Python
@@ -155,6 +163,7 @@ Currently, the following metrics are supported:
     to [\@lucadiliello](https://github.com/lucadiliello/bleurt-pytorch))
 -   MetricX ([Juraska et al., 2023](https://aclanthology.org/2023.wmt-1.63);
     [Juraska et al., 2024](https://aclanthology.org/2024.wmt-1.35)): `metricx`
+-   BERTScore [(Zhang et al., 2020)](https://openreview.net/forum?id=SkeHuCVFDr): `bertscore`
 
 ### Decoders
 
diff --git a/docs/list_metrics.rst b/docs/list_metrics.rst
@@ -51,3 +51,7 @@ Supported metrics are listed below.
      - :code:`metricx`
      - :doc:`MetricMetricX <./source/mbrs.metrics.metricx>`
      - `(Juraska et al., 2023) <https://aclanthology.org/2023.wmt-1.63>`_ `(Juraska et al., 2024) <https://aclanthology.org/2024.wmt-1.35>`_
+   * - BERTScore
+     - :code:`bertscore`
+     - :doc:`MetricBERTScore <./source/mbrs.metrics.bertscore>`
+     - `(Zhang et al., 2020) <https://openreview.net/forum?id=SkeHuCVFDr>`_
diff --git a/mbrs/metrics/__init__.py b/mbrs/metrics/__init__.py
@@ -14,6 +14,7 @@
 
 register, get_metric = registry.setup("metric")
 
+from .bertscore import MetricBERTScore
 from .bleu import MetricBLEU
 from .bleurt import MetricBLEURT
 from .chrf import MetricChrF
@@ -29,6 +30,7 @@
     "MetricAggregatable",
     "MetricCacheable",
     "MetricReferenceless",
+    "MetricBERTScore",
     "MetricBLEU",
     "MetricChrF",
     "MetricCOMET",
diff --git a/mbrs/metrics/bertscore.py b/mbrs/metrics/bertscore.py
@@ -0,0 +1,192 @@
+from __future__ import annotations
+
+import enum
+import itertools
+from dataclasses import dataclass
+from typing import Optional
+
+import torch
+import transformers
+from bert_score import BERTScorer
+from simple_parsing.helpers.fields import choice
+from torch import Tensor
+
+from mbrs import timer
+
+from . import Metric, register
+
+transformers.logging.set_verbosity_error()
+
+
+class BERTScoreScoreType(int, enum.Enum):
+    precision = 0
+    recall = 1
+    f1 = 2
+
+
+@register("bertscore")
+class MetricBERTScore(Metric):
+    """BERTScore metric class."""
+
+    scorer: BERTScorer
+
+    @dataclass
+    class Config(Metric.Config):
+        """BERTScore metric configuration.
+
+        - score_type (BERTScoreScoreType): The output score type, i.e.,
+            precision, recall, or f1.
+        - model_type (str): Contexual embedding model specification, default using the
+            suggested model for the target langauge; has to specify at least one of
+            `model_type` or `lang`.
+        - num_layers (int): The layer of representation to use. Default using the number
+            of layer tuned on WMT16 correlation data.
+        - idf (bool): A booling to specify whether to use idf or not. (This should be
+            True even if `idf_sents` is given.)
+        - idf_sents (list[str]): List of sentences used to compute the idf weights.
+        - batch_size (int): Bert score processing batch size
+        - nthreads (int): Number of threads.
+        - lang (str): Language of the sentences; has to specify at least one of
+            `model_type` or `lang`. `lang` needs to be specified when
+            `rescale_with_baseline` is True.
+        - rescale_with_baseline (bool): Rescale bertscore with pre-computed baseline.
+        - baseline_path (str): Customized baseline file.
+        - use_fast_tokenizer (bool): `use_fast` parameter passed to HF tokenizer.
+        - fp16 (bool): Use float16 for the forward computation.
+        - bf16 (bool): Use bfloat16 for the forward computation.
+        - cpu (bool): Use CPU for the forward computation.
+        """
+
+        score_type: BERTScoreScoreType = choice(
+            BERTScoreScoreType, default=BERTScoreScoreType.f1
+        )
+        model_type: Optional[str] = None
+        num_layers: Optional[int] = None
+        batch_size: int = 64
+        nthreads: int = 4
+        all_layers: bool = False
+        idf: bool = False
+        idf_sents: Optional[list[str]] = None
+        lang: Optional[str] = None
+        rescale_with_baseline: bool = False
+        baseline_path: Optional[str] = None
+        use_fast_tokenizer: bool = False
+        fp16: bool = False
+        bf16: bool = False
+        cpu: bool = False
+
+    def __init__(self, cfg: MetricBERTScore.Config):
+        self.cfg = cfg
+        self.scorer = BERTScorer(
+            model_type=cfg.model_type,
+            num_layers=cfg.num_layers,
+            batch_size=cfg.batch_size,
+            nthreads=cfg.nthreads,
+            all_layers=cfg.all_layers,
+            idf=cfg.idf,
+            idf_sents=cfg.idf_sents,
+            device="cpu" if cfg.cpu else None,
+            lang=cfg.lang,
+            rescale_with_baseline=cfg.rescale_with_baseline,
+            baseline_path=cfg.baseline_path,
+            use_fast_tokenizer=cfg.use_fast_tokenizer,
+        )
+        self.scorer._model.eval()
+        for param in self.scorer._model.parameters():
+            param.requires_grad = False
+
+        if not cfg.cpu and torch.cuda.is_available():
+            if cfg.fp16:
+                self.scorer._model = self.scorer._model.half()
+            elif cfg.bf16:
+                self.scorer._model = self.scorer._model.bfloat16()
+            self.scorer._model = self.scorer._model.cuda()
+
+    @property
+    def device(self) -> torch.device:
+        """Returns the device of the model."""
+        return self.scorer._model.device
+
+    def _choose_output_score(self, triplet: tuple[Tensor, Tensor, Tensor]) -> Tensor:
+        """Choose the output score from the triplet of precision, recall, and f1 scores.
+
+        Args:
+            triplet (tuple[Tensor, Tensor, Tensor]): A triplet of precision, recall, and f1 scores.
+
+        Returns:
+            Tensor: Output score.
+        """
+        return triplet[self.cfg.score_type]
+
+    def score(self, hypothesis: str, reference: str, *_, **__) -> float:
+        """Calculate the score of the given hypothesis.
+
+        Args:
+            hypothesis (str): A hypothesis.
+            reference (str): A reference.
+
+        Returns:
+            float: The score of the given hypothesis.
+        """
+        return self._choose_output_score(
+            self.scorer.score(
+                [hypothesis],
+                [reference],
+                batch_size=self.cfg.batch_size,
+            )
+        ).item()
+
+    def scores(self, hypotheses: list[str], references: list[str], *_, **__) -> Tensor:
+        """Calculate the scores of the given hypothesis.
+
+        Args:
+            hypotheses (list[str]): N hypotheses.
+            references (list[str]): N references.
+
+        Returns:
+            Tensor: The N scores of the given hypotheses.
+        """
+
+        with timer.measure("score") as t:
+            t.set_delta_ncalls(len(hypotheses))
+            return self._choose_output_score(
+                self.scorer.score(
+                    hypotheses,
+                    references,
+                    batch_size=self.cfg.batch_size,
+                )
+            ).view(len(hypotheses))
+
+    def pairwise_scores(
+        self, hypotheses: list[str], references: list[str], *_, **__
+    ) -> Tensor:
+        """Calculate the pairwise scores.
+
+        Args:
+            hypotheses (list[str]): Hypotheses.
+            references (list[str]): References.
+
+        Returns:
+            Tensor: Score matrix of shape `(H, R)`, where `H` is the number
+              of hypotheses and `R` is the number of references.
+        """
+        hyps, refs = tuple(zip(*itertools.product(hypotheses, references)))
+        with timer.measure("score") as t:
+            t.set_delta_ncalls(len(hypotheses) * len(references))
+            return self._choose_output_score(
+                self.scorer.score(hyps, refs, batch_size=self.cfg.batch_size)
+            ).view(len(hypotheses), len(references))
+
+    def corpus_score(
+        self, hypotheses: list[str], references: list[str], *_, **__
+    ) -> float:
+        """Calculate the corpus-level score.
+
+        Args:
+            hypotheses (list[str]): Hypotheses.
+            references (list[str]): References.
+
+        Returns:
+            float: The corpus score.
+        """
+        return self.scores(hypotheses, references).mean().item()
diff --git a/mbrs/metrics/bertscore_test.py b/mbrs/metrics/bertscore_test.py
diff --git a/pyproject.toml b/pyproject.toml