diff --git a/docs/source/metrics.rst b/docs/source/metrics.rst
index c47a68cf9ca..5af9020e3d4 100644
--- a/docs/source/metrics.rst
+++ b/docs/source/metrics.rst
@@ -325,13 +325,14 @@ Complete list of metrics
     precision.Precision
     PSNR
     recall.Recall
-    Rouge
-    rouge.RougeL
-    rouge.RougeN
     RootMeanSquaredError
     RunningAverage
     SSIM
     TopKCategoricalAccuracy
+    Bleu
+    Rouge
+    RougeL
+    RougeN
 
 Helpers for customizing metrics
 -------------------------------
diff --git a/ignite/metrics/__init__.py b/ignite/metrics/__init__.py
index 47765f677eb..d743faf4f62 100644
--- a/ignite/metrics/__init__.py
+++ b/ignite/metrics/__init__.py
@@ -11,11 +11,12 @@
 from ignite.metrics.metric import BatchFiltered, BatchWise, EpochWise, Metric, MetricUsage
 from ignite.metrics.metrics_lambda import MetricsLambda
 from ignite.metrics.multilabel_confusion_matrix import MultiLabelConfusionMatrix
+from ignite.metrics.nlp.bleu import Bleu
+from ignite.metrics.nlp.rouge import Rouge, RougeL, RougeN
 from ignite.metrics.precision import Precision
 from ignite.metrics.psnr import PSNR
 from ignite.metrics.recall import Recall
 from ignite.metrics.root_mean_squared_error import RootMeanSquaredError
-from ignite.metrics.rouge import Rouge, RougeL, RougeN
 from ignite.metrics.running_average import RunningAverage
 from ignite.metrics.ssim import SSIM
 from ignite.metrics.top_k_categorical_accuracy import TopKCategoricalAccuracy
@@ -43,11 +44,12 @@
     "PSNR",
     "Recall",
     "RootMeanSquaredError",
-    "Rouge",
-    "RougeN",
-    "RougeL",
     "RunningAverage",
     "VariableAccumulation",
     "Frequency",
     "SSIM",
+    "Bleu",
+    "Rouge",
+    "RougeN",
+    "RougeL",
 ]
diff --git a/ignite/metrics/nlp/__init__.py b/ignite/metrics/nlp/__init__.py
new file mode 100644
index 00000000000..506f0bab51e
--- /dev/null
+++ b/ignite/metrics/nlp/__init__.py
@@ -0,0 +1,9 @@
+from ignite.metrics.nlp.bleu import Bleu
+from ignite.metrics.nlp.rouge import Rouge, RougeL, RougeN
+
+__all__ = [
+    "Bleu",
+    "Rouge",
+    "RougeN",
+    "RougeL",
+]
diff --git a/ignite/metrics/nlp/bleu.py b/ignite/metrics/nlp/bleu.py
new file mode 100644
index 00000000000..90be793d9fd
--- /dev/null
+++ b/ignite/metrics/nlp/bleu.py
@@ -0,0 +1,191 @@
+import math
+from collections import Counter
+from typing import Any, Callable, Sequence, Tuple, Union
+
+import torch
+
+from ignite.exceptions import NotComputableError
+from ignite.metrics.metric import Metric, reinit__is_reduced, sync_all_reduce
+from ignite.metrics.nlp.utils import modified_precision
+
+__all__ = ["Bleu"]
+
+
+def _closest_ref_length(references: Sequence[Sequence[Any]], hyp_len: int) -> int:
+    ref_lens = (len(reference) for reference in references)
+    closest_ref_len = min(ref_lens, key=lambda ref_len: (abs(ref_len - hyp_len), ref_len))
+    return closest_ref_len
+
+
+class _Smoother:
+    """
+    Smoothing helper
+    http://acl2014.org/acl2014/W14-33/pdf/W14-3346.pdf
+    """
+
+    def __init__(self, method: str):
+        valid = ["no_smooth", "smooth1", "nltk_smooth2", "smooth2"]
+        if method not in valid:
+            raise ValueError(f"Smooth is not valid (expected: {valid}, got: {method})")
+        self.smooth = method
+
+    def __call__(self, numerators: Counter, denominators: Counter) -> Sequence[float]:
+        method = getattr(self, self.smooth)
+        return method(numerators, denominators)
+
+    @staticmethod
+    def smooth1(numerators: Counter, denominators: Counter) -> Sequence[float]:
+        epsilon = 0.1
+        denominators_ = [max(1, d) for d in denominators.values()]
+        return [n / d if n != 0 else epsilon / d for n, d in zip(numerators.values(), denominators_)]
+
+    @staticmethod
+    def nltk_smooth2(numerators: Counter, denominators: Counter) -> Sequence[float]:
+        denominators_ = [max(1, d) for d in denominators.values()]
+        return [(n + 1) / (d + 1) for n, d in zip(numerators.values(), denominators_)]
+
+    @staticmethod
+    def smooth2(numerators: Counter, denominators: Counter) -> Sequence[float]:
+        return [(n + 1) / (d + 1) for n, d in zip(numerators.values(), denominators.values())]
+
+    @staticmethod
+    def no_smooth(numerators: Counter, denominators: Counter) -> Sequence[float]:
+        denominators_ = [max(1, d) for d in denominators.values()]
+        return [n / d for n, d in zip(numerators.values(), denominators_)]
+
+
+class Bleu(Metric):
+    r"""Calculates the `BLEU score <https://en.wikipedia.org/wiki/BLEU>`_.
+
+    .. math::
+       \text{BLEU} = b_{p} \cdot \exp \left( \sum_{n=1}^{N} w_{n} \: \log p_{n} \right)
+
+    where :math:`N` is the order of n-grams, :math:`b_{p}` is a sentence brevety penalty, :math:`w_{n}` are
+    positive weights summing to one and :math:`p_{n}` are modified n-gram precisions.
+
+    More details can be found in `Papineni et al. 2002`__.
+
+    __ https://www.aclweb.org/anthology/P02-1040.pdf
+
+    In addition, a review of smoothing techniques can be found in `Chen et al. 2014`__
+
+    __ http://acl2014.org/acl2014/W14-33/pdf/W14-3346.pdf
+
+    Remark :
+
+        This implementation is inspired by nltk
+
+    Args:
+        ngram: order of n-grams.
+        smooth: enable smoothing. Valid are ``no_smooth``, ``smooth1``, ``nltk_smooth2`` or ``smooth2``.
+            Default: ``no_smooth``.
+        output_transform: a callable that is used to transform the
+            :class:`~ignite.engine.engine.Engine`'s ``process_function``'s output into the
+            form expected by the metric. This can be useful if, for example, you have a multi-output model and
+            you want to compute the metric with respect to one of the outputs.
+            By default, metrics require the output as ``(y_pred, y)`` or ``{'y_pred': y_pred, 'y': y}``.
+        device: specifies which device updates are accumulated on. Setting the
+            metric's device to be the same as your ``update`` arguments ensures the ``update`` method is
+            non-blocking. By default, CPU.
+
+    Example:
+
+    .. code-block:: python
+
+        from ignite.metrics.nlp import Bleu
+
+        m = Bleu(ngram=4, smooth="smooth1")
+
+        y_pred = "the the the the the the the"
+        y = ["the cat is on the mat", "there is a cat on the mat"]
+
+        m.update((y_pred.split(), [y.split()]))
+
+        print(m.compute())
+
+    .. versionadded:: 0.5.0
+    """
+
+    def __init__(
+        self,
+        ngram: int = 4,
+        smooth: str = "no_smooth",
+        output_transform: Callable = lambda x: x,
+        device: Union[str, torch.device] = torch.device("cpu"),
+    ):
+        if ngram <= 0:
+            raise ValueError(f"ngram order must be greater than zero (got: {ngram})")
+        self.ngrams_order = ngram
+        self.weights = [1 / self.ngrams_order] * self.ngrams_order
+        self.smoother = _Smoother(method=smooth)
+        super(Bleu, self).__init__(output_transform=output_transform, device=device)
+
+    def _corpus_bleu(self, references: Sequence[Sequence[Any]], candidates: Sequence[Sequence[Any]],) -> float:
+        p_numerators: Counter = Counter()
+        p_denominators: Counter = Counter()
+
+        if len(references) != len(candidates):
+            raise ValueError(
+                f"nb of candidates should be equal to nb of reference lists ({len(candidates)} != "
+                f"{len(references)})"
+            )
+
+        # Iterate through each hypothesis and their corresponding references.
+        for refs, hyp in zip(references, candidates):
+            # For each order of ngram, calculate the numerator and
+            # denominator for the corpus-level modified precision.
+            for i in range(1, self.ngrams_order + 1):
+                numerator, denominator = modified_precision(refs, hyp, i)
+                p_numerators[i] += numerator
+                p_denominators[i] += denominator
+
+        # Returns 0 if there's no matching n-grams
+        # We only need to check for p_numerators[1] == 0, since if there's
+        # no unigrams, there won't be any higher order ngrams.
+        if p_numerators[1] == 0:
+            return 0
+
+        # If no smoother, returns 0 if there's at least one a not matching n-grams
+        if self.smoother.smooth == "no_smooth" and min(p_numerators.values()) == 0:
+            return 0
+
+        # Calculate the hypothesis lengths
+        hyp_lengths = [len(hyp) for hyp in candidates]
+
+        # Calculate the closest reference lengths.
+        ref_lengths = [_closest_ref_length(refs, hyp_len) for refs, hyp_len in zip(references, hyp_lengths)]
+
+        # Sum of hypothesis and references lengths
+        hyp_len = sum(hyp_lengths)
+        ref_len = sum(ref_lengths)
+
+        # Calculate corpus-level brevity penalty.
+        if hyp_len < ref_len:
+            bp = math.exp(1 - ref_len / hyp_len) if hyp_len > 0 else 0.0
+        else:
+            bp = 1.0
+
+        # Smoothing
+        p_n = self.smoother(p_numerators, p_denominators)
+
+        # Compute the geometric mean
+        s = [w_i * math.log(p_i) for w_i, p_i in zip(self.weights, p_n)]
+        gm = bp * math.exp(math.fsum(s))
+        return gm
+
+    @reinit__is_reduced
+    def reset(self) -> None:
+        self._sum_of_bleu = torch.tensor(0.0, dtype=torch.double, device=self._device)
+        self._num_sentences = 0
+
+    @reinit__is_reduced
+    def update(self, output: Tuple[Sequence[Any], Sequence[Sequence[Any]]]) -> None:
+        y_pred, y = output
+        self._sum_of_bleu += self._corpus_bleu(references=[y], candidates=[y_pred])
+        self._num_sentences += 1
+
+    @sync_all_reduce("_sum_of_bleu", "_num_sentences")
+    def compute(self) -> torch.Tensor:
+        if self._num_sentences == 0:
+            raise NotComputableError("Bleu must have at least one example before it can be computed.")
+        return self._sum_of_bleu / self._num_sentences
diff --git a/ignite/metrics/rouge.py b/ignite/metrics/nlp/rouge.py
similarity index 91%
rename from ignite/metrics/rouge.py
rename to ignite/metrics/nlp/rouge.py
index 476d97bd807..d0dede92335 100644
--- a/ignite/metrics/rouge.py
+++ b/ignite/metrics/nlp/rouge.py
@@ -1,5 +1,5 @@
 from abc import ABCMeta, abstractmethod
-from collections import Counter, namedtuple
+from collections import namedtuple
 from typing import Any, Callable, List, Mapping, Optional, Sequence, Tuple, Union
 
 import torch
@@ -9,53 +9,9 @@
 
 # These decorators helps with distributed settings
 from ignite.metrics.metric import reinit__is_reduced, sync_all_reduce
+from ignite.metrics.nlp.utils import lcs, ngrams
 
-
-def ngrams(sequence: Sequence[Any], n: int) -> Counter:
-    """
-    Generate the ngrams from a sequence of items
-
-    Args:
-        sequence: sequence of items
-        n: ngram order
-
-    Returns:
-        A counter of ngram objects
-
-    .. versionadded:: 0.5.0
-    """
-    return Counter([tuple(sequence[i : i + n]) for i in range(len(sequence) - n + 1)])
-
-
-def lcs(seq_a: Sequence[Any], seq_b: Sequence[Any]) -> int:
-    """
-    Compute the length of the longest common subsequence in two sequence of items
-    https://en.wikipedia.org/wiki/Longest_common_subsequence_problem
-
-    Args:
-        seq_a: first sequence of items
-        seq_b: second sequence of items
-
-    Returns:
-        The length of the longest common subsequence
-
-    .. versionadded:: 0.5.0
-    """
-    m = len(seq_a)
-    n = len(seq_b)
-
-    dp = [[0] * (n + 1) for _ in range(m + 1)]
-
-    for i in range(m + 1):
-        for j in range(n + 1):
-            if i == 0 or j == 0:
-                dp[i][j] = 0
-            elif seq_a[i - 1] == seq_b[j - 1]:
-                dp[i][j] = dp[i - 1][j - 1] + 1
-            else:
-                dp[i][j] = max(dp[i - 1][j], dp[i][j - 1])
-
-    return dp[m][n]
+__all__ = ["Rouge", "RougeN", "RougeL"]
 
 
 class Score(namedtuple("Score", ["match", "candidate", "reference"])):
@@ -286,7 +242,7 @@ def __init__(
         super(RougeN, self).__init__(multiref=multiref, alpha=alpha, output_transform=output_transform, device=device)
         self._ngram = ngram
         if self._ngram < 1:
-            raise ValueError(f"ngram order must be greater than one (got : {self._ngram})")
+            raise ValueError(f"ngram order must be greater than zero (got : {self._ngram})")
 
     def _compute_score(self, candidate: Sequence[Any], reference: Sequence[Any]) -> Score:
         return compute_ngram_scores(candidate=candidate, reference=reference, n=self._ngram)
diff --git a/ignite/metrics/nlp/utils.py b/ignite/metrics/nlp/utils.py
new file mode 100644
index 00000000000..90be7fbff1e
--- /dev/null
+++ b/ignite/metrics/nlp/utils.py
@@ -0,0 +1,89 @@
+from collections import Counter
+from typing import Any, Sequence, Tuple
+
+__all__ = ["ngrams", "lcs", "modified_precision"]
+
+
+def ngrams(sequence: Sequence[Any], n: int) -> Counter:
+    """
+    Generate the ngrams from a sequence of items
+
+    Args:
+        sequence: sequence of items
+        n: n-gram order
+
+    Returns:
+        A counter of ngram objects
+
+    .. versionadded:: 0.5.0
+    """
+    return Counter([tuple(sequence[i : i + n]) for i in range(len(sequence) - n + 1)])
+
+
+def lcs(seq_a: Sequence[Any], seq_b: Sequence[Any]) -> int:
+    """
+    Compute the length of the longest common subsequence in two sequence of items
+    https://en.wikipedia.org/wiki/Longest_common_subsequence_problem
+
+    Args:
+        seq_a: first sequence of items
+        seq_b: second sequence of items
+
+    Returns:
+        The length of the longest common subsequence
+
+    .. versionadded:: 0.5.0
+    """
+    m = len(seq_a)
+    n = len(seq_b)
+
+    dp = [[0] * (n + 1) for _ in range(m + 1)]
+
+    for i in range(m + 1):
+        for j in range(n + 1):
+            if i == 0 or j == 0:
+                dp[i][j] = 0
+            elif seq_a[i - 1] == seq_b[j - 1]:
+                dp[i][j] = dp[i - 1][j - 1] + 1
+            else:
+                dp[i][j] = max(dp[i - 1][j], dp[i][j - 1])
+
+    return dp[m][n]
+
+
+def modified_precision(references: Sequence[Sequence[Any]], candidate: Any, n: int) -> Tuple[int, int]:
+    """
+    Compute the modified precision
+
+    .. math::
+       p_{n} = \frac{m_{n}}{l_{n}}
+
+    where m_{n} is the number of matched n-grams between translation T and its reference R, and l_{n} is the
+    total number of n-grams in the translation T.
+
+    More details can be found in `Papineni et al. 2002`__.
+
+    __ https://www.aclweb.org/anthology/P02-1040.pdf
+
+    Args:
+        references: list of references R
+        candidate: translation T
+        n: n-gram order
+
+    Returns:
+        The length of the longest common subsequence
+
+    .. versionadded:: 0.5.0
+    """
+    # ngrams of the candidate
+    counts = ngrams(candidate, n)
+
+    # union of ngrams of references
+    max_counts: Counter = Counter()
+    for reference in references:
+        max_counts |= ngrams(reference, n)
+
+    # clipped count of the candidate and references
+    clipped_counts = counts & max_counts
+
+    return sum(clipped_counts.values()), sum(counts.values())
diff --git a/tests/ignite/metrics/nlp/__init__.py b/tests/ignite/metrics/nlp/__init__.py
new file mode 100644
index 00000000000..e12bafd8508
--- /dev/null
+++ b/tests/ignite/metrics/nlp/__init__.py
@@ -0,0 +1,61 @@
+__all__ = ["CorpusForTest"]
+
+
+class CorpusForTest:
+    def __init__(self, lower_split=False):
+        def preproc(text):
+            if lower_split:
+                return text.lower().split()
+            else:
+                return text
+
+        # BLEU Paper examples
+        self.cand_1 = preproc("the the the the the the the")
+        self.ref_1a = preproc("The cat is on the mat")
+        self.ref_1b = preproc("There is a cat on the mat")
+
+        self.cand_2a = preproc(
+            "It is a guide to action which ensures that the military always obeys the commands of the party"
+        )
+        self.cand_2b = preproc("It is to insure the troops forever hearing the activity guidebook that " "party direct")
+        self.ref_2a = preproc(
+            "It is a guide to action that ensures that the military will forever heed " "Party commands"
+        )
+        self.ref_2b = preproc(
+            "It is the guiding principle which guarantees the military forces always being under the command of "
+            "the Party"
+        )
+        self.ref_2c = preproc("It is the practical guide for the army always to heed the directions of the party")
+
+        self.cand_3 = preproc("of the")
+
+        self.references_1 = [self.ref_1a, self.ref_1b]
+        self.references_2 = [self.ref_2a, self.ref_2b, self.ref_2c]
+
+        self.sample_1 = ([self.cand_1], [self.references_1])
+        self.sample_2 = ([self.cand_3], [self.references_2])
+        self.sample_3 = ([self.cand_2a], [self.references_2])
+        self.sample_4 = ([self.cand_2b], [self.references_2])
+        self.sample_5 = ([self.cand_2a, self.cand_2b], [self.references_2, self.references_2])
+
+        self.references_3 = [self.ref_2a, self.ref_2b]
+        self.references_4 = [self.ref_2b, self.ref_2c]
+        self.references_5 = [self.ref_2a, self.ref_2c]
+
+        self.chunks = [
+            (self.cand_1, self.references_1),
+            (self.cand_2a, self.references_2),
+            (self.cand_2b, self.references_2),
+            (self.cand_1, [self.ref_1a]),
+            (self.cand_2a, self.references_3),
+            (self.cand_2b, self.references_3),
+            (self.cand_1, [self.ref_1b]),
+            (self.cand_2a, self.references_4),
+            (self.cand_2b, self.references_4),
+            (self.cand_1, self.references_1),
+            (self.cand_2a, self.references_5),
+            (self.cand_2b, self.references_5),
+            (self.cand_1, [self.ref_1a]),
+            (self.cand_2a, [self.ref_2a]),
+            (self.cand_2b, [self.ref_2c]),
+        ]
diff --git a/tests/ignite/metrics/nlp/test_bleu.py b/tests/ignite/metrics/nlp/test_bleu.py
new file mode 100644
index 00000000000..c98143cf98d
--- /dev/null
+++ b/tests/ignite/metrics/nlp/test_bleu.py
@@ -0,0 +1,244 @@
+import os
+import warnings
+
+import pytest
+import torch
+from nltk.translate.bleu_score import SmoothingFunction, corpus_bleu
+
+import ignite.distributed as idist
+from ignite.exceptions import NotComputableError
+from ignite.metrics.nlp import Bleu
+
+from . import CorpusForTest
+
+corpus = CorpusForTest(lower_split=True)
+
+
+def test_wrong_inputs():
+
+    with pytest.raises(ValueError, match=r"ngram order must be greater than zero"):
+        Bleu(ngram=0)
+
+    with pytest.raises(ValueError, match=r"Smooth is not valid"):
+        Bleu(smooth="fake")
+
+    with pytest.raises(ValueError, match=r"nb of candidates should be equal to nb of reference lists"):
+        Bleu()._corpus_bleu(references=[[0], [0]], candidates=[[0]])
+
+    with pytest.raises(NotComputableError):
+        Bleu().compute()
+
+
+@pytest.mark.parametrize(
+    "candidate, references",
+    [
+        ([["a"], ["a"]]),
+        ([["a", "a", "a", "b", "c"]], [[["a", "b", "c"], ["a", "a", "d"]]]),
+        corpus.sample_1,
+        corpus.sample_2,
+        corpus.sample_3,
+        corpus.sample_4,
+    ],
+)
+def test_corpus_bleu(candidate, references):
+    print(candidate, references)
+    for i in range(1, 8):
+        weights = tuple([1 / i] * i)
+        with warnings.catch_warnings():
+            warnings.simplefilter("ignore")
+            reference = corpus_bleu(references, candidate, weights=weights)
+        bleu = Bleu(ngram=i)
+        assert pytest.approx(reference) == bleu._corpus_bleu(references, candidate)
+        bleu.update((candidate[0], references[0]))
+        assert pytest.approx(reference) == bleu.compute()
+
+
+@pytest.mark.parametrize(
+    "candidate, references",
+    [
+        ([["a", "a", "a", "b", "c"]], [[["a", "b", "c"], ["a", "a", "d"]]]),
+        corpus.sample_1,
+        corpus.sample_2,
+        corpus.sample_3,
+        corpus.sample_4,
+    ],
+)
+def test_corpus_bleu_smooth1(candidate, references):
+    for i in range(1, 8):
+        weights = tuple([1 / i] * i)
+        with warnings.catch_warnings():
+            warnings.simplefilter("ignore")
+            reference = corpus_bleu(
+                references, candidate, weights=weights, smoothing_function=SmoothingFunction().method1
+            )
+        bleu = Bleu(ngram=i, smooth="smooth1")
+        assert reference == bleu._corpus_bleu(references, candidate)
+        bleu.update((candidate[0], references[0]))
+        assert reference == bleu.compute()
+
+
+@pytest.mark.parametrize(
+    "candidate, references",
+    [
+        ([["a", "a", "a", "b", "c"]], [[["a", "b", "c"], ["a", "a", "d"]]]),
+        corpus.sample_1,
+        corpus.sample_2,
+        corpus.sample_3,
+        corpus.sample_4,
+    ],
+)
+def test_corpus_bleu_nltk_smooth2(candidate, references):
+    for i in range(1, 8):
+        weights = tuple([1 / i] * i)
+        with warnings.catch_warnings():
+            warnings.simplefilter("ignore")
+            reference = corpus_bleu(
+                references, candidate, weights=weights, smoothing_function=SmoothingFunction().method2
+            )
+        bleu = Bleu(ngram=i, smooth="nltk_smooth2")
+        assert reference == bleu._corpus_bleu(references, candidate)
+        bleu.update((candidate[0], references[0]))
+        assert reference == bleu.compute()
+
+
+@pytest.mark.parametrize(
+    "candidate, references",
+    [
+        ([["a", "a", "a", "b", "c"]], [[["a", "b", "c"], ["a", "a", "d"]]]),
+        corpus.sample_1,
+        corpus.sample_2,
+        corpus.sample_3,
+        corpus.sample_4,
+    ],
+)
+def test_corpus_bleu_smooth2(candidate, references):
+    for i in range(1, 3):
+        weights = tuple([1 / i] * i)
+        with warnings.catch_warnings():
+            warnings.simplefilter("ignore")
+            reference = corpus_bleu(
+                references, candidate, weights=weights, smoothing_function=SmoothingFunction().method2
+            )
+        bleu = Bleu(ngram=i, smooth="smooth2")
+        assert reference == bleu._corpus_bleu(references, candidate)
+        bleu.update((candidate[0], references[0]))
+        assert reference == bleu.compute()
+
+
+def test_bleu():
+    bleu = Bleu(ngram=4, smooth="smooth2")
+    bleu.update((corpus.cand_1, corpus.references_1))
+    bleu.update((corpus.cand_2a, corpus.references_2))
+    bleu.update((corpus.cand_2b, corpus.references_2))
+    bleu.update((corpus.cand_3, corpus.references_2))
+    value = bleu._corpus_bleu([corpus.references_1], [corpus.cand_1])
+    value += bleu._corpus_bleu([corpus.references_2], [corpus.cand_2a])
+    value += bleu._corpus_bleu([corpus.references_2], [corpus.cand_2b])
+    value += bleu._corpus_bleu([corpus.references_2], [corpus.cand_3])
+    assert bleu.compute() == value / 4
+
+
+def _test_distrib_integration(device):
+
+    from ignite.engine import Engine
+
+    rank = idist.get_rank()
+
+    size = len(corpus.chunks)
+
+    data = []
+    for c in corpus.chunks:
+        data += idist.get_world_size() * [c]
+
+    def update(_, i):
+        return data[i + size * rank]
+
+    def _test(metric_device):
+        engine = Engine(update)
+        m = Bleu(ngram=4, smooth="smooth2")
+        m.attach(engine, "bleu")
+
+        engine.run(data=list(range(size)), max_epochs=1)
+
+        assert "bleu" in engine.state.metrics
+
+        ref_bleu = 0
+        for candidate, references in data:
+            with warnings.catch_warnings():
+                warnings.simplefilter("ignore")
+                ref_bleu += corpus_bleu(
+                    [references],
+                    [candidate],
+                    weights=[0.25, 0.25, 0.25, 0.25],
+                    smoothing_function=SmoothingFunction().method2,
+                )
+
+        assert pytest.approx(engine.state.metrics["bleu"]) == ref_bleu / len(data)
+
+    _test("cpu")
+
+    if device.type != "xla":
+        _test(idist.device())
+
+
+@pytest.mark.distributed
+@pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support")
+@pytest.mark.skipif(torch.cuda.device_count() < 1, reason="Skip if no GPU")
+def test_distrib_gpu(local_rank, distributed_context_single_node_nccl):
+    device = torch.device(f"cuda:{local_rank}")
+    _test_distrib_integration(device)
+
+
+@pytest.mark.distributed
+@pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support")
+def test_distrib_cpu(distributed_context_single_node_gloo):
+    device = torch.device("cpu")
+    _test_distrib_integration(device)
+
+
+@pytest.mark.distributed
+@pytest.mark.skipif(not idist.has_hvd_support, reason="Skip if no Horovod dist support")
+@pytest.mark.skipif("WORLD_SIZE" in os.environ, reason="Skip if launched as multiproc")
+def test_distrib_hvd(gloo_hvd_executor):
+
+    device = torch.device("cpu" if not torch.cuda.is_available() else "cuda")
+    nproc = 4 if not torch.cuda.is_available() else torch.cuda.device_count()
+
+    gloo_hvd_executor(_test_distrib_integration, (device,), np=nproc, do_init=True)
+
+
+@pytest.mark.multinode_distributed
+@pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support")
+@pytest.mark.skipif("MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed")
+def test_multinode_distrib_cpu(distributed_context_multi_node_gloo):
+    device = torch.device("cpu")
+    _test_distrib_integration(device)
+
+
+@pytest.mark.multinode_distributed
+@pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support")
+@pytest.mark.skipif("GPU_MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed")
+def test_multinode_distrib_gpu(distributed_context_multi_node_nccl):
+    device = torch.device(f"cuda:{distributed_context_multi_node_nccl['local_rank']}")
+    _test_distrib_integration(device)
+
+
+@pytest.mark.tpu
+@pytest.mark.skipif("NUM_TPU_WORKERS" in os.environ, reason="Skip if NUM_TPU_WORKERS is in env vars")
+@pytest.mark.skipif(not idist.has_xla_support, reason="Skip if no PyTorch XLA package")
+def test_distrib_single_device_xla():
+    device = idist.device()
+    _test_distrib_integration(device)
+
+
+def _test_distrib_xla_nprocs(index):
+    device = idist.device()
+    _test_distrib_integration(device)
+
+
+@pytest.mark.tpu
+@pytest.mark.skipif("NUM_TPU_WORKERS" not in os.environ, reason="Skip if no NUM_TPU_WORKERS in env vars")
+@pytest.mark.skipif(not idist.has_xla_support, reason="Skip if no PyTorch XLA package")
+def test_distrib_xla_nprocs(xmp_executor):
+    n = int(os.environ["NUM_TPU_WORKERS"])
+    xmp_executor(_test_distrib_xla_nprocs, args=(), nprocs=n)
diff --git a/tests/ignite/metrics/test_rouge.py b/tests/ignite/metrics/nlp/test_rouge.py
similarity index 75%
rename from tests/ignite/metrics/test_rouge.py
rename to tests/ignite/metrics/nlp/test_rouge.py
index b8c5bc6e6f3..40aafae189c 100644
--- a/tests/ignite/metrics/test_rouge.py
+++ b/tests/ignite/metrics/nlp/test_rouge.py
@@ -7,36 +7,14 @@
 
 import ignite.distributed as idist
 from ignite.exceptions import NotComputableError
-from ignite.metrics import Rouge
-from ignite.metrics.rouge import RougeL, RougeN, compute_ngram_scores, lcs, ngrams
+from ignite.metrics.nlp import Rouge
+from ignite.metrics.nlp.rouge import RougeL, RougeN, compute_ngram_scores
+
+from . import CorpusForTest
 
 nltk.download("punkt")
 
-
-@pytest.mark.parametrize(
-    "sequence, n, expected_keys, expected_values",
-    [
-        ([], 1, [], []),
-        ([0, 1, 2], 1, [(0,), (1,), (2,)], [1, 1, 1]),
-        ([0, 1, 2], 2, [(0, 1,), (1, 2,),], [1, 1],),
-        ([0, 1, 2], 3, [(0, 1, 2)], [1]),
-        ([0, 0, 0], 1, [(0,)], [3]),
-        ([0, 0, 0], 2, [(0, 0)], [2]),
-        ("abcde", 4, [("a", "b", "c", "d"), ("b", "c", "d", "e")], [1, 1]),
-    ],
-)
-def test_ngrams(sequence, n, expected_keys, expected_values):
-    ngrams_counter = ngrams(sequence=sequence, n=n)
-    assert list(ngrams_counter.values()) == expected_values
-    assert list(ngrams_counter.keys()) == expected_keys
-
-
-@pytest.mark.parametrize(
-    "seq_a, seq_b, expected",
-    [([], [], 0), ([0, 1, 2], [0, 1, 2], 3), ([0, 1, 2], [0, 3, 2], 2), ("academy", "abracadabra", 4),],
-)
-def test_lcs(seq_a, seq_b, expected):
-    assert lcs(seq_a, seq_b) == expected
+corpus = CorpusForTest()
 
 
 @pytest.mark.parametrize(
@@ -61,7 +39,7 @@ def test_compute_ngram_scores(candidate, reference, n, expected_precision, expec
 
 def test_wrong_inputs():
 
-    with pytest.raises(ValueError, match=r"ngram order must be greater than one"):
+    with pytest.raises(ValueError, match=r"ngram order must be greater than zero"):
         RougeN(ngram=0)
 
     with pytest.raises(ValueError, match=r"alpha must be in interval \[0, 1\]"):
@@ -106,31 +84,8 @@ def test_rouge_n_alpha(ngram, candidate, reference, expected):
         assert results[f"Rouge-{ngram}-F"] == F
 
 
-# BLEU Paper examples
-CAND_1 = "the the the the the the the"
-REF_1A = "The cat is on the mat"
-REF_1B = "There is a cat on the mat"
-
-CAND_2A = "It is a guide to action which ensures that the military always obeys the " "commands of the party"
-CAND_2B = "It is to insure the troops forever hearing the activity guidebook that " "party direct"
-REF_2A = "It is a guide to action that ensures that the military will forever heed " "Party commands"
-REF_2B = (
-    "It is the guiding principle which guarantees the military forces always being under the " "command of the Party"
-)
-REF_2C = "It is the practical guide for the army always to heed the directions of the " "party"
-
-CAND_3 = "of the"
-
-
 @pytest.mark.parametrize(
-    "candidates, references",
-    [
-        ([CAND_1], [[REF_1A, REF_1B]]),
-        ([CAND_3], [[REF_2A, REF_2B, REF_2C]]),
-        ([CAND_2A], [[REF_2A, REF_2B, REF_2C]]),
-        ([CAND_2B], [[REF_2A, REF_2B, REF_2C]]),
-        ([CAND_2A, CAND_2B], [[REF_2A, REF_2B, REF_2C], [REF_2A, REF_2B, REF_2C]]),
-    ],
+    "candidates, references", [corpus.sample_1, corpus.sample_2, corpus.sample_3, corpus.sample_4, corpus.sample_5,],
 )
 def test_rouge_metrics(candidates, references):
     for multiref in ["average", "best"]:
@@ -171,28 +126,10 @@ def _test_distrib_integration(device):
 
     rank = idist.get_rank()
 
-    chunks = [
-        (CAND_1, [REF_1A, REF_1B]),
-        (CAND_2A, [REF_2A, REF_2B, REF_2C]),
-        (CAND_2B, [REF_2A, REF_2B, REF_2C]),
-        (CAND_1, [REF_1A]),
-        (CAND_2A, [REF_2A, REF_2B]),
-        (CAND_2B, [REF_2A, REF_2B]),
-        (CAND_1, [REF_1B]),
-        (CAND_2A, [REF_2B, REF_2C]),
-        (CAND_2B, [REF_2B, REF_2C]),
-        (CAND_1, [REF_1A, REF_1B]),
-        (CAND_2A, [REF_2A, REF_2C]),
-        (CAND_2B, [REF_2A, REF_2C]),
-        (CAND_1, [REF_1A]),
-        (CAND_2A, [REF_2A]),
-        (CAND_2B, [REF_2C]),
-    ]
-
-    size = len(chunks)
+    size = len(corpus.chunks)
 
     data = []
-    for c in chunks:
+    for c in corpus.chunks:
         data += idist.get_world_size() * [c]
 
     def update(_, i):
diff --git a/tests/ignite/metrics/nlp/test_utils.py b/tests/ignite/metrics/nlp/test_utils.py
new file mode 100644
index 00000000000..8cf267a68bd
--- /dev/null
+++ b/tests/ignite/metrics/nlp/test_utils.py
@@ -0,0 +1,57 @@
+import pytest
+
+from ignite.metrics.nlp.utils import lcs, modified_precision, ngrams
+
+
+@pytest.mark.parametrize(
+    "sequence, n, expected_keys, expected_values",
+    [
+        ([], 1, [], []),
+        ([0, 1, 2], 1, [(0,), (1,), (2,)], [1, 1, 1]),
+        ([0, 1, 2], 2, [(0, 1,), (1, 2,),], [1, 1],),
+        ([0, 1, 2], 3, [(0, 1, 2)], [1]),
+        ([0, 0, 0], 1, [(0,)], [3]),
+        ([0, 0, 0], 2, [(0, 0)], [2]),
+        ("abcde", 4, [("a", "b", "c", "d"), ("b", "c", "d", "e")], [1, 1]),
+    ],
+)
+def test_ngrams(sequence, n, expected_keys, expected_values):
+    ngrams_counter = ngrams(sequence=sequence, n=n)
+    assert list(ngrams_counter.values()) == expected_values
+    assert list(ngrams_counter.keys()) == expected_keys
+
+
+@pytest.mark.parametrize(
+    "seq_a, seq_b, expected",
+    [([], [], 0), ([0, 1, 2], [0, 1, 2], 3), ([0, 1, 2], [0, 3, 2], 2), ("academy", "abracadabra", 4),],
+)
+def test_lcs(seq_a, seq_b, expected):
+    assert lcs(seq_a, seq_b) == expected
+
+
+def test_modified_precision_empty():
+    for k in range(1, 5):
+        n, d = modified_precision([[]], [], k)
+        assert n == 0 and d == 0
+        n, d = modified_precision([[]], [0], k)
+        assert n == 0 and d == (k == 1)
+        n, d = modified_precision([[0]], [], k)
+        assert n == 0 and d == 0
+        n, d = modified_precision([[]], list(range(k)), k)
+        assert n == 0 and d == 1
+        n, d = modified_precision([list(range(k))], [], k)
+        assert n == 0 and d == 0
+
+
+@pytest.mark.parametrize(
+    "references, candidate, expected",
+    [
+        ([[0, 0, 0], [1, 2]], [1, 2, 3, 4], ((2, 4), (1, 3), (0, 2))),
+        ([[0, 1, 2], [0, 0, 3]], [0, 0, 0, 1, 2], ((4, 5), (3, 4), (1, 3))),
+        ([[0, 1, 2], [3, 0, 3]], [3, 0, 0, 1, 2], ((4, 5), (3, 4), (1, 3))),
+    ],
+)
+def test_modified_precision(references, candidate, expected):
+    for n, (e_n, e_d) in enumerate(expected, start=1):
+        n, d = modified_precision(references, candidate, n)
+        assert n == e_n and d == e_d