(remove) legacy commit suggester

graykode
Commit f9e5ae186331648d56790a8e7088860c1ee2967a f9e5ae18 1 parent 3f92ebb7
Showing 13 changed files with 0 additions and 6984 deletions
commit_suggester.py
preprocess/__init__.py
preprocess/gitcommit.py
test.source
train.py
train/__init__.py
train/callbacks.py
train/finetune.py
train/generation_utils.py
train/lightning_base.py
train/modeling_bart.py
train/modeling_utils.py
train/utils.py
--- a/commit_suggester.py deleted 100644 → 0
View file @3f92ebb
+++ b/commit_suggester.py deleted 100644 → 0
View file @3f92ebb
-# Copyright 2020-present Tae Hwan Jung
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import torch
-import argparse
-import subprocess
-from transformers import AutoTokenizer
-
-from preprocess import diff_parse, truncate
-from train import BartForConditionalGeneration
-
-def get_length(chunks):
-    cnt = 0
-    for chunk in chunks:
-        cnt += len(chunk)
-    return cnt
-
-def suggester(chunks, model, tokenizer, device):
-    max_source_length = get_length(chunks)
-
-    input_ids, attention_masks, patch_ids = zip(*chunks)
-    input_ids = torch.LongTensor(
-        [truncate(input_ids, max_source_length, value=0)]
-    ).to(device)
-    attention_masks = torch.LongTensor(
-        [truncate(attention_masks, max_source_length, value=1)]
-    ).to(device)
-    patch_ids = torch.LongTensor(
-        [truncate(patch_ids, max_source_length, value=0)]
-    ).to(device)
-
-    summaries = model.generate(
-        input_ids=input_ids, patch_ids=patch_ids, attention_mask=attention_masks
-    )
-    return tokenizer.batch_decode(
-        summaries, skip_special_tokens=True, clean_up_tokenization_spaces=False
-    )
-
-
-def main(args):
-    device = torch.device(
-        "cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu"
-    )
-    model = BartForConditionalGeneration.from_pretrained(args.output_dir).to(device)
-
-    tokenizer = AutoTokenizer.from_pretrained(args.tokenizer_name)
-
-    if args.unittest:
-        with open("test.source", "r") as f:
-            chunks = diff_parse(f.read(), tokenizer)
-    else:
-        proc = subprocess.Popen(["git", "diff", "--cached"], stdout=subprocess.PIPE)
-        staged_files = proc.stdout.readlines()
-        staged_files = [f.decode("utf-8") for f in staged_files]
-        staged_files = [f.strip() for f in staged_files]
-        chunks = "\n".join(staged_files)
-
-    chunks = diff_parse(chunks, tokenizer)
-    if not chunks:
-        print('There is no file in staged state.')
-        return
-
-    commit_message = suggester(
-        chunks,
-        model=model,
-        tokenizer=tokenizer,
-        device=device,
-    )
-    print(commit_message)
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser(description="Code to collect commits on github")
-    parser.add_argument(
-        "--no_cuda", action="store_true", help="Whether not to use CUDA when available"
-    )
-    parser.add_argument(
-        "--unittest", action="store_true", help="Unittest with an one batch git diff"
-    )
-    parser.add_argument(
-        "--output_dir",
-        type=str,
-        required=True,
-        help="The output directory where the model predictions and checkpoints will be written.",
-    )
-    parser.add_argument(
-        "--tokenizer_name",
-        default="sshleifer/distilbart-xsum-6-6",
-        type=str,
-        help="Pretrained tokenizer name or path if not the same as model_name",
-    )
-    args = parser.parse_args()
-
-    main(args)
--- a/preprocess/__init__.py deleted 100644 → 0
View file @3f92ebb
+++ b/preprocess/__init__.py deleted 100644 → 0
View file @3f92ebb
-# Copyright 2020-present Tae Hwan Jung
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from .gitcommit import diff_parse, truncate
-
-__all__ = [
-    "diff_parse",
-    "truncate",
-]
--- a/preprocess/gitcommit.py deleted 100644 → 0
View file @3f92ebb
+++ b/preprocess/gitcommit.py deleted 100644 → 0
View file @3f92ebb
-# Copyright 2020-present Tae Hwan Jung
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os
-import re
-import enum
-import random
-import logging
-import tempfile
-import argparse
-import numpy as np
-from tqdm import *
-import whatthepatch
-from git import Repo
-from functools import partial
-from multiprocessing.pool import Pool
-from transformers import AutoTokenizer
-
-from matorage import *
-
-logger = logging.getLogger(__name__)  # pylint: disable=invalid-name
-logging.basicConfig(
-    format="%(asctime)s - %(levelname)s - %(name)s - PID: %(process)d -  %(message)s",
-    datefmt="%m/%d/%Y %H:%M:%S",
-    level=logging.INFO,
-)
-
-
-class PATCH(enum.Enum):
-    PLUS = 1
-    MINUS = 2
-
-
-def truncate(tuple, max_length, value=0):
-    ls = []
-    for t in tuple:
-        if isinstance(t, int):
-            t = [t]
-        ls.extend(t)
-    ls = ls[: max_length - 1]
-    ls.insert(0, value)
-    if len(ls) < max_length:
-        ls.extend([0] * (max_length - len(ls)))
-    assert len(ls) == max_length
-    return ls
-
-
-def encode_line(tokenizer, line, patch):
-    line = re.sub(r"[\u0100-\uFFFF\U00010000-\U0010FFFF]+", "", line).strip()
-    tokens = tokenizer.tokenize(line)
-    tokens = tokenizer.convert_tokens_to_ids(tokens)
-    return (tokens, [1] * len(tokens), len(tokens) * [patch.value])
-
-
-def diff_parse(diff, tokenizer):
-    chunks = []
-    for diff in whatthepatch.parse_patch(diff):
-        if diff.header.old_path != diff.header.new_path:
-            chunks.append(encode_line(tokenizer, diff.header.old_path, PATCH.MINUS))
-            chunks.append(encode_line(tokenizer, diff.header.new_path, PATCH.PLUS))
-        if not diff.changes:
-            continue
-        for change in diff.changes:
-            if change.old == None and change.new != None:
-                chunks.append(encode_line(tokenizer, change.line, PATCH.PLUS))
-            elif change.old != None and change.new == None:
-                chunks.append(encode_line(tokenizer, change.line, PATCH.MINUS))
-    return chunks
-
-
-def sha_parse(sha, tokenizer, max_length=1024):
-
-    chunks = diff_parse(diff=repo.git.show(sha), tokenizer=tokenizer)
-    if not chunks:
-        return None
-
-    input_ids, attention_masks, patch_ids = zip(*chunks)
-    input_ids = truncate(input_ids, max_length, value=0)
-    attention_masks = truncate(attention_masks, max_length, value=1)
-    patch_ids = truncate(patch_ids, max_length, value=0)
-
-    return (input_ids, attention_masks, patch_ids)
-
-
-def message_parse(msg, tokenizer, max_length=56):
-    msg = re.sub(r"(\(|)#([0-9])+(\)|)", "", msg)
-
-    msg = re.sub(r"[\u0100-\uFFFF\U00010000-\U0010FFFF]+", "", msg).strip()
-    msg = tokenizer.tokenize(msg)
-    msg = tokenizer.convert_tokens_to_ids(msg)
-    msg = truncate(msg, max_length, value=0)
-
-    return msg
-
-
-def jobs(sha_msgs, args, data_config, train=True):
-
-    input_ids, attention_masks, patch_ids, targets = [], [], [], []
-    data_saver = DataSaver(config=data_config)
-
-    for sha_msg in sha_msgs:
-        sha, msg = sha_msg
-
-        source = sha_parse(
-            sha, tokenizer=args.tokenizer, max_length=args.max_source_length
-        )
-        if not source:
-            continue
-        input_id, attention_mask, patch_id = source
-        target = message_parse(
-            msg,
-            tokenizer=args.tokenizer,
-            max_length=(
-                args.max_target_length if train else args.val_max_target_length
-            ),
-        )
-
-        input_ids.append(input_id)
-        attention_masks.append(attention_mask)
-        patch_ids.append(patch_id)
-        targets.append(target)
-
-    data_saver(
-        {
-            "input_ids": np.asarray(input_ids),
-            "attention_masks": np.asarray(attention_masks),
-            "patch_ids": np.asarray(patch_ids),
-            "targets": np.asarray(targets),
-        }
-    )
-    data_saver.disconnect()
-
-
-def start(chunked_sha_msgs, train=True):
-
-    logger.info(f"Start %s pre-processing" % ("training" if train else "evaluation"))
-
-    max_target_length = args.max_target_length if train else args.val_max_target_length
-
-    data_config = DataConfig(
-        endpoint=args.endpoint,
-        access_key=os.environ["access_key"],
-        secret_key=os.environ["secret_key"],
-        region=args.region,
-        dataset_name="commit-autosuggestions",
-        additional={
-            "mode": ("training" if train else "evaluation"),
-            "max_source_length": args.max_source_length,
-            "max_target_length": max_target_length,
-            "url": args.url,
-        },
-        attributes=[
-            ("input_ids", "int32", (args.max_source_length,)),
-            ("attention_masks", "int32", (args.max_source_length,)),
-            ("patch_ids", "int32", (args.max_source_length,)),
-            ("targets", "int32", (max_target_length,)),
-        ],
-    )
-
-    func = partial(jobs, args=args, data_config=data_config, train=train)
-    with Pool(processes=args.num_workers) as pool:
-        with tqdm(total=len(chunked_sha_msgs)) as pbar:
-            for i, _ in tqdm(enumerate(pool.imap_unordered(func, chunked_sha_msgs))):
-                pbar.update()
-
-
-def main(args):
-    if "access_key" not in os.environ or "secret_key" not in os.environ:
-        raise OSError("access_key or secret_key are not found.")
-
-    sha_msgs = [(c.hexsha, c.summary) for c in repo.iter_commits()]
-    random.shuffle(sha_msgs)
-    chunked_sha_msgs = [
-        sha_msgs[x : x + args.matorage_batch]
-        for x in range(0, len(sha_msgs), args.matorage_batch)
-    ]
-
-    barrier = int(len(chunked_sha_msgs) * (1 - args.p_val))
-    if args.do_train:
-        start(chunked_sha_msgs[:barrier], train=True)
-    if args.do_predict:
-        start(chunked_sha_msgs[barrier:], train=False)
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser(description="Code to collect commits on github")
-    parser.add_argument("--url", type=str, required=True, help="github url")
-    parser.add_argument(
-        "--endpoint",
-        type=str,
-        required=True,
-        help="matorage endpoint, check document of matorage: https://matorage.readthedocs.io/en/stable/storage.html",
-    )
-    parser.add_argument(
-        "--region",
-        type=str,
-        default=None,
-        help="matorage s3 region, check document of matorage: https://matorage.readthedocs.io/en/stable/storage.html",
-    )
-    parser.add_argument(
-        "--tokenizer_name",
-        default="sshleifer/distilbart-xsum-6-6",
-        type=str,
-        help="Pretrained tokenizer name or path if not the same as model_name",
-    )
-    parser.add_argument(
-        "--matorage_batch",
-        default=1024,
-        type=int,
-        help="The smallest batch size stored atomically in matorage.",
-    )
-    parser.add_argument(
-        "--num_workers", default=4, type=int, help="number of process",
-    )
-    parser.add_argument(
-        "--max_source_length",
-        default=1024,
-        type=int,
-        help="The maximum total input sequence length after tokenization. Sequences longer "
-        "than this will be truncated, sequences shorter will be padded.",
-    )
-    parser.add_argument(
-        "--max_target_length",
-        default=56,
-        type=int,
-        help="The maximum total input sequence length after tokenization. Sequences longer "
-        "than this will be truncated, sequences shorter will be padded.",
-    )
-    parser.add_argument(
-        "--val_max_target_length",
-        default=142,  # these defaults are optimized for CNNDM. For xsum, see README.md.
-        type=int,
-        help="The maximum total input sequence length after tokenization. Sequences longer "
-        "than this will be truncated, sequences shorter will be padded.",
-    )
-    parser.add_argument(
-        "--p_val", type=float, default=0.25, help="percent of validation dataset"
-    )
-    parser.add_argument("--do_train", action="store_true", default=False)
-    parser.add_argument("--do_predict", action="store_true", default=False)
-    args = parser.parse_args()
-
-    args.local_path = args.url.split("/")[-1]
-    logger.info(f"master branch of {args.url} will be downloaded to {args.local_path}")
-    repo = (
-        Repo(args.local_path)
-        if os.path.exists(args.local_path)
-        else Repo.clone_from(args.url, to_path=args.local_path, branch="master")
-    )
-    args.tokenizer = AutoTokenizer.from_pretrained(args.tokenizer_name)
-
-    main(args)
--- a/test.source deleted 100644 → 0
View file @3f92ebb
+++ b/test.source deleted 100644 → 0
View file @3f92ebb
-commit b5a5268dabb2a4dea1c3c543a1ddff501b87a447
-Author: jbrockmendel <jbrockmendel@gmail.com>
-Date:   Tue Sep 8 18:33:41 2020 -0700
-
-    STY: De-privatize imported names (#36235)
-
-diff --git a/pandas/_libs/interval.pyx b/pandas/_libs/interval.pyx
-index 931ad8326..f8bcbcfb1 100644
---- a/pandas/_libs/interval.pyx
-+++ b/pandas/_libs/interval.pyx
-@@ -46,7 +46,7 @@ from pandas._libs.tslibs.util cimport (
-     is_timedelta64_object,
- )
-
--_VALID_CLOSED = frozenset(['left', 'right', 'both', 'neither'])
-+VALID_CLOSED = frozenset(['left', 'right', 'both', 'neither'])
-
-
- cdef class IntervalMixin:
-@@ -318,7 +318,7 @@ cdef class Interval(IntervalMixin):
-         self._validate_endpoint(left)
-         self._validate_endpoint(right)
-
--        if closed not in _VALID_CLOSED:
-+        if closed not in VALID_CLOSED:
-             raise ValueError(f"invalid option for 'closed': {closed}")
-         if not left <= right:
-             raise ValueError("left side of interval must be <= right side")
-diff --git a/pandas/core/arrays/_arrow_utils.py b/pandas/core/arrays/_arrow_utils.py
-index 4a33e0e84..c89f5554d 100644
---- a/pandas/core/arrays/_arrow_utils.py
-+++ b/pandas/core/arrays/_arrow_utils.py
-@@ -4,7 +4,7 @@ import json
- import numpy as np
- import pyarrow
-
--from pandas.core.arrays.interval import _VALID_CLOSED
-+from pandas.core.arrays.interval import VALID_CLOSED
-
- _pyarrow_version_ge_015 = LooseVersion(pyarrow.__version__) >= LooseVersion("0.15")
-
-@@ -83,7 +83,7 @@ if _pyarrow_version_ge_015:
-         def __init__(self, subtype, closed):
-             # attributes need to be set first before calling
-             # super init (as that calls serialize)
--            assert closed in _VALID_CLOSED
-+            assert closed in VALID_CLOSED
-             self._closed = closed
-             if not isinstance(subtype, pyarrow.DataType):
-                 subtype = pyarrow.type_for_alias(str(subtype))
-diff --git a/pandas/core/arrays/interval.py b/pandas/core/arrays/interval.py
-index d76e0fd62..1dbd3cfc6 100644
---- a/pandas/core/arrays/interval.py
-+++ b/pandas/core/arrays/interval.py
-@@ -5,7 +5,12 @@ import numpy as np
-
- from pandas._config import get_option
-
--from pandas._libs.interval import Interval, IntervalMixin, intervals_to_interval_bounds
-+from pandas._libs.interval import (
-+    VALID_CLOSED,
-+    Interval,
-+    IntervalMixin,
-+    intervals_to_interval_bounds,
-+)
- from pandas.compat.numpy import function as nv
- from pandas.util._decorators import Appender
-
-@@ -42,7 +47,6 @@ from pandas.core.construction import array
- from pandas.core.indexers import check_array_indexer
- from pandas.core.indexes.base import ensure_index
-
--_VALID_CLOSED = {"left", "right", "both", "neither"}
- _interval_shared_docs = {}
-
- _shared_docs_kwargs = dict(
-@@ -475,7 +479,7 @@ class IntervalArray(IntervalMixin, ExtensionArray):
-         * left and right have the same missing values
-         * left is always below right
-         """
--        if self.closed not in _VALID_CLOSED:
-+        if self.closed not in VALID_CLOSED:
-             msg = f"invalid option for 'closed': {self.closed}"
-             raise ValueError(msg)
-         if len(self.left) != len(self.right):
-@@ -1012,7 +1016,7 @@ class IntervalArray(IntervalMixin, ExtensionArray):
-         )
-     )
-     def set_closed(self, closed):
--        if closed not in _VALID_CLOSED:
-+        if closed not in VALID_CLOSED:
-             msg = f"invalid option for 'closed': {closed}"
-             raise ValueError(msg)
-
-diff --git a/pandas/core/arrays/sparse/__init__.py b/pandas/core/arrays/sparse/__init__.py
-index e928db499..e9ff4b7d4 100644
---- a/pandas/core/arrays/sparse/__init__.py
-+++ b/pandas/core/arrays/sparse/__init__.py
-@@ -5,6 +5,6 @@ from pandas.core.arrays.sparse.array import (
-     BlockIndex,
-     IntIndex,
-     SparseArray,
--    _make_index,
-+    make_sparse_index,
- )
- from pandas.core.arrays.sparse.dtype import SparseDtype
-diff --git a/pandas/core/arrays/sparse/array.py b/pandas/core/arrays/sparse/array.py
-index 47c960dc9..853f7bb0b 100644
---- a/pandas/core/arrays/sparse/array.py
-+++ b/pandas/core/arrays/sparse/array.py
-@@ -1556,7 +1556,7 @@ def make_sparse(arr: np.ndarray, kind="block", fill_value=None, dtype=None, copy
-     else:
-         indices = mask.nonzero()[0].astype(np.int32)
-
--    index = _make_index(length, indices, kind)
-+    index = make_sparse_index(length, indices, kind)
-     sparsified_values = arr[mask]
-     if dtype is not None:
-         sparsified_values = astype_nansafe(sparsified_values, dtype=dtype)
-@@ -1564,7 +1564,7 @@ def make_sparse(arr: np.ndarray, kind="block", fill_value=None, dtype=None, copy
-     return sparsified_values, index, fill_value
-
-
--def _make_index(length, indices, kind):
-+def make_sparse_index(length, indices, kind):
-
-     if kind == "block" or isinstance(kind, BlockIndex):
-         locs, lens = splib.get_blocks(indices)
-diff --git a/pandas/core/computation/engines.py b/pandas/core/computation/engines.py
-index 0cdc0f530..77a378369 100644
---- a/pandas/core/computation/engines.py
-+++ b/pandas/core/computation/engines.py
-@@ -130,7 +130,7 @@ class PythonEngine(AbstractEngine):
-         pass
-
-
--_engines: Dict[str, Type[AbstractEngine]] = {
-+ENGINES: Dict[str, Type[AbstractEngine]] = {
-     "numexpr": NumExprEngine,
-     "python": PythonEngine,
- }
-diff --git a/pandas/core/computation/eval.py b/pandas/core/computation/eval.py
-index f6a793514..630606b4d 100644
---- a/pandas/core/computation/eval.py
-+++ b/pandas/core/computation/eval.py
-@@ -9,8 +9,8 @@ import warnings
- from pandas._libs.lib import no_default
- from pandas.util._validators import validate_bool_kwarg
-
--from pandas.core.computation.engines import _engines
--from pandas.core.computation.expr import Expr, _parsers
-+from pandas.core.computation.engines import ENGINES
-+from pandas.core.computation.expr import PARSERS, Expr
- from pandas.core.computation.parsing import tokenize_string
- from pandas.core.computation.scope import ensure_scope
-
-@@ -43,8 +43,8 @@ def _check_engine(engine: Optional[str]) -> str:
-     if engine is None:
-         engine = "numexpr" if NUMEXPR_INSTALLED else "python"
-
--    if engine not in _engines:
--        valid_engines = list(_engines.keys())
-+    if engine not in ENGINES:
-+        valid_engines = list(ENGINES.keys())
-         raise KeyError(
-             f"Invalid engine '{engine}' passed, valid engines are {valid_engines}"
-         )
-@@ -75,9 +75,9 @@ def _check_parser(parser: str):
-     KeyError
-       * If an invalid parser is passed
-     """
--    if parser not in _parsers:
-+    if parser not in PARSERS:
-         raise KeyError(
--            f"Invalid parser '{parser}' passed, valid parsers are {_parsers.keys()}"
-+            f"Invalid parser '{parser}' passed, valid parsers are {PARSERS.keys()}"
-         )
-
-
-@@ -341,7 +341,7 @@ def eval(
-         parsed_expr = Expr(expr, engine=engine, parser=parser, env=env)
-
-         # construct the engine and evaluate the parsed expression
--        eng = _engines[engine]
-+        eng = ENGINES[engine]
-         eng_inst = eng(parsed_expr)
-         ret = eng_inst.evaluate()
-
-diff --git a/pandas/core/computation/expr.py b/pandas/core/computation/expr.py
-index 8cff6abc0..f5897277d 100644
---- a/pandas/core/computation/expr.py
-+++ b/pandas/core/computation/expr.py
-@@ -782,7 +782,7 @@ class Expr:
-         self.env = env or Scope(level=level + 1)
-         self.engine = engine
-         self.parser = parser
--        self._visitor = _parsers[parser](self.env, self.engine, self.parser)
-+        self._visitor = PARSERS[parser](self.env, self.engine, self.parser)
-         self.terms = self.parse()
-
-     @property
-@@ -814,4 +814,4 @@ class Expr:
-         return frozenset(term.name for term in com.flatten(self.terms))
-
-
--_parsers = {"python": PythonExprVisitor, "pandas": PandasExprVisitor}
-+PARSERS = {"python": PythonExprVisitor, "pandas": PandasExprVisitor}
-diff --git a/pandas/core/config_init.py b/pandas/core/config_init.py
-index 0c23f1b4b..bfe20551c 100644
---- a/pandas/core/config_init.py
-+++ b/pandas/core/config_init.py
-@@ -314,9 +314,9 @@ pc_latex_multirow = """
-
-
- def table_schema_cb(key):
--    from pandas.io.formats.printing import _enable_data_resource_formatter
-+    from pandas.io.formats.printing import enable_data_resource_formatter
-
--    _enable_data_resource_formatter(cf.get_option(key))
-+    enable_data_resource_formatter(cf.get_option(key))
-
-
- def is_terminal() -> bool:
-diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py
-index 72003eab2..e870187fc 100644
---- a/pandas/core/groupby/generic.py
-+++ b/pandas/core/groupby/generic.py
-@@ -70,9 +70,9 @@ from pandas.core.groupby.groupby import (
-     GroupBy,
-     _agg_template,
-     _apply_docs,
--    _group_selection_context,
-     _transform_template,
-     get_groupby,
-+    group_selection_context,
- )
- from pandas.core.groupby.numba_ import generate_numba_func, split_for_numba
- from pandas.core.indexes.api import Index, MultiIndex, all_indexes_same
-@@ -230,7 +230,7 @@ class SeriesGroupBy(GroupBy[Series]):
-                 raise NotImplementedError(
-                     "Numba engine can only be used with a single function."
-                 )
--            with _group_selection_context(self):
-+            with group_selection_context(self):
-                 data = self._selected_obj
-             result, index = self._aggregate_with_numba(
-                 data.to_frame(), func, *args, engine_kwargs=engine_kwargs, **kwargs
-@@ -685,7 +685,7 @@ class SeriesGroupBy(GroupBy[Series]):
-         self, normalize=False, sort=True, ascending=False, bins=None, dropna=True
-     ):
-
--        from pandas.core.reshape.merge import _get_join_indexers
-+        from pandas.core.reshape.merge import get_join_indexers
-         from pandas.core.reshape.tile import cut
-
-         if bins is not None and not np.iterable(bins):
-@@ -787,7 +787,7 @@ class SeriesGroupBy(GroupBy[Series]):
-
-         right = [diff.cumsum() - 1, codes[-1]]
-
--        _, idx = _get_join_indexers(left, right, sort=False, how="left")
-+        _, idx = get_join_indexers(left, right, sort=False, how="left")
-         out = np.where(idx != -1, out[idx], 0)
-
-         if sort:
-@@ -942,7 +942,7 @@ class DataFrameGroupBy(GroupBy[DataFrame]):
-                 raise NotImplementedError(
-                     "Numba engine can only be used with a single function."
-                 )
--            with _group_selection_context(self):
-+            with group_selection_context(self):
-                 data = self._selected_obj
-             result, index = self._aggregate_with_numba(
-                 data, func, *args, engine_kwargs=engine_kwargs, **kwargs
-diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py
-index 6ef2e6703..1e3e56f4f 100644
---- a/pandas/core/groupby/groupby.py
-+++ b/pandas/core/groupby/groupby.py
-@@ -459,9 +459,9 @@ class GroupByPlot(PandasObject):
-
-
- @contextmanager
--def _group_selection_context(groupby: "_GroupBy"):
-+def group_selection_context(groupby: "_GroupBy"):
-     """
--    Set / reset the _group_selection_context.
-+    Set / reset the group_selection_context.
-     """
-     groupby._set_group_selection()
-     try:
-@@ -737,7 +737,7 @@ b  2""",
-     def _make_wrapper(self, name: str) -> Callable:
-         assert name in self._apply_allowlist
-
--        with _group_selection_context(self):
-+        with group_selection_context(self):
-             # need to setup the selection
-             # as are not passed directly but in the grouper
-             f = getattr(self._obj_with_exclusions, name)
-@@ -868,7 +868,7 @@ b  2""",
-                 # fails on *some* columns, e.g. a numeric operation
-                 # on a string grouper column
-
--                with _group_selection_context(self):
-+                with group_selection_context(self):
-                     return self._python_apply_general(f, self._selected_obj)
-
-         return result
-@@ -994,7 +994,7 @@ b  2""",
-         alias: str,
-         npfunc: Callable,
-     ):
--        with _group_selection_context(self):
-+        with group_selection_context(self):
-             # try a cython aggregation if we can
-             try:
-                 return self._cython_agg_general(
-@@ -1499,7 +1499,7 @@ class GroupBy(_GroupBy[FrameOrSeries]):
-             )
-         else:
-             func = lambda x: x.var(ddof=ddof)
--            with _group_selection_context(self):
-+            with group_selection_context(self):
-                 return self._python_agg_general(func)
-
-     @Substitution(name="groupby")
-@@ -1658,7 +1658,7 @@ class GroupBy(_GroupBy[FrameOrSeries]):
-
-     @doc(DataFrame.describe)
-     def describe(self, **kwargs):
--        with _group_selection_context(self):
-+        with group_selection_context(self):
-             result = self.apply(lambda x: x.describe(**kwargs))
-             if self.axis == 1:
-                 return result.T
-@@ -1963,7 +1963,7 @@ class GroupBy(_GroupBy[FrameOrSeries]):
-                 nth_values = list(set(n))
-
-             nth_array = np.array(nth_values, dtype=np.intp)
--            with _group_selection_context(self):
-+            with group_selection_context(self):
-
-                 mask_left = np.in1d(self._cumcount_array(), nth_array)
-                 mask_right = np.in1d(
-@@ -2226,7 +2226,7 @@ class GroupBy(_GroupBy[FrameOrSeries]):
-         5    0
-         dtype: int64
-         """
--        with _group_selection_context(self):
-+        with group_selection_context(self):
-             index = self._selected_obj.index
-             result = self._obj_1d_constructor(self.grouper.group_info[0], index)
-             if not ascending:
-@@ -2287,7 +2287,7 @@ class GroupBy(_GroupBy[FrameOrSeries]):
-         5    0
-         dtype: int64
-         """
--        with _group_selection_context(self):
-+        with group_selection_context(self):
-             index = self._selected_obj.index
-             cumcounts = self._cumcount_array(ascending=ascending)
-             return self._obj_1d_constructor(cumcounts, index)
-diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py
-index 526dae7e2..8014b16d0 100644
---- a/pandas/core/indexes/base.py
-+++ b/pandas/core/indexes/base.py
-@@ -3660,7 +3660,7 @@ class Index(IndexOpsMixin, PandasObject):
-         return result
-
-     def _join_non_unique(self, other, how="left", return_indexers=False):
--        from pandas.core.reshape.merge import _get_join_indexers
-+        from pandas.core.reshape.merge import get_join_indexers
-
-         # We only get here if dtypes match
-         assert self.dtype == other.dtype
-@@ -3668,7 +3668,7 @@ class Index(IndexOpsMixin, PandasObject):
-         lvalues = self._get_engine_target()
-         rvalues = other._get_engine_target()
-
--        left_idx, right_idx = _get_join_indexers(
-+        left_idx, right_idx = get_join_indexers(
-             [lvalues], [rvalues], how=how, sort=True
-         )
-
-diff --git a/pandas/core/indexes/interval.py b/pandas/core/indexes/interval.py
-index 3f72577c9..154f41bf0 100644
---- a/pandas/core/indexes/interval.py
-+++ b/pandas/core/indexes/interval.py
-@@ -59,7 +59,6 @@ from pandas.core.ops import get_op_result_name
- if TYPE_CHECKING:
-     from pandas import CategoricalIndex  # noqa:F401
-
--_VALID_CLOSED = {"left", "right", "both", "neither"}
- _index_doc_kwargs = dict(ibase._index_doc_kwargs)
-
- _index_doc_kwargs.update(
-diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py
-index 030dec369..9f19ea9ae 100644
---- a/pandas/core/reshape/merge.py
-+++ b/pandas/core/reshape/merge.py
-@@ -859,7 +859,7 @@ class _MergeOperation:
-
-     def _get_join_indexers(self):
-         """ return the join indexers """
--        return _get_join_indexers(
-+        return get_join_indexers(
-             self.left_join_keys, self.right_join_keys, sort=self.sort, how=self.how
-         )
-
-@@ -1298,7 +1298,7 @@ class _MergeOperation:
-             raise ValueError("Not a valid argument for validate")
-
-
--def _get_join_indexers(
-+def get_join_indexers(
-     left_keys, right_keys, sort: bool = False, how: str = "inner", **kwargs
- ):
-     """
-diff --git a/pandas/io/formats/printing.py b/pandas/io/formats/printing.py
-index edc6fbfff..0d2ca83f1 100644
---- a/pandas/io/formats/printing.py
-+++ b/pandas/io/formats/printing.py
-@@ -243,7 +243,7 @@ def pprint_thing_encoded(
-     return value.encode(encoding, errors)
-
-
--def _enable_data_resource_formatter(enable: bool) -> None:
-+def enable_data_resource_formatter(enable: bool) -> None:
-     if "IPython" not in sys.modules:
-         # definitely not in IPython
-         return
-diff --git a/pandas/tests/arrays/sparse/test_libsparse.py b/pandas/tests/arrays/sparse/test_libsparse.py
-index a2f861d37..2d6e657de 100644
---- a/pandas/tests/arrays/sparse/test_libsparse.py
-+++ b/pandas/tests/arrays/sparse/test_libsparse.py
-@@ -8,7 +8,7 @@ import pandas.util._test_decorators as td
-
- from pandas import Series
- import pandas._testing as tm
--from pandas.core.arrays.sparse import BlockIndex, IntIndex, _make_index
-+from pandas.core.arrays.sparse import BlockIndex, IntIndex, make_sparse_index
-
- TEST_LENGTH = 20
-
-@@ -273,41 +273,43 @@ class TestSparseIndexIntersect:
-
- class TestSparseIndexCommon:
-     def test_int_internal(self):
--        idx = _make_index(4, np.array([2, 3], dtype=np.int32), kind="integer")
-+        idx = make_sparse_index(4, np.array([2, 3], dtype=np.int32), kind="integer")
-         assert isinstance(idx, IntIndex)
-         assert idx.npoints == 2
-         tm.assert_numpy_array_equal(idx.indices, np.array([2, 3], dtype=np.int32))
-
--        idx = _make_index(4, np.array([], dtype=np.int32), kind="integer")
-+        idx = make_sparse_index(4, np.array([], dtype=np.int32), kind="integer")
-         assert isinstance(idx, IntIndex)
-         assert idx.npoints == 0
-         tm.assert_numpy_array_equal(idx.indices, np.array([], dtype=np.int32))
-
--        idx = _make_index(4, np.array([0, 1, 2, 3], dtype=np.int32), kind="integer")
-+        idx = make_sparse_index(
-+            4, np.array([0, 1, 2, 3], dtype=np.int32), kind="integer"
-+        )
-         assert isinstance(idx, IntIndex)
-         assert idx.npoints == 4
-         tm.assert_numpy_array_equal(idx.indices, np.array([0, 1, 2, 3], dtype=np.int32))
-
-     def test_block_internal(self):
--        idx = _make_index(4, np.array([2, 3], dtype=np.int32), kind="block")
-+        idx = make_sparse_index(4, np.array([2, 3], dtype=np.int32), kind="block")
-         assert isinstance(idx, BlockIndex)
-         assert idx.npoints == 2
-         tm.assert_numpy_array_equal(idx.blocs, np.array([2], dtype=np.int32))
-         tm.assert_numpy_array_equal(idx.blengths, np.array([2], dtype=np.int32))
-
--        idx = _make_index(4, np.array([], dtype=np.int32), kind="block")
-+        idx = make_sparse_index(4, np.array([], dtype=np.int32), kind="block")
-         assert isinstance(idx, BlockIndex)
-         assert idx.npoints == 0
-         tm.assert_numpy_array_equal(idx.blocs, np.array([], dtype=np.int32))
-         tm.assert_numpy_array_equal(idx.blengths, np.array([], dtype=np.int32))
-
--        idx = _make_index(4, np.array([0, 1, 2, 3], dtype=np.int32), kind="block")
-+        idx = make_sparse_index(4, np.array([0, 1, 2, 3], dtype=np.int32), kind="block")
-         assert isinstance(idx, BlockIndex)
-         assert idx.npoints == 4
-         tm.assert_numpy_array_equal(idx.blocs, np.array([0], dtype=np.int32))
-         tm.assert_numpy_array_equal(idx.blengths, np.array([4], dtype=np.int32))
-
--        idx = _make_index(4, np.array([0, 2, 3], dtype=np.int32), kind="block")
-+        idx = make_sparse_index(4, np.array([0, 2, 3], dtype=np.int32), kind="block")
-         assert isinstance(idx, BlockIndex)
-         assert idx.npoints == 3
-         tm.assert_numpy_array_equal(idx.blocs, np.array([0, 2], dtype=np.int32))
-@@ -315,7 +317,7 @@ class TestSparseIndexCommon:
-
-     def test_lookup(self):
-         for kind in ["integer", "block"]:
--            idx = _make_index(4, np.array([2, 3], dtype=np.int32), kind=kind)
-+            idx = make_sparse_index(4, np.array([2, 3], dtype=np.int32), kind=kind)
-             assert idx.lookup(-1) == -1
-             assert idx.lookup(0) == -1
-             assert idx.lookup(1) == -1
-@@ -323,12 +325,14 @@ class TestSparseIndexCommon:
-             assert idx.lookup(3) == 1
-             assert idx.lookup(4) == -1
-
--            idx = _make_index(4, np.array([], dtype=np.int32), kind=kind)
-+            idx = make_sparse_index(4, np.array([], dtype=np.int32), kind=kind)
-
-             for i in range(-1, 5):
-                 assert idx.lookup(i) == -1
-
--            idx = _make_index(4, np.array([0, 1, 2, 3], dtype=np.int32), kind=kind)
-+            idx = make_sparse_index(
-+                4, np.array([0, 1, 2, 3], dtype=np.int32), kind=kind
-+            )
-             assert idx.lookup(-1) == -1
-             assert idx.lookup(0) == 0
-             assert idx.lookup(1) == 1
-@@ -336,7 +340,7 @@ class TestSparseIndexCommon:
-             assert idx.lookup(3) == 3
-             assert idx.lookup(4) == -1
-
--            idx = _make_index(4, np.array([0, 2, 3], dtype=np.int32), kind=kind)
-+            idx = make_sparse_index(4, np.array([0, 2, 3], dtype=np.int32), kind=kind)
-             assert idx.lookup(-1) == -1
-             assert idx.lookup(0) == 0
-             assert idx.lookup(1) == -1
-@@ -346,7 +350,7 @@ class TestSparseIndexCommon:
-
-     def test_lookup_array(self):
-         for kind in ["integer", "block"]:
--            idx = _make_index(4, np.array([2, 3], dtype=np.int32), kind=kind)
-+            idx = make_sparse_index(4, np.array([2, 3], dtype=np.int32), kind=kind)
-
-             res = idx.lookup_array(np.array([-1, 0, 2], dtype=np.int32))
-             exp = np.array([-1, -1, 0], dtype=np.int32)
-@@ -356,11 +360,13 @@ class TestSparseIndexCommon:
-             exp = np.array([-1, 0, -1, 1], dtype=np.int32)
-             tm.assert_numpy_array_equal(res, exp)
-
--            idx = _make_index(4, np.array([], dtype=np.int32), kind=kind)
-+            idx = make_sparse_index(4, np.array([], dtype=np.int32), kind=kind)
-             res = idx.lookup_array(np.array([-1, 0, 2, 4], dtype=np.int32))
-             exp = np.array([-1, -1, -1, -1], dtype=np.int32)
-
--            idx = _make_index(4, np.array([0, 1, 2, 3], dtype=np.int32), kind=kind)
-+            idx = make_sparse_index(
-+                4, np.array([0, 1, 2, 3], dtype=np.int32), kind=kind
-+            )
-             res = idx.lookup_array(np.array([-1, 0, 2], dtype=np.int32))
-             exp = np.array([-1, 0, 2], dtype=np.int32)
-             tm.assert_numpy_array_equal(res, exp)
-@@ -369,7 +375,7 @@ class TestSparseIndexCommon:
-             exp = np.array([-1, 2, 1, 3], dtype=np.int32)
-             tm.assert_numpy_array_equal(res, exp)
-
--            idx = _make_index(4, np.array([0, 2, 3], dtype=np.int32), kind=kind)
-+            idx = make_sparse_index(4, np.array([0, 2, 3], dtype=np.int32), kind=kind)
-             res = idx.lookup_array(np.array([2, 1, 3, 0], dtype=np.int32))
-             exp = np.array([1, -1, 2, 0], dtype=np.int32)
-             tm.assert_numpy_array_equal(res, exp)
-@@ -402,25 +408,25 @@ class TestSparseIndexCommon:
-
- class TestBlockIndex:
-     def test_block_internal(self):
--        idx = _make_index(4, np.array([2, 3], dtype=np.int32), kind="block")
-+        idx = make_sparse_index(4, np.array([2, 3], dtype=np.int32), kind="block")
-         assert isinstance(idx, BlockIndex)
-         assert idx.npoints == 2
-         tm.assert_numpy_array_equal(idx.blocs, np.array([2], dtype=np.int32))
-         tm.assert_numpy_array_equal(idx.blengths, np.array([2], dtype=np.int32))
-
--        idx = _make_index(4, np.array([], dtype=np.int32), kind="block")
-+        idx = make_sparse_index(4, np.array([], dtype=np.int32), kind="block")
-         assert isinstance(idx, BlockIndex)
-         assert idx.npoints == 0
-         tm.assert_numpy_array_equal(idx.blocs, np.array([], dtype=np.int32))
-         tm.assert_numpy_array_equal(idx.blengths, np.array([], dtype=np.int32))
-
--        idx = _make_index(4, np.array([0, 1, 2, 3], dtype=np.int32), kind="block")
-+        idx = make_sparse_index(4, np.array([0, 1, 2, 3], dtype=np.int32), kind="block")
-         assert isinstance(idx, BlockIndex)
-         assert idx.npoints == 4
-         tm.assert_numpy_array_equal(idx.blocs, np.array([0], dtype=np.int32))
-         tm.assert_numpy_array_equal(idx.blengths, np.array([4], dtype=np.int32))
-
--        idx = _make_index(4, np.array([0, 2, 3], dtype=np.int32), kind="block")
-+        idx = make_sparse_index(4, np.array([0, 2, 3], dtype=np.int32), kind="block")
-         assert isinstance(idx, BlockIndex)
-         assert idx.npoints == 3
-         tm.assert_numpy_array_equal(idx.blocs, np.array([0, 2], dtype=np.int32))
-@@ -428,7 +434,7 @@ class TestBlockIndex:
-
-     def test_make_block_boundary(self):
-         for i in [5, 10, 100, 101]:
--            idx = _make_index(i, np.arange(0, i, 2, dtype=np.int32), kind="block")
-+            idx = make_sparse_index(i, np.arange(0, i, 2, dtype=np.int32), kind="block")
-
-             exp = np.arange(0, i, 2, dtype=np.int32)
-             tm.assert_numpy_array_equal(idx.blocs, exp)
-@@ -514,17 +520,19 @@ class TestIntIndex:
-             IntIndex(length=5, indices=[1, 3, 3])
-
-     def test_int_internal(self):
--        idx = _make_index(4, np.array([2, 3], dtype=np.int32), kind="integer")
-+        idx = make_sparse_index(4, np.array([2, 3], dtype=np.int32), kind="integer")
-         assert isinstance(idx, IntIndex)
-         assert idx.npoints == 2
-         tm.assert_numpy_array_equal(idx.indices, np.array([2, 3], dtype=np.int32))
-
--        idx = _make_index(4, np.array([], dtype=np.int32), kind="integer")
-+        idx = make_sparse_index(4, np.array([], dtype=np.int32), kind="integer")
-         assert isinstance(idx, IntIndex)
-         assert idx.npoints == 0
-         tm.assert_numpy_array_equal(idx.indices, np.array([], dtype=np.int32))
-
--        idx = _make_index(4, np.array([0, 1, 2, 3], dtype=np.int32), kind="integer")
-+        idx = make_sparse_index(
-+            4, np.array([0, 1, 2, 3], dtype=np.int32), kind="integer"
-+        )
-         assert isinstance(idx, IntIndex)
-         assert idx.npoints == 4
-         tm.assert_numpy_array_equal(idx.indices, np.array([0, 1, 2, 3], dtype=np.int32))
-diff --git a/pandas/tests/computation/test_compat.py b/pandas/tests/computation/test_compat.py
-index ead102f53..9fc3ed480 100644
---- a/pandas/tests/computation/test_compat.py
-+++ b/pandas/tests/computation/test_compat.py
-@@ -5,7 +5,7 @@ import pytest
- from pandas.compat._optional import VERSIONS
-
- import pandas as pd
--from pandas.core.computation.engines import _engines
-+from pandas.core.computation.engines import ENGINES
- import pandas.core.computation.expr as expr
-
-
-@@ -26,8 +26,8 @@ def test_compat():
-         pytest.skip("not testing numexpr version compat")
-
-
--@pytest.mark.parametrize("engine", _engines)
--@pytest.mark.parametrize("parser", expr._parsers)
-+@pytest.mark.parametrize("engine", ENGINES)
-+@pytest.mark.parametrize("parser", expr.PARSERS)
- def test_invalid_numexpr_version(engine, parser):
-     def testit():
-         a, b = 1, 2  # noqa
-diff --git a/pandas/tests/computation/test_eval.py b/pandas/tests/computation/test_eval.py
-index 72dc04e68..cca64a6bf 100644
---- a/pandas/tests/computation/test_eval.py
-+++ b/pandas/tests/computation/test_eval.py
-@@ -19,7 +19,7 @@ from pandas import DataFrame, Series, compat, date_range
- import pandas._testing as tm
- from pandas.core.computation import pytables
- from pandas.core.computation.check import NUMEXPR_VERSION
--from pandas.core.computation.engines import NumExprClobberingError, _engines
-+from pandas.core.computation.engines import ENGINES, NumExprClobberingError
- import pandas.core.computation.expr as expr
- from pandas.core.computation.expr import (
-     BaseExprVisitor,
-@@ -46,14 +46,14 @@ from pandas.core.computation.ops import (
-                 f"installed->{NUMEXPR_INSTALLED}",
-             ),
-         )
--        for engine in _engines
-+        for engine in ENGINES
-     )
- )  # noqa
- def engine(request):
-     return request.param
-
-
--@pytest.fixture(params=expr._parsers)
-+@pytest.fixture(params=expr.PARSERS)
- def parser(request):
-     return request.param
-
-@@ -77,7 +77,7 @@ def unary_fns_for_ne():
-
-
- def engine_has_neg_frac(engine):
--    return _engines[engine].has_neg_frac
-+    return ENGINES[engine].has_neg_frac
-
-
- def _eval_single_bin(lhs, cmp1, rhs, engine):
-@@ -168,7 +168,7 @@ class TestEvalNumexprPandas:
-     def setup_method(self, method):
-         self.setup_ops()
-         self.setup_data()
--        self.current_engines = (engine for engine in _engines if engine != self.engine)
-+        self.current_engines = (engine for engine in ENGINES if engine != self.engine)
-
-     def teardown_method(self, method):
-         del self.lhses, self.rhses, self.scalar_rhses, self.scalar_lhses
-@@ -1921,7 +1921,7 @@ _parsers: Dict[str, Type[BaseExprVisitor]] = {
- }
-
-
--@pytest.mark.parametrize("engine", _engines)
-+@pytest.mark.parametrize("engine", ENGINES)
- @pytest.mark.parametrize("parser", _parsers)
- def test_disallowed_nodes(engine, parser):
-     VisitorClass = _parsers[parser]
\ No newline at end of file
--- a/train.py deleted 100644 → 0
View file @3f92ebb
+++ b/train.py deleted 100644 → 0
View file @3f92ebb
-# Copyright 2020-present Tae Hwan Jung
-# 
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-# 
-#     http://www.apache.org/licenses/LICENSE-2.0
-# 
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os
-import argparse
-import pytorch_lightning as pl
-from train.finetune import main, SummarizationModule
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    parser = pl.Trainer.add_argparse_args(parser)
-    parser = SummarizationModule.add_model_specific_args(parser, os.getcwd())
-
-    args = parser.parse_args()
-
-    main(args)
\ No newline at end of file
--- a/train/__init__.py deleted 100644 → 0
View file @3f92ebb
+++ b/train/__init__.py deleted 100644 → 0
View file @3f92ebb
-# Copyright 2020-present Tae Hwan Jung
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from train.modeling_bart import BartForConditionalGeneration
-
-__all__ = ["BartForConditionalGeneration"]
--- a/train/callbacks.py deleted 100644 → 0
View file @3f92ebb
+++ b/train/callbacks.py deleted 100644 → 0
View file @3f92ebb
-import logging
-import os
-from pathlib import Path
-
-import numpy as np
-import pytorch_lightning as pl
-import torch
-from pytorch_lightning.callbacks import EarlyStopping, ModelCheckpoint
-from pytorch_lightning.utilities import rank_zero_only
-
-
-def count_trainable_parameters(model):
-    model_parameters = filter(lambda p: p.requires_grad, model.parameters())
-    params = sum([np.prod(p.size()) for p in model_parameters])
-    return params
-
-
-logger = logging.getLogger(__name__)
-
-
-class Seq2SeqLoggingCallback(pl.Callback):
-    def on_batch_end(self, trainer, pl_module):
-        lrs = {
-            f"lr_group_{i}": param["lr"]
-            for i, param in enumerate(pl_module.trainer.optimizers[0].param_groups)
-        }
-        pl_module.logger.log_metrics(lrs)
-
-    @rank_zero_only
-    def _write_logs(
-        self,
-        trainer: pl.Trainer,
-        pl_module: pl.LightningModule,
-        type_path: str,
-        save_generations=True,
-    ) -> None:
-        logger.info(
-            f"***** {type_path} results at step {trainer.global_step:05d} *****"
-        )
-        metrics = trainer.callback_metrics
-        trainer.logger.log_metrics(
-            {
-                k: v
-                for k, v in metrics.items()
-                if k not in ["log", "progress_bar", "preds"]
-            }
-        )
-        # Log results
-        od = Path(pl_module.hparams.output_dir)
-        if type_path == "test":
-            results_file = od / "test_results.txt"
-            generations_file = od / "test_generations.txt"
-        else:
-            # this never gets hit. I prefer not to save intermediate generations, and results are in metrics.json
-            # If people want this it will be easy enough to add back.
-            results_file = od / f"{type_path}_results/{trainer.global_step:05d}.txt"
-            generations_file = (
-                od / f"{type_path}_generations/{trainer.global_step:05d}.txt"
-            )
-            results_file.parent.mkdir(exist_ok=True)
-            generations_file.parent.mkdir(exist_ok=True)
-        with open(results_file, "a+") as writer:
-            for key in sorted(metrics):
-                if key in ["log", "progress_bar", "preds"]:
-                    continue
-                val = metrics[key]
-                if isinstance(val, torch.Tensor):
-                    val = val.item()
-                msg = f"{key}: {val:.6f}\n"
-                writer.write(msg)
-
-        if not save_generations:
-            return
-
-        if "preds" in metrics:
-            content = "\n".join(metrics["preds"])
-            generations_file.open("w+").write(content)
-
-    @rank_zero_only
-    def on_train_start(self, trainer, pl_module):
-        try:
-            npars = pl_module.model.model.num_parameters()
-        except AttributeError:
-            npars = pl_module.model.num_parameters()
-
-        n_trainable_pars = count_trainable_parameters(pl_module)
-        # mp stands for million parameters
-        trainer.logger.log_metrics(
-            {"n_params": npars, "mp": npars / 1e6, "grad_mp": n_trainable_pars / 1e6}
-        )
-
-    @rank_zero_only
-    def on_test_end(self, trainer: pl.Trainer, pl_module: pl.LightningModule):
-        return self._write_logs(trainer, pl_module, "test")
-
-
-def get_checkpoint_callback(output_dir, metric):
-    """Saves the best model by validation ROUGE2 score."""
-    if metric == "rouge2":
-        exp = "{val_avg_rouge2:.4f}-{step_count}"
-    elif metric == "bleu":
-        exp = "{val_avg_bleu:.4f}-{step_count}"
-    else:
-        raise NotImplementedError(
-            f"seq2seq callbacks only support rouge2 and bleu, got {metric}, You can make your own by adding to this function."
-        )
-
-    checkpoint_callback = ModelCheckpoint(
-        filepath=os.path.join(output_dir, exp),
-        monitor=f"val_{metric}",
-        mode="max",
-        save_top_k=1,
-        period=0,  # maybe save a checkpoint every time val is run, not just end of epoch.
-    )
-    return checkpoint_callback
-
-
-def get_early_stopping_callback(metric, patience):
-    return EarlyStopping(
-        monitor=f"val_{metric}", mode="max", patience=patience, verbose=True,
-    )
--- a/train/finetune.py deleted 100644 → 0
View file @3f92ebb
+++ b/train/finetune.py deleted 100644 → 0
View file @3f92ebb
-import argparse
-import glob
-import logging
-import os
-import time
-from collections import defaultdict
-from pathlib import Path
-from typing import Dict, List, Tuple
-
-import numpy as np
-import pytorch_lightning as pl
-import torch
-from torch.utils.data import DataLoader
-
-from train.lightning_base import BaseTransformer, add_generic_args, generic_train
-from transformers import MBartTokenizer, T5ForConditionalGeneration
-from transformers.modeling_bart import shift_tokens_right
-
-from matorage import DataConfig
-from matorage.torch import Dataset
-
-
-try:
-    from .callbacks import (
-        Seq2SeqLoggingCallback,
-        get_checkpoint_callback,
-        get_early_stopping_callback,
-    )
-    from .utils import (
-        ROUGE_KEYS,
-        LegacySeq2SeqDataset,
-        Seq2SeqDataset,
-        assert_all_frozen,
-        calculate_bleu,
-        calculate_rouge,
-        flatten_list,
-        freeze_params,
-        get_git_info,
-        label_smoothed_nll_loss,
-        lmap,
-        pickle_save,
-        save_git_info,
-        save_json,
-        use_task_specific_params,
-    )
-except ImportError:
-    from callbacks import (
-        Seq2SeqLoggingCallback,
-        get_checkpoint_callback,
-        get_early_stopping_callback,
-    )
-    from utils import (
-        ROUGE_KEYS,
-        LegacySeq2SeqDataset,
-        Seq2SeqDataset,
-        assert_all_frozen,
-        calculate_bleu,
-        calculate_rouge,
-        flatten_list,
-        freeze_params,
-        get_git_info,
-        label_smoothed_nll_loss,
-        lmap,
-        pickle_save,
-        save_git_info,
-        save_json,
-        use_task_specific_params,
-    )
-
-logger = logging.getLogger(__name__)
-
-
-class SummarizationModule(BaseTransformer):
-    mode = "summarization"
-    loss_names = ["loss"]
-    metric_names = ROUGE_KEYS
-    default_val_metric = "rouge2"
-
-    def __init__(self, hparams, **kwargs):
-        super().__init__(hparams, num_labels=None, mode=self.mode, **kwargs)
-        use_task_specific_params(self.model, "summarization")
-        save_git_info(self.hparams.output_dir)
-        self.metrics_save_path = Path(self.output_dir) / "metrics.json"
-        self.hparams_save_path = Path(self.output_dir) / "hparams.pkl"
-        pickle_save(self.hparams, self.hparams_save_path)
-        self.step_count = 0
-        self.metrics = defaultdict(list)
-
-        self.target_lens = {
-            "train": self.hparams.max_target_length,
-            "val": self.hparams.val_max_target_length,
-            "test": self.hparams.test_max_target_length,
-        }
-        assert (
-            self.target_lens["train"] <= self.target_lens["val"]
-        ), f"target_lens: {self.target_lens}"
-        assert (
-            self.target_lens["train"] <= self.target_lens["test"]
-        ), f"target_lens: {self.target_lens}"
-
-        if self.hparams.freeze_embeds:
-            self.freeze_embeds()
-        if self.hparams.freeze_encoder:
-            freeze_params(self.model.get_encoder())
-            assert_all_frozen(self.model.get_encoder())
-
-        self.hparams.git_sha = get_git_info()["repo_sha"]
-        self.num_workers = hparams.num_workers
-        self.decoder_start_token_id = None  # default to config
-        if self.model.config.decoder_start_token_id is None and isinstance(
-            self.tokenizer, MBartTokenizer
-        ):
-            self.decoder_start_token_id = self.tokenizer.lang_code_to_id[
-                hparams.tgt_lang
-            ]
-            self.model.config.decoder_start_token_id = self.decoder_start_token_id
-
-        self.eval_beams = (
-            self.model.config.num_beams
-            if self.hparams.eval_beams is None
-            else self.hparams.eval_beams
-        )
-        assert (
-            self.eval_beams >= 1
-        ), f"got self.eval_beams={self.eval_beams}. Need an integer > 1"
-        self.val_metric = (
-            self.default_val_metric
-            if self.hparams.val_metric is None
-            else self.hparams.val_metric
-        )
-
-    def freeze_embeds(self):
-        """Freeze token embeddings and positional embeddings for bart, just token embeddings for t5."""
-        try:
-            freeze_params(self.model.model.shared)
-            for d in [self.model.model.encoder, self.model.model.decoder]:
-                freeze_params(d.embed_positions)
-                freeze_params(d.embed_tokens)
-        except AttributeError:
-            freeze_params(self.model.shared)
-            for d in [self.model.encoder, self.model.decoder]:
-                freeze_params(d.embed_tokens)
-
-    def forward(self, input_ids, patch_ids, **kwargs):
-        return self.model(input_ids, patch_ids, **kwargs)
-
-    def ids_to_clean_text(self, generated_ids: List[int]):
-        gen_text = self.tokenizer.batch_decode(
-            generated_ids, skip_special_tokens=True, clean_up_tokenization_spaces=True
-        )
-        return lmap(str.strip, gen_text)
-
-    def _step(self, batch: dict) -> Tuple:
-        pad_token_id = self.tokenizer.pad_token_id
-        src_ids, src_mask, src_patch = batch[0].long(), batch[1].long(), batch[2].long()
-        tgt_ids = batch[3].long()
-        if isinstance(self.model, T5ForConditionalGeneration):
-            decoder_input_ids = self.model._shift_right(tgt_ids)
-        else:
-            decoder_input_ids = shift_tokens_right(tgt_ids, pad_token_id)
-
-        outputs = self(
-            src_ids,
-            src_patch,
-            attention_mask=src_mask,
-            decoder_input_ids=decoder_input_ids,
-            use_cache=False,
-        )
-        lm_logits = outputs[0]
-        if self.hparams.label_smoothing == 0:
-            # Same behavior as modeling_bart.py, besides ignoring pad_token_id
-            loss_fct = torch.nn.CrossEntropyLoss(ignore_index=pad_token_id)
-
-            assert lm_logits.shape[-1] == self.model.config.vocab_size
-            loss = loss_fct(lm_logits.view(-1, lm_logits.shape[-1]), tgt_ids.view(-1))
-        else:
-            lprobs = torch.nn.functional.log_softmax(lm_logits, dim=-1)
-            loss, nll_loss = label_smoothed_nll_loss(
-                lprobs, tgt_ids, self.hparams.label_smoothing, ignore_index=pad_token_id
-            )
-        return (loss,)
-
-    @property
-    def pad(self) -> int:
-        return self.tokenizer.pad_token_id
-
-    def training_step(self, batch, batch_idx) -> Dict:
-        loss_tensors = self._step(batch)
-
-        logs = {name: loss for name, loss in zip(self.loss_names, loss_tensors)}
-        # tokens per batch
-        logs["tpb"] = (
-            batch[0].long().ne(self.pad).sum() + batch[3].long().ne(self.pad).sum()
-        )
-        return {"loss": loss_tensors[0], "log": logs}
-
-    def validation_step(self, batch, batch_idx) -> Dict:
-        return self._generative_step(batch)
-
-    def validation_epoch_end(self, outputs, prefix="val") -> Dict:
-        self.step_count += 1
-        losses = {
-            k: torch.stack([x[k] for x in outputs]).mean() for k in self.loss_names
-        }
-        loss = losses["loss"]
-        rouges = {
-            k: np.array([x[k] for x in outputs]).mean()
-            for k in self.metric_names + ["gen_time", "gen_len"]
-        }
-        rouge_tensor: torch.FloatTensor = torch.tensor(rouges[self.val_metric]).type_as(
-            loss
-        )
-        rouges.update({k: v.item() for k, v in losses.items()})
-        losses.update(rouges)
-        metrics = {f"{prefix}_avg_{k}": x for k, x in losses.items()}
-        metrics["step_count"] = self.step_count
-        self.save_metrics(metrics, prefix)  # writes to self.metrics_save_path
-        preds = flatten_list([x["preds"] for x in outputs])
-        return {
-            "log": metrics,
-            "preds": preds,
-            f"{prefix}_loss": loss,
-            f"{prefix}_{self.val_metric}": rouge_tensor,
-        }
-
-    def save_metrics(self, latest_metrics, type_path) -> None:
-        self.metrics[type_path].append(latest_metrics)
-        save_json(self.metrics, self.metrics_save_path)
-
-    def calc_generative_metrics(self, preds, target) -> Dict:
-        return calculate_rouge(preds, target)
-
-    def _generative_step(self, batch: dict) -> dict:
-        t0 = time.time()
-        generated_ids = self.model.generate(
-            batch[0].long(),
-            patch_ids=batch[2].long(),
-            attention_mask=batch[1].long(),
-            use_cache=True,
-            decoder_start_token_id=self.decoder_start_token_id,
-        )
-        gen_time = (time.time() - t0) / batch[0].shape[0]
-        preds: List[str] = self.ids_to_clean_text(generated_ids)
-        target: List[str] = self.ids_to_clean_text(batch[3])
-        loss_tensors = self._step(batch)
-        base_metrics = {name: loss for name, loss in zip(self.loss_names, loss_tensors)}
-        rouge: Dict = self.calc_generative_metrics(preds, target)
-        summ_len = np.mean(lmap(len, generated_ids))
-        base_metrics.update(
-            gen_time=gen_time, gen_len=summ_len, preds=preds, target=target, **rouge
-        )
-        return base_metrics
-
-    def test_step(self, batch, batch_idx):
-        return self._generative_step(batch)
-
-    def test_epoch_end(self, outputs):
-        return self.validation_epoch_end(outputs, prefix="test")
-
-    def get_dataset(self, type_path) -> Seq2SeqDataset:
-        max_target_length = self.target_lens[type_path]
-        data_config = DataConfig(
-            endpoint=self.hparams.endpoint,
-            access_key=os.environ["access_key"],
-            secret_key=os.environ["secret_key"],
-            region=self.hparams.region,
-            dataset_name="commit-autosuggestions",
-            additional={
-                "mode": ("training" if type_path == "train" else "evaluation"),
-                "max_source_length": self.hparams.max_source_length,
-                "max_target_length": max_target_length,
-                "url": self.hparams.url,
-            },
-            attributes=[
-                ("input_ids", "int32", (self.hparams.max_source_length,)),
-                ("attention_masks", "int32", (self.hparams.max_source_length,)),
-                ("patch_ids", "int32", (self.hparams.max_source_length,)),
-                ("targets", "int32", (max_target_length,)),
-            ],
-        )
-        return Dataset(config=data_config, clear=True)
-
-    def get_dataloader(
-        self, type_path: str, batch_size: int, shuffle: bool = False
-    ) -> DataLoader:
-        dataset = self.get_dataset(type_path)
-        sampler = None
-
-        dataloader = DataLoader(
-            dataset,
-            batch_size=batch_size,
-            shuffle=shuffle,
-            num_workers=self.num_workers,
-            sampler=sampler,
-        )
-        return dataloader
-
-    def train_dataloader(self) -> DataLoader:
-        dataloader = self.get_dataloader(
-            "train", batch_size=self.hparams.train_batch_size, shuffle=True
-        )
-        return dataloader
-
-    def val_dataloader(self) -> DataLoader:
-        return self.get_dataloader("val", batch_size=self.hparams.eval_batch_size)
-
-    def test_dataloader(self) -> DataLoader:
-        return self.get_dataloader("test", batch_size=self.hparams.eval_batch_size)
-
-    @staticmethod
-    def add_model_specific_args(parser, root_dir):
-        BaseTransformer.add_model_specific_args(parser, root_dir)
-        add_generic_args(parser, root_dir)
-        parser.add_argument("--url", type=str, required=True, help="github url")
-        parser.add_argument(
-            "--endpoint",
-            type=str,
-            required=True,
-            help="matorage endpoint, check document of matorage: https://matorage.readthedocs.io/en/stable/storage.html",
-        )
-        parser.add_argument(
-            "--region",
-            type=str,
-            default=None,
-            help="matorage s3 region, check document of matorage: https://matorage.readthedocs.io/en/stable/storage.html",
-        )
-        parser.add_argument(
-            "--max_source_length",
-            default=1024,
-            type=int,
-            help="The maximum total input sequence length after tokenization. Sequences longer "
-            "than this will be truncated, sequences shorter will be padded.",
-        )
-        parser.add_argument(
-            "--max_target_length",
-            default=56,
-            type=int,
-            help="The maximum total input sequence length after tokenization. Sequences longer "
-            "than this will be truncated, sequences shorter will be padded.",
-        )
-        parser.add_argument(
-            "--val_max_target_length",
-            default=142,  # these defaults are optimized for CNNDM. For xsum, see README.md.
-            type=int,
-            help="The maximum total input sequence length after tokenization. Sequences longer "
-            "than this will be truncated, sequences shorter will be padded.",
-        )
-        parser.add_argument(
-            "--test_max_target_length",
-            default=142,
-            type=int,
-            help="The maximum total input sequence length after tokenization. Sequences longer "
-            "than this will be truncated, sequences shorter will be padded.",
-        )
-        parser.add_argument("--freeze_encoder", action="store_true")
-        parser.add_argument("--freeze_embeds", action="store_true")
-        parser.add_argument("--sortish_sampler", action="store_true", default=False)
-        parser.add_argument(
-            "--logger_name",
-            type=str,
-            choices=["default", "wandb", "wandb_shared"],
-            default="default",
-        )
-        parser.add_argument(
-            "--n_train",
-            type=int,
-            default=-1,
-            required=False,
-            help="# examples. -1 means use all.",
-        )
-        parser.add_argument(
-            "--n_val",
-            type=int,
-            default=500,
-            required=False,
-            help="# examples. -1 means use all.",
-        )
-        parser.add_argument(
-            "--n_test",
-            type=int,
-            default=-1,
-            required=False,
-            help="# examples. -1 means use all.",
-        )
-        parser.add_argument(
-            "--task",
-            type=str,
-            default="summarization",
-            required=False,
-            help="# examples. -1 means use all.",
-        )
-        parser.add_argument(
-            "--label_smoothing", type=float, default=0.0, required=False
-        )
-        parser.add_argument("--src_lang", type=str, default="", required=False)
-        parser.add_argument("--tgt_lang", type=str, default="", required=False)
-        parser.add_argument("--eval_beams", type=int, default=None, required=False)
-        parser.add_argument("--val_metric", type=str, default=None, required=False)
-        parser.add_argument(
-            "--early_stopping_patience",
-            type=int,
-            default=-1,
-            required=False,
-            help="-1 means never early stop. early_stopping_patience is measured in validation checks, not epochs. So val_check_interval will effect it.",
-        )
-        return parser
-
-
-class TranslationModule(SummarizationModule):
-    mode = "translation"
-    loss_names = ["loss"]
-    metric_names = ["bleu"]
-    default_val_metric = "bleu"
-
-    def __init__(self, hparams, **kwargs):
-        super().__init__(hparams, **kwargs)
-        self.dataset_kwargs["src_lang"] = hparams.src_lang
-        self.dataset_kwargs["tgt_lang"] = hparams.tgt_lang
-
-    def calc_generative_metrics(self, preds, target) -> dict:
-        return calculate_bleu(preds, target)
-
-
-def main(args, model=None) -> SummarizationModule:
-    Path(args.output_dir).mkdir(exist_ok=True)
-    if len(os.listdir(args.output_dir)) > 3 and args.do_train:
-        raise ValueError(
-            "Output directory ({}) already exists and is not empty.".format(
-                args.output_dir
-            )
-        )
-    if model is None:
-        if args.task == "summarization":
-            model: SummarizationModule = SummarizationModule(args)
-        else:
-            model: SummarizationModule = TranslationModule(args)
-
-    logger = True
-    es_callback = False
-    trainer: pl.Trainer = generic_train(
-        model,
-        args,
-        logging_callback=Seq2SeqLoggingCallback(),
-        checkpoint_callback=get_checkpoint_callback(args.output_dir, model.val_metric),
-        early_stopping_callback=es_callback,
-        logger=logger,
-        # TODO: early stopping callback seems messed up
-    )
-    pickle_save(model.hparams, model.output_dir / "hparams.pkl")
-    if not args.do_predict:
-        return model
-
-    model.hparams.test_checkpoint = ""
-    checkpoints = list(
-        sorted(glob.glob(os.path.join(args.output_dir, "*.ckpt"), recursive=True))
-    )
-    if checkpoints:
-        model.hparams.test_checkpoint = checkpoints[-1]
-        trainer.resume_from_checkpoint = checkpoints[-1]
-    trainer.logger.log_hyperparams(model.hparams)
-
-    # test() without a model tests using the best checkpoint automatically
-    trainer.test()
-    return model
\ No newline at end of file
--- a/train/generation_utils.py deleted 100644 → 0
View file @3f92ebb
+++ b/train/generation_utils.py deleted 100644 → 0
View file @3f92ebb
-# coding=utf-8
-# Copyright 2018 The Google AI Language Team Authors, Facebook AI Research authors and The HuggingFace Inc. team.
-# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from typing import Iterable, List, Optional, Tuple
-
-import torch
-from torch import Tensor
-from torch.nn import functional as F
-
-from transformers.file_utils import ModelOutput
-import logging
-
-logger = logging.getLogger(__name__)  # pylint: disable=invalid-name
-logging.basicConfig(
-    format="%(asctime)s - %(levelname)s - %(name)s - PID: %(process)d -  %(message)s",
-    datefmt="%m/%d/%Y %H:%M:%S",
-    level=logging.INFO,
-)
-
-
-class GenerationMixin:
-    """
-    A class contraining all of the functions supporting generation, to be used as a mixin in
-    :class:`~transfomers.PreTrainedModel`.
-    """
-
-    def prepare_inputs_for_generation(self, input_ids, **kwargs):
-        """
-        Implement in subclasses of :class:`~transfomers.PreTrainedModel` for custom behavior to prepare inputs in the
-        generate method.
-        """
-        return {"input_ids": input_ids}
-
-    def adjust_logits_during_generation(self, logits, **kwargs):
-        """
-        Implement in subclasses of :class:`~transfomers.PreTrainedModel` for custom behavior to adjust the logits in
-        the generate method.
-        """
-        return logits
-
-    def enforce_repetition_penalty_(
-        self, lprobs, batch_size, num_beams, prev_output_tokens, repetition_penalty
-    ):
-        """
-        Enforce the repetition penalty (from the `CTRL paper <https://arxiv.org/abs/1909.05858>`__).
-        """
-        for i in range(batch_size * num_beams):
-            for previous_token in set(prev_output_tokens[i].tolist()):
-                # if score < 0 then repetition penalty has to multiplied to reduce the previous token probability
-                if lprobs[i, previous_token] < 0:
-                    lprobs[i, previous_token] *= repetition_penalty
-                else:
-                    lprobs[i, previous_token] /= repetition_penalty
-
-    def postprocess_next_token_scores(
-        self,
-        scores,
-        input_ids,
-        no_repeat_ngram_size,
-        bad_words_ids,
-        cur_len,
-        min_length,
-        max_length,
-        eos_token_id,
-        repetition_penalty,
-        batch_size,
-        num_beams,
-    ):
-        # repetition penalty (from CTRL paper https://arxiv.org/abs/1909.05858)
-        if repetition_penalty != 1.0:
-            self.enforce_repetition_penalty_(
-                scores, batch_size, num_beams, input_ids, repetition_penalty,
-            )
-
-        # set eos token prob to zero if min_length is not reached
-        if eos_token_id is not None and cur_len < min_length:
-            scores[:, eos_token_id] = -float("inf")
-
-        if no_repeat_ngram_size > 0:
-            # calculate a list of banned tokens to prevent repetitively generating the same ngrams
-            num_batch_hypotheses = batch_size * num_beams
-            # from fairseq: https://github.com/pytorch/fairseq/blob/a07cb6f40480928c9e0548b737aadd36ee66ac76/fairseq/sequence_generator.py#L345
-            banned_batch_tokens = calc_banned_ngram_tokens(
-                input_ids, num_batch_hypotheses, no_repeat_ngram_size, cur_len
-            )
-            for i, banned_tokens in enumerate(banned_batch_tokens):
-                scores[i, banned_tokens] = -float("inf")
-
-        if bad_words_ids is not None:
-            # Exclude EOS token (already processed)
-            bad_words_ids = list(
-                filter(
-                    lambda bad_token_seq: bad_token_seq != [eos_token_id], bad_words_ids
-                )
-            )
-            # calculate a list of banned tokens according to bad words
-            banned_tokens = calc_banned_bad_words_ids(input_ids.tolist(), bad_words_ids)
-            # Modify the scores in place by setting the banned tokens logits to `-inf`
-            set_scores_to_inf_for_banned_tokens(scores, banned_tokens)
-
-        return scores
-
-    @torch.no_grad()
-    def generate(
-        self,
-        input_ids: Optional[torch.LongTensor] = None,
-        patch_ids: Optional[torch.LongTensor] = None,
-        max_length: Optional[int] = None,
-        min_length: Optional[int] = None,
-        do_sample: Optional[bool] = None,
-        early_stopping: Optional[bool] = None,
-        num_beams: Optional[int] = None,
-        temperature: Optional[float] = None,
-        top_k: Optional[int] = None,
-        top_p: Optional[float] = None,
-        repetition_penalty: Optional[float] = None,
-        bad_words_ids: Optional[Iterable[int]] = None,
-        bos_token_id: Optional[int] = None,
-        pad_token_id: Optional[int] = None,
-        eos_token_id: Optional[int] = None,
-        length_penalty: Optional[float] = None,
-        no_repeat_ngram_size: Optional[int] = None,
-        num_return_sequences: Optional[int] = None,
-        attention_mask: Optional[torch.LongTensor] = None,
-        decoder_start_token_id: Optional[int] = None,
-        use_cache: Optional[bool] = None,
-        **model_kwargs,
-    ) -> torch.LongTensor:
-        r"""
-        Generates sequences for models with a language modeling head. The method currently supports greedy decoding,
-        beam-search decoding, sampling with temperature, sampling with top-k or nucleus sampling.
-
-        Adapted in part from `Facebook's XLM beam search code
-        <https://github.com/facebookresearch/XLM/blob/9e6f6814d17be4fe5b15f2e6c43eb2b2d76daeb4/src/model/transformer.py#L529>`__.
-
-        Apart from :obj:`input_ids` and :obj:`attention_mask`, all the arguments below will default to the value of the
-        attribute of the same name inside the :class:`~transformers.PretrainedConfig` of the model. The default values
-        indicated are the default values of those config.
-
-        Most of these parameters are explained in more detail in `this blog post
-        <https://huggingface.co/blog/how-to-generate>`__.
-
-        Parameters:
-
-            input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
-                The sequence used as a prompt for the generation. If :obj:`None` the method initializes
-                it as an empty :obj:`torch.LongTensor` of shape :obj:`(1,)`.
-            max_length (:obj:`int`, `optional`, defaults to 20):
-                The maximum length of the sequence to be generated.
-            min_length (:obj:`int`, `optional`, defaults to 10):
-                The minimum length of the sequence to be generated.
-            do_sample (:obj:`bool`, `optional`, defaults to :obj:`False`):
-                Whether or not to use sampling ; use greedy decoding otherwise.
-            early_stopping (:obj:`bool`, `optional`, defaults to :obj:`False`):
-                Whether to stop the beam search when at least ``num_beams`` sentences are finished per batch or not.
-            num_beams (:obj:`int`, `optional`, defaults to 1):
-                Number of beams for beam search. 1 means no beam search.
-            temperature (:obj:`float`, `optional`, defaults tp 1.0):
-                The value used to module the next token probabilities.
-            top_k (:obj:`int`, `optional`, defaults to 50):
-                The number of highest probability vocabulary tokens to keep for top-k-filtering.
-            top_p (:obj:`float`, `optional`, defaults to 1.0):
-                If set to float < 1, only the most probable tokens with probabilities that add up to ``top_p`` or
-                higher are kept for generation.
-            repetition_penalty (:obj:`float`, `optional`, defaults to 1.0):
-                The parameter for repetition penalty. 1.0 means no penalty. See `this paper
-                <https://arxiv.org/pdf/1909.05858.pdf>`__ for more details.
-            pad_token_id (:obj:`int`, `optional`):
-                The id of the `padding` token.
-            bos_token_id (:obj:`int`, `optional`):
-                The id of the `beginning-of-sequence` token.
-            eos_token_id (:obj:`int`, `optional`):
-                The id of the `end-of-sequence` token.
-            length_penalty (:obj:`float`, `optional`, defaults to 1.0):
-                Exponential penalty to the length. 1.0 means no penalty.
-
-                Set to values < 1.0 in order to encourage the model to generate shorter sequences, to a value > 1.0 in
-                order to encourage the model to produce longer sequences.
-            no_repeat_ngram_size (:obj:`int`, `optional`, defaults to 0):
-                If set to int > 0, all ngrams of that size can only occur once.
-            bad_words_ids(:obj:`List[int]`, `optional`):
-                List of token ids that are not allowed to be generated. In order to get the tokens of the words that
-                should not appear in the generated text, use :obj:`tokenizer.encode(bad_word, add_prefix_space=True)`.
-            num_return_sequences(:obj:`int`, `optional`, defaults to 1):
-                The number of independently computed returned sequences for each element in the batch.
-            attention_mask (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
-                Mask to avoid performing attention on padding token indices. Mask values are in ``[0, 1]``, 1 for
-                tokens that are not masked, and 0 for masked tokens.
-
-                If not provided, will default to a tensor the same shape as :obj:`input_ids` that masks the pad token.
-
-                `What are attention masks? <../glossary.html#attention-mask>`__
-            decoder_start_token_id (:obj:`int`, `optional`):
-                If an encoder-decoder model starts decoding with a different token than `bos`, the id of that token.
-            use_cache: (:obj:`bool`, `optional`, defaults to :obj:`True`):
-                Whether or not the model should use the past last key/values attentions (if applicable to the model) to
-                speed up decoding.
-            model_kwargs:
-                Additional model specific kwargs will be forwarded to the :obj:`forward` function of the model.
-
-        Return:
-
-            :obj:`torch.LongTensor` of shape :obj:`(batch_size * num_return_sequences, sequence_length)`:
-            The generated sequences. The second dimension (sequence_length) is either equal to :obj:`max_length` or
-            shorter if all batches finished early due to the :obj:`eos_token_id`.
-
-        Examples::
-
-            tokenizer = AutoTokenizer.from_pretrained('distilgpt2')   # Initialize tokenizer
-            model = AutoModelWithLMHead.from_pretrained('distilgpt2')    # Download model and configuration from S3 and cache.
-            outputs = model.generate(max_length=40)  # do greedy decoding
-            print('Generated: {}'.format(tokenizer.decode(outputs[0], skip_special_tokens=True)))
-
-            tokenizer = AutoTokenizer.from_pretrained('openai-gpt')   # Initialize tokenizer
-            model = AutoModelWithLMHead.from_pretrained('openai-gpt')    # Download model and configuration from S3 and cache.
-            input_context = 'The dog'
-            input_ids = tokenizer.encode(input_context, return_tensors='pt')  # encode input context
-            outputs = model.generate(input_ids=input_ids, num_beams=5, num_return_sequences=3, temperature=1.5)  # generate 3 independent sequences using beam search decoding (5 beams) with sampling from initial context 'The dog'
-            for i in range(3): #  3 output sequences were generated
-                print('Generated {}: {}'.format(i, tokenizer.decode(outputs[i], skip_special_tokens=True)))
-
-            tokenizer = AutoTokenizer.from_pretrained('distilgpt2')   # Initialize tokenizer
-            model = AutoModelWithLMHead.from_pretrained('distilgpt2')    # Download model and configuration from S3 and cache.
-            input_context = 'The dog'
-            input_ids = tokenizer.encode(input_context, return_tensors='pt')  # encode input context
-            outputs = model.generate(input_ids=input_ids, max_length=40, temperature=0.7, num_return_sequences=3, do_sample=True)  # generate 3 candidates using sampling
-            for i in range(3): #  3 output sequences were generated
-                print('Generated {}: {}'.format(i, tokenizer.decode(outputs[i], skip_special_tokens=True)))
-
-            tokenizer = AutoTokenizer.from_pretrained('ctrl')   # Initialize tokenizer
-            model = AutoModelWithLMHead.from_pretrained('ctrl')    # Download model and configuration from S3 and cache.
-            input_context = 'Legal My neighbor is'  # "Legal" is one of the control codes for ctrl
-            input_ids = tokenizer.encode(input_context, return_tensors='pt')  # encode input context
-            outputs = model.generate(input_ids=input_ids, max_length=50, temperature=0.7, repetition_penalty=1.2)  # generate sequences
-            print('Generated: {}'.format(tokenizer.decode(outputs[0], skip_special_tokens=True)))
-
-            tokenizer = AutoTokenizer.from_pretrained('gpt2')   # Initialize tokenizer
-            model = AutoModelWithLMHead.from_pretrained('gpt2')    # Download model and configuration from S3 and cache.
-            input_context = 'My cute dog'  # "Legal" is one of the control codes for ctrl
-            bad_words_ids = [tokenizer.encode(bad_word, add_prefix_space=True) for bad_word in ['idiot', 'stupid', 'shut up']]
-            input_ids = tokenizer.encode(input_context, return_tensors='pt')  # encode input context
-            outputs = model.generate(input_ids=input_ids, max_length=100, do_sample=True, bad_words_ids=bad_words_ids)  # generate sequences without allowing bad_words to be generated
-        """
-
-        # We cannot generate if the model does not have a LM head
-        if self.get_output_embeddings() is None:
-            raise AttributeError(
-                "You tried to generate sequences with a model that does not have a LM Head."
-                "Please use another model class (e.g. `OpenAIGPTLMHeadModel`, `XLNetLMHeadModel`, `GPT2LMHeadModel`, `CTRLLMHeadModel`, `T5WithLMHeadModel`, `TransfoXLLMHeadModel`, `XLMWithLMHeadModel`, `BartForConditionalGeneration` )"
-            )
-
-        max_length = max_length if max_length is not None else self.config.max_length
-        min_length = min_length if min_length is not None else self.config.min_length
-        do_sample = do_sample if do_sample is not None else self.config.do_sample
-        early_stopping = (
-            early_stopping if early_stopping is not None else self.config.early_stopping
-        )
-        use_cache = use_cache if use_cache is not None else self.config.use_cache
-        num_beams = num_beams if num_beams is not None else self.config.num_beams
-        temperature = (
-            temperature if temperature is not None else self.config.temperature
-        )
-        top_k = top_k if top_k is not None else self.config.top_k
-        top_p = top_p if top_p is not None else self.config.top_p
-        repetition_penalty = (
-            repetition_penalty
-            if repetition_penalty is not None
-            else self.config.repetition_penalty
-        )
-        bos_token_id = (
-            bos_token_id if bos_token_id is not None else self.config.bos_token_id
-        )
-        pad_token_id = (
-            pad_token_id if pad_token_id is not None else self.config.pad_token_id
-        )
-        eos_token_id = (
-            eos_token_id if eos_token_id is not None else self.config.eos_token_id
-        )
-        length_penalty = (
-            length_penalty if length_penalty is not None else self.config.length_penalty
-        )
-        no_repeat_ngram_size = (
-            no_repeat_ngram_size
-            if no_repeat_ngram_size is not None
-            else self.config.no_repeat_ngram_size
-        )
-        bad_words_ids = (
-            bad_words_ids if bad_words_ids is not None else self.config.bad_words_ids
-        )
-        num_return_sequences = (
-            num_return_sequences
-            if num_return_sequences is not None
-            else self.config.num_return_sequences
-        )
-        decoder_start_token_id = (
-            decoder_start_token_id
-            if decoder_start_token_id is not None
-            else self.config.decoder_start_token_id
-        )
-
-        if input_ids is not None:
-            batch_size = input_ids.shape[0]  # overriden by the input batch_size
-        else:
-            batch_size = 1
-
-        assert (
-            isinstance(max_length, int) and max_length > 0
-        ), "`max_length` should be a strictly positive integer."
-        assert (
-            isinstance(min_length, int) and min_length >= 0
-        ), "`min_length` should be a positive integer."
-        assert isinstance(do_sample, bool), "`do_sample` should be a boolean."
-        assert isinstance(early_stopping, bool), "`early_stopping` should be a boolean."
-        assert isinstance(use_cache, bool), "`use_cache` should be a boolean."
-        assert (
-            isinstance(num_beams, int) and num_beams > 0
-        ), "`num_beams` should be a strictly positive integer."
-        assert temperature > 0, "`temperature` should be strictly positive."
-        assert (
-            isinstance(top_k, int) and top_k >= 0
-        ), "`top_k` should be a positive integer."
-        assert 0 <= top_p <= 1, "`top_p` should be between 0 and 1."
-        assert repetition_penalty >= 1.0, "`repetition_penalty` should be >= 1."
-        assert input_ids is not None or (
-            isinstance(bos_token_id, int) and bos_token_id >= 0
-        ), "If input_ids is not defined, `bos_token_id` should be a positive integer."
-        assert pad_token_id is None or (
-            isinstance(pad_token_id, int) and (pad_token_id >= 0)
-        ), "`pad_token_id` should be a positive integer."
-        assert (eos_token_id is None) or (
-            isinstance(eos_token_id, int) and (eos_token_id >= 0)
-        ), "`eos_token_id` should be a positive integer."
-        assert length_penalty > 0, "`length_penalty` should be strictly positive."
-        assert (
-            isinstance(no_repeat_ngram_size, int) and no_repeat_ngram_size >= 0
-        ), "`no_repeat_ngram_size` should be a positive integer."
-        assert (
-            isinstance(num_return_sequences, int) and num_return_sequences > 0
-        ), "`num_return_sequences` should be a strictly positive integer."
-        assert (
-            bad_words_ids is None
-            or isinstance(bad_words_ids, list)
-            and isinstance(bad_words_ids[0], list)
-        ), "`bad_words_ids` is either `None` or a list of lists of tokens that should not be generated"
-
-        if input_ids is None:
-            assert isinstance(bos_token_id, int) and bos_token_id >= 0, (
-                "you should either supply a context to complete as `input_ids` input "
-                "or a `bos_token_id` (integer >= 0) as a first token to start the generation."
-            )
-            input_ids = torch.full(
-                (batch_size, 1),
-                bos_token_id,
-                dtype=torch.long,
-                device=next(self.parameters()).device,
-            )
-        else:
-            assert (
-                input_ids.dim() == 2
-            ), "Input prompt should be of shape (batch_size, sequence length)."
-
-        # not allow to duplicate outputs when greedy decoding
-        if do_sample is False:
-            if num_beams == 1:
-                # no_beam_search greedy generation conditions
-                assert (
-                    num_return_sequences == 1
-                ), "Greedy decoding will always produce the same output for num_beams == 1 and num_return_sequences > 1. Please set num_return_sequences = 1"
-
-            else:
-                # beam_search greedy generation conditions
-                assert (
-                    num_beams >= num_return_sequences
-                ), "Greedy beam search decoding cannot return more sequences than it has beams. Please set num_beams >= num_return_sequences"
-
-        # create attention mask if necessary
-        # TODO (PVP): this should later be handled by the forward fn() in each model in the future see PR 3140
-        if (
-            (attention_mask is None)
-            and (pad_token_id is not None)
-            and (pad_token_id in input_ids)
-        ):
-            attention_mask = input_ids.ne(pad_token_id).long()
-        elif attention_mask is None:
-            attention_mask = input_ids.new_ones(input_ids.shape)
-
-        # set pad_token_id to eos_token_id if not set. Important that this is done after
-        # attention_mask is created
-        if pad_token_id is None and eos_token_id is not None:
-            logger.warning(
-                "Setting `pad_token_id` to {} (first `eos_token_id`) to generate sequence".format(
-                    eos_token_id
-                )
-            )
-            pad_token_id = eos_token_id
-
-        # current position and vocab size
-        if hasattr(self.config, "vocab_size"):
-            vocab_size = self.config.vocab_size
-        elif (
-            self.config.is_encoder_decoder
-            and hasattr(self.config, "decoder")
-            and hasattr(self.config.decoder, "vocab_size")
-        ):
-            vocab_size = self.config.decoder.vocab_size
-
-        # set effective batch size and effective batch multiplier according to do_sample
-        if do_sample:
-            effective_batch_size = batch_size * num_return_sequences
-            effective_batch_mult = num_return_sequences
-        else:
-            effective_batch_size = batch_size
-            effective_batch_mult = 1
-
-        if self.config.is_encoder_decoder:
-            if decoder_start_token_id is None:
-                # see if BOS token can be used for decoder_start_token_id
-                if bos_token_id is not None:
-                    decoder_start_token_id = bos_token_id
-                elif hasattr(self.config, "decoder") and hasattr(
-                    self.config.decoder, "bos_token_id"
-                ):
-                    decoder_start_token_id = self.config.decoder.bos_token_id
-                else:
-                    raise ValueError(
-                        "decoder_start_token_id or bos_token_id has to be defined for encoder-decoder generation"
-                    )
-
-            assert hasattr(
-                self, "get_encoder"
-            ), "{} should have a 'get_encoder' function defined".format(self)
-            assert callable(self.get_encoder), "{} should be a method".format(
-                self.get_encoder
-            )
-
-            # get encoder and store encoder outputs
-            encoder = self.get_encoder()
-            encoder_outputs: ModelOutput = encoder(
-                input_ids, patch_ids, attention_mask=attention_mask, return_dict=True
-            )
-
-        # Expand input ids if num_beams > 1 or num_return_sequences > 1
-        if num_return_sequences > 1 or num_beams > 1:
-            input_ids_len = input_ids.shape[-1]
-            input_ids = input_ids.unsqueeze(1).expand(
-                batch_size, effective_batch_mult * num_beams, input_ids_len
-            )
-            patch_ids = patch_ids.unsqueeze(1).expand(
-                batch_size, effective_batch_mult * num_beams, input_ids_len
-            )
-            attention_mask = attention_mask.unsqueeze(1).expand(
-                batch_size, effective_batch_mult * num_beams, input_ids_len
-            )
-
-            input_ids = input_ids.contiguous().view(
-                effective_batch_size * num_beams, input_ids_len
-            )  # shape: (batch_size * num_return_sequences * num_beams, cur_len)
-            patch_ids = patch_ids.contiguous().view(
-                effective_batch_size * num_beams, input_ids_len
-            )  # shape: (batch_size * num_return_sequences * num_beams, cur_len)
-            attention_mask = attention_mask.contiguous().view(
-                effective_batch_size * num_beams, input_ids_len
-            )  # shape: (batch_size * num_return_sequences * num_beams, cur_len)
-
-        if self.config.is_encoder_decoder:
-            # create empty decoder_input_ids
-            input_ids = torch.full(
-                (effective_batch_size * num_beams, 1),
-                decoder_start_token_id,
-                dtype=torch.long,
-                device=next(self.parameters()).device,
-            )
-            cur_len = 1
-
-            assert (
-                batch_size == encoder_outputs.last_hidden_state.shape[0]
-            ), f"expected encoder_outputs.last_hidden_state to have 1st dimension bs={batch_size}, got {encoder_outputs.last_hidden_state.shape[0]} "
-
-            # expand batch_idx to assign correct encoder output for expanded input_ids (due to num_beams > 1 and num_return_sequences > 1)
-            expanded_batch_idxs = (
-                torch.arange(batch_size)
-                .view(-1, 1)
-                .repeat(1, num_beams * effective_batch_mult)
-                .view(-1)
-                .to(input_ids.device)
-            )
-
-            # expand encoder_outputs
-            encoder_outputs[
-                "last_hidden_state"
-            ] = encoder_outputs.last_hidden_state.index_select(0, expanded_batch_idxs)
-
-            # save encoder_outputs in `model_kwargs`
-            model_kwargs["encoder_outputs"] = encoder_outputs
-
-        else:
-            cur_len = input_ids.shape[-1]
-
-        assert (
-            cur_len < max_length
-        ), f"The context has {cur_len} number of tokens, but `max_length` is only {max_length}. Please make sure that `max_length` is bigger than the number of tokens, by setting either `generate(max_length=...,...)` or `config.max_length = ...`"
-
-        if num_beams > 1:
-            output = self._generate_beam_search(
-                input_ids,
-                cur_len=cur_len,
-                max_length=max_length,
-                min_length=min_length,
-                do_sample=do_sample,
-                early_stopping=early_stopping,
-                temperature=temperature,
-                top_k=top_k,
-                top_p=top_p,
-                repetition_penalty=repetition_penalty,
-                no_repeat_ngram_size=no_repeat_ngram_size,
-                bad_words_ids=bad_words_ids,
-                pad_token_id=pad_token_id,
-                eos_token_id=eos_token_id,
-                batch_size=effective_batch_size,
-                num_return_sequences=num_return_sequences,
-                length_penalty=length_penalty,
-                num_beams=num_beams,
-                vocab_size=vocab_size,
-                attention_mask=attention_mask,
-                use_cache=use_cache,
-                model_kwargs=model_kwargs,
-            )
-        else:
-            output = self._generate_no_beam_search(
-                input_ids,
-                cur_len=cur_len,
-                max_length=max_length,
-                min_length=min_length,
-                do_sample=do_sample,
-                temperature=temperature,
-                top_k=top_k,
-                top_p=top_p,
-                repetition_penalty=repetition_penalty,
-                no_repeat_ngram_size=no_repeat_ngram_size,
-                bad_words_ids=bad_words_ids,
-                pad_token_id=pad_token_id,
-                eos_token_id=eos_token_id,
-                batch_size=effective_batch_size,
-                attention_mask=attention_mask,
-                use_cache=use_cache,
-                model_kwargs=model_kwargs,
-            )
-
-        return output
-
-    def _generate_no_beam_search(
-        self,
-        input_ids,
-        cur_len,
-        max_length,
-        min_length,
-        do_sample,
-        temperature,
-        top_k,
-        top_p,
-        repetition_penalty,
-        no_repeat_ngram_size,
-        bad_words_ids,
-        pad_token_id,
-        eos_token_id,
-        batch_size,
-        attention_mask,
-        use_cache,
-        model_kwargs,
-    ):
-        """Generate sequences for each example without beam search (num_beams == 1).
-        All returned sequence are generated independantly.
-        """
-        # length of generated sentences / unfinished sentences
-        unfinished_sents = input_ids.new(batch_size).fill_(1)
-        sent_lengths = input_ids.new(batch_size).fill_(max_length)
-
-        past = None
-        while cur_len < max_length:
-            model_inputs = self.prepare_inputs_for_generation(
-                input_ids,
-                past=past,
-                attention_mask=attention_mask,
-                use_cache=use_cache,
-                **model_kwargs,
-            )
-
-            outputs = self(**model_inputs, return_dict=True)
-            next_token_logits = outputs.logits[:, -1, :]
-
-            scores = self.postprocess_next_token_scores(
-                scores=next_token_logits,
-                input_ids=input_ids,
-                no_repeat_ngram_size=no_repeat_ngram_size,
-                bad_words_ids=bad_words_ids,
-                cur_len=cur_len,
-                min_length=min_length,
-                max_length=max_length,
-                eos_token_id=eos_token_id,
-                repetition_penalty=repetition_penalty,
-                batch_size=batch_size,
-                num_beams=1,
-            )
-
-            # if model has past, then set the past variable to speed up decoding
-            if "past_key_values" in outputs:
-                past = outputs.past_key_values
-            elif "mems" in outputs:
-                past = outputs.mems
-
-            if do_sample:
-                # Temperature (higher temperature => more likely to sample low probability tokens)
-                if temperature != 1.0:
-                    scores = scores / temperature
-                # Top-p/top-k filtering
-                next_token_logscores = top_k_top_p_filtering(
-                    scores, top_k=top_k, top_p=top_p
-                )
-                # Sample
-                probs = F.softmax(next_token_logscores, dim=-1)
-                next_token = torch.multinomial(probs, num_samples=1).squeeze(1)
-            else:
-                # Greedy decoding
-                next_token = torch.argmax(next_token_logits, dim=-1)
-
-            # update generations and finished sentences
-            if eos_token_id is not None:
-                # pad finished sentences if eos_token_id exist
-                tokens_to_add = next_token * unfinished_sents + (pad_token_id) * (
-                    1 - unfinished_sents
-                )
-            else:
-                tokens_to_add = next_token
-
-            # add token and increase length by one
-            input_ids = torch.cat([input_ids, tokens_to_add.unsqueeze(-1)], dim=-1)
-            cur_len = cur_len + 1
-
-            if eos_token_id is not None:
-                eos_in_sents = tokens_to_add == eos_token_id
-                # if sentence is unfinished and the token to add is eos, sent_lengths is filled with current length
-                is_sents_unfinished_and_token_to_add_is_eos = unfinished_sents.mul(
-                    eos_in_sents.long()
-                ).bool()
-                sent_lengths.masked_fill_(
-                    is_sents_unfinished_and_token_to_add_is_eos, cur_len
-                )
-                # unfinished_sents is set to zero if eos in sentence
-                unfinished_sents.mul_((~eos_in_sents).long())
-
-            # stop when there is a </s> in each sentence, or if we exceed the maximul length
-            if unfinished_sents.max() == 0:
-                break
-
-            # extend attention_mask for new generated input if only decoder
-            if self.config.is_encoder_decoder is False:
-                attention_mask = torch.cat(
-                    [
-                        attention_mask,
-                        attention_mask.new_ones((attention_mask.shape[0], 1)),
-                    ],
-                    dim=-1,
-                )
-
-        return input_ids
-
-    def _generate_beam_search(
-        self,
-        input_ids,
-        cur_len,
-        max_length,
-        min_length,
-        do_sample,
-        early_stopping,
-        temperature,
-        top_k,
-        top_p,
-        repetition_penalty,
-        no_repeat_ngram_size,
-        bad_words_ids,
-        pad_token_id,
-        eos_token_id,
-        batch_size,
-        num_return_sequences,
-        length_penalty,
-        num_beams,
-        vocab_size,
-        attention_mask,
-        use_cache,
-        model_kwargs,
-    ):
-        """Generate sequences for each example with beam search."""
-
-        # generated hypotheses
-        generated_hyps = [
-            BeamHypotheses(
-                num_beams, max_length, length_penalty, early_stopping=early_stopping
-            )
-            for _ in range(batch_size)
-        ]
-
-        # scores for each sentence in the beam
-        beam_scores = torch.zeros(
-            (batch_size, num_beams), dtype=torch.float, device=input_ids.device
-        )
-
-        # for greedy decoding it is made sure that only tokens of the first beam are considered to avoid sampling the exact same tokens three times
-        if do_sample is False:
-            beam_scores[:, 1:] = -1e9
-        beam_scores = beam_scores.view(-1)  # shape (batch_size * num_beams,)
-
-        # cache compute states
-        past = None
-
-        # done sentences
-        done = [False for _ in range(batch_size)]
-
-        while cur_len < max_length:
-            model_inputs = self.prepare_inputs_for_generation(
-                input_ids,
-                past=past,
-                attention_mask=attention_mask,
-                use_cache=use_cache,
-                **model_kwargs,
-            )
-            outputs = self(
-                **model_inputs, return_dict=True
-            )  # (batch_size * num_beams, cur_len, vocab_size)
-            next_token_logits = outputs.logits[
-                :, -1, :
-            ]  # (batch_size * num_beams, vocab_size)
-
-            # if model has past, then set the past variable to speed up decoding
-            if "past_key_values" in outputs:
-                past = outputs.past_key_values
-            elif "mems" in outputs:
-                past = outputs.mems
-
-            if self.config.is_encoder_decoder and do_sample is False:
-                # TODO (PVP) still a bit hacky here - there might be a better solution
-                next_token_logits = self.adjust_logits_during_generation(
-                    next_token_logits, cur_len=cur_len, max_length=max_length
-                )
-
-            scores = F.log_softmax(
-                next_token_logits, dim=-1
-            )  # (batch_size * num_beams, vocab_size)
-
-            scores = self.postprocess_next_token_scores(
-                scores=scores,
-                input_ids=input_ids,
-                no_repeat_ngram_size=no_repeat_ngram_size,
-                bad_words_ids=bad_words_ids,
-                cur_len=cur_len,
-                min_length=min_length,
-                max_length=max_length,
-                eos_token_id=eos_token_id,
-                repetition_penalty=repetition_penalty,
-                batch_size=batch_size,
-                num_beams=num_beams,
-            )
-
-            assert scores.shape == (
-                batch_size * num_beams,
-                vocab_size,
-            ), "Shapes of scores: {} != {}".format(
-                scores.shape, (batch_size * num_beams, vocab_size)
-            )
-
-            if do_sample:
-                _scores = scores + beam_scores[:, None].expand_as(
-                    scores
-                )  # (batch_size * num_beams, vocab_size)
-                # Temperature
-                if temperature != 1.0:
-                    _scores = _scores / temperature
-                # Top-p/top-k filtering
-                _scores = top_k_top_p_filtering(
-                    _scores, top_k=top_k, top_p=top_p, min_tokens_to_keep=2
-                )  # (batch_size * num_beams, vocab_size)
-                # re-organize to group the beam together to sample from all beam_idxs
-                _scores = _scores.contiguous().view(
-                    batch_size, num_beams * vocab_size
-                )  # (batch_size, num_beams * vocab_size)
-
-                # Sample 2 next tokens for each beam (so we have some spare tokens and match output of greedy beam search)
-                probs = F.softmax(_scores, dim=-1)
-                next_tokens = torch.multinomial(
-                    probs, num_samples=2 * num_beams
-                )  # (batch_size, num_beams * 2)
-                # Compute next scores
-                next_scores = torch.gather(
-                    _scores, -1, next_tokens
-                )  # (batch_size, num_beams * 2)
-                # sort the sampled vector to make sure that the first num_beams samples are the best
-                next_scores, next_scores_indices = torch.sort(
-                    next_scores, descending=True, dim=1
-                )
-                next_tokens = torch.gather(
-                    next_tokens, -1, next_scores_indices
-                )  # (batch_size, num_beams * 2)
-
-            else:
-                next_scores = scores + beam_scores[:, None].expand_as(
-                    scores
-                )  # (batch_size * num_beams, vocab_size)
-
-                # re-organize to group the beam together (we are keeping top hypothesis accross beams)
-                next_scores = next_scores.view(
-                    batch_size, num_beams * vocab_size
-                )  # (batch_size, num_beams * vocab_size)
-
-                next_scores, next_tokens = torch.topk(
-                    next_scores, 2 * num_beams, dim=1, largest=True, sorted=True
-                )
-
-            assert (
-                next_scores.size() == next_tokens.size() == (batch_size, 2 * num_beams)
-            )
-
-            # next batch beam content
-            next_batch_beam = []
-
-            # for each sentence
-            for batch_idx in range(batch_size):
-
-                # if we are done with this sentence, add a pad token
-                if done[batch_idx]:
-                    assert (
-                        len(generated_hyps[batch_idx]) >= num_beams
-                    ), "Batch can only be done if at least {} beams have been generated".format(
-                        num_beams
-                    )
-                    assert (
-                        eos_token_id is not None and pad_token_id is not None
-                    ), "generated beams >= num_beams -> eos_token_id and pad_token have to be defined"
-                    next_batch_beam.extend(
-                        [(0, pad_token_id, 0)] * num_beams
-                    )  # pad the batch
-                    continue
-
-                # next sentence beam content, this will get added to next_batch_beam
-                next_sent_beam = []
-
-                # next tokens for this sentence
-                for beam_token_rank, (beam_token_id, beam_token_score) in enumerate(
-                    zip(next_tokens[batch_idx], next_scores[batch_idx])
-                ):
-                    # get beam and token IDs
-                    beam_id = beam_token_id // vocab_size
-                    token_id = beam_token_id % vocab_size
-
-                    effective_beam_id = batch_idx * num_beams + beam_id
-                    # add to generated hypotheses if end of sentence
-                    if (eos_token_id is not None) and (token_id.item() == eos_token_id):
-                        # if beam_token does not belong to top num_beams tokens, it should not be added
-                        is_beam_token_worse_than_top_num_beams = (
-                            beam_token_rank >= num_beams
-                        )
-                        if is_beam_token_worse_than_top_num_beams:
-                            continue
-                        generated_hyps[batch_idx].add(
-                            input_ids[effective_beam_id].clone(),
-                            beam_token_score.item(),
-                        )
-                    else:
-                        # add next predicted token since it is not eos_token
-                        next_sent_beam.append(
-                            (beam_token_score, token_id, effective_beam_id)
-                        )
-
-                    # once the beam for next step is full, don't add more tokens to it.
-                    if len(next_sent_beam) == num_beams:
-                        break
-
-                # Check if we are done so that we can save a pad step if all(done)
-                done[batch_idx] = done[batch_idx] or generated_hyps[batch_idx].is_done(
-                    next_scores[batch_idx].max().item(), cur_len
-                )
-
-                # update next beam content
-                assert len(next_sent_beam) == num_beams, "Beam should always be full"
-                next_batch_beam.extend(next_sent_beam)
-                assert len(next_batch_beam) == num_beams * (
-                    batch_idx + 1
-                ), "We should have added num_beams each step"
-
-            # stop when we are done with each sentence
-            if all(done):
-                break
-
-            # sanity check / prepare next batch
-            assert len(next_batch_beam) == batch_size * num_beams
-            beam_scores = beam_scores.new([x[0] for x in next_batch_beam])
-            beam_tokens = input_ids.new([x[1] for x in next_batch_beam])
-            beam_idx = input_ids.new([x[2] for x in next_batch_beam])
-
-            # re-order batch and update current length
-            input_ids = input_ids[beam_idx, :]
-            input_ids = torch.cat([input_ids, beam_tokens.unsqueeze(1)], dim=-1)
-            cur_len = cur_len + 1
-
-            # re-order internal states
-            if past is not None:
-                past = self._reorder_cache(past, beam_idx)
-
-            # extend attention_mask for new generated input if only decoder
-            if self.config.is_encoder_decoder is False:
-                attention_mask = torch.cat(
-                    [
-                        attention_mask,
-                        attention_mask.new_ones((attention_mask.shape[0], 1)),
-                    ],
-                    dim=-1,
-                )
-
-        # finalize all open beam hypotheses and add to generated hypotheses
-        for batch_idx in range(batch_size):
-            if done[batch_idx]:
-                continue
-
-            # test that beam scores match previously calculated scores if not eos and batch_idx not done
-            if eos_token_id is not None and all(
-                (token_id % vocab_size).item() != eos_token_id
-                for token_id in next_tokens[batch_idx]
-            ):
-                assert torch.all(
-                    next_scores[batch_idx, :num_beams]
-                    == beam_scores.view(batch_size, num_beams)[batch_idx]
-                ), "If batch_idx is not done, final next scores: {} have to equal to accumulated beam_scores: {}".format(
-                    next_scores[:, :num_beams][batch_idx],
-                    beam_scores.view(batch_size, num_beams)[batch_idx],
-                )
-
-            # need to add best num_beams hypotheses to generated hyps
-            for beam_id in range(num_beams):
-                effective_beam_id = batch_idx * num_beams + beam_id
-                final_score = beam_scores[effective_beam_id].item()
-                final_tokens = input_ids[effective_beam_id]
-                generated_hyps[batch_idx].add(final_tokens, final_score)
-
-        # depending on whether greedy generation is wanted or not define different output_batch_size and output_num_return_sequences_per_batch
-        output_batch_size = (
-            batch_size if do_sample else batch_size * num_return_sequences
-        )
-        output_num_return_sequences_per_batch = 1 if do_sample else num_return_sequences
-
-        # select the best hypotheses
-        sent_lengths = input_ids.new(output_batch_size)
-        best = []
-
-        # retrieve best hypotheses
-        for i, hypotheses in enumerate(generated_hyps):
-            sorted_hyps = sorted(hypotheses.beams, key=lambda x: x[0])
-            for j in range(output_num_return_sequences_per_batch):
-                effective_batch_idx = output_num_return_sequences_per_batch * i + j
-                best_hyp = sorted_hyps.pop()[1]
-                sent_lengths[effective_batch_idx] = len(best_hyp)
-                best.append(best_hyp)
-
-        # shorter batches are padded
-        if sent_lengths.min().item() != sent_lengths.max().item():
-            assert pad_token_id is not None, "`Pad_token_id` has to be defined"
-            sent_max_len = min(sent_lengths.max().item() + 1, max_length)
-            decoded = input_ids.new(output_batch_size, sent_max_len).fill_(pad_token_id)
-
-            # fill with hypothesis and eos_token_id if necessary
-            for i, hypo in enumerate(best):
-                decoded[i, : sent_lengths[i]] = hypo
-                if sent_lengths[i] < max_length:
-                    decoded[i, sent_lengths[i]] = eos_token_id
-        else:
-            # none of the hypotheses have an eos_token
-            assert (len(hypo) == max_length for hypo in best)
-            decoded = (
-                torch.stack(best).type(torch.long).to(next(self.parameters()).device)
-            )
-
-        return decoded
-
-    @staticmethod
-    def _reorder_cache(past: Tuple, beam_idx: Tensor) -> Tuple[Tensor]:
-        return tuple(layer_past.index_select(1, beam_idx) for layer_past in past)
-
-
-def calc_banned_ngram_tokens(
-    prev_input_ids: Tensor, num_hypos: int, no_repeat_ngram_size: int, cur_len: int
-) -> None:
-    """Copied from fairseq for no_repeat_ngram in beam_search"""
-    if cur_len + 1 < no_repeat_ngram_size:
-        # return no banned tokens if we haven't generated no_repeat_ngram_size tokens yet
-        return [[] for _ in range(num_hypos)]
-    generated_ngrams = [{} for _ in range(num_hypos)]
-    for idx in range(num_hypos):
-        gen_tokens = prev_input_ids[idx].tolist()
-        generated_ngram = generated_ngrams[idx]
-        for ngram in zip(*[gen_tokens[i:] for i in range(no_repeat_ngram_size)]):
-            prev_ngram_tuple = tuple(ngram[:-1])
-            generated_ngram[prev_ngram_tuple] = generated_ngram.get(
-                prev_ngram_tuple, []
-            ) + [ngram[-1]]
-
-    def _get_generated_ngrams(hypo_idx):
-        # Before decoding the next token, prevent decoding of ngrams that have already appeared
-        start_idx = cur_len + 1 - no_repeat_ngram_size
-        ngram_idx = tuple(prev_input_ids[hypo_idx, start_idx:cur_len].tolist())
-        return generated_ngrams[hypo_idx].get(ngram_idx, [])
-
-    banned_tokens = [_get_generated_ngrams(hypo_idx) for hypo_idx in range(num_hypos)]
-    return banned_tokens
-
-
-def calc_banned_bad_words_ids(
-    prev_input_ids: Iterable[int], bad_words_ids: Iterable[int]
-) -> Iterable[int]:
-    banned_tokens = []
-
-    def _tokens_match(prev_tokens, tokens):
-        if len(tokens) == 0:
-            # if bad word tokens is just one token always ban it
-            return True
-        if len(tokens) > len(prev_tokens):
-            # if bad word tokens are longer than prev tokens they can't be equal
-            return False
-
-        if prev_tokens[-len(tokens) :] == tokens:
-            # if tokens match
-            return True
-        else:
-            return False
-
-    for prev_input_ids_slice in prev_input_ids:
-        banned_tokens_slice = []
-
-        for banned_token_seq in bad_words_ids:
-            assert (
-                len(banned_token_seq) > 0
-            ), "Banned words token sequences {} cannot have an empty list".format(
-                bad_words_ids
-            )
-
-            if _tokens_match(prev_input_ids_slice, banned_token_seq[:-1]) is False:
-                # if tokens do not match continue
-                continue
-
-            banned_tokens_slice.append(banned_token_seq[-1])
-
-        banned_tokens.append(banned_tokens_slice)
-
-    return banned_tokens
-
-
-def set_scores_to_inf_for_banned_tokens(
-    scores: torch.Tensor, banned_tokens: List[List[int]]
-) -> None:
-    """Modifies the scores in place by setting the banned token positions to `-inf`. Banned token is expected to be
-    a list of list of banned tokens to ban in the format [[batch index, vocabulary position],...]
-        Args:
-            scores: logits distribution of shape (batch size, vocabulary size)
-            banned_tokens: list of list of tokens to ban of length (batch_size)
-    """
-    banned_mask_list = []
-    for idx, batch_banned_tokens in enumerate(banned_tokens):
-        for token in batch_banned_tokens:
-            banned_mask_list.append([idx, token])
-    if not banned_mask_list:
-        return
-    banned_mask = torch.LongTensor(banned_mask_list)
-    indices = torch.ones(len(banned_mask))
-    # A sparse tensor is generated from a list of coordinates: [[0, 1], [0, 2], [2, 0]]. A conversion to dense tensor generates:
-    # [ 0  1  1 ]
-    # [ 0  0  0 ]
-    # [ 1  0  0 ]
-
-    banned_mask = (
-        torch.sparse.LongTensor(banned_mask.t(), indices, scores.size())
-        .to(scores.device)
-        .to_dense()
-        .bool()
-    )
-    scores.masked_fill_(banned_mask, -float("inf"))
-
-
-def top_k_top_p_filtering(
-    logits: Tensor,
-    top_k: int = 0,
-    top_p: float = 1.0,
-    filter_value: float = -float("Inf"),
-    min_tokens_to_keep: int = 1,
-) -> Tensor:
-    """Filter a distribution of logits using top-k and/or nucleus (top-p) filtering
-    Args:
-        logits: logits distribution shape (batch size, vocabulary size)
-        if top_k > 0: keep only top k tokens with highest probability (top-k filtering).
-        if top_p < 1.0: keep the top tokens with cumulative probability >= top_p (nucleus filtering).
-            Nucleus filtering is described in Holtzman et al. (http://arxiv.org/abs/1904.09751)
-        Make sure we keep at least min_tokens_to_keep per batch example in the output
-    From: https://gist.github.com/thomwolf/1a5a29f6962089e871b94cbd09daf317
-    """
-    if top_k > 0:
-        top_k = min(max(top_k, min_tokens_to_keep), logits.size(-1))  # Safety check
-        # Remove all tokens with a probability less than the last token of the top-k
-        indices_to_remove = logits < torch.topk(logits, top_k)[0][..., -1, None]
-        logits[indices_to_remove] = filter_value
-
-    if top_p < 1.0:
-        sorted_logits, sorted_indices = torch.sort(logits, descending=True)
-        cumulative_probs = torch.cumsum(F.softmax(sorted_logits, dim=-1), dim=-1)
-
-        # Remove tokens with cumulative probability above the threshold (token with 0 are kept)
-        sorted_indices_to_remove = cumulative_probs > top_p
-        if min_tokens_to_keep > 1:
-            # Keep at least min_tokens_to_keep (set to min_tokens_to_keep-1 because we add the first one below)
-            sorted_indices_to_remove[..., :min_tokens_to_keep] = 0
-        # Shift the indices to the right to keep also the first token above the threshold
-        sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[..., :-1].clone()
-        sorted_indices_to_remove[..., 0] = 0
-
-        # scatter sorted tensors to original indexing
-        indices_to_remove = sorted_indices_to_remove.scatter(
-            1, sorted_indices, sorted_indices_to_remove
-        )
-        logits[indices_to_remove] = filter_value
-    return logits
-
-
-class BeamHypotheses(object):
-    def __init__(self, num_beams, max_length, length_penalty, early_stopping):
-        """
-        Initialize n-best list of hypotheses.
-        """
-        self.max_length = max_length - 1  # ignoring bos_token
-        self.length_penalty = length_penalty
-        self.early_stopping = early_stopping
-        self.num_beams = num_beams
-        self.beams = []
-        self.worst_score = 1e9
-
-    def __len__(self):
-        """
-        Number of hypotheses in the list.
-        """
-        return len(self.beams)
-
-    def add(self, hyp, sum_logprobs):
-        """
-        Add a new hypothesis to the list.
-        """
-        score = sum_logprobs / len(hyp) ** self.length_penalty
-        if len(self) < self.num_beams or score > self.worst_score:
-            self.beams.append((score, hyp))
-            if len(self) > self.num_beams:
-                sorted_scores = sorted(
-                    [(s, idx) for idx, (s, _) in enumerate(self.beams)]
-                )
-                del self.beams[sorted_scores[0][1]]
-                self.worst_score = sorted_scores[1][0]
-            else:
-                self.worst_score = min(score, self.worst_score)
-
-    def is_done(self, best_sum_logprobs, cur_len):
-        """
-        If there are enough hypotheses and that none of the hypotheses being generated
-        can become better than the worst one in the heap, then we are done with this sentence.
-        """
-
-        if len(self) < self.num_beams:
-            return False
-        elif self.early_stopping:
-            return True
-        else:
-            cur_score = best_sum_logprobs / cur_len ** self.length_penalty
-            ret = self.worst_score >= cur_score
-            return ret
--- a/train/lightning_base.py deleted 100644 → 0
View file @3f92ebb
+++ b/train/lightning_base.py deleted 100644 → 0
View file @3f92ebb
-import argparse
-import logging
-import os
-from pathlib import Path
-from typing import Any, Dict
-
-import pytorch_lightning as pl
-from pytorch_lightning.utilities import rank_zero_info
-
-from transformers import (
-    AdamW,
-    AutoConfig,
-    AutoModel,
-    AutoModelForPreTraining,
-    AutoModelForQuestionAnswering,
-    AutoModelForSeq2SeqLM,
-    AutoModelForSequenceClassification,
-    AutoModelForTokenClassification,
-    AutoModelWithLMHead,
-    AutoTokenizer,
-    PretrainedConfig,
-    PreTrainedTokenizer,
-)
-from train.modeling_bart import BartForConditionalGeneration
-
-from transformers.optimization import (
-    Adafactor,
-    get_cosine_schedule_with_warmup,
-    get_cosine_with_hard_restarts_schedule_with_warmup,
-    get_linear_schedule_with_warmup,
-    get_polynomial_decay_schedule_with_warmup,
-)
-
-
-logger = logging.getLogger(__name__)
-
-
-MODEL_MODES = {
-    "base": AutoModel,
-    "sequence-classification": AutoModelForSequenceClassification,
-    "question-answering": AutoModelForQuestionAnswering,
-    "pretraining": AutoModelForPreTraining,
-    "token-classification": AutoModelForTokenClassification,
-    "language-modeling": AutoModelWithLMHead,
-    "summarization": BartForConditionalGeneration,
-    "translation": AutoModelForSeq2SeqLM,
-}
-
-
-# update this and the import above to support new schedulers from transformers.optimization
-arg_to_scheduler = {
-    "linear": get_linear_schedule_with_warmup,
-    "cosine": get_cosine_schedule_with_warmup,
-    "cosine_w_restarts": get_cosine_with_hard_restarts_schedule_with_warmup,
-    "polynomial": get_polynomial_decay_schedule_with_warmup,
-    # '': get_constant_schedule,             # not supported for now
-    # '': get_constant_schedule_with_warmup, # not supported for now
-}
-arg_to_scheduler_choices = sorted(arg_to_scheduler.keys())
-arg_to_scheduler_metavar = "{" + ", ".join(arg_to_scheduler_choices) + "}"
-
-
-class BaseTransformer(pl.LightningModule):
-    def __init__(
-        self,
-        hparams: argparse.Namespace,
-        num_labels=None,
-        mode="base",
-        config=None,
-        tokenizer=None,
-        model=None,
-        **config_kwargs,
-    ):
-        """Initialize a model, tokenizer and config."""
-        super().__init__()
-        # TODO: move to self.save_hyperparameters()
-        # self.save_hyperparameters()
-        # can also expand arguments into trainer signature for easier reading
-
-        self.save_hyperparameters(hparams)
-        self.step_count = 0
-        self.output_dir = Path(self.hparams.output_dir)
-        cache_dir = self.hparams.cache_dir if self.hparams.cache_dir else None
-        if config is None:
-            self.config = AutoConfig.from_pretrained(
-                self.hparams.config_name
-                if self.hparams.config_name
-                else self.hparams.model_name_or_path,
-                **({"num_labels": num_labels} if num_labels is not None else {}),
-                cache_dir=cache_dir,
-                **config_kwargs,
-            )
-        else:
-            self.config: PretrainedConfig = config
-
-        extra_model_params = (
-            "encoder_layerdrop",
-            "decoder_layerdrop",
-            "dropout",
-            "attention_dropout",
-        )
-        for p in extra_model_params:
-            if getattr(self.hparams, p, None):
-                assert hasattr(
-                    self.config, p
-                ), f"model config doesn't have a `{p}` attribute"
-                setattr(self.config, p, getattr(self.hparams, p))
-
-        if tokenizer is None:
-            self.tokenizer = AutoTokenizer.from_pretrained(
-                self.hparams.tokenizer_name
-                if self.hparams.tokenizer_name
-                else self.hparams.model_name_or_path,
-                cache_dir=cache_dir,
-            )
-        else:
-            self.tokenizer: PreTrainedTokenizer = tokenizer
-        self.model_type = MODEL_MODES[mode]
-        if model is None:
-            self.model = self.model_type.from_pretrained(
-                self.hparams.model_name_or_path,
-                from_tf=bool(".ckpt" in self.hparams.model_name_or_path),
-                config=self.config,
-                cache_dir=cache_dir,
-            )
-        else:
-            self.model = model
-        self.model.resize_token_embeddings(len(tokenizer))
-
-    def load_hf_checkpoint(self, *args, **kwargs):
-        self.model = self.model_type.from_pretrained(*args, **kwargs)
-
-    def get_lr_scheduler(self):
-        get_schedule_func = arg_to_scheduler[self.hparams.lr_scheduler]
-        scheduler = get_schedule_func(
-            self.opt,
-            num_warmup_steps=self.hparams.warmup_steps,
-            num_training_steps=self.total_steps,
-        )
-        scheduler = {"scheduler": scheduler, "interval": "step", "frequency": 1}
-        return scheduler
-
-    def configure_optimizers(self):
-        """Prepare optimizer and schedule (linear warmup and decay)"""
-        model = self.model
-        no_decay = ["bias", "LayerNorm.weight"]
-        optimizer_grouped_parameters = [
-            {
-                "params": [
-                    p
-                    for n, p in model.named_parameters()
-                    if not any(nd in n for nd in no_decay)
-                ],
-                "weight_decay": self.hparams.weight_decay,
-            },
-            {
-                "params": [
-                    p
-                    for n, p in model.named_parameters()
-                    if any(nd in n for nd in no_decay)
-                ],
-                "weight_decay": 0.0,
-            },
-        ]
-        if self.hparams.adafactor:
-            optimizer = Adafactor(
-                optimizer_grouped_parameters,
-                lr=self.hparams.learning_rate,
-                scale_parameter=False,
-                relative_step=False,
-            )
-
-        else:
-            optimizer = AdamW(
-                optimizer_grouped_parameters,
-                lr=self.hparams.learning_rate,
-                eps=self.hparams.adam_epsilon,
-            )
-        self.opt = optimizer
-
-        scheduler = self.get_lr_scheduler()
-
-        return [optimizer], [scheduler]
-
-    def test_step(self, batch, batch_nb):
-        return self.validation_step(batch, batch_nb)
-
-    def test_epoch_end(self, outputs):
-        return self.validation_end(outputs)
-
-    @property
-    def total_steps(self) -> int:
-        """The number of total training steps that will be run. Used for lr scheduler purposes."""
-        num_devices = max(1, self.hparams.gpus)  # TODO: consider num_tpu_cores
-        effective_batch_size = (
-            self.hparams.train_batch_size
-            * self.hparams.accumulate_grad_batches
-            * num_devices
-        )
-        dataset_size = len(self.train_loader.dataset)
-        return (dataset_size / effective_batch_size) * self.hparams.max_epochs
-
-    def setup(self, mode):
-        if mode == "fit":
-            self.train_loader = self.get_dataloader(
-                "train", self.hparams.train_batch_size, shuffle=True
-            )
-
-    def get_dataloader(self, type_path, batch_size, shuffle=False):
-        raise NotImplementedError("You must implement this for your task")
-
-    def train_dataloader(self):
-        return self.train_loader
-
-    def val_dataloader(self):
-        return self.get_dataloader("dev", self.hparams.eval_batch_size, shuffle=False)
-
-    def test_dataloader(self):
-        return self.get_dataloader("test", self.hparams.eval_batch_size, shuffle=False)
-
-    def _feature_file(self, mode):
-        return os.path.join(
-            self.hparams.data_dir,
-            "cached_{}_{}_{}".format(
-                mode,
-                list(filter(None, self.hparams.model_name_or_path.split("/"))).pop(),
-                str(self.hparams.max_seq_length),
-            ),
-        )
-
-    @pl.utilities.rank_zero_only
-    def on_save_checkpoint(self, checkpoint: Dict[str, Any]) -> None:
-        save_path = self.output_dir.joinpath("best_tfmr")
-        self.model.config.save_step = self.step_count
-        self.model.save_pretrained(save_path)
-        self.tokenizer.save_pretrained(save_path)
-
-    @staticmethod
-    def add_model_specific_args(parser, root_dir):
-        parser.add_argument(
-            "--model_name_or_path",
-            default=None,
-            type=str,
-            required=True,
-            help="Path to pretrained model or model identifier from huggingface.co/models",
-        )
-        parser.add_argument(
-            "--config_name",
-            default="",
-            type=str,
-            help="Pretrained config name or path if not the same as model_name",
-        )
-        parser.add_argument(
-            "--tokenizer_name",
-            default=None,
-            type=str,
-            help="Pretrained tokenizer name or path if not the same as model_name",
-        )
-        parser.add_argument(
-            "--cache_dir",
-            default="",
-            type=str,
-            help="Where do you want to store the pre-trained models downloaded from s3",
-        )
-        parser.add_argument(
-            "--encoder_layerdrop",
-            type=float,
-            help="Encoder layer dropout probability (Optional). Goes into model.config",
-        )
-        parser.add_argument(
-            "--decoder_layerdrop",
-            type=float,
-            help="Decoder layer dropout probability (Optional). Goes into model.config",
-        )
-        parser.add_argument(
-            "--dropout",
-            type=float,
-            help="Dropout probability (Optional). Goes into model.config",
-        )
-        parser.add_argument(
-            "--attention_dropout",
-            type=float,
-            help="Attention dropout probability (Optional). Goes into model.config",
-        )
-        parser.add_argument(
-            "--learning_rate",
-            default=5e-5,
-            type=float,
-            help="The initial learning rate for Adam.",
-        )
-        parser.add_argument(
-            "--lr_scheduler",
-            default="linear",
-            choices=arg_to_scheduler_choices,
-            metavar=arg_to_scheduler_metavar,
-            type=str,
-            help="Learning rate scheduler",
-        )
-        parser.add_argument(
-            "--weight_decay",
-            default=0.0,
-            type=float,
-            help="Weight decay if we apply some.",
-        )
-        parser.add_argument(
-            "--adam_epsilon",
-            default=1e-8,
-            type=float,
-            help="Epsilon for Adam optimizer.",
-        )
-        parser.add_argument(
-            "--warmup_steps",
-            default=0,
-            type=int,
-            help="Linear warmup over warmup_steps.",
-        )
-        parser.add_argument(
-            "--num_workers", default=4, type=int, help="kwarg passed to DataLoader"
-        )
-        parser.add_argument(
-            "--num_train_epochs", dest="max_epochs", default=3, type=int
-        )
-        parser.add_argument("--train_batch_size", default=32, type=int)
-        parser.add_argument("--eval_batch_size", default=32, type=int)
-        parser.add_argument("--adafactor", action="store_true")
-
-
-class LoggingCallback(pl.Callback):
-    def on_batch_end(self, trainer, pl_module):
-        lr_scheduler = trainer.lr_schedulers[0]["scheduler"]
-        lrs = {f"lr_group_{i}": lr for i, lr in enumerate(lr_scheduler.get_lr())}
-        pl_module.logger.log_metrics(lrs)
-
-    def on_validation_end(self, trainer: pl.Trainer, pl_module: pl.LightningModule):
-        rank_zero_info("***** Validation results *****")
-        metrics = trainer.callback_metrics
-        # Log results
-        for key in sorted(metrics):
-            if key not in ["log", "progress_bar"]:
-                rank_zero_info("{} = {}\n".format(key, str(metrics[key])))
-
-    def on_test_end(self, trainer: pl.Trainer, pl_module: pl.LightningModule):
-        rank_zero_info("***** Test results *****")
-        metrics = trainer.callback_metrics
-        # Log and save results to file
-        output_test_results_file = os.path.join(
-            pl_module.hparams.output_dir, "test_results.txt"
-        )
-        with open(output_test_results_file, "w") as writer:
-            for key in sorted(metrics):
-                if key not in ["log", "progress_bar"]:
-                    rank_zero_info("{} = {}\n".format(key, str(metrics[key])))
-                    writer.write("{} = {}\n".format(key, str(metrics[key])))
-
-
-def add_generic_args(parser, root_dir) -> None:
-    #  TODO(SS): allow all pl args? parser = pl.Trainer.add_argparse_args(parser)
-    parser.add_argument(
-        "--output_dir",
-        default=None,
-        type=str,
-        required=True,
-        help="The output directory where the model predictions and checkpoints will be written.",
-    )
-    parser.add_argument(
-        "--fp16",
-        action="store_true",
-        help="Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit",
-    )
-
-    parser.add_argument(
-        "--fp16_opt_level",
-        type=str,
-        default="O2",
-        help="For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']."
-        "See details at https://nvidia.github.io/apex/amp.html",
-    )
-    parser.add_argument("--n_tpu_cores", dest="tpu_cores", type=int)
-    parser.add_argument(
-        "--max_grad_norm",
-        dest="gradient_clip_val",
-        default=1.0,
-        type=float,
-        help="Max gradient norm",
-    )
-    parser.add_argument(
-        "--do_train", action="store_true", help="Whether to run training."
-    )
-    parser.add_argument(
-        "--do_predict",
-        action="store_true",
-        help="Whether to run predictions on the test set.",
-    )
-    parser.add_argument(
-        "--gradient_accumulation_steps",
-        dest="accumulate_grad_batches",
-        type=int,
-        default=1,
-        help="Number of updates steps to accumulate before performing a backward/update pass.",
-    )
-    parser.add_argument(
-        "--seed", type=int, default=42, help="random seed for initialization"
-    )
-
-
-def generic_train(
-    model: BaseTransformer,
-    args: argparse.Namespace,
-    early_stopping_callback=False,
-    logger=True,  # can pass WandbLogger() here
-    extra_callbacks=[],
-    checkpoint_callback=None,
-    logging_callback=None,
-    **extra_train_kwargs,
-):
-    pl.seed_everything(args.seed)
-
-    # init model
-    odir = Path(model.hparams.output_dir)
-    odir.mkdir(exist_ok=True)
-
-    # add custom checkpoints
-    if checkpoint_callback is None:
-        checkpoint_callback = pl.callbacks.ModelCheckpoint(
-            filepath=args.output_dir,
-            prefix="checkpoint",
-            monitor="val_loss",
-            mode="min",
-            save_top_k=1,
-        )
-    if logging_callback is None:
-        logging_callback = LoggingCallback()
-
-    train_params = {}
-
-    # TODO: remove with PyTorch 1.6 since pl uses native amp
-    if args.fp16:
-        train_params["precision"] = 16
-        train_params["amp_level"] = args.fp16_opt_level
-
-    if args.gpus > 1:
-        train_params["distributed_backend"] = "ddp"
-
-    trainer = pl.Trainer.from_argparse_args(
-        args,
-        weights_summary=None,
-        callbacks=[logging_callback] + extra_callbacks,
-        logger=logger,
-        checkpoint_callback=checkpoint_callback,
-        early_stop_callback=early_stopping_callback,
-        **train_params,
-    )
-
-    if args.do_train:
-        trainer.fit(model)
-
-    return trainer
--- a/train/modeling_bart.py deleted 100644 → 0
View file @3f92ebb
+++ b/train/modeling_bart.py deleted 100644 → 0
View file @3f92ebb
-# coding=utf-8
-# Copyright 2020 The Facebook AI Research Team Authors and The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""PyTorch BART model, ported from the fairseq repo."""
-import math
-import random
-import warnings
-from typing import Dict, List, Optional, Tuple
-
-import numpy as np
-import torch
-import torch.nn.functional as F
-from torch import Tensor, nn
-from torch.nn import CrossEntropyLoss
-
-from transformers.activations import ACT2FN
-from transformers.configuration_bart import BartConfig
-from transformers.file_utils import (
-    add_code_sample_docstrings,
-    add_end_docstrings,
-    add_start_docstrings,
-    add_start_docstrings_to_callable,
-    replace_return_docstrings,
-)
-from transformers.modeling_outputs import (
-    BaseModelOutput,
-    BaseModelOutputWithPast,
-    Seq2SeqLMOutput,
-    Seq2SeqModelOutput,
-    Seq2SeqQuestionAnsweringModelOutput,
-    Seq2SeqSequenceClassifierOutput,
-)
-from train.modeling_utils import PreTrainedModel
-import logging
-
-logger = logging.getLogger(__name__)  # pylint: disable=invalid-name
-logging.basicConfig(
-    format="%(asctime)s - %(levelname)s - %(name)s - PID: %(process)d -  %(message)s",
-    datefmt="%m/%d/%Y %H:%M:%S",
-    level=logging.INFO,
-)
-
-_CONFIG_FOR_DOC = "BartConfig"
-_TOKENIZER_FOR_DOC = "BartTokenizer"
-
-
-BART_PRETRAINED_MODEL_ARCHIVE_LIST = [
-    "facebook/bart-base",
-    "facebook/bart-large",
-    "facebook/bart-large-mnli",
-    "facebook/bart-large-cnn",
-    "facebook/bart-large-xsum",
-    "facebook/mbart-large-en-ro",
-]
-# This list is incomplete. See all BART models at https://huggingface.co/models?filter=bart
-
-
-BART_START_DOCSTRING = r"""
-
-    This model is a PyTorch `torch.nn.Module <https://pytorch.org/docs/stable/nn.html#torch.nn.Module>`_ sub-class. Use it as a regular PyTorch Module and
-    refer to the PyTorch documentation for all matters related to general usage and behavior.
-
-    Parameters:
-        config (:class:`~transformers.BartConfig`): Model configuration class with all the parameters of the model.
-            Initializing with a config file does not load the weights associated with the model, only the configuration.
-            Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model weights.
-
-"""
-BART_GENERATION_EXAMPLE = r"""
-    Summarization example::
-
-        from transformers import BartTokenizer, BartForConditionalGeneration, BartConfig
-
-        # see ``examples/summarization/bart/run_eval.py`` for a longer example
-        model = BartForConditionalGeneration.from_pretrained('facebook/bart-large-cnn')
-        tokenizer = BartTokenizer.from_pretrained('facebook/bart-large-cnn')
-
-        ARTICLE_TO_SUMMARIZE = "My friends are cool but they eat too many carbs."
-        inputs = tokenizer([ARTICLE_TO_SUMMARIZE], max_length=1024, return_tensors='pt')
-
-        # Generate Summary
-        summary_ids = model.generate(inputs['input_ids'], num_beams=4, max_length=5, early_stopping=True)
-        print([tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=False) for g in summary_ids])
-
-"""
-
-BART_INPUTS_DOCSTRING = r"""
-    Args:
-        input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`):
-               Indices of input sequence tokens in the vocabulary. Use BartTokenizer.encode to produce them.
-            Padding will be ignored by default should you provide it.
-            Indices can be obtained using :class:`transformers.BartTokenizer.encode(text)`.
-        attention_mask (:obj:`torch.Tensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
-            Mask to avoid performing attention on padding token indices in input_ids.
-            Mask values selected in ``[0, 1]``:
-            ``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens.
-        encoder_outputs (:obj:`tuple(tuple(torch.FloatTensor)`, `optional`, defaults to :obj:`None`):
-            Tuple consists of (`last_hidden_state`, `optional`: `hidden_states`, `optional`: `attentions`)
-            `last_hidden_state` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`, defaults to :obj:`None`) is a sequence of hidden-states at the output of the last layer of the encoder.
-            Used in the cross-attention of the decoder.
-        decoder_input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, target_sequence_length)`, `optional`, defaults to :obj:`None`):
-            Provide for translation and summarization training. By default, the model will create this tensor by shifting the input_ids right, following the paper.
-        decoder_attention_mask (:obj:`torch.BoolTensor` of shape :obj:`(batch_size, tgt_seq_len)`, `optional`, defaults to :obj:`None`):
-            Default behavior: generate a tensor that ignores pad tokens in decoder_input_ids. Causal mask will also be used by default.
-            If you want to change padding behavior, you should read :func:`~transformers.modeling_bart._prepare_decoder_inputs` and modify.
-            See diagram 1 in the paper for more info on the default strategy
-        past_key_values (:obj:`tuple(tuple(torch.FloatTensor))` of length :obj:`config.n_layers` with each tuple having 4 tensors of shape :obj:`(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`):
-            Contains pre-computed key and value hidden-states of the attention blocks.
-            Can be used to speed up decoding.
-            If ``past_key_values`` are used, the user can optionally input only the last
-            ``decoder_input_ids`` (those that don't have their past key value states given to this model) of shape
-            :obj:`(batch_size, 1)` instead of all ``decoder_input_ids`` of shape :obj:`(batch_size, sequence_length)`.
-        use_cache (:obj:`bool`, `optional`, defaults to :obj:`True`):
-            If `use_cache` is True, ``past_key_values`` are returned and can be used to speed up decoding (see
-            ``past_key_values``).
-        output_attentions (:obj:`bool`, `optional`, defaults to :obj:`None`):
-            If set to ``True``, the attentions tensors of all attention layers are returned. See ``attentions`` under returned tensors for more detail.
-        output_hidden_states (:obj:`bool`, `optional`, defaults to :obj:`None`):
-            If set to ``True``, the hidden states of all layers are returned. See ``hidden_states`` under returned tensors for more detail.
-        return_dict (:obj:`bool`, `optional`, defaults to :obj:`None`):
-            If set to ``True``, the model will return a :class:`~transformers.file_utils.ModelOutput` instead of a
-            plain tuple.
-"""
-
-
-def invert_mask(attention_mask):
-    """Turns 1->0, 0->1, False->True, True-> False"""
-    assert attention_mask.dim() == 2
-    return attention_mask.eq(0)
-
-
-def _prepare_bart_decoder_inputs(
-    config,
-    input_ids,
-    decoder_input_ids=None,
-    decoder_padding_mask=None,
-    causal_mask_dtype=torch.float32,
-):
-    """Prepare masks that ignore padding tokens in the decoder and a causal mask for the decoder if
-    none are provided. This mimics the default behavior in fairseq. To override it pass in masks.
-    Note: this is not called during generation
-    """
-    pad_token_id = config.pad_token_id
-    if decoder_input_ids is None:
-        decoder_input_ids = shift_tokens_right(input_ids, pad_token_id)
-    bsz, tgt_len = decoder_input_ids.size()
-    if decoder_padding_mask is None:
-        decoder_padding_mask = make_padding_mask(decoder_input_ids, pad_token_id)
-    else:
-        decoder_padding_mask = invert_mask(decoder_padding_mask)
-    if decoder_padding_mask is not None and decoder_padding_mask.shape[1] > 1:
-        # never mask leading token, even if it is pad
-        decoder_padding_mask[:, 0] = decoder_padding_mask[:, 1]
-    causal_mask = torch.triu(fill_with_neg_inf(torch.zeros(tgt_len, tgt_len)), 1).to(
-        dtype=causal_mask_dtype, device=decoder_input_ids.device
-    )
-    return decoder_input_ids, decoder_padding_mask, causal_mask
-
-
-class PretrainedBartModel(PreTrainedModel):
-    config_class = BartConfig
-    base_model_prefix = "model"
-
-    def _init_weights(self, module):
-        std = self.config.init_std
-        if isinstance(module, nn.Linear):
-            module.weight.data.normal_(mean=0.0, std=std)
-            if module.bias is not None:
-                module.bias.data.zero_()
-        elif isinstance(module, SinusoidalPositionalEmbedding):
-            pass
-        elif isinstance(module, nn.Embedding):
-            module.weight.data.normal_(mean=0.0, std=std)
-            if module.padding_idx is not None:
-                module.weight.data[module.padding_idx].zero_()
-
-    @property
-    def dummy_inputs(self):
-        pad_token = self.config.pad_token_id
-        input_ids = torch.tensor(
-            [[0, 6, 10, 4, 2], [0, 8, 12, 2, pad_token]], device=self.device
-        )
-        dummy_inputs = {
-            "attention_mask": input_ids.ne(pad_token),
-            "input_ids": input_ids,
-        }
-        return dummy_inputs
-
-
-def _make_linear_from_emb(emb):
-    vocab_size, emb_size = emb.weight.shape
-    lin_layer = nn.Linear(vocab_size, emb_size, bias=False)
-    lin_layer.weight.data = emb.weight.data
-    return lin_layer
-
-
-# Helper Functions, mostly for making masks
-def _check_shapes(shape_1, shape2):
-    if shape_1 != shape2:
-        raise AssertionError("shape mismatch: {} != {}".format(shape_1, shape2))
-
-
-def shift_tokens_right(input_ids, pad_token_id):
-    """Shift input ids one token to the right, and wrap the last non pad token (usually <eos>)."""
-    prev_output_tokens = input_ids.clone()
-    index_of_eos = (input_ids.ne(pad_token_id).sum(dim=1) - 1).unsqueeze(-1)
-    prev_output_tokens[:, 0] = input_ids.gather(1, index_of_eos).squeeze()
-    prev_output_tokens[:, 1:] = input_ids[:, :-1]
-    return prev_output_tokens
-
-
-def make_padding_mask(input_ids, padding_idx=1):
-    """True for pad tokens"""
-    padding_mask = input_ids.eq(padding_idx)
-    if not padding_mask.any():
-        padding_mask = None
-    return padding_mask
-
-
-# Helper Modules
-
-
-class EncoderLayer(nn.Module):
-    def __init__(self, config: BartConfig):
-        super().__init__()
-        self.embed_dim = config.d_model
-        self.self_attn = Attention(
-            self.embed_dim,
-            config.encoder_attention_heads,
-            dropout=config.attention_dropout,
-        )
-        self.normalize_before = config.normalize_before
-        self.self_attn_layer_norm = LayerNorm(self.embed_dim)
-        self.dropout = config.dropout
-        self.activation_fn = ACT2FN[config.activation_function]
-        self.activation_dropout = config.activation_dropout
-        self.fc1 = nn.Linear(self.embed_dim, config.encoder_ffn_dim)
-        self.fc2 = nn.Linear(config.encoder_ffn_dim, self.embed_dim)
-        self.final_layer_norm = LayerNorm(self.embed_dim)
-
-    def forward(self, x, encoder_padding_mask, output_attentions=False):
-        """
-        Args:
-            x (Tensor): input to the layer of shape `(seq_len, batch, embed_dim)`
-            encoder_padding_mask (ByteTensor): binary ByteTensor of shape
-                `(batch, src_len)` where padding elements are indicated by ``1``.
-            for t_tgt, t_src is excluded (or masked out), =0 means it is
-            included in attention
-
-        Returns:
-            encoded output of shape `(seq_len, batch, embed_dim)`
-        """
-        residual = x
-        if self.normalize_before:
-            x = self.self_attn_layer_norm(x)
-        x, attn_weights = self.self_attn(
-            query=x,
-            key=x,
-            key_padding_mask=encoder_padding_mask,
-            output_attentions=output_attentions,
-        )
-        x = F.dropout(x, p=self.dropout, training=self.training)
-        x = residual + x
-        if not self.normalize_before:
-            x = self.self_attn_layer_norm(x)
-
-        residual = x
-        if self.normalize_before:
-            x = self.final_layer_norm(x)
-        x = self.activation_fn(self.fc1(x))
-        x = F.dropout(x, p=self.activation_dropout, training=self.training)
-        x = self.fc2(x)
-        x = F.dropout(x, p=self.dropout, training=self.training)
-        x = residual + x
-        if not self.normalize_before:
-            x = self.final_layer_norm(x)
-        return x, attn_weights
-
-
-class BartEncoder(nn.Module):
-    """
-    Transformer encoder consisting of *config.encoder_layers* self attention layers. Each layer
-    is a :class:`EncoderLayer`.
-
-    Args:
-        config: BartConfig
-    """
-
-    def __init__(self, config: BartConfig, embed_tokens):
-        super().__init__()
-
-        self.dropout = config.dropout
-        self.layerdrop = config.encoder_layerdrop
-
-        embed_dim = embed_tokens.embedding_dim
-        self.embed_scale = math.sqrt(embed_dim) if config.scale_embedding else 1.0
-        self.padding_idx = embed_tokens.padding_idx
-        self.max_source_positions = config.max_position_embeddings
-
-        self.embed_tokens = embed_tokens
-        if config.static_position_embeddings:
-            self.embed_positions = SinusoidalPositionalEmbedding(
-                config.max_position_embeddings, embed_dim, self.padding_idx
-            )
-        else:
-            self.embed_positions = LearnedPositionalEmbedding(
-                config.max_position_embeddings,
-                embed_dim,
-                self.padding_idx,
-                config.extra_pos_embeddings,
-            )
-        self.embed_patches = nn.Embedding(3, config.d_model, padding_idx=0)
-        self.layers = nn.ModuleList(
-            [EncoderLayer(config) for _ in range(config.encoder_layers)]
-        )
-        self.layernorm_embedding = (
-            LayerNorm(embed_dim) if config.normalize_embedding else nn.Identity()
-        )
-        # mbart has one extra layer_norm
-        self.layer_norm = LayerNorm(config.d_model) if config.normalize_before else None
-
-    def forward(
-        self,
-        input_ids,
-        patch_ids,
-        attention_mask=None,
-        output_attentions=False,
-        output_hidden_states=False,
-        return_dict=False,
-    ):
-        """
-        Args:
-            input_ids (LongTensor): tokens in the source language of shape
-                `(batch, src_len)`
-            attention_mask (torch.LongTensor): indicating which indices are padding tokens.
-        Returns:
-            BaseModelOutput or Tuple comprised of:
-                - **x** (Tensor): the last encoder layer's output of
-                  shape `(src_len, batch, embed_dim)`
-                - **encoder_states** (tuple(torch.FloatTensor)): all intermediate
-                  hidden states of shape `(src_len, batch, embed_dim)`.
-                  Only populated if *output_hidden_states:* is True.
-                - **all_attentions** (tuple(torch.FloatTensor)): Attention weights for each layer.
-                During training might not be of length n_layers because of layer dropout.
-        """
-        # check attention mask and invert
-        if attention_mask is not None:
-            attention_mask = invert_mask(attention_mask)
-
-        inputs_embeds = self.embed_tokens(input_ids) * self.embed_scale
-        embed_pos = self.embed_positions(input_ids)
-        embed_patch = self.embed_patches(patch_ids)
-        x = inputs_embeds + embed_pos + embed_patch
-        x = self.layernorm_embedding(x)
-        x = F.dropout(x, p=self.dropout, training=self.training)
-
-        # B x T x C -> T x B x C
-        x = x.transpose(0, 1)
-
-        encoder_states = [] if output_hidden_states else None
-        all_attentions = () if output_attentions else None
-        for encoder_layer in self.layers:
-            if output_hidden_states:
-                encoder_states.append(x)
-            # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
-            dropout_probability = random.uniform(0, 1)
-            if self.training and (
-                dropout_probability < self.layerdrop
-            ):  # skip the layer
-                attn = None
-            else:
-                x, attn = encoder_layer(
-                    x, attention_mask, output_attentions=output_attentions
-                )
-
-            if output_attentions:
-                all_attentions = all_attentions + (attn,)
-
-        if self.layer_norm:
-            x = self.layer_norm(x)
-        if output_hidden_states:
-            encoder_states.append(x)
-            # T x B x C -> B x T x C
-            encoder_states = tuple(
-                hidden_state.transpose(0, 1) for hidden_state in encoder_states
-            )
-
-        # T x B x C -> B x T x C
-        x = x.transpose(0, 1)
-
-        if not return_dict:
-            return tuple(
-                v for v in [x, encoder_states, all_attentions] if v is not None
-            )
-        return BaseModelOutput(
-            last_hidden_state=x, hidden_states=encoder_states, attentions=all_attentions
-        )
-
-
-class DecoderLayer(nn.Module):
-    def __init__(self, config: BartConfig):
-        super().__init__()
-        self.embed_dim = config.d_model
-
-        self.self_attn = Attention(
-            embed_dim=self.embed_dim,
-            num_heads=config.decoder_attention_heads,
-            dropout=config.attention_dropout,
-        )
-        self.dropout = config.dropout
-        self.activation_fn = ACT2FN[config.activation_function]
-        self.activation_dropout = config.activation_dropout
-        self.normalize_before = config.normalize_before
-
-        self.self_attn_layer_norm = LayerNorm(self.embed_dim)
-        self.encoder_attn = Attention(
-            self.embed_dim,
-            config.decoder_attention_heads,
-            dropout=config.attention_dropout,
-            encoder_decoder_attention=True,
-        )
-        self.encoder_attn_layer_norm = LayerNorm(self.embed_dim)
-        self.fc1 = nn.Linear(self.embed_dim, config.decoder_ffn_dim)
-        self.fc2 = nn.Linear(config.decoder_ffn_dim, self.embed_dim)
-        self.final_layer_norm = LayerNorm(self.embed_dim)
-
-    def forward(
-        self,
-        x,
-        encoder_hidden_states,
-        encoder_attn_mask=None,
-        layer_state=None,
-        causal_mask=None,
-        decoder_padding_mask=None,
-        output_attentions=False,
-    ):
-        residual = x
-
-        if layer_state is None:
-            layer_state = {}
-        if self.normalize_before:
-            x = self.self_attn_layer_norm(x)
-        # Self Attention
-
-        x, self_attn_weights = self.self_attn(
-            query=x,
-            key=x,
-            layer_state=layer_state,  # adds keys to layer state
-            key_padding_mask=decoder_padding_mask,
-            attn_mask=causal_mask,
-            output_attentions=output_attentions,
-        )
-        x = F.dropout(x, p=self.dropout, training=self.training)
-        x = residual + x
-        if not self.normalize_before:
-            x = self.self_attn_layer_norm(x)
-
-        # Cross attention
-        residual = x
-        assert self.encoder_attn.cache_key != self.self_attn.cache_key
-        if self.normalize_before:
-            x = self.encoder_attn_layer_norm(x)
-        x, _ = self.encoder_attn(
-            query=x,
-            key=encoder_hidden_states,
-            key_padding_mask=encoder_attn_mask,
-            layer_state=layer_state,  # mutates layer state
-        )
-        x = F.dropout(x, p=self.dropout, training=self.training)
-        x = residual + x
-        if not self.normalize_before:
-            x = self.encoder_attn_layer_norm(x)
-
-        # Fully Connected
-        residual = x
-        if self.normalize_before:
-            x = self.final_layer_norm(x)
-        x = self.activation_fn(self.fc1(x))
-        x = F.dropout(x, p=self.activation_dropout, training=self.training)
-        x = self.fc2(x)
-        x = F.dropout(x, p=self.dropout, training=self.training)
-        x = residual + x
-        if not self.normalize_before:
-            x = self.final_layer_norm(x)
-        return (
-            x,
-            self_attn_weights,
-            layer_state,
-        )  # just self_attn weights for now, following t5, layer_state = cache for decoding
-
-
-class BartDecoder(nn.Module):
-    """
-    Transformer decoder consisting of *config.decoder_layers* layers. Each layer
-    is a :class:`DecoderLayer`.
-    Args:
-        config: BartConfig
-        embed_tokens (torch.nn.Embedding): output embedding
-    """
-
-    def __init__(self, config: BartConfig, embed_tokens: nn.Embedding):
-        super().__init__()
-        self.dropout = config.dropout
-        self.layerdrop = config.decoder_layerdrop
-        self.padding_idx = embed_tokens.padding_idx
-        self.max_target_positions = config.max_position_embeddings
-        self.embed_scale = math.sqrt(config.d_model) if config.scale_embedding else 1.0
-        self.embed_tokens = embed_tokens
-        if config.static_position_embeddings:
-            self.embed_positions = SinusoidalPositionalEmbedding(
-                config.max_position_embeddings, config.d_model, config.pad_token_id
-            )
-        else:
-            self.embed_positions = LearnedPositionalEmbedding(
-                config.max_position_embeddings,
-                config.d_model,
-                self.padding_idx,
-                config.extra_pos_embeddings,
-            )
-        self.layers = nn.ModuleList(
-            [DecoderLayer(config) for _ in range(config.decoder_layers)]
-        )  # type: List[DecoderLayer]
-        self.layernorm_embedding = (
-            LayerNorm(config.d_model) if config.normalize_embedding else nn.Identity()
-        )
-        self.layer_norm = (
-            LayerNorm(config.d_model) if config.add_final_layer_norm else None
-        )
-
-    def forward(
-        self,
-        input_ids,
-        encoder_hidden_states,
-        encoder_padding_mask,
-        decoder_padding_mask,
-        decoder_causal_mask,
-        past_key_values=None,
-        use_cache=False,
-        output_attentions=False,
-        output_hidden_states=False,
-        return_dict=False,
-        **unused,
-    ):
-        """
-        Includes several features from "Jointly Learning to Align and
-        Translate with Transformer Models" (Garg et al., EMNLP 2019).
-
-        Args:
-            input_ids (LongTensor): previous decoder outputs of shape
-                `(batch, tgt_len)`, for teacher forcing
-            encoder_hidden_states: output from the encoder, used for
-                encoder-side attention
-            encoder_padding_mask: for ignoring pad tokens
-            past_key_values (dict or None): dictionary used for storing state during generation
-
-        Returns:
-            BaseModelOutputWithPast or tuple:
-                - the decoder's features of shape `(batch, tgt_len, embed_dim)`
-                - the cache
-                - hidden states
-                - attentions
-        """
-        if "decoder_cached_states" in unused:
-            warnings.warn(
-                "The `decoder_cached_states` argument is deprecated and will be removed in a future version, use `past_key_values` instead.",
-                FutureWarning,
-            )
-            past_key_values = unused.pop("decoder_cached_states")
-        if "decoder_past_key_values" in unused:
-            warnings.warn(
-                "The `decoder_past_key_values` argument is deprecated and will be removed in a future version, use `past_key_values` instead.",
-                FutureWarning,
-            )
-            past_key_values = unused.pop("decoder_past_key_values")
-
-        # check attention mask and invert
-        if encoder_padding_mask is not None:
-            encoder_padding_mask = invert_mask(encoder_padding_mask)
-
-        # embed positions
-        positions = self.embed_positions(input_ids, use_cache=use_cache)
-
-        if use_cache:
-            input_ids = input_ids[:, -1:]
-            positions = positions[:, -1:]  # happens after we embed them
-            # assert input_ids.ne(self.padding_idx).any()
-
-        x = self.embed_tokens(input_ids) * self.embed_scale
-        x += positions
-        x = self.layernorm_embedding(x)
-        x = F.dropout(x, p=self.dropout, training=self.training)
-
-        # Convert to Bart output format: (seq_len, BS, model_dim) -> (BS, seq_len, model_dim)
-        x = x.transpose(0, 1)
-        encoder_hidden_states = encoder_hidden_states.transpose(0, 1)
-
-        # decoder layers
-        all_hidden_states = () if output_hidden_states else None
-        all_self_attns = () if output_attentions else None
-        next_decoder_cache = []
-        for idx, decoder_layer in enumerate(self.layers):
-            # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
-            if output_hidden_states:
-                all_hidden_states += (x,)
-            dropout_probability = random.uniform(0, 1)
-            if self.training and (dropout_probability < self.layerdrop):
-                continue
-
-            layer_state = past_key_values[idx] if past_key_values is not None else None
-
-            x, layer_self_attn, layer_past = decoder_layer(
-                x,
-                encoder_hidden_states,
-                encoder_attn_mask=encoder_padding_mask,
-                decoder_padding_mask=decoder_padding_mask,
-                layer_state=layer_state,
-                causal_mask=decoder_causal_mask,
-                output_attentions=output_attentions,
-            )
-
-            if use_cache:
-                next_decoder_cache.append(layer_past.copy())
-
-            if self.layer_norm and (
-                idx == len(self.layers) - 1
-            ):  # if config.add_final_layer_norm (mBART)
-                x = self.layer_norm(x)
-            if output_attentions:
-                all_self_attns += (layer_self_attn,)
-
-        # Convert to standard output format: (seq_len, BS, model_dim) -> (BS, seq_len, model_dim)
-        if output_hidden_states:
-            all_hidden_states = tuple(
-                hidden_state.transpose(0, 1) for hidden_state in all_hidden_states
-            )
-        x = x.transpose(0, 1)
-        encoder_hidden_states = encoder_hidden_states.transpose(0, 1)
-
-        next_cache = next_decoder_cache if use_cache else None
-
-        if not return_dict:
-            return tuple(
-                v
-                for v in [x, next_cache, all_hidden_states, all_self_attns]
-                if v is not None
-            )
-        return BaseModelOutputWithPast(
-            last_hidden_state=x,
-            past_key_values=next_cache,
-            hidden_states=all_hidden_states,
-            attentions=all_self_attns,
-        )
-
-
-def _reorder_buffer(attn_cache, new_order):
-    for k, input_buffer_k in attn_cache.items():
-        if input_buffer_k is not None:
-            attn_cache[k] = input_buffer_k.index_select(0, new_order)
-    return attn_cache
-
-
-class Attention(nn.Module):
-    """Multi-headed attention from 'Attention Is All You Need' paper"""
-
-    def __init__(
-        self,
-        embed_dim,
-        num_heads,
-        dropout=0.0,
-        bias=True,
-        encoder_decoder_attention=False,  # otherwise self_attention
-    ):
-        super().__init__()
-        self.embed_dim = embed_dim
-        self.num_heads = num_heads
-        self.dropout = dropout
-        self.head_dim = embed_dim // num_heads
-        assert (
-            self.head_dim * num_heads == self.embed_dim
-        ), "embed_dim must be divisible by num_heads"
-        self.scaling = self.head_dim ** -0.5
-
-        self.encoder_decoder_attention = encoder_decoder_attention
-        self.k_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
-        self.v_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
-        self.q_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
-        self.out_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
-        self.cache_key = "encoder_decoder" if self.encoder_decoder_attention else "self"
-
-    def _shape(self, tensor, seq_len, bsz):
-        return (
-            tensor.contiguous()
-            .view(seq_len, bsz * self.num_heads, self.head_dim)
-            .transpose(0, 1)
-        )
-
-    def forward(
-        self,
-        query,
-        key: Optional[Tensor],
-        key_padding_mask: Optional[Tensor] = None,
-        layer_state: Optional[Dict[str, Optional[Tensor]]] = None,
-        attn_mask: Optional[Tensor] = None,
-        output_attentions=False,
-    ) -> Tuple[Tensor, Optional[Tensor]]:
-        """Input shape: Time(SeqLen) x Batch x Channel"""
-        static_kv: bool = self.encoder_decoder_attention
-        tgt_len, bsz, embed_dim = query.size()
-        assert embed_dim == self.embed_dim
-        assert list(query.size()) == [tgt_len, bsz, embed_dim]
-        # get here for encoder decoder cause of static_kv
-        if layer_state is not None:  # reuse k,v and encoder_padding_mask
-            saved_state = layer_state.get(self.cache_key, {})
-            if "prev_key" in saved_state and static_kv:
-                # previous time steps are cached - no need to recompute key and value if they are static
-                key = None
-        else:
-            saved_state = None
-            layer_state = {}
-
-        q = self.q_proj(query) * self.scaling
-        if static_kv:
-            if key is None:
-                k = v = None
-            else:
-                k = self.k_proj(key)
-                v = self.v_proj(key)
-        else:
-            k = self.k_proj(query)
-            v = self.v_proj(query)
-
-        q = self._shape(q, tgt_len, bsz)
-        if k is not None:
-            k = self._shape(k, -1, bsz)
-        if v is not None:
-            v = self._shape(v, -1, bsz)
-
-        if saved_state is not None:
-            k, v, key_padding_mask = self._use_saved_state(
-                k, v, saved_state, key_padding_mask, static_kv, bsz
-            )
-
-        # Update cache
-        layer_state[self.cache_key] = {
-            "prev_key": k.view(bsz, self.num_heads, -1, self.head_dim),
-            "prev_value": v.view(bsz, self.num_heads, -1, self.head_dim),
-            "prev_key_padding_mask": key_padding_mask if not static_kv else None,
-        }
-
-        assert k is not None
-        src_len = k.size(1)
-        attn_weights = torch.bmm(q, k.transpose(1, 2))
-        assert attn_weights.size() == (bsz * self.num_heads, tgt_len, src_len)
-
-        if attn_mask is not None:
-            attn_weights = (
-                attn_weights.view(bsz, self.num_heads, tgt_len, src_len) + attn_mask
-            )
-            attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)
-
-        # This is part of a workaround to get around fork/join parallelism not supporting Optional types.
-        if key_padding_mask is not None and key_padding_mask.dim() == 0:
-            key_padding_mask = None
-        assert key_padding_mask is None or key_padding_mask.size()[:2] == (
-            bsz,
-            src_len,
-        )
-
-        if key_padding_mask is not None:  # don't attend to padding symbols
-            attn_weights = attn_weights.view(bsz, self.num_heads, tgt_len, src_len)
-            reshaped = key_padding_mask.unsqueeze(1).unsqueeze(2)
-            attn_weights = attn_weights.masked_fill(reshaped, float("-inf"))
-            attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)
-        attn_weights = F.softmax(attn_weights, dim=-1)
-        attn_probs = F.dropout(attn_weights, p=self.dropout, training=self.training,)
-
-        assert v is not None
-        attn_output = torch.bmm(attn_probs, v)
-        assert attn_output.size() == (bsz * self.num_heads, tgt_len, self.head_dim)
-        attn_output = (
-            attn_output.transpose(0, 1).contiguous().view(tgt_len, bsz, embed_dim)
-        )
-        attn_output = self.out_proj(attn_output)
-        if output_attentions:
-            attn_weights = attn_weights.view(bsz, self.num_heads, tgt_len, src_len)
-        else:
-            attn_weights = None
-        return attn_output, attn_weights
-
-    def _use_saved_state(self, k, v, saved_state, key_padding_mask, static_kv, bsz):
-        # saved states are stored with shape (bsz, num_heads, seq_len, head_dim)
-        if "prev_key" in saved_state:
-            _prev_key = saved_state["prev_key"]
-            assert _prev_key is not None
-            prev_key = _prev_key.view(bsz * self.num_heads, -1, self.head_dim)
-            if static_kv:
-                k = prev_key
-            else:
-                assert k is not None
-                k = torch.cat([prev_key, k], dim=1)
-        if "prev_value" in saved_state:
-            _prev_value = saved_state["prev_value"]
-            assert _prev_value is not None
-            prev_value = _prev_value.view(bsz * self.num_heads, -1, self.head_dim)
-            if static_kv:
-                v = prev_value
-            else:
-                assert v is not None
-                v = torch.cat([prev_value, v], dim=1)
-        assert k is not None and v is not None
-        prev_key_padding_mask: Optional[Tensor] = saved_state.get(
-            "prev_key_padding_mask", None
-        )
-        if prev_key_padding_mask is not None:
-            if static_kv:
-                new_key_padding_mask = prev_key_padding_mask
-            else:
-                new_key_padding_mask = torch.cat(
-                    [prev_key_padding_mask, key_padding_mask], dim=1
-                )
-        else:
-            new_key_padding_mask = key_padding_mask
-        return k, v, new_key_padding_mask
-
-
-class BartClassificationHead(nn.Module):
-    """Head for sentence-level classification tasks."""
-
-    # This can trivially be shared with RobertaClassificationHead
-
-    def __init__(
-        self, input_dim, inner_dim, num_classes, pooler_dropout,
-    ):
-        super().__init__()
-        self.dense = nn.Linear(input_dim, inner_dim)
-        self.dropout = nn.Dropout(p=pooler_dropout)
-        self.out_proj = nn.Linear(inner_dim, num_classes)
-
-    def forward(self, x):
-        x = self.dropout(x)
-        x = self.dense(x)
-        x = torch.tanh(x)
-        x = self.dropout(x)
-        x = self.out_proj(x)
-        return x
-
-
-class LearnedPositionalEmbedding(nn.Embedding):
-    """
-    This module learns positional embeddings up to a fixed maximum size.
-    Padding ids are ignored by either offsetting based on padding_idx
-    or by setting padding_idx to None and ensuring that the appropriate
-    position ids are passed to the forward function.
-    """
-
-    def __init__(
-        self, num_embeddings: int, embedding_dim: int, padding_idx: int, offset
-    ):
-        # Bart is set up so that if padding_idx is specified then offset the embedding ids by 2
-        # and adjust num_embeddings appropriately. Other models dont have this hack
-        self.offset = offset
-        assert padding_idx is not None
-        num_embeddings += offset
-        super().__init__(num_embeddings, embedding_dim, padding_idx=padding_idx)
-
-    def forward(self, input_ids, use_cache=False):
-        """Input is expected to be of size [bsz x seqlen]."""
-        bsz, seq_len = input_ids.shape[:2]
-        if use_cache:
-            positions = input_ids.data.new(1, 1).fill_(
-                seq_len - 1
-            )  # called before slicing
-        else:
-            # starts at 0, ends at 1-seq_len
-            positions = torch.arange(
-                seq_len, dtype=torch.long, device=self.weight.device
-            )
-        return super().forward(positions + self.offset)
-
-
-def LayerNorm(normalized_shape, eps=1e-5, elementwise_affine=True):
-    if torch.cuda.is_available():
-        try:
-            from apex.normalization import FusedLayerNorm
-
-            return FusedLayerNorm(normalized_shape, eps, elementwise_affine)
-        except ImportError:
-            pass
-    return torch.nn.LayerNorm(normalized_shape, eps, elementwise_affine)
-
-
-def fill_with_neg_inf(t):
-    """FP16-compatible function that fills a input_ids with -inf."""
-    return t.float().fill_(float("-inf")).type_as(t)
-
-
-# Public API
-def _get_shape(t):
-    return getattr(t, "shape", None)
-
-
-@add_start_docstrings(
-    "The bare BART Model outputting raw hidden-states without any specific head on top.",
-    BART_START_DOCSTRING,
-)
-class BartModel(PretrainedBartModel):
-    def __init__(self, config: BartConfig):
-        super().__init__(config)
-
-        padding_idx, vocab_size = config.pad_token_id, config.vocab_size
-        self.shared = nn.Embedding(vocab_size, config.d_model, padding_idx)
-
-        self.encoder = BartEncoder(config, self.shared)
-        self.decoder = BartDecoder(config, self.shared)
-
-        self.init_weights()
-
-    @add_start_docstrings_to_callable(BART_INPUTS_DOCSTRING)
-    @add_code_sample_docstrings(
-        tokenizer_class=_TOKENIZER_FOR_DOC,
-        checkpoint="facebook/bart-large",
-        output_type=BaseModelOutputWithPast,
-        config_class=_CONFIG_FOR_DOC,
-    )
-    def forward(
-        self,
-        input_ids,
-        patch_ids=None,
-        attention_mask=None,
-        decoder_input_ids=None,
-        encoder_outputs: Optional[Tuple] = None,
-        decoder_attention_mask=None,
-        past_key_values=None,
-        use_cache=None,
-        output_attentions=None,
-        output_hidden_states=None,
-        return_dict=None,
-        **kwargs,
-    ):
-        if "decoder_past_key_values" in kwargs:
-            warnings.warn(
-                "The `decoder_past_key_values` argument is deprecated and will be removed in a future version, use `past_key_values` instead.",
-                FutureWarning,
-            )
-            past_key_values = kwargs.pop("decoder_past_key_values")
-
-        if decoder_input_ids is None:
-            use_cache = False
-
-        output_attentions = (
-            output_attentions
-            if output_attentions is not None
-            else self.config.output_attentions
-        )
-        output_hidden_states = (
-            output_hidden_states
-            if output_hidden_states is not None
-            else self.config.output_hidden_states
-        )
-        use_cache = use_cache if use_cache is not None else self.config.use_cache
-        return_dict = (
-            return_dict if return_dict is not None else self.config.use_return_dict
-        )
-
-        # make masks if user doesn't supply
-        if not use_cache:
-            (
-                decoder_input_ids,
-                decoder_padding_mask,
-                causal_mask,
-            ) = _prepare_bart_decoder_inputs(
-                self.config,
-                input_ids,
-                decoder_input_ids=decoder_input_ids,
-                decoder_padding_mask=decoder_attention_mask,
-                causal_mask_dtype=self.shared.weight.dtype,
-            )
-        else:
-            decoder_padding_mask, causal_mask = None, None
-
-        assert decoder_input_ids is not None
-
-        if encoder_outputs is None:
-            encoder_outputs = self.encoder(
-                input_ids=input_ids,
-                patch_ids=patch_ids,
-                attention_mask=attention_mask,
-                output_attentions=output_attentions,
-                output_hidden_states=output_hidden_states,
-                return_dict=return_dict,
-            )
-        # If the user passed a tuple for encoder_outputs, we wrap it in a BaseModelOuput when return_dict=False
-        elif return_dict and not isinstance(encoder_outputs, BaseModelOutput):
-            encoder_outputs = BaseModelOutput(
-                last_hidden_state=encoder_outputs[0],
-                hidden_states=encoder_outputs[1] if len(encoder_outputs) > 1 else None,
-                attentions=encoder_outputs[2] if len(encoder_outputs) > 2 else None,
-            )
-
-        # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
-        decoder_outputs = self.decoder(
-            decoder_input_ids,
-            encoder_outputs[0],
-            attention_mask,
-            decoder_padding_mask,
-            decoder_causal_mask=causal_mask,
-            past_key_values=past_key_values,
-            use_cache=use_cache,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-        )
-
-        if not return_dict:
-            return decoder_outputs + encoder_outputs
-
-        return Seq2SeqModelOutput(
-            last_hidden_state=decoder_outputs.last_hidden_state,
-            past_key_values=decoder_outputs.past_key_values,
-            decoder_hidden_states=decoder_outputs.hidden_states,
-            decoder_attentions=decoder_outputs.attentions,
-            encoder_last_hidden_state=encoder_outputs.last_hidden_state,
-            encoder_hidden_states=encoder_outputs.hidden_states,
-            encoder_attentions=encoder_outputs.attentions,
-        )
-
-    def get_input_embeddings(self):
-        return self.shared
-
-    def set_input_embeddings(self, value):
-        self.shared = value
-        self.encoder.embed_tokens = self.shared
-        self.decoder.embed_tokens = self.shared
-
-    def get_output_embeddings(self):
-        return _make_linear_from_emb(self.shared)  # make it on the fly
-
-
-@add_start_docstrings(
-    "The BART Model with a language modeling head. Can be used for summarization.",
-    BART_START_DOCSTRING,
-)
-class BartForConditionalGeneration(PretrainedBartModel):
-    base_model_prefix = "model"
-    authorized_missing_keys = [
-        r"final_logits_bias",
-        r"encoder\.version",
-        r"decoder\.version",
-    ]
-
-    def __init__(self, config: BartConfig):
-        super().__init__(config)
-        base_model = BartModel(config)
-        self.model = base_model
-        self.register_buffer(
-            "final_logits_bias", torch.zeros((1, self.model.shared.num_embeddings))
-        )
-
-    def resize_token_embeddings(self, new_num_tokens: int) -> nn.Embedding:
-        old_num_tokens = self.model.shared.num_embeddings
-        new_embeddings = super().resize_token_embeddings(new_num_tokens)
-        self.model.shared = new_embeddings
-        self._resize_final_logits_bias(new_num_tokens, old_num_tokens)
-        return new_embeddings
-
-    def _resize_final_logits_bias(
-        self, new_num_tokens: int, old_num_tokens: int
-    ) -> None:
-        if new_num_tokens <= old_num_tokens:
-            new_bias = self.final_logits_bias[:, :new_num_tokens]
-        else:
-            extra_bias = torch.zeros(
-                (1, new_num_tokens - old_num_tokens),
-                device=self.final_logits_bias.device,
-            )
-            new_bias = torch.cat([self.final_logits_bias, extra_bias], dim=1)
-        self.register_buffer("final_logits_bias", new_bias)
-
-    @add_start_docstrings_to_callable(BART_INPUTS_DOCSTRING)
-    @replace_return_docstrings(
-        output_type=Seq2SeqLMOutput, config_class=_CONFIG_FOR_DOC
-    )
-    @add_end_docstrings(BART_GENERATION_EXAMPLE)
-    def forward(
-        self,
-        input_ids,
-        patch_ids,
-        attention_mask=None,
-        encoder_outputs=None,
-        decoder_input_ids=None,
-        decoder_attention_mask=None,
-        past_key_values=None,
-        labels=None,
-        use_cache=None,
-        output_attentions=None,
-        output_hidden_states=None,
-        return_dict=None,
-        **unused,
-    ):
-        r"""
-            labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
-                Labels for computing the masked language modeling loss.
-                Indices should either be in ``[0, ..., config.vocab_size]`` or -100 (see ``input_ids`` docstring).
-                Tokens with indices set to ``-100`` are ignored (masked), the loss is only computed for the tokens
-                with labels in ``[0, ..., config.vocab_size]``.
-
-        Returns:
-
-        Conditional generation example::
-
-                # Mask filling only works for bart-large
-                from transformers import BartTokenizer, BartForConditionalGeneration
-                tokenizer = BartTokenizer.from_pretrained('facebook/bart-large')
-                TXT = "My friends are <mask> but they eat too many carbs."
-
-                model = BartForConditionalGeneration.from_pretrained('facebook/bart-large')
-                input_ids = tokenizer([TXT], return_tensors='pt')['input_ids']
-                logits = model(input_ids).logits
-
-                masked_index = (input_ids[0] == tokenizer.mask_token_id).nonzero().item()
-                probs = logits[0, masked_index].softmax(dim=0)
-                values, predictions = probs.topk(5)
-
-                tokenizer.decode(predictions).split()
-                # ['good', 'great', 'all', 'really', 'very']
-        """
-        if "lm_labels" in unused:
-            warnings.warn(
-                "The `lm_labels` argument is deprecated and will be removed in a future version, use `labels` instead.",
-                FutureWarning,
-            )
-            labels = unused.pop("lm_labels")
-        if "decoder_cached_states" in unused:
-            warnings.warn(
-                "The `decoder_cached_states` argument is deprecated and will be removed in a future version, use `past_key_values` instead.",
-                FutureWarning,
-            )
-            past_key_values = unused.pop("decoder_cached_states")
-        if "decoder_past_key_values" in unused:
-            warnings.warn(
-                "The `decoder_past_key_values` argument is deprecated and will be removed in a future version, use `past_key_values` instead.",
-                FutureWarning,
-            )
-            past_key_values = unused.pop("decoder_past_key_values")
-        return_dict = (
-            return_dict if return_dict is not None else self.config.use_return_dict
-        )
-
-        if labels is not None:
-            use_cache = False
-            if decoder_input_ids is None:
-                decoder_input_ids = shift_tokens_right(labels, self.config.pad_token_id)
-
-        outputs = self.model(
-            input_ids,
-            patch_ids=patch_ids,
-            attention_mask=attention_mask,
-            decoder_input_ids=decoder_input_ids,
-            encoder_outputs=encoder_outputs,
-            decoder_attention_mask=decoder_attention_mask,
-            past_key_values=past_key_values,
-            use_cache=use_cache,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-        )
-        lm_logits = F.linear(
-            outputs[0], self.model.shared.weight, bias=self.final_logits_bias
-        )
-
-        masked_lm_loss = None
-        if labels is not None:
-            loss_fct = CrossEntropyLoss()
-            # TODO(SS): do we need to ignore pad tokens in labels?
-            masked_lm_loss = loss_fct(
-                lm_logits.view(-1, self.config.vocab_size), labels.view(-1)
-            )
-
-        if not return_dict:
-            output = (lm_logits,) + outputs[1:]
-            return (
-                ((masked_lm_loss,) + output) if masked_lm_loss is not None else output
-            )
-
-        return Seq2SeqLMOutput(
-            loss=masked_lm_loss,
-            logits=lm_logits,
-            past_key_values=outputs.past_key_values,
-            decoder_hidden_states=outputs.decoder_hidden_states,
-            decoder_attentions=outputs.decoder_attentions,
-            encoder_last_hidden_state=outputs.encoder_last_hidden_state,
-            encoder_hidden_states=outputs.encoder_hidden_states,
-            encoder_attentions=outputs.encoder_attentions,
-        )
-
-    def prepare_inputs_for_generation(
-        self,
-        decoder_input_ids,
-        past,
-        attention_mask,
-        use_cache,
-        encoder_outputs,
-        **kwargs,
-    ):
-        return {
-            "input_ids": None,  # encoder_outputs is defined. input_ids not needed
-            "patch_ids": None,  # encoder_outputs is defined. input_ids not needed
-            "encoder_outputs": encoder_outputs,
-            "past_key_values": past,
-            "decoder_input_ids": decoder_input_ids,
-            "attention_mask": attention_mask,
-            "use_cache": use_cache,  # change this to avoid caching (presumably for debugging)
-        }
-
-    def adjust_logits_during_generation(self, logits, cur_len, max_length):
-        if cur_len == 1 and self.config.force_bos_token_to_be_generated:
-            self._force_token_ids_generation(logits, self.config.bos_token_id)
-        elif cur_len == max_length - 1 and self.config.eos_token_id is not None:
-            self._force_token_ids_generation(logits, self.config.eos_token_id)
-        return logits
-
-    def _force_token_ids_generation(self, scores, token_id) -> None:
-        """force one of token_ids to be generated by setting prob of all other tokens to 0 (logprob=-float("inf"))"""
-        scores[:, [x for x in range(self.config.vocab_size) if x != token_id]] = -float(
-            "inf"
-        )
-
-    @staticmethod
-    def _reorder_cache(past, beam_idx):
-        reordered_past = []
-        for layer_past in past:
-            # get the correct batch idx from decoder layer's batch dim for cross and self-attn
-            layer_past_new = {
-                attn_key: _reorder_buffer(attn_cache, beam_idx)
-                for attn_key, attn_cache in layer_past.items()
-            }
-            reordered_past.append(layer_past_new)
-        return reordered_past
-
-    def get_encoder(self):
-        return self.model.encoder
-
-    def get_output_embeddings(self):
-        return _make_linear_from_emb(self.model.shared)  # make it on the fly
-
-
-@add_start_docstrings(
-    """Bart model with a sequence classification/head on top (a linear layer on top of the pooled output) e.g. for GLUE tasks. """,
-    BART_START_DOCSTRING,
-)
-class BartForSequenceClassification(PretrainedBartModel):
-    def __init__(self, config: BartConfig, **kwargs):
-        super().__init__(config, **kwargs)
-        self.model = BartModel(config)
-        self.classification_head = BartClassificationHead(
-            config.d_model, config.d_model, config.num_labels, config.classif_dropout,
-        )
-        self.model._init_weights(self.classification_head.dense)
-        self.model._init_weights(self.classification_head.out_proj)
-
-    @add_start_docstrings_to_callable(BART_INPUTS_DOCSTRING)
-    @add_code_sample_docstrings(
-        tokenizer_class=_TOKENIZER_FOR_DOC,
-        checkpoint="facebook/bart-large",
-        output_type=Seq2SeqSequenceClassifierOutput,
-        config_class=_CONFIG_FOR_DOC,
-    )
-    def forward(
-        self,
-        input_ids,
-        attention_mask=None,
-        encoder_outputs=None,
-        decoder_input_ids=None,
-        decoder_attention_mask=None,
-        labels=None,
-        use_cache=None,
-        output_attentions=None,
-        output_hidden_states=None,
-        return_dict=None,
-    ):
-        r"""
-        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
-            Labels for computing the sequence classification/regression loss.
-            Indices should be in :obj:`[0, ..., config.num_labels - 1]`.
-            If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
-        """
-        return_dict = (
-            return_dict if return_dict is not None else self.config.use_return_dict
-        )
-        if labels is not None:
-            use_cache = False
-
-        outputs = self.model(
-            input_ids,
-            attention_mask=attention_mask,
-            decoder_input_ids=decoder_input_ids,
-            decoder_attention_mask=decoder_attention_mask,
-            encoder_outputs=encoder_outputs,
-            use_cache=use_cache,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-        )
-        x = outputs[0]  # last hidden state
-        eos_mask = input_ids.eq(self.config.eos_token_id)
-        if len(torch.unique(eos_mask.sum(1))) > 1:
-            raise ValueError("All examples must have the same number of <eos> tokens.")
-        sentence_representation = x[eos_mask, :].view(x.size(0), -1, x.size(-1))[
-            :, -1, :
-        ]
-        logits = self.classification_head(sentence_representation)
-
-        loss = None
-        if labels is not None:
-            loss_fct = CrossEntropyLoss()
-            loss = loss_fct(logits.view(-1, self.config.num_labels), labels.view(-1))
-
-        if not return_dict:
-            output = (logits,) + outputs[1:]
-            return ((loss,) + output) if loss is not None else output
-
-        return Seq2SeqSequenceClassifierOutput(
-            loss=loss,
-            logits=logits,
-            past_key_values=outputs.past_key_values,
-            decoder_hidden_states=outputs.decoder_hidden_states,
-            decoder_attentions=outputs.decoder_attentions,
-            encoder_last_hidden_state=outputs.encoder_last_hidden_state,
-            encoder_hidden_states=outputs.encoder_hidden_states,
-            encoder_attentions=outputs.encoder_attentions,
-        )
-
-
-@add_start_docstrings(
-    """BART Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear layer on top of
-    the hidden-states output to compute `span start logits` and `span end logits`). """,
-    BART_START_DOCSTRING,
-)
-class BartForQuestionAnswering(PretrainedBartModel):
-    def __init__(self, config):
-        super().__init__(config)
-
-        config.num_labels = 2
-        self.num_labels = config.num_labels
-
-        self.model = BartModel(config)
-        self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels)
-
-        self.model._init_weights(self.qa_outputs)
-
-    @add_start_docstrings_to_callable(BART_INPUTS_DOCSTRING)
-    @add_code_sample_docstrings(
-        tokenizer_class=_TOKENIZER_FOR_DOC,
-        checkpoint="facebook/bart-large",
-        output_type=Seq2SeqQuestionAnsweringModelOutput,
-        config_class=_CONFIG_FOR_DOC,
-    )
-    def forward(
-        self,
-        input_ids,
-        attention_mask=None,
-        encoder_outputs=None,
-        decoder_input_ids=None,
-        decoder_attention_mask=None,
-        start_positions=None,
-        end_positions=None,
-        use_cache=None,
-        output_attentions=None,
-        output_hidden_states=None,
-        return_dict=None,
-    ):
-        r"""
-        start_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
-            Labels for position (index) of the start of the labelled span for computing the token classification loss.
-            Positions are clamped to the length of the sequence (`sequence_length`).
-            Position outside of the sequence are not taken into account for computing the loss.
-        end_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
-            Labels for position (index) of the end of the labelled span for computing the token classification loss.
-            Positions are clamped to the length of the sequence (`sequence_length`).
-            Position outside of the sequence are not taken into account for computing the loss.
-        """
-        return_dict = (
-            return_dict if return_dict is not None else self.config.use_return_dict
-        )
-        if start_positions is not None and end_positions is not None:
-            use_cache = False
-
-        outputs = self.model(
-            input_ids,
-            attention_mask=attention_mask,
-            decoder_input_ids=decoder_input_ids,
-            decoder_attention_mask=decoder_attention_mask,
-            encoder_outputs=encoder_outputs,
-            use_cache=use_cache,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-        )
-
-        sequence_output = outputs[0]
-
-        logits = self.qa_outputs(sequence_output)
-        start_logits, end_logits = logits.split(1, dim=-1)
-        start_logits = start_logits.squeeze(-1)
-        end_logits = end_logits.squeeze(-1)
-
-        total_loss = None
-        if start_positions is not None and end_positions is not None:
-            # If we are on multi-GPU, split add a dimension
-            if len(start_positions.size()) > 1:
-                start_positions = start_positions.squeeze(-1)
-            if len(end_positions.size()) > 1:
-                end_positions = end_positions.squeeze(-1)
-            # sometimes the start/end positions are outside our model inputs, we ignore these terms
-            ignored_index = start_logits.size(1)
-            start_positions.clamp_(0, ignored_index)
-            end_positions.clamp_(0, ignored_index)
-
-            loss_fct = CrossEntropyLoss(ignore_index=ignored_index)
-            start_loss = loss_fct(start_logits, start_positions)
-            end_loss = loss_fct(end_logits, end_positions)
-            total_loss = (start_loss + end_loss) / 2
-
-        if not return_dict:
-            output = (start_logits, end_logits,) + outputs[1:]
-            return ((total_loss,) + output) if total_loss is not None else output
-
-        return Seq2SeqQuestionAnsweringModelOutput(
-            loss=total_loss,
-            start_logits=start_logits,
-            end_logits=end_logits,
-            past_key_values=outputs.past_key_values,
-            decoder_hidden_states=outputs.decoder_hidden_states,
-            decoder_attentions=outputs.decoder_attentions,
-            encoder_last_hidden_state=outputs.encoder_last_hidden_state,
-            encoder_hidden_states=outputs.encoder_hidden_states,
-            encoder_attentions=outputs.encoder_attentions,
-        )
-
-
-class SinusoidalPositionalEmbedding(nn.Embedding):
-    """This module produces sinusoidal positional embeddings of any length."""
-
-    def __init__(self, num_positions, embedding_dim, padding_idx=None):
-        super().__init__(num_positions, embedding_dim)
-        if embedding_dim % 2 != 0:
-            raise NotImplementedError(
-                f"odd embedding_dim {embedding_dim} not supported"
-            )
-        self.weight = self._init_weight(self.weight)
-
-    @staticmethod
-    def _init_weight(out: nn.Parameter):
-        """Identical to the XLM create_sinusoidal_embeddings except features are not interleaved.
-        The cos features are in the 2nd half of the vector. [dim // 2:]
-        """
-        n_pos, dim = out.shape
-        position_enc = np.array(
-            [
-                [pos / np.power(10000, 2 * (j // 2) / dim) for j in range(dim)]
-                for pos in range(n_pos)
-            ]
-        )
-        out[:, 0 : dim // 2] = torch.FloatTensor(
-            np.sin(position_enc[:, 0::2])
-        )  # This line breaks for odd n_pos
-        out[:, dim // 2 :] = torch.FloatTensor(np.cos(position_enc[:, 1::2]))
-        out.detach_()
-        out.requires_grad = False
-        return out
-
-    @torch.no_grad()
-    def forward(self, input_ids, use_cache=False):
-        """Input is expected to be of size [bsz x seqlen]."""
-        bsz, seq_len = input_ids.shape[:2]
-        if use_cache:
-            positions = input_ids.data.new(1, 1).fill_(
-                seq_len - 1
-            )  # called before slicing
-        else:
-            # starts at 0, ends at 1-seq_len
-            positions = torch.arange(
-                seq_len, dtype=torch.long, device=self.weight.device
-            )
-        return super().forward(positions)
--- a/train/modeling_utils.py deleted 100644 → 0
View file @3f92ebb
+++ b/train/modeling_utils.py deleted 100644 → 0
View file @3f92ebb
-# coding=utf-8
-# Copyright 2018 The Google AI Language Team Authors, Facebook AI Research authors and The HuggingFace Inc. team.
-# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import inspect
-import os
-import re
-from dataclasses import dataclass
-from typing import Callable, Dict, List, Optional, Set, Tuple, Union
-
-import torch
-from torch import Tensor, device, dtype, nn
-from torch.nn import CrossEntropyLoss
-from torch.nn import functional as F
-
-from transformers.activations import get_activation
-from transformers.configuration_utils import PretrainedConfig
-from transformers.file_utils import (
-    DUMMY_INPUTS,
-    TF2_WEIGHTS_NAME,
-    TF_WEIGHTS_NAME,
-    WEIGHTS_NAME,
-    ModelOutput,
-    cached_path,
-    hf_bucket_url,
-    is_remote_url,
-    is_torch_tpu_available,
-    replace_return_docstrings,
-)
-from train.generation_utils import GenerationMixin
-import logging
-
-logger = logging.getLogger(__name__)  # pylint: disable=invalid-name
-logging.basicConfig(
-    format="%(asctime)s - %(levelname)s - %(name)s - PID: %(process)d -  %(message)s",
-    datefmt="%m/%d/%Y %H:%M:%S",
-    level=logging.INFO,
-)
-
-
-try:
-    from torch.nn import Identity
-except ImportError:
-    # Older PyTorch compatibility
-    class Identity(nn.Module):
-        r"""A placeholder identity operator that is argument-insensitive."""
-
-        def __init__(self, *args, **kwargs):
-            super().__init__()
-
-        def forward(self, input):
-            return input
-
-
-def find_pruneable_heads_and_indices(
-    heads: List[int], n_heads: int, head_size: int, already_pruned_heads: Set[int]
-) -> Tuple[Set[int], torch.LongTensor]:
-    """
-    Finds the heads and their indices taking :obj:`already_pruned_heads` into account.
-
-    Args:
-        heads (:obj:`List[int]`): List of the indices of heads to prune.
-        n_heads (:obj:`int`): The number of heads in the model.
-        head_size (:obj:`int`): The size of each head.
-        already_pruned_heads (:obj:`Set[int]`): A set of already pruned heads.
-
-    Returns:
-        :obj:`Tuple[Set[int], torch.LongTensor]`: A tuple with the remaining heads and their corresponding indices.
-    """
-    mask = torch.ones(n_heads, head_size)
-    heads = (
-        set(heads) - already_pruned_heads
-    )  # Convert to set and remove already pruned heads
-    for head in heads:
-        # Compute how many pruned heads are before the head and move the index accordingly
-        head = head - sum(1 if h < head else 0 for h in already_pruned_heads)
-        mask[head] = 0
-    mask = mask.view(-1).contiguous().eq(1)
-    index: torch.LongTensor = torch.arange(len(mask))[mask].long()
-    return heads, index
-
-
-class ModuleUtilsMixin:
-    """
-    A few utilities for :obj:`torch.nn.Modules`, to be used as a mixin.
-    """
-
-    def num_parameters(self, only_trainable: bool = False) -> int:
-        """
-        Get the number of (optionally, trainable) parameters in the model.
-
-        Args:
-            only_trainable (:obj:`bool`, `optional`, defaults to :obj:`False`):
-                Whether or not to return only the number of trainable parameters
-
-        Returns:
-            :obj:`int`: The number of parameters.
-        """
-        params = (
-            filter(lambda x: x.requires_grad, self.parameters())
-            if only_trainable
-            else self.parameters()
-        )
-        return sum(p.numel() for p in params)
-
-    @staticmethod
-    def _hook_rss_memory_pre_forward(module, *args, **kwargs):
-        try:
-            import psutil
-        except (ImportError):
-            raise ImportError(
-                "You need to install psutil (pip install psutil) to use memory tracing."
-            )
-
-        process = psutil.Process(os.getpid())
-        mem = process.memory_info()
-        module.mem_rss_pre_forward = mem.rss
-        return None
-
-    @staticmethod
-    def _hook_rss_memory_post_forward(module, *args, **kwargs):
-        try:
-            import psutil
-        except (ImportError):
-            raise ImportError(
-                "You need to install psutil (pip install psutil) to use memory tracing."
-            )
-
-        process = psutil.Process(os.getpid())
-        mem = process.memory_info()
-        module.mem_rss_post_forward = mem.rss
-        mem_rss_diff = module.mem_rss_post_forward - module.mem_rss_pre_forward
-        module.mem_rss_diff = mem_rss_diff + (
-            module.mem_rss_diff if hasattr(module, "mem_rss_diff") else 0
-        )
-        return None
-
-    def add_memory_hooks(self):
-        """
-        Add a memory hook before and after each sub-module forward pass to record increase in memory consumption.
-
-        Increase in memory consumption is stored in a :obj:`mem_rss_diff` attribute for each module and can be reset to
-        zero with :obj:`model.reset_memory_hooks_state()`.
-        """
-        for module in self.modules():
-            module.register_forward_pre_hook(self._hook_rss_memory_pre_forward)
-            module.register_forward_hook(self._hook_rss_memory_post_forward)
-        self.reset_memory_hooks_state()
-
-    def reset_memory_hooks_state(self):
-        """
-        Reset the :obj:`mem_rss_diff` attribute of each module (see
-        :func:`~transformers.modeling_utils.ModuleUtilsMixin.add_memory_hooks`).
-        """
-        for module in self.modules():
-            module.mem_rss_diff = 0
-            module.mem_rss_post_forward = 0
-            module.mem_rss_pre_forward = 0
-
-    @property
-    def device(self) -> device:
-        """
-        :obj:`torch.device`: The device on which the module is (assuming that all the module parameters are on the same
-        device).
-        """
-        try:
-            return next(self.parameters()).device
-        except StopIteration:
-            # For nn.DataParallel compatibility in PyTorch 1.5
-
-            def find_tensor_attributes(module: nn.Module) -> List[Tuple[str, Tensor]]:
-                tuples = [
-                    (k, v) for k, v in module.__dict__.items() if torch.is_tensor(v)
-                ]
-                return tuples
-
-            gen = self._named_members(get_members_fn=find_tensor_attributes)
-            first_tuple = next(gen)
-            return first_tuple[1].device
-
-    @property
-    def dtype(self) -> dtype:
-        """
-        :obj:`torch.dtype`: The dtype of the module (assuming that all the module parameters have the same dtype).
-        """
-        try:
-            return next(self.parameters()).dtype
-        except StopIteration:
-            # For nn.DataParallel compatibility in PyTorch 1.5
-
-            def find_tensor_attributes(module: nn.Module) -> List[Tuple[str, Tensor]]:
-                tuples = [
-                    (k, v) for k, v in module.__dict__.items() if torch.is_tensor(v)
-                ]
-                return tuples
-
-            gen = self._named_members(get_members_fn=find_tensor_attributes)
-            first_tuple = next(gen)
-            return first_tuple[1].dtype
-
-    def invert_attention_mask(self, encoder_attention_mask: Tensor) -> Tensor:
-        """
-        Invert an attention mask (e.g., switches 0. and 1.).
-
-        Args:
-            encoder_attention_mask (:obj:`torch.Tensor`): An attention mask.
-
-        Returns:
-            :obj:`torch.Tensor`: The inverted attention mask.
-        """
-        if encoder_attention_mask.dim() == 3:
-            encoder_extended_attention_mask = encoder_attention_mask[:, None, :, :]
-        if encoder_attention_mask.dim() == 2:
-            encoder_extended_attention_mask = encoder_attention_mask[:, None, None, :]
-        # T5 has a mask that can compare sequence ids, we can simulate this here with this transposition
-        # Cf. https://github.com/tensorflow/mesh/blob/8d2465e9bc93129b913b5ccc6a59aa97abd96ec6/mesh_tensorflow
-        # /transformer/transformer_layers.py#L270
-        # encoder_extended_attention_mask = (encoder_extended_attention_mask ==
-        # encoder_extended_attention_mask.transpose(-1, -2))
-        encoder_extended_attention_mask = encoder_extended_attention_mask.to(
-            dtype=self.dtype
-        )  # fp16 compatibility
-
-        if self.dtype == torch.float16:
-            encoder_extended_attention_mask = (
-                1.0 - encoder_extended_attention_mask
-            ) * -1e4
-        elif self.dtype == torch.float32:
-            encoder_extended_attention_mask = (
-                1.0 - encoder_extended_attention_mask
-            ) * -1e9
-        else:
-            raise ValueError(
-                "{} not recognized. `dtype` should be set to either `torch.float32` or `torch.float16`".format(
-                    self.dtype
-                )
-            )
-
-        return encoder_extended_attention_mask
-
-    def get_extended_attention_mask(
-        self, attention_mask: Tensor, input_shape: Tuple[int], device: device
-    ) -> Tensor:
-        """
-        Makes broadcastable attention and causal masks so that future and masked tokens are ignored.
-
-        Arguments:
-            attention_mask (:obj:`torch.Tensor`):
-                Mask with ones indicating tokens to attend to, zeros for tokens to ignore.
-            input_shape (:obj:`Tuple[int]`):
-                The shape of the input to the model.
-            device: (:obj:`torch.device`):
-                The device of the input to the model.
-
-        Returns:
-            :obj:`torch.Tensor` The extended attention mask, with a the same dtype as :obj:`attention_mask.dtype`.
-        """
-        # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
-        # ourselves in which case we just need to make it broadcastable to all heads.
-        if attention_mask.dim() == 3:
-            extended_attention_mask = attention_mask[:, None, :, :]
-        elif attention_mask.dim() == 2:
-            # Provided a padding mask of dimensions [batch_size, seq_length]
-            # - if the model is a decoder, apply a causal mask in addition to the padding mask
-            # - if the model is an encoder, make the mask broadcastable to [batch_size, num_heads, seq_length, seq_length]
-            if self.config.is_decoder:
-                batch_size, seq_length = input_shape
-                seq_ids = torch.arange(seq_length, device=device)
-                causal_mask = (
-                    seq_ids[None, None, :].repeat(batch_size, seq_length, 1)
-                    <= seq_ids[None, :, None]
-                )
-                # causal and attention masks must have same type with pytorch version < 1.3
-                causal_mask = causal_mask.to(attention_mask.dtype)
-                extended_attention_mask = (
-                    causal_mask[:, None, :, :] * attention_mask[:, None, None, :]
-                )
-            else:
-                extended_attention_mask = attention_mask[:, None, None, :]
-        else:
-            raise ValueError(
-                "Wrong shape for input_ids (shape {}) or attention_mask (shape {})".format(
-                    input_shape, attention_mask.shape
-                )
-            )
-
-        # Since attention_mask is 1.0 for positions we want to attend and 0.0 for
-        # masked positions, this operation will create a tensor which is 0.0 for
-        # positions we want to attend and -10000.0 for masked positions.
-        # Since we are adding it to the raw scores before the softmax, this is
-        # effectively the same as removing these entirely.
-        extended_attention_mask = extended_attention_mask.to(
-            dtype=self.dtype
-        )  # fp16 compatibility
-        extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0
-        return extended_attention_mask
-
-    def get_head_mask(
-        self,
-        head_mask: Optional[Tensor],
-        num_hidden_layers: int,
-        is_attention_chunked: bool = False,
-    ) -> Tensor:
-        """
-        Prepare the head mask if needed.
-
-        Args:
-            head_mask (:obj:`torch.Tensor` with shape :obj:`[num_heads]` or :obj:`[num_hidden_layers x num_heads]`, `optional`):
-                The mask indicating if we should keep the heads or not (1.0 for keep, 0.0 for discard).
-            num_hidden_layers (:obj:`int`):
-                The number of hidden layers in the model.
-            is_attention_chunked: (:obj:`bool`, `optional, defaults to :obj:`False`):
-                Whether or not the attentions scores are computed by chunks or not.
-
-        Returns:
-            :obj:`torch.Tensor` with shape :obj:`[num_hidden_layers x batch x num_heads x seq_length x seq_length]`
-            or list with :obj:`[None]` for each layer.
-        """
-        if head_mask is not None:
-            head_mask = self._convert_head_mask_to_5d(head_mask, num_hidden_layers)
-            if is_attention_chunked is True:
-                head_mask = head_mask.unsqueeze(-1)
-        else:
-            head_mask = [None] * num_hidden_layers
-
-        return head_mask
-
-    def _convert_head_mask_to_5d(self, head_mask, num_hidden_layers):
-        """-> [num_hidden_layers x batch x num_heads x seq_length x seq_length]"""
-        if head_mask.dim() == 1:
-            head_mask = head_mask.unsqueeze(0).unsqueeze(0).unsqueeze(-1).unsqueeze(-1)
-            head_mask = head_mask.expand(num_hidden_layers, -1, -1, -1, -1)
-        elif head_mask.dim() == 2:
-            head_mask = (
-                head_mask.unsqueeze(1).unsqueeze(-1).unsqueeze(-1)
-            )  # We can specify head_mask for each layer
-        assert head_mask.dim() == 5, f"head_mask.dim != 5, instead {head_mask.dim()}"
-        head_mask = head_mask.to(
-            dtype=self.dtype
-        )  # switch to fload if need + fp16 compatibility
-        return head_mask
-
-
-class PreTrainedModel(nn.Module, ModuleUtilsMixin, GenerationMixin):
-    r"""
-    Base class for all models.
-
-    :class:`~transformers.PreTrainedModel` takes care of storing the configuration of the models and handles methods
-    for loading, downloading and saving models as well as a few methods common to all models to:
-
-        * resize the input embeddings,
-        * prune heads in the self-attention heads.
-
-    Class attributes (overridden by derived classes):
-        - **config_class** (:class:`~transformers.PretrainedConfig`) -- A subclass of
-          :class:`~transformers.PretrainedConfig` to use as configuration class for this model architecture.
-        - **load_tf_weights** (:obj:`Callable`) -- A python `method` for loading a TensorFlow checkpoint in a
-          PyTorch model, taking as arguments:
-
-            - **model** (:class:`~transformers.PreTrainedModel`) -- An instance of the model on which to load the
-              TensorFlow checkpoint.
-            - **config** (:class:`~transformers.PreTrainedConfig`) -- An instance of the configuration associated
-              to the model.
-            - **path** (:obj:`str`) -- A path to the TensorFlow checkpoint.
-
-        - **base_model_prefix** (:obj:`str`) -- A string indicating the attribute associated to the base model in
-          derived classes of the same architecture adding modules on top of the base model.
-        - **authorized_missing_keys** (:obj:`Optional[List[str]]`) -- A list of re pattern of tensor names to ignore
-          when loading the model (and avoid unnecessary warnings).
-    """
-    config_class = None
-    base_model_prefix = ""
-    authorized_missing_keys = None
-
-    @property
-    def dummy_inputs(self) -> Dict[str, torch.Tensor]:
-        """
-        :obj:`Dict[str, torch.Tensor]`: Dummy inputs to do a forward pass in the network.
-        """
-        return {"input_ids": torch.tensor(DUMMY_INPUTS)}
-
-    def __init__(self, config: PretrainedConfig, *inputs, **kwargs):
-        super().__init__()
-        if not isinstance(config, PretrainedConfig):
-            raise ValueError(
-                "Parameter config in `{}(config)` should be an instance of class `PretrainedConfig`. "
-                "To create a model from a pretrained model use "
-                "`model = {}.from_pretrained(PRETRAINED_MODEL_NAME)`".format(
-                    self.__class__.__name__, self.__class__.__name__
-                )
-            )
-        # Save config in model
-        self.config = config
-
-    @property
-    def base_model(self) -> nn.Module:
-        """
-        :obj:`torch.nn.Module`: The main body of the model.
-        """
-        return getattr(self, self.base_model_prefix, self)
-
-    def get_input_embeddings(self) -> nn.Module:
-        """
-        Returns the model's input embeddings.
-
-        Returns:
-            :obj:`nn.Module`: A torch module mapping vocabulary to hidden states.
-        """
-        base_model = getattr(self, self.base_model_prefix, self)
-        if base_model is not self:
-            return base_model.get_input_embeddings()
-        else:
-            raise NotImplementedError
-
-    def set_input_embeddings(self, value: nn.Module):
-        """
-        Set model's input embeddings
-
-        Args:
-            value (:obj:`nn.Module`): A module mapping vocabulary to hidden states.
-        """
-        base_model = getattr(self, self.base_model_prefix, self)
-        if base_model is not self:
-            base_model.set_input_embeddings(value)
-        else:
-            raise NotImplementedError
-
-    def get_output_embeddings(self) -> nn.Module:
-        """
-        Returns the model's output embeddings.
-
-        Returns:
-            :obj:`nn.Module`: A torch module mapping hidden states to vocabulary.
-        """
-        return None  # Overwrite for models with output embeddings
-
-    def tie_weights(self):
-        """
-        Tie the weights between the input embeddings and the output embeddings.
-
-        If the :obj:`torchscript` flag is set in the configuration, can't handle parameter sharing so we are cloning
-        the weights instead.
-        """
-        output_embeddings = self.get_output_embeddings()
-        if output_embeddings is not None and self.config.tie_word_embeddings:
-            self._tie_or_clone_weights(output_embeddings, self.get_input_embeddings())
-
-        if self.config.is_encoder_decoder and self.config.tie_encoder_decoder:
-            self._tie_encoder_decoder_weights(
-                self.encoder, self.decoder, self.base_model_prefix
-            )
-
-    @staticmethod
-    def _tie_encoder_decoder_weights(
-        encoder: nn.Module, decoder: nn.Module, base_model_prefix: str
-    ):
-        uninitialized_encoder_weights: List[str] = []
-        assert (
-            decoder.__class__ == encoder.__class__
-        ), f"{decoder.__class__} and {encoder.__class__} have to be equal."
-
-        def tie_encoder_to_decoder_recursively(
-            decoder_pointer: nn.Module,
-            encoder_pointer: nn.Module,
-            module_name: str,
-            uninitialized_encoder_weights: List[str],
-            depth=0,
-        ):
-            assert isinstance(decoder_pointer, nn.Module) and isinstance(
-                encoder_pointer, nn.Module
-            ), f"{decoder_pointer} and {encoder_pointer} have to be of type torch.nn.Module"
-            if hasattr(decoder_pointer, "weight"):
-                assert hasattr(encoder_pointer, "weight")
-                encoder_pointer.weight = decoder_pointer.weight
-                if hasattr(decoder_pointer, "bias"):
-                    assert hasattr(encoder_pointer, "bias")
-                    encoder_pointer.bias = decoder_pointer.bias
-                return
-
-            encoder_modules = encoder_pointer._modules
-            decoder_modules = decoder_pointer._modules
-            if len(decoder_modules) > 0:
-                assert (
-                    len(encoder_modules) > 0
-                ), f"Encoder module {encoder_pointer} does not match decoder module {decoder_pointer}"
-
-                all_encoder_weights = set(
-                    [
-                        module_name + "/" + sub_name
-                        for sub_name in encoder_modules.keys()
-                    ]
-                )
-                encoder_layer_pos = 0
-                for name, module in decoder_modules.items():
-                    if name.isdigit():
-                        encoder_name = str(int(name) + encoder_layer_pos)
-                        decoder_name = name
-                        if not isinstance(
-                            decoder_modules[decoder_name],
-                            type(encoder_modules[encoder_name]),
-                        ):
-                            # this can happen if the name corresponds to the position in a list module list of layers
-                            # in this case the decoder has added a cross-attention that the encoder does not have
-                            # thus skip this step and substract one layer pos from encoder
-                            encoder_layer_pos -= 1
-                            continue
-                    elif name not in encoder_modules:
-                        continue
-                    elif depth > 500:
-                        raise ValueError(
-                            "Max depth of recursive function `tie_encoder_to_decoder` reached. It seems that there is a circular dependency between two or more `nn.Modules` of your model."
-                        )
-                    else:
-                        decoder_name = encoder_name = name
-                    tie_encoder_to_decoder_recursively(
-                        decoder_modules[decoder_name],
-                        encoder_modules[encoder_name],
-                        module_name + "/" + name,
-                        uninitialized_encoder_weights,
-                        depth=depth + 1,
-                    )
-                    all_encoder_weights.remove(module_name + "/" + encoder_name)
-
-                uninitialized_encoder_weights += list(all_encoder_weights)
-
-        # tie weights recursively
-        tie_encoder_to_decoder_recursively(
-            decoder, encoder, base_model_prefix, uninitialized_encoder_weights
-        )
-        if len(uninitialized_encoder_weights) > 0:
-            logger.warning(
-                f"The following encoder weights were not tied to the decoder {uninitialized_encoder_weights}"
-            )
-
-    def _tie_or_clone_weights(self, output_embeddings, input_embeddings):
-        """Tie or clone module weights depending of whether we are using TorchScript or not"""
-        if self.config.torchscript:
-            output_embeddings.weight = nn.Parameter(input_embeddings.weight.clone())
-        else:
-            output_embeddings.weight = input_embeddings.weight
-
-        if getattr(output_embeddings, "bias", None) is not None:
-            output_embeddings.bias.data = torch.nn.functional.pad(
-                output_embeddings.bias.data,
-                (
-                    0,
-                    output_embeddings.weight.shape[0] - output_embeddings.bias.shape[0],
-                ),
-                "constant",
-                0,
-            )
-        if hasattr(output_embeddings, "out_features") and hasattr(
-            input_embeddings, "num_embeddings"
-        ):
-            output_embeddings.out_features = input_embeddings.num_embeddings
-
-    def resize_token_embeddings(
-        self, new_num_tokens: Optional[int] = None
-    ) -> torch.nn.Embedding:
-        """
-        Resizes input token embeddings matrix of the model if :obj:`new_num_tokens != config.vocab_size`.
-
-        Takes care of tying weights embeddings afterwards if the model class has a :obj:`tie_weights()` method.
-
-        Arguments:
-            new_num_tokens (:obj:`int`, `optional`):
-                The number of new tokens in the embedding matrix. Increasing the size will add newly initialized
-                vectors at the end. Reducing the size will remove vectors from the end. If not provided or :obj:`None`,
-                just returns a pointer to the input tokens :obj:`torch.nn.Embedding` module of the model wihtout doing
-                anything.
-
-        Return:
-            :obj:`torch.nn.Embedding`: Pointer to the input tokens Embeddings Module of the model.
-        """
-        base_model = getattr(
-            self, self.base_model_prefix, self
-        )  # get the base model if needed
-        model_embeds = base_model._resize_token_embeddings(new_num_tokens)
-        if new_num_tokens is None:
-            return model_embeds
-
-        # Update base model and current model config
-        self.config.vocab_size = new_num_tokens
-        base_model.vocab_size = new_num_tokens
-
-        # Tie weights again if needed
-        self.tie_weights()
-
-        return model_embeds
-
-    def _resize_token_embeddings(self, new_num_tokens):
-        old_embeddings = self.get_input_embeddings()
-        new_embeddings = self._get_resized_embeddings(old_embeddings, new_num_tokens)
-        self.set_input_embeddings(new_embeddings)
-        return self.get_input_embeddings()
-
-    def _get_resized_embeddings(
-        self, old_embeddings: torch.nn.Embedding, new_num_tokens: Optional[int] = None
-    ) -> torch.nn.Embedding:
-        """
-        Build a resized Embedding Module from a provided token Embedding Module. Increasing the size will add newly
-        initialized vectors at the end. Reducing the size will remove vectors from the end
-
-        Args:
-            old_embeddings (:obj:`torch.nn.Embedding`):
-                Old embeddings to be resized.
-            new_num_tokens (:obj:`int`, `optional`):
-                New number of tokens in the embedding matrix.
-
-                Increasing the size will add newly initialized vectors at the end. Reducing the size will remove
-                vectors from the end. If not provided or :obj:`None`, just returns a pointer to the input tokens
-                :obj:`torch.nn.Embedding`` module of the model wihtout doing anything.
-
-        Return:
-            :obj:`torch.nn.Embedding`: Pointer to the resized Embedding Module or the old Embedding Module if
-            :obj:`new_num_tokens` is :obj:`None`
-        """
-        if new_num_tokens is None:
-            return old_embeddings
-
-        old_num_tokens, old_embedding_dim = old_embeddings.weight.size()
-        if old_num_tokens == new_num_tokens:
-            return old_embeddings
-
-        # Build new embeddings
-        new_embeddings = nn.Embedding(new_num_tokens, old_embedding_dim)
-        new_embeddings.to(old_embeddings.weight.device)
-
-        # initialize all new embeddings (in particular added tokens)
-        self._init_weights(new_embeddings)
-
-        # Copy token embeddings from the previous weights
-        num_tokens_to_copy = min(old_num_tokens, new_num_tokens)
-        new_embeddings.weight.data[:num_tokens_to_copy, :] = old_embeddings.weight.data[
-            :num_tokens_to_copy, :
-        ]
-
-        return new_embeddings
-
-    def init_weights(self):
-        """
-        Initializes and prunes weights if needed.
-        """
-        # Initialize weights
-        self.apply(self._init_weights)
-
-        # Prune heads if needed
-        if self.config.pruned_heads:
-            self.prune_heads(self.config.pruned_heads)
-
-        # Tie weights if needed
-        self.tie_weights()
-
-    def prune_heads(self, heads_to_prune: Dict[int, List[int]]):
-        """
-        Prunes heads of the base model.
-
-        Arguments:
-            heads_to_prune (:obj:`Dict[int, List[int]]`):
-                Dictionary with keys being selected layer indices (:obj:`int`) and associated values being the list
-                of heads to prune in said layer (list of :obj:`int`). For instance {1: [0, 2], 2: [2, 3]} will
-                prune heads 0 and 2 on layer 1 and heads 2 and 3 on layer 2.
-        """
-        # save new sets of pruned heads as union of previously stored pruned heads and newly pruned heads
-        for layer, heads in heads_to_prune.items():
-            union_heads = set(self.config.pruned_heads.get(layer, [])) | set(heads)
-            self.config.pruned_heads[layer] = list(
-                union_heads
-            )  # Unfortunately we have to store it as list for JSON
-
-        self.base_model._prune_heads(heads_to_prune)
-
-    def save_pretrained(self, save_directory):
-        """
-        Save a model and its configuration file to a directory, so that it can be re-loaded using the
-        `:func:`~transformers.PreTrainedModel.from_pretrained`` class method.
-
-        Arguments:
-            save_directory (:obj:`str`):
-                Directory to which to save. Will be created if it doesn't exist.
-        """
-        if os.path.isfile(save_directory):
-            logger.error(
-                "Provided path ({}) should be a directory, not a file".format(
-                    save_directory
-                )
-            )
-            return
-        os.makedirs(save_directory, exist_ok=True)
-
-        # Only save the model itself if we are using distributed training
-        model_to_save = self.module if hasattr(self, "module") else self
-
-        # Attach architecture to the config
-        model_to_save.config.architectures = [model_to_save.__class__.__name__]
-
-        # If we save using the predefined names, we can load using `from_pretrained`
-        output_model_file = os.path.join(save_directory, WEIGHTS_NAME)
-
-        if getattr(self.config, "xla_device", False):
-            import torch_xla.core.xla_model as xm
-
-            if xm.is_master_ordinal():
-                # Save configuration file
-                model_to_save.config.save_pretrained(save_directory)
-            # xm.save takes care of saving only from master
-            xm.save(model_to_save.state_dict(), output_model_file)
-        else:
-            model_to_save.config.save_pretrained(save_directory)
-            torch.save(model_to_save.state_dict(), output_model_file)
-
-        logger.info("Model weights saved in {}".format(output_model_file))
-
-    @classmethod
-    def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
-        r"""
-        Instantiate a pretrained pytorch model from a pre-trained model configuration.
-
-        The model is set in evaluation mode by default using ``model.eval()`` (Dropout modules are deactivated).
-        To train the model, you should first set it back in training mode with ``model.train()``.
-
-        The warning `Weights from XXX not initialized from pretrained model` means that the weights of XXX do not come
-        pretrained with the rest of the model. It is up to you to train those weights with a downstream fine-tuning
-        task.
-
-        The warning `Weights from XXX not used in YYY` means that the layer XXX is not used by YYY, therefore those
-        weights are discarded.
-
-        Parameters:
-            pretrained_model_name_or_path (:obj:`str`, `optional`):
-                Can be either:
-
-                    - A string with the `shortcut name` of a pretrained model to load from cache or download, e.g.,
-                      ``bert-base-uncased``.
-                    - A string with the `identifier name` of a pretrained model that was user-uploaded to our S3, e.g.,
-                      ``dbmdz/bert-base-german-cased``.
-                    - A path to a `directory` containing model weights saved using
-                      :func:`~transformers.PreTrainedModel.save_pretrained`, e.g., ``./my_model_directory/``.
-                    - A path or url to a `tensorflow index checkpoint file` (e.g, ``./tf_model/model.ckpt.index``). In
-                      this case, ``from_tf`` should be set to :obj:`True` and a configuration object should be provided
-                      as ``config`` argument. This loading path is slower than converting the TensorFlow checkpoint in
-                      a PyTorch model using the provided conversion scripts and loading the PyTorch model afterwards.
-                    - :obj:`None` if you are both providing the configuration and state dictionary (resp. with keyword
-                      arguments ``config`` and ``state_dict``).
-            model_args (sequence of positional arguments, `optional`):
-                All remaning positional arguments will be passed to the underlying model's ``__init__`` method.
-            config (:obj:`Union[PretrainedConfig, str]`, `optional`):
-                Can be either:
-
-                    - an instance of a class derived from :class:`~transformers.PretrainedConfig`,
-                    - a string valid as input to :func:`~transformers.PretrainedConfig.from_pretrained`.
-
-                Configuration for the model to use instead of an automatically loaded configuation. Configuration can
-                be automatically loaded when:
-
-                    - The model is a model provided by the library (loaded with the `shortcut name` string of a
-                      pretrained model).
-                    - The model was saved using :func:`~transformers.PreTrainedModel.save_pretrained` and is reloaded
-                      by suppling the save directory.
-                    - The model is loaded by suppling a local directory as ``pretrained_model_name_or_path`` and a
-                      configuration JSON file named `config.json` is found in the directory.
-            state_dict (:obj:`Dict[str, torch.Tensor]`, `optional`):
-                A state dictionary to use instead of a state dictionary loaded from saved weights file.
-
-                This option can be used if you want to create a model from a pretrained configuration but load your own
-                weights. In this case though, you should check if using
-                :func:`~transformers.PreTrainedModel.save_pretrained` and
-                :func:`~transformers.PreTrainedModel.from_pretrained` is not a simpler option.
-            cache_dir (:obj:`str`, `optional`):
-                Path to a directory in which a downloaded pretrained model configuration should be cached if the
-                standard cache should not be used.
-            from_tf (:obj:`bool`, `optional`, defaults to :obj:`False`):
-                Load the model weights from a TensorFlow checkpoint save file (see docstring of
-                ``pretrained_model_name_or_path`` argument).
-            force_download (:obj:`bool`, `optional`, defaults to :obj:`False`):
-                Whether or not to force the (re-)download of the model weights and configuration files, overriding the
-                cached versions if they exist.
-            resume_download (:obj:`bool`, `optional`, defaults to :obj:`False`):
-                Whether or not to delete incompletely received files. Will attempt to resume the download if such a
-                file exists.
-            proxies (:obj:`Dict[str, str], `optional`):
-                A dictionary of proxy servers to use by protocol or endpoint, e.g.,
-                :obj:`{'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}`. The proxies are used on each
-                request.
-            output_loading_info(:obj:`bool`, `optional`, defaults to :obj:`False`):
-                Whether ot not to also return a dictionnary containing missing keys, unexpected keys and error
-                messages.
-            local_files_only(:obj:`bool`, `optional`, defaults to :obj:`False`):
-                Whether or not to only look at local files (e.g., not try doanloading the model).
-            use_cdn(:obj:`bool`, `optional`, defaults to :obj:`True`):
-                Whether or not to use Cloudfront (a Content Delivery Network, or CDN) when searching for the model on
-                our S3 (faster). Should be set to :obj:`False` for checkpoints larger than 20GB.
-            kwargs (remaining dictionary of keyword arguments, `optional`):
-                Can be used to update the configuration object (after it being loaded) and initiate the model (e.g.,
-                :obj:`output_attention=True`). Behaves differently depending on whether a ``config`` is provided or
-                automatically loaded:
-
-                    - If a configuration is provided with ``config``, ``**kwargs`` will be directly passed to the
-                      underlying model's ``__init__`` method (we assume all relevant updates to the configuration have
-                      already been done)
-                    - If a configuration is not provided, ``kwargs`` will be first passed to the configuration class
-                      initialization function (:func:`~transformers.PretrainedConfig.from_pretrained`). Each key of
-                      ``kwargs`` that corresponds to a configuration attribute will be used to override said attribute
-                      with the supplied ``kwargs`` value. Remaining keys that do not correspond to any configuration
-                      attribute will be passed to the underlying model's ``__init__`` function.
-
-        Examples::
-
-            from transformers import BertConfig, BertModel
-            # Download model and configuration from S3 and cache.
-            model = BertModel.from_pretrained('bert-base-uncased')
-            # Model was saved using `save_pretrained('./test/saved_model/')` (for example purposes, not runnable).
-            model = BertModel.from_pretrained('./test/saved_model/')
-            # Update configuration during loading.
-            model = BertModel.from_pretrained('bert-base-uncased', output_attention=True)
-            assert model.config.output_attention == True
-            # Loading from a TF checkpoint file instead of a PyTorch model (slower, for example purposes, not runnable).
-            config = BertConfig.from_json_file('./tf_model/my_tf_model_config.json')
-            model = BertModel.from_pretrained('./tf_model/my_tf_checkpoint.ckpt.index', from_tf=True, config=config)
-        """
-        config = kwargs.pop("config", None)
-        state_dict = kwargs.pop("state_dict", None)
-        cache_dir = kwargs.pop("cache_dir", None)
-        from_tf = kwargs.pop("from_tf", False)
-        force_download = kwargs.pop("force_download", False)
-        resume_download = kwargs.pop("resume_download", False)
-        proxies = kwargs.pop("proxies", None)
-        output_loading_info = kwargs.pop("output_loading_info", False)
-        local_files_only = kwargs.pop("local_files_only", False)
-        use_cdn = kwargs.pop("use_cdn", True)
-
-        # Load config if we don't provide a configuration
-        if not isinstance(config, PretrainedConfig):
-            config_path = (
-                config if config is not None else pretrained_model_name_or_path
-            )
-            config, model_kwargs = cls.config_class.from_pretrained(
-                config_path,
-                *model_args,
-                cache_dir=cache_dir,
-                return_unused_kwargs=True,
-                force_download=force_download,
-                resume_download=resume_download,
-                proxies=proxies,
-                local_files_only=local_files_only,
-                **kwargs,
-            )
-        else:
-            model_kwargs = kwargs
-
-        # Load model
-        if pretrained_model_name_or_path is not None:
-            if os.path.isdir(pretrained_model_name_or_path):
-                if from_tf and os.path.isfile(
-                    os.path.join(
-                        pretrained_model_name_or_path, TF_WEIGHTS_NAME + ".index"
-                    )
-                ):
-                    # Load from a TF 1.0 checkpoint
-                    archive_file = os.path.join(
-                        pretrained_model_name_or_path, TF_WEIGHTS_NAME + ".index"
-                    )
-                elif from_tf and os.path.isfile(
-                    os.path.join(pretrained_model_name_or_path, TF2_WEIGHTS_NAME)
-                ):
-                    # Load from a TF 2.0 checkpoint
-                    archive_file = os.path.join(
-                        pretrained_model_name_or_path, TF2_WEIGHTS_NAME
-                    )
-                elif os.path.isfile(
-                    os.path.join(pretrained_model_name_or_path, WEIGHTS_NAME)
-                ):
-                    # Load from a PyTorch checkpoint
-                    archive_file = os.path.join(
-                        pretrained_model_name_or_path, WEIGHTS_NAME
-                    )
-                else:
-                    raise EnvironmentError(
-                        "Error no file named {} found in directory {} or `from_tf` set to False".format(
-                            [
-                                WEIGHTS_NAME,
-                                TF2_WEIGHTS_NAME,
-                                TF_WEIGHTS_NAME + ".index",
-                            ],
-                            pretrained_model_name_or_path,
-                        )
-                    )
-            elif os.path.isfile(pretrained_model_name_or_path) or is_remote_url(
-                pretrained_model_name_or_path
-            ):
-                archive_file = pretrained_model_name_or_path
-            elif os.path.isfile(pretrained_model_name_or_path + ".index"):
-                assert (
-                    from_tf
-                ), "We found a TensorFlow checkpoint at {}, please set from_tf to True to load from this checkpoint".format(
-                    pretrained_model_name_or_path + ".index"
-                )
-                archive_file = pretrained_model_name_or_path + ".index"
-            else:
-                archive_file = hf_bucket_url(
-                    pretrained_model_name_or_path,
-                    filename=(TF2_WEIGHTS_NAME if from_tf else WEIGHTS_NAME),
-                    use_cdn=use_cdn,
-                )
-
-            try:
-                # Load from URL or cache if already cached
-                resolved_archive_file = cached_path(
-                    archive_file,
-                    cache_dir=cache_dir,
-                    force_download=force_download,
-                    proxies=proxies,
-                    resume_download=resume_download,
-                    local_files_only=local_files_only,
-                )
-                if resolved_archive_file is None:
-                    raise EnvironmentError
-            except EnvironmentError:
-                msg = (
-                    f"Can't load weights for '{pretrained_model_name_or_path}'. Make sure that:\n\n"
-                    f"- '{pretrained_model_name_or_path}' is a correct model identifier listed on 'https://huggingface.co/models'\n\n"
-                    f"- or '{pretrained_model_name_or_path}' is the correct path to a directory containing a file named one of {WEIGHTS_NAME}, {TF2_WEIGHTS_NAME}, {TF_WEIGHTS_NAME}.\n\n"
-                )
-                raise EnvironmentError(msg)
-
-            if resolved_archive_file == archive_file:
-                logger.info("loading weights file {}".format(archive_file))
-            else:
-                logger.info(
-                    "loading weights file {} from cache at {}".format(
-                        archive_file, resolved_archive_file
-                    )
-                )
-        else:
-            resolved_archive_file = None
-
-        # Instantiate model.
-        model = cls(config, *model_args, **model_kwargs)
-
-        if state_dict is None and not from_tf:
-            try:
-                state_dict = torch.load(resolved_archive_file, map_location="cpu")
-            except Exception:
-                raise OSError(
-                    "Unable to load weights from pytorch checkpoint file. "
-                    "If you tried to load a PyTorch model from a TF 2.0 checkpoint, please set from_tf=True. "
-                )
-
-        missing_keys = []
-        unexpected_keys = []
-        error_msgs = []
-
-        if from_tf:
-            if resolved_archive_file.endswith(".index"):
-                # Load from a TensorFlow 1.X checkpoint - provided by original authors
-                model = cls.load_tf_weights(
-                    model, config, resolved_archive_file[:-6]
-                )  # Remove the '.index'
-            else:
-                # Load from our TensorFlow 2.0 checkpoints
-                try:
-                    from transformers import load_tf2_checkpoint_in_pytorch_model
-
-                    model = load_tf2_checkpoint_in_pytorch_model(
-                        model, resolved_archive_file, allow_missing_keys=True
-                    )
-                except ImportError:
-                    logger.error(
-                        "Loading a TensorFlow model in PyTorch, requires both PyTorch and TensorFlow to be installed. Please see "
-                        "https://pytorch.org/ and https://www.tensorflow.org/install/ for installation instructions."
-                    )
-                    raise
-        else:
-            # Convert old format to new format if needed from a PyTorch state_dict
-            old_keys = []
-            new_keys = []
-            for key in state_dict.keys():
-                new_key = None
-                if "gamma" in key:
-                    new_key = key.replace("gamma", "weight")
-                if "beta" in key:
-                    new_key = key.replace("beta", "bias")
-                if new_key:
-                    old_keys.append(key)
-                    new_keys.append(new_key)
-            for old_key, new_key in zip(old_keys, new_keys):
-                state_dict[new_key] = state_dict.pop(old_key)
-
-            # copy state_dict so _load_from_state_dict can modify it
-            metadata = getattr(state_dict, "_metadata", None)
-            state_dict = state_dict.copy()
-            if metadata is not None:
-                state_dict._metadata = metadata
-
-            # PyTorch's `_load_from_state_dict` does not copy parameters in a module's descendants
-            # so we need to apply the function recursively.
-            def load(module: nn.Module, prefix=""):
-                local_metadata = (
-                    {} if metadata is None else metadata.get(prefix[:-1], {})
-                )
-                module._load_from_state_dict(
-                    state_dict,
-                    prefix,
-                    local_metadata,
-                    True,
-                    missing_keys,
-                    unexpected_keys,
-                    error_msgs,
-                )
-                for name, child in module._modules.items():
-                    if child is not None:
-                        load(child, prefix + name + ".")
-
-            # Make sure we are able to load base models as well as derived models (with heads)
-            start_prefix = ""
-            model_to_load = model
-            has_prefix_module = any(
-                s.startswith(cls.base_model_prefix) for s in state_dict.keys()
-            )
-            if not hasattr(model, cls.base_model_prefix) and has_prefix_module:
-                start_prefix = cls.base_model_prefix + "."
-            if hasattr(model, cls.base_model_prefix) and not has_prefix_module:
-                model_to_load = getattr(model, cls.base_model_prefix)
-
-            load(model_to_load, prefix=start_prefix)
-
-            if model.__class__.__name__ != model_to_load.__class__.__name__:
-                base_model_state_dict = model_to_load.state_dict().keys()
-                head_model_state_dict_without_base_prefix = [
-                    key.split(cls.base_model_prefix + ".")[-1]
-                    for key in model.state_dict().keys()
-                ]
-                missing_keys.extend(
-                    head_model_state_dict_without_base_prefix - base_model_state_dict
-                )
-
-            # Some models may have keys that are not in the state by design, removing them before needlessly warning
-            # the user.
-            if cls.authorized_missing_keys is not None:
-                for pat in cls.authorized_missing_keys:
-                    missing_keys = [
-                        k for k in missing_keys if re.search(pat, k) is None
-                    ]
-
-            if len(unexpected_keys) > 0:
-                logger.warning(
-                    f"Some weights of the model checkpoint at {pretrained_model_name_or_path} were not used when "
-                    f"initializing {model.__class__.__name__}: {unexpected_keys}\n"
-                    f"- This IS expected if you are initializing {model.__class__.__name__} from the checkpoint of a model trained on another task "
-                    f"or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).\n"
-                    f"- This IS NOT expected if you are initializing {model.__class__.__name__} from the checkpoint of a model that you expect "
-                    f"to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model)."
-                )
-            else:
-                logger.info(
-                    f"All model checkpoint weights were used when initializing {model.__class__.__name__}.\n"
-                )
-            if len(missing_keys) > 0:
-                logger.warning(
-                    f"Some weights of {model.__class__.__name__} were not initialized from the model checkpoint at {pretrained_model_name_or_path} "
-                    f"and are newly initialized: {missing_keys}\n"
-                    f"You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference."
-                )
-            else:
-                logger.info(
-                    f"All the weights of {model.__class__.__name__} were initialized from the model checkpoint at {pretrained_model_name_or_path}.\n"
-                    f"If your task is similar to the task the model of the checkpoint was trained on, "
-                    f"you can already use {model.__class__.__name__} for predictions without further training."
-                )
-            if len(error_msgs) > 0:
-                raise RuntimeError(
-                    "Error(s) in loading state_dict for {}:\n\t{}".format(
-                        model.__class__.__name__, "\n\t".join(error_msgs)
-                    )
-                )
-        # make sure token embedding weights are still tied if needed
-        model.tie_weights()
-
-        # Set model in evaluation mode to deactivate DropOut modules by default
-        model.eval()
-
-        if output_loading_info:
-            loading_info = {
-                "missing_keys": missing_keys,
-                "unexpected_keys": unexpected_keys,
-                "error_msgs": error_msgs,
-            }
-            return model, loading_info
-
-        if (
-            hasattr(config, "xla_device")
-            and config.xla_device
-            and is_torch_tpu_available()
-        ):
-            import torch_xla.core.xla_model as xm
-
-            model = xm.send_cpu_data_to_device(model, xm.xla_device())
-            model.to(xm.xla_device())
-
-        return model
-
-
-class Conv1D(nn.Module):
-    """
-    1D-convolutional layer as defined by Radford et al. for OpenAI GPT (and also used in GPT-2).
-
-    Basically works like a linear layer but the weights are transposed.
-
-    Args:
-        nf (:obj:`int`): The number of output features.
-        nx (:obj:`int`): The number of input features.
-    """
-
-    def __init__(self, nf, nx):
-        super().__init__()
-        self.nf = nf
-        w = torch.empty(nx, nf)
-        nn.init.normal_(w, std=0.02)
-        self.weight = nn.Parameter(w)
-        self.bias = nn.Parameter(torch.zeros(nf))
-
-    def forward(self, x):
-        size_out = x.size()[:-1] + (self.nf,)
-        x = torch.addmm(self.bias, x.view(-1, x.size(-1)), self.weight)
-        x = x.view(*size_out)
-        return x
-
-
-class PoolerStartLogits(nn.Module):
-    """
-    Compute SQuAD start logits from sequence hidden states.
-
-    Args:
-        config (:class:`~transformers.PretrainedConfig`):
-            The config used by the model, will be used to grab the :obj:`hidden_size` of the model.
-    """
-
-    def __init__(self, config: PretrainedConfig):
-        super().__init__()
-        self.dense = nn.Linear(config.hidden_size, 1)
-
-    def forward(
-        self,
-        hidden_states: torch.FloatTensor,
-        p_mask: Optional[torch.FloatTensor] = None,
-    ) -> torch.FloatTensor:
-        """
-        Args:
-            hidden_states (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, seq_len, hidden_size)`):
-                The final hidden states of the model.
-            p_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, seq_len)`, `optional`):
-                Mask for tokens at invalid position, such as query and special symbols (PAD, SEP, CLS).
-                1.0 means token should be masked.
-
-        Returns:
-            :obj:`torch.FloatTensor`: The start logits for SQuAD.
-        """
-        x = self.dense(hidden_states).squeeze(-1)
-
-        if p_mask is not None:
-            if next(self.parameters()).dtype == torch.float16:
-                x = x * (1 - p_mask) - 65500 * p_mask
-            else:
-                x = x * (1 - p_mask) - 1e30 * p_mask
-
-        return x
-
-
-class PoolerEndLogits(nn.Module):
-    """
-    Compute SQuAD end logits from sequence hidden states.
-
-    Args:
-        config (:class:`~transformers.PretrainedConfig`):
-            The config used by the model, will be used to grab the :obj:`hidden_size` of the model and the
-            :obj:`layer_norm_eps` to use.
-    """
-
-    def __init__(self, config: PretrainedConfig):
-        super().__init__()
-        self.dense_0 = nn.Linear(config.hidden_size * 2, config.hidden_size)
-        self.activation = nn.Tanh()
-        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
-        self.dense_1 = nn.Linear(config.hidden_size, 1)
-
-    def forward(
-        self,
-        hidden_states: torch.FloatTensor,
-        start_states: Optional[torch.FloatTensor] = None,
-        start_positions: Optional[torch.LongTensor] = None,
-        p_mask: Optional[torch.FloatTensor] = None,
-    ) -> torch.FloatTensor:
-        """
-        Args:
-            hidden_states (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, seq_len, hidden_size)`):
-                The final hidden states of the model.
-            start_states (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, seq_len, hidden_size)`, `optional`):
-                The hidden states of the first tokens for the labeled span.
-            start_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
-                The position of the first token for the labeled span.
-            p_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, seq_len)`, `optional`):
-                Mask for tokens at invalid position, such as query and special symbols (PAD, SEP, CLS).
-                1.0 means token should be masked.
-
-        .. note::
-
-            One of ``start_states`` or ``start_positions`` should be not obj:`None`. If both are set,
-            ``start_positions`` overrides ``start_states``.
-
-        Returns:
-            :obj:`torch.FloatTensor`: The end logits for SQuAD.
-        """
-        assert (
-            start_states is not None or start_positions is not None
-        ), "One of start_states, start_positions should be not None"
-        if start_positions is not None:
-            slen, hsz = hidden_states.shape[-2:]
-            start_positions = start_positions[:, None, None].expand(
-                -1, -1, hsz
-            )  # shape (bsz, 1, hsz)
-            start_states = hidden_states.gather(
-                -2, start_positions
-            )  # shape (bsz, 1, hsz)
-            start_states = start_states.expand(-1, slen, -1)  # shape (bsz, slen, hsz)
-
-        x = self.dense_0(torch.cat([hidden_states, start_states], dim=-1))
-        x = self.activation(x)
-        x = self.LayerNorm(x)
-        x = self.dense_1(x).squeeze(-1)
-
-        if p_mask is not None:
-            if next(self.parameters()).dtype == torch.float16:
-                x = x * (1 - p_mask) - 65500 * p_mask
-            else:
-                x = x * (1 - p_mask) - 1e30 * p_mask
-
-        return x
-
-
-class PoolerAnswerClass(nn.Module):
-    """
-    Compute SQuAD 2.0 answer class from classification and start tokens hidden states.
-
-    Args:
-        config (:class:`~transformers.PretrainedConfig`):
-            The config used by the model, will be used to grab the :obj:`hidden_size` of the model.
-    """
-
-    def __init__(self, config):
-        super().__init__()
-        self.dense_0 = nn.Linear(config.hidden_size * 2, config.hidden_size)
-        self.activation = nn.Tanh()
-        self.dense_1 = nn.Linear(config.hidden_size, 1, bias=False)
-
-    def forward(
-        self,
-        hidden_states: torch.FloatTensor,
-        start_states: Optional[torch.FloatTensor] = None,
-        start_positions: Optional[torch.LongTensor] = None,
-        cls_index: Optional[torch.LongTensor] = None,
-    ) -> torch.FloatTensor:
-        """
-        Args:
-            hidden_states (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, seq_len, hidden_size)`):
-                The final hidden states of the model.
-            start_states (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, seq_len, hidden_size)`, `optional`):
-                The hidden states of the first tokens for the labeled span.
-            start_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
-                The position of the first token for the labeled span.
-            cls_index (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
-                Position of the CLS token for each sentence in the batch. If :obj:`None`, takes the last token.
-
-        .. note::
-
-            One of ``start_states`` or ``start_positions`` should be not obj:`None`. If both are set,
-            ``start_positions`` overrides ``start_states``.
-
-        Returns:
-            :obj:`torch.FloatTensor`: The SQuAD 2.0 answer class.
-        """
-        # No dependency on end_feature so that we can obtain one single `cls_logits` for each sample.
-        hsz = hidden_states.shape[-1]
-        assert (
-            start_states is not None or start_positions is not None
-        ), "One of start_states, start_positions should be not None"
-        if start_positions is not None:
-            start_positions = start_positions[:, None, None].expand(
-                -1, -1, hsz
-            )  # shape (bsz, 1, hsz)
-            start_states = hidden_states.gather(-2, start_positions).squeeze(
-                -2
-            )  # shape (bsz, hsz)
-
-        if cls_index is not None:
-            cls_index = cls_index[:, None, None].expand(
-                -1, -1, hsz
-            )  # shape (bsz, 1, hsz)
-            cls_token_state = hidden_states.gather(-2, cls_index).squeeze(
-                -2
-            )  # shape (bsz, hsz)
-        else:
-            cls_token_state = hidden_states[:, -1, :]  # shape (bsz, hsz)
-
-        x = self.dense_0(torch.cat([start_states, cls_token_state], dim=-1))
-        x = self.activation(x)
-        x = self.dense_1(x).squeeze(-1)
-
-        return x
-
-
-@dataclass
-class SquadHeadOutput(ModelOutput):
-    """
-    Base class for outputs of question answering models using a :class:`~transformers.modeling_utils.SQuADHead`.
-
-    Args:
-        loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned if both :obj:`start_positions` and :obj:`end_positions` are provided):
-            Classification loss as the sum of start token, end token (and is_impossible if provided) classification losses.
-        start_top_log_probs (``torch.FloatTensor`` of shape ``(batch_size, config.start_n_top)``, `optional`, returned if ``start_positions`` or ``end_positions`` is not provided):
-            Log probabilities for the top config.start_n_top start token possibilities (beam-search).
-        start_top_index (``torch.LongTensor`` of shape ``(batch_size, config.start_n_top)``, `optional`, returned if ``start_positions`` or ``end_positions`` is not provided):
-            Indices for the top config.start_n_top start token possibilities (beam-search).
-        end_top_log_probs (``torch.FloatTensor`` of shape ``(batch_size, config.start_n_top * config.end_n_top)``, `optional`, returned if ``start_positions`` or ``end_positions`` is not provided):
-            Log probabilities for the top ``config.start_n_top * config.end_n_top`` end token possibilities (beam-search).
-        end_top_index (``torch.LongTensor`` of shape ``(batch_size, config.start_n_top * config.end_n_top)``, `optional`, returned if ``start_positions`` or ``end_positions`` is not provided):
-            Indices for the top ``config.start_n_top * config.end_n_top`` end token possibilities (beam-search).
-        cls_logits (``torch.FloatTensor`` of shape ``(batch_size,)``, `optional`, returned if ``start_positions`` or ``end_positions`` is not provided):
-            Log probabilities for the ``is_impossible`` label of the answers.
-
-    """
-
-    loss: Optional[torch.FloatTensor] = None
-    start_top_log_probs: Optional[torch.FloatTensor] = None
-    start_top_index: Optional[torch.LongTensor] = None
-    end_top_log_probs: Optional[torch.FloatTensor] = None
-    end_top_index: Optional[torch.LongTensor] = None
-    cls_logits: Optional[torch.FloatTensor] = None
-
-
-class SQuADHead(nn.Module):
-    r"""
-    A SQuAD head inspired by XLNet.
-
-    Args:
-        config (:class:`~transformers.PretrainedConfig`):
-            The config used by the model, will be used to grab the :obj:`hidden_size` of the model and the
-            :obj:`layer_norm_eps` to use.
-    """
-
-    def __init__(self, config):
-        super().__init__()
-        self.start_n_top = config.start_n_top
-        self.end_n_top = config.end_n_top
-
-        self.start_logits = PoolerStartLogits(config)
-        self.end_logits = PoolerEndLogits(config)
-        self.answer_class = PoolerAnswerClass(config)
-
-    @replace_return_docstrings(
-        output_type=SquadHeadOutput, config_class=PretrainedConfig
-    )
-    def forward(
-        self,
-        hidden_states: torch.FloatTensor,
-        start_positions: Optional[torch.LongTensor] = None,
-        end_positions: Optional[torch.LongTensor] = None,
-        cls_index: Optional[torch.LongTensor] = None,
-        is_impossible: Optional[torch.LongTensor] = None,
-        p_mask: Optional[torch.FloatTensor] = None,
-        return_dict: bool = False,
-    ) -> Union[SquadHeadOutput, Tuple[torch.FloatTensor]]:
-        """
-        Args:
-            hidden_states (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, seq_len, hidden_size)`):
-                Final hidden states of the model on the sequence tokens.
-            start_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
-                Positions of the first token for the labeled span.
-            end_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
-                Positions of the last token for the labeled span.
-            cls_index (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
-                Position of the CLS token for each sentence in the batch. If :obj:`None`, takes the last token.
-            is_impossible (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
-                Whether the question has a possible answer in the paragraph or not.
-            p_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, seq_len)`, `optional`):
-                Mask for tokens at invalid position, such as query and special symbols (PAD, SEP, CLS).
-                1.0 means token should be masked.
-            return_dict (:obj:`bool`, `optional`, defaults to :obj:`False`):
-                Whether or not to return a :class:`~transformers.file_utils.ModelOuput` instead of a plain tuple.
-
-        Returns:
-        """
-        start_logits = self.start_logits(hidden_states, p_mask=p_mask)
-
-        if start_positions is not None and end_positions is not None:
-            # If we are on multi-GPU, let's remove the dimension added by batch splitting
-            for x in (start_positions, end_positions, cls_index, is_impossible):
-                if x is not None and x.dim() > 1:
-                    x.squeeze_(-1)
-
-            # during training, compute the end logits based on the ground truth of the start position
-            end_logits = self.end_logits(
-                hidden_states, start_positions=start_positions, p_mask=p_mask
-            )
-
-            loss_fct = CrossEntropyLoss()
-            start_loss = loss_fct(start_logits, start_positions)
-            end_loss = loss_fct(end_logits, end_positions)
-            total_loss = (start_loss + end_loss) / 2
-
-            if cls_index is not None and is_impossible is not None:
-                # Predict answerability from the representation of CLS and START
-                cls_logits = self.answer_class(
-                    hidden_states, start_positions=start_positions, cls_index=cls_index
-                )
-                loss_fct_cls = nn.BCEWithLogitsLoss()
-                cls_loss = loss_fct_cls(cls_logits, is_impossible)
-
-                # note(zhiliny): by default multiply the loss by 0.5 so that the scale is comparable to start_loss and end_loss
-                total_loss += cls_loss * 0.5
-
-            return SquadHeadOutput(loss=total_loss) if return_dict else (total_loss,)
-
-        else:
-            # during inference, compute the end logits based on beam search
-            bsz, slen, hsz = hidden_states.size()
-            start_log_probs = F.softmax(start_logits, dim=-1)  # shape (bsz, slen)
-
-            start_top_log_probs, start_top_index = torch.topk(
-                start_log_probs, self.start_n_top, dim=-1
-            )  # shape (bsz, start_n_top)
-            start_top_index_exp = start_top_index.unsqueeze(-1).expand(
-                -1, -1, hsz
-            )  # shape (bsz, start_n_top, hsz)
-            start_states = torch.gather(
-                hidden_states, -2, start_top_index_exp
-            )  # shape (bsz, start_n_top, hsz)
-            start_states = start_states.unsqueeze(1).expand(
-                -1, slen, -1, -1
-            )  # shape (bsz, slen, start_n_top, hsz)
-
-            hidden_states_expanded = hidden_states.unsqueeze(2).expand_as(
-                start_states
-            )  # shape (bsz, slen, start_n_top, hsz)
-            p_mask = p_mask.unsqueeze(-1) if p_mask is not None else None
-            end_logits = self.end_logits(
-                hidden_states_expanded, start_states=start_states, p_mask=p_mask
-            )
-            end_log_probs = F.softmax(
-                end_logits, dim=1
-            )  # shape (bsz, slen, start_n_top)
-
-            end_top_log_probs, end_top_index = torch.topk(
-                end_log_probs, self.end_n_top, dim=1
-            )  # shape (bsz, end_n_top, start_n_top)
-            end_top_log_probs = end_top_log_probs.view(
-                -1, self.start_n_top * self.end_n_top
-            )
-            end_top_index = end_top_index.view(-1, self.start_n_top * self.end_n_top)
-
-            start_states = torch.einsum("blh,bl->bh", hidden_states, start_log_probs)
-            cls_logits = self.answer_class(
-                hidden_states, start_states=start_states, cls_index=cls_index
-            )
-
-            if not return_dict:
-                return (
-                    start_top_log_probs,
-                    start_top_index,
-                    end_top_log_probs,
-                    end_top_index,
-                    cls_logits,
-                )
-            else:
-                return SquadHeadOutput(
-                    start_top_log_probs=start_top_log_probs,
-                    start_top_index=start_top_index,
-                    end_top_log_probs=end_top_log_probs,
-                    end_top_index=end_top_index,
-                    cls_logits=cls_logits,
-                )
-
-
-class SequenceSummary(nn.Module):
-    r"""
-    Compute a single vector summary of a sequence hidden states.
-
-    Args:
-        config (:class:`~transformers.PretrainedConfig`):
-            The config used by the model. Relevant arguments in the config class of the model are (refer to the
-            actual config class of your model for the default values it uses):
-
-            - **summary_type** (:obj:`str`) -- The method to use to make this summary. Accepted values are:
-
-                - :obj:`"last"` -- Take the last token hidden state (like XLNet)
-                - :obj:`"first"` -- Take the first token hidden state (like Bert)
-                - :obj:`"mean"` -- Take the mean of all tokens hidden states
-                - :obj:`"cls_index"` -- Supply a Tensor of classification token position (GPT/GPT-2)
-                - :obj:`"attn"` -- Not implemented now, use multi-head attention
-
-            - **summary_use_proj** (:obj:`bool`) -- Add a projection after the vector extraction.
-            - **summary_proj_to_labels** (:obj:`bool`) -- If :obj:`True`, the projection outputs to
-              :obj:`config.num_labels` classes (otherwise to :obj:`config.hidden_size`).
-            - **summary_activation**  (:obj:`Optional[str]`) -- Set to :obj:`"tanh"` to add a tanh activation to the
-              output, another string or :obj:`None` will add no activation.
-            - **summary_first_dropout** (:obj:`float`) -- Optional dropout probability before the projection and
-              activation.
-            - **summary_last_dropout** (:obj:`float`)-- Optional dropout probability after the projection and
-              activation.
-    """
-
-    def __init__(self, config: PretrainedConfig):
-        super().__init__()
-
-        self.summary_type = getattr(config, "summary_type", "last")
-        if self.summary_type == "attn":
-            # We should use a standard multi-head attention module with absolute positional embedding for that.
-            # Cf. https://github.com/zihangdai/xlnet/blob/master/modeling.py#L253-L276
-            # We can probably just use the multi-head attention module of PyTorch >=1.1.0
-            raise NotImplementedError
-
-        self.summary = Identity()
-        if hasattr(config, "summary_use_proj") and config.summary_use_proj:
-            if (
-                hasattr(config, "summary_proj_to_labels")
-                and config.summary_proj_to_labels
-                and config.num_labels > 0
-            ):
-                num_classes = config.num_labels
-            else:
-                num_classes = config.hidden_size
-            self.summary = nn.Linear(config.hidden_size, num_classes)
-
-        activation_string = getattr(config, "summary_activation", None)
-        self.activation: Callable = get_activation(
-            activation_string
-        ) if activation_string else Identity()
-
-        self.first_dropout = Identity()
-        if (
-            hasattr(config, "summary_first_dropout")
-            and config.summary_first_dropout > 0
-        ):
-            self.first_dropout = nn.Dropout(config.summary_first_dropout)
-
-        self.last_dropout = Identity()
-        if hasattr(config, "summary_last_dropout") and config.summary_last_dropout > 0:
-            self.last_dropout = nn.Dropout(config.summary_last_dropout)
-
-    def forward(
-        self,
-        hidden_states: torch.FloatTensor,
-        cls_index: Optional[torch.LongTensor] = None,
-    ) -> torch.FloatTensor:
-        """
-        Compute a single vector summary of a sequence hidden states.
-
-        Args:
-            hidden_states (:obj:`torch.FloatTensor` of shape :obj:`[batch_size, seq_len, hidden_size]`):
-                The hidden states of the last layer.
-            cls_index (:obj:`torch.LongTensor` of shape :obj:`[batch_size]` or :obj:`[batch_size, ...]` where ... are optional leading dimensions of :obj:`hidden_states`, `optional`):
-                Used if :obj:`summary_type == "cls_index"` and takes the last token of the sequence as classification
-                token.
-
-        Returns:
-            :obj:`torch.FloatTensor`: The summary of the sequence hidden states.
-        """
-        if self.summary_type == "last":
-            output = hidden_states[:, -1]
-        elif self.summary_type == "first":
-            output = hidden_states[:, 0]
-        elif self.summary_type == "mean":
-            output = hidden_states.mean(dim=1)
-        elif self.summary_type == "cls_index":
-            if cls_index is None:
-                cls_index = torch.full_like(
-                    hidden_states[..., :1, :],
-                    hidden_states.shape[-2] - 1,
-                    dtype=torch.long,
-                )
-            else:
-                cls_index = cls_index.unsqueeze(-1).unsqueeze(-1)
-                cls_index = cls_index.expand(
-                    (-1,) * (cls_index.dim() - 1) + (hidden_states.size(-1),)
-                )
-            # shape of cls_index: (bsz, XX, 1, hidden_size) where XX are optional leading dim of hidden_states
-            output = hidden_states.gather(-2, cls_index).squeeze(
-                -2
-            )  # shape (bsz, XX, hidden_size)
-        elif self.summary_type == "attn":
-            raise NotImplementedError
-
-        output = self.first_dropout(output)
-        output = self.summary(output)
-        output = self.activation(output)
-        output = self.last_dropout(output)
-
-        return output
-
-
-def prune_linear_layer(
-    layer: torch.nn.Linear, index: torch.LongTensor, dim: int = 0
-) -> torch.nn.Linear:
-    """
-    Prune a linear layer to keep only entries in index.
-
-    Used to remove heads.
-
-    Args:
-        layer (:obj:`torch.nn.Linear`): The layer to prune.
-        index (:obj:`torch.LongTensor`): The indices to keep in the layer.
-        dim (:obj:`int`, `optional`, defaults to 0): The dimension on which to keep the indices.
-
-    Returns:
-        :obj:`torch.nn.Linear`: The pruned layer as a new layer with :obj:`requires_grad=True`.
-    """
-    index = index.to(layer.weight.device)
-    W = layer.weight.index_select(dim, index).clone().detach()
-    if layer.bias is not None:
-        if dim == 1:
-            b = layer.bias.clone().detach()
-        else:
-            b = layer.bias[index].clone().detach()
-    new_size = list(layer.weight.size())
-    new_size[dim] = len(index)
-    new_layer = nn.Linear(new_size[1], new_size[0], bias=layer.bias is not None).to(
-        layer.weight.device
-    )
-    new_layer.weight.requires_grad = False
-    new_layer.weight.copy_(W.contiguous())
-    new_layer.weight.requires_grad = True
-    if layer.bias is not None:
-        new_layer.bias.requires_grad = False
-        new_layer.bias.copy_(b.contiguous())
-        new_layer.bias.requires_grad = True
-    return new_layer
-
-
-def prune_conv1d_layer(layer: Conv1D, index: torch.LongTensor, dim: int = 1) -> Conv1D:
-    """
-    Prune a Conv1D layer to keep only entries in index. A Conv1D work as a Linear layer (see e.g. BERT) but the weights
-    are transposed.
-
-    Used to remove heads.
-
-    Args:
-        layer (:class:`~transformers.modeling_utils.Conv1D`): The layer to prune.
-        index (:obj:`torch.LongTensor`): The indices to keep in the layer.
-        dim (:obj:`int`, `optional`, defaults to 1): The dimension on which to keep the indices.
-
-    Returns:
-        :class:`~transformers.modeling_utils.Conv1D`: The pruned layer as a new layer with :obj:`requires_grad=True`.
-    """
-    index = index.to(layer.weight.device)
-    W = layer.weight.index_select(dim, index).clone().detach()
-    if dim == 0:
-        b = layer.bias.clone().detach()
-    else:
-        b = layer.bias[index].clone().detach()
-    new_size = list(layer.weight.size())
-    new_size[dim] = len(index)
-    new_layer = Conv1D(new_size[1], new_size[0]).to(layer.weight.device)
-    new_layer.weight.requires_grad = False
-    new_layer.weight.copy_(W.contiguous())
-    new_layer.weight.requires_grad = True
-    new_layer.bias.requires_grad = False
-    new_layer.bias.copy_(b.contiguous())
-    new_layer.bias.requires_grad = True
-    return new_layer
-
-
-def prune_layer(
-    layer: Union[torch.nn.Linear, Conv1D],
-    index: torch.LongTensor,
-    dim: Optional[int] = None,
-) -> Union[torch.nn.Linear, Conv1D]:
-    """
-    Prune a Conv1D or linear layer to keep only entries in index.
-
-    Used to remove heads.
-
-    Args:
-        layer (:obj:`Union[torch.nn.Linear, Conv1D]`): The layer to prune.
-        index (:obj:`torch.LongTensor`): The indices to keep in the layer.
-        dim (:obj:`int`, `optional`): The dimension on which to keep the indices.
-
-    Returns:
-        :obj:`torch.nn.Linear` or :class:`~transformers.modeling_utils.Conv1D`:
-        The pruned layer as a new layer with :obj:`requires_grad=True`.
-    """
-    if isinstance(layer, nn.Linear):
-        return prune_linear_layer(layer, index, dim=0 if dim is None else dim)
-    elif isinstance(layer, Conv1D):
-        return prune_conv1d_layer(layer, index, dim=1 if dim is None else dim)
-    else:
-        raise ValueError("Can't prune layer of class {}".format(layer.__class__))
-
-
-def apply_chunking_to_forward(
-    forward_fn: Callable[..., torch.Tensor],
-    chunk_size: int,
-    chunk_dim: int,
-    *input_tensors,
-) -> torch.Tensor:
-    """
-    This function chunks the :obj:`input_tensors` into smaller input tensor parts of size :obj:`chunk_size` over the
-    dimension :obj:`chunk_dim`. It then applies a layer :obj:`forward_fn` to each chunk independently to save memory.
-
-    If the :obj:`forward_fn` is independent across the :obj:`chunk_dim` this function will yield the same result as
-    directly applying :obj:`forward_fn` to :obj:`input_tensors`.
-
-    Args:
-        forward_fn (:obj:`Callable[..., torch.Tensor]`):
-            The forward function of the model.
-        chunk_size (:obj:`int`):
-            The chunk size of a chunked tensor: :obj:`num_chunks = len(input_tensors[0]) / chunk_size`.
-        chunk_dim (:obj:`int`):
-            The dimension over which the :obj:`input_tensors` should be chunked.
-        input_tensors (:obj:`Tuple[torch.Tensor]`):
-            The input tensors of ``forward_fn`` which will be chunked.
-    Returns:
-        :obj:`torch.Tensor`: A tensor with the same shape as the :obj:`foward_fn` would have given if applied`.
-
-
-    Examples::
-
-        # rename the usual forward() fn to forward_chunk()
-        def forward_chunk(self, hidden_states):
-            hidden_states = self.decoder(hidden_states)
-            return hidden_states
-
-        # implement a chunked forward function
-        def forward(self, hidden_states):
-            return apply_chunking_to_forward(self.forward_chunk, self.chunk_size_lm_head, self.seq_len_dim, hidden_states)
-    """
-
-    assert len(input_tensors) > 0, "{} has to be a tuple/list of tensors".format(
-        input_tensors
-    )
-    tensor_shape = input_tensors[0].shape
-    assert all(
-        input_tensor.shape == tensor_shape for input_tensor in input_tensors
-    ), "All input tenors have to be of the same shape"
-
-    # inspect.signature exist since python 3.5 and is a python method -> no problem with backward compability
-    num_args_in_forward_chunk_fn = len(inspect.signature(forward_fn).parameters)
-    assert num_args_in_forward_chunk_fn == len(
-        input_tensors
-    ), "forward_chunk_fn expects {} arguments, but only {} input tensors are given".format(
-        num_args_in_forward_chunk_fn, len(input_tensors)
-    )
-
-    if chunk_size > 0:
-        assert (
-            input_tensors[0].shape[chunk_dim] % chunk_size == 0
-        ), "The dimension to be chunked {} has to be a multiple of the chunk size {}".format(
-            input_tensors[0].shape[chunk_dim], chunk_size
-        )
-
-        num_chunks = input_tensors[0].shape[chunk_dim] // chunk_size
-
-        # chunk input tensor into tuples
-        input_tensors_chunks = tuple(
-            input_tensor.chunk(num_chunks, dim=chunk_dim)
-            for input_tensor in input_tensors
-        )
-        # apply forward fn to every tuple
-        output_chunks = tuple(
-            forward_fn(*input_tensors_chunk)
-            for input_tensors_chunk in zip(*input_tensors_chunks)
-        )
-        # concatenate output at same dimension
-        return torch.cat(output_chunks, dim=chunk_dim)
-
-    return forward_fn(*input_tensors)
--- a/train/utils.py deleted 100644 → 0
View file @3f92ebb
+++ b/train/utils.py deleted 100644 → 0
View file @3f92ebb
-import itertools
-import json
-import linecache
-import os
-import pickle
-from logging import getLogger
-from pathlib import Path
-from typing import Callable, Dict, Iterable, List
-
-import git
-import numpy as np
-import torch
-from rouge_score import rouge_scorer, scoring
-from sacrebleu import corpus_bleu
-from torch import nn
-from torch.utils.data import Dataset, Sampler
-
-from transformers import BartTokenizer
-
-
-def label_smoothed_nll_loss(lprobs, target, epsilon, ignore_index=-100):
-    """From fairseq"""
-    if target.dim() == lprobs.dim() - 1:
-        target = target.unsqueeze(-1)
-    nll_loss = -lprobs.gather(dim=-1, index=target)
-    smooth_loss = -lprobs.sum(dim=-1, keepdim=True)
-    if ignore_index is not None:
-        pad_mask = target.eq(ignore_index)
-        nll_loss.masked_fill_(pad_mask, 0.0)
-        smooth_loss.masked_fill_(pad_mask, 0.0)
-    else:
-        nll_loss = nll_loss.squeeze(-1)
-        smooth_loss = smooth_loss.squeeze(-1)
-
-    nll_loss = nll_loss.sum()  # mean()? Scared to break other math.
-    smooth_loss = smooth_loss.sum()
-    eps_i = epsilon / lprobs.size(-1)
-    loss = (1.0 - epsilon) * nll_loss + eps_i * smooth_loss
-    return loss, nll_loss
-
-
-def encode_line(
-    tokenizer, line, max_length, pad_to_max_length=True, return_tensors="pt"
-):
-    """Only used by LegacyDataset"""
-    extra_kw = (
-        {"add_prefix_space": True} if isinstance(tokenizer, BartTokenizer) else {}
-    )
-    return tokenizer(
-        [line],
-        max_length=max_length,
-        padding="max_length" if pad_to_max_length else None,
-        truncation=True,
-        return_tensors=return_tensors,
-        **extra_kw,
-    )
-
-
-def lmap(f: Callable, x: Iterable) -> List:
-    """list(map(f, x))"""
-    return list(map(f, x))
-
-
-def calculate_bleu(output_lns, refs_lns, **kwargs) -> dict:
-    """Uses sacrebleu's corpus_bleu implementation."""
-    return {"bleu": round(corpus_bleu(output_lns, [refs_lns], **kwargs).score, 4)}
-
-
-def trim_batch(
-    input_ids, pad_token_id, attention_mask=None,
-):
-    """Remove columns that are populated exclusively by pad_token_id"""
-    keep_column_mask = input_ids.ne(pad_token_id).any(dim=0)
-    if attention_mask is None:
-        return input_ids[:, keep_column_mask]
-    else:
-        return (input_ids[:, keep_column_mask], attention_mask[:, keep_column_mask])
-
-
-class AbstractSeq2SeqDataset(Dataset):
-    def __init__(
-        self,
-        tokenizer,
-        data_dir,
-        max_source_length,
-        max_target_length,
-        type_path="train",
-        n_obs=None,
-        src_lang=None,
-        tgt_lang=None,
-        prefix="",
-    ):
-        super().__init__()
-        self.src_file = Path(data_dir).joinpath(type_path + ".source")
-        self.tgt_file = Path(data_dir).joinpath(type_path + ".target")
-        self.src_lens = self.get_char_lens(self.src_file)
-        self.max_source_length = max_source_length
-        self.max_target_length = max_target_length
-        assert min(self.src_lens) > 0, f"found empty line in {self.src_file}"
-        self.tokenizer = tokenizer
-        self.prefix = prefix
-        if n_obs is not None:
-            self.src_lens = self.src_lens[:n_obs]
-        self.pad_token_id = self.tokenizer.pad_token_id
-        self.src_lang = src_lang
-        self.tgt_lang = tgt_lang
-        self.add_prefix_space = isinstance(self.tokenizer, BartTokenizer)
-
-    def __len__(self):
-        return len(self.src_lens)
-
-    @staticmethod
-    def get_char_lens(data_file):
-        return [len(x) for x in Path(data_file).open().readlines()]
-
-    def make_sortish_sampler(self, batch_size):
-        return SortishSampler(self.src_lens, batch_size)
-
-    def __getitem__(self, item):
-        raise NotImplementedError("You must implement this")
-
-    def collate_fn(self, batch):
-        raise NotImplementedError("You must implement this")
-
-
-class LegacySeq2SeqDataset(AbstractSeq2SeqDataset):
-    def __getitem__(self, index) -> Dict[str, torch.Tensor]:
-        """Call tokenizer on src and tgt_lines"""
-        index = index + 1  # linecache starts at 1
-        source_line = self.prefix + linecache.getline(str(self.src_file), index).rstrip(
-            "\n"
-        )
-        tgt_line = linecache.getline(str(self.tgt_file), index).rstrip("\n")
-        assert source_line, f"empty source line for index {index}"
-        assert tgt_line, f"empty tgt line for index {index}"
-        source_inputs = encode_line(self.tokenizer, source_line, self.max_source_length)
-        target_inputs = encode_line(self.tokenizer, tgt_line, self.max_target_length)
-
-        source_ids = source_inputs["input_ids"].squeeze()
-        target_ids = target_inputs["input_ids"].squeeze()
-        src_mask = source_inputs["attention_mask"].squeeze()
-        return {
-            "input_ids": source_ids,
-            "attention_mask": src_mask,
-            "labels": target_ids,
-        }
-
-    def collate_fn(self, batch) -> Dict[str, torch.Tensor]:
-        input_ids = torch.stack([x["input_ids"] for x in batch])
-        masks = torch.stack([x["attention_mask"] for x in batch])
-        target_ids = torch.stack([x["labels"] for x in batch])
-        pad_token_id = self.pad_token_id
-        y = trim_batch(target_ids, pad_token_id)
-        source_ids, source_mask = trim_batch(
-            input_ids, pad_token_id, attention_mask=masks
-        )
-        batch = {
-            "input_ids": source_ids,
-            "attention_mask": source_mask,
-            "labels": y,
-        }
-        return batch
-
-
-class Seq2SeqDataset(AbstractSeq2SeqDataset):
-    """A dataset that calls prepare_seq2seq_batch."""
-
-    def __getitem__(self, index) -> Dict[str, str]:
-        index = index + 1  # linecache starts at 1
-        source_line = self.prefix + linecache.getline(str(self.src_file), index).rstrip(
-            "\n"
-        )
-        tgt_line = linecache.getline(str(self.tgt_file), index).rstrip("\n")
-        assert source_line, f"empty source line for index {index}"
-        assert tgt_line, f"empty tgt line for index {index}"
-        return {
-            "tgt_texts": tgt_line,
-            "src_texts": source_line,
-        }
-
-    def collate_fn(self, batch) -> Dict[str, torch.Tensor]:
-        """Call prepare_seq2seq_batch."""
-        batch_encoding = self.tokenizer.prepare_seq2seq_batch(
-            [x["src_texts"] for x in batch],
-            src_lang=self.src_lang,
-            tgt_texts=[x["tgt_texts"] for x in batch],
-            tgt_lang=self.tgt_lang,
-            max_length=self.max_source_length,
-            max_target_length=self.max_target_length,
-            return_tensors="pt",
-            add_prefix_space=self.add_prefix_space,
-        )
-        return batch_encoding.data
-
-
-class SortishSampler(Sampler):
-    "Go through the text data by order of src length with a bit of randomness. From fastai repo."
-
-    def __init__(self, data, batch_size):
-        self.data, self.bs = data, batch_size
-
-    def key(self, i):
-        return self.data[i]
-
-    def __len__(self) -> int:
-        return len(self.data)
-
-    def __iter__(self):
-        idxs = np.random.permutation(len(self.data))
-        sz = self.bs * 50
-        ck_idx = [idxs[i : i + sz] for i in range(0, len(idxs), sz)]
-        sort_idx = np.concatenate(
-            [sorted(s, key=self.key, reverse=True) for s in ck_idx]
-        )
-        sz = self.bs
-        ck_idx = [sort_idx[i : i + sz] for i in range(0, len(sort_idx), sz)]
-        max_ck = np.argmax(
-            [self.key(ck[0]) for ck in ck_idx]
-        )  # find the chunk with the largest key,
-        ck_idx[0], ck_idx[max_ck] = (
-            ck_idx[max_ck],
-            ck_idx[0],
-        )  # then make sure it goes first.
-        sort_idx = (
-            np.concatenate(np.random.permutation(ck_idx[1:]))
-            if len(ck_idx) > 1
-            else np.array([], dtype=np.int)
-        )
-        sort_idx = np.concatenate((ck_idx[0], sort_idx))
-        return iter(sort_idx)
-
-
-logger = getLogger(__name__)
-
-
-def use_task_specific_params(model, task):
-    """Update config with summarization specific params."""
-    task_specific_params = model.config.task_specific_params
-
-    if task_specific_params is not None:
-        pars = task_specific_params.get(task, {})
-        logger.info(f"using task specific params for {task}: {pars}")
-        model.config.update(pars)
-
-
-def pickle_load(path):
-    """pickle.load(path)"""
-    with open(path, "rb") as f:
-        return pickle.load(f)
-
-
-def pickle_save(obj, path):
-    """pickle.dump(obj, path)"""
-    with open(path, "wb") as f:
-        return pickle.dump(obj, f)
-
-
-def flatten_list(summary_ids: List[List]):
-    return [x for x in itertools.chain.from_iterable(summary_ids)]
-
-
-def save_git_info(folder_path: str) -> None:
-    """Save git information to output_dir/git_log.json"""
-    repo_infos = get_git_info()
-    save_json(repo_infos, os.path.join(folder_path, "git_log.json"))
-
-
-def save_json(content, path):
-    with open(path, "w") as f:
-        json.dump(content, f, indent=4)
-
-
-def load_json(path):
-    with open(path) as f:
-        return json.load(f)
-
-
-def get_git_info():
-    repo = git.Repo(search_parent_directories=True)
-    repo_infos = {
-        "repo_id": str(repo),
-        "repo_sha": str(repo.head.object.hexsha),
-        "repo_branch": str(repo.active_branch),
-    }
-    return repo_infos
-
-
-ROUGE_KEYS = ["rouge1", "rouge2", "rougeL"]
-
-
-def calculate_rouge(
-    output_lns: List[str], reference_lns: List[str], use_stemmer=True
-) -> Dict:
-    scorer = rouge_scorer.RougeScorer(ROUGE_KEYS, use_stemmer=use_stemmer)
-    aggregator = scoring.BootstrapAggregator()
-
-    for reference_ln, output_ln in zip(reference_lns, output_lns):
-        scores = scorer.score(reference_ln, output_ln)
-        aggregator.add_scores(scores)
-
-    result = aggregator.aggregate()
-    return {k: round(v.mid.fmeasure * 100, 4) for k, v in result.items()}
-
-
-# Utilities for freezing parameters and checking whether they are frozen
-
-
-def freeze_params(model: nn.Module):
-    """Set requires_grad=False for each of model.parameters()"""
-    for par in model.parameters():
-        par.requires_grad = False
-
-
-def grad_status(model: nn.Module) -> Iterable:
-    return (par.requires_grad for par in model.parameters())
-
-
-def any_requires_grad(model: nn.Module) -> bool:
-    return any(grad_status(model))
-
-
-def assert_all_frozen(model):
-    model_grads: List[bool] = list(grad_status(model))
-    n_require_grad = sum(lmap(int, model_grads))
-    npars = len(model_grads)
-    assert not any(
-        model_grads
-    ), f"{n_require_grad/npars:.1%} of {npars} weights require grad"
-
-
-def assert_not_all_frozen(model):
-    model_grads: List[bool] = list(grad_status(model))
-    npars = len(model_grads)
-    assert any(model_grads), f"none of {npars} weights require grad"