Remove caching logic for local & tglobal attention

stancld · stancld · commit 2a619828f6dd · 2022-04-24T11:33:26.000+02:00
diff --git a/src/transformers/models/longt5/modeling_flax_longt5.py b/src/transformers/models/longt5/modeling_flax_longt5.py
@@ -16,7 +16,7 @@
 
 
 import copy
-from typing import Callable, List, Optional, Tuple
+from typing import Any, Callable, List, Optional, Tuple
 
 import numpy as np
 
@@ -730,10 +730,8 @@ def __call__(
         attention_mask=None,
         key_value_states=None,
         position_bias=None,
-        use_cache=False,
         output_attentions=False,
         deterministic=True,
-        init_cache=False,
     ):
         """
         Self-attention (if key_value_states is None) or attention over source sentence (provided by key_value_states).
@@ -867,13 +865,12 @@ def setup(self):
                 embedding_init=jax.nn.initializers.normal(kv_init_std),
             )
 
-        # Relativen attention bias & Layer norm for global attention
-        if self.has_relative_attention_bias:
-            self.global_relative_attention_bias = nn.Embed(
-                self.relative_attention_num_buckets,
-                self.n_heads,
-                embedding_init=jax.nn.initializers.normal(kv_init_std),
-            )
+        # Relative attention bias & Layer norm for global attention - global relative attention bias is always applied
+        self.global_relative_attention_bias = nn.Embed(
+            self.relative_attention_num_buckets,
+            self.n_heads,
+            embedding_init=jax.nn.initializers.normal(kv_init_std),
+        )
         self.global_input_layer_norm = FlaxLongT5LayerNorm(
             self.config.d_model, eps=self.config.layer_norm_epsilon, dtype=self.dtype
         )
@@ -980,10 +977,8 @@ def __call__(
         attention_mask=None,
         key_value_states=None,
         position_bias=None,
-        use_cache=False,
         output_attentions=False,
         deterministic=True,
-        init_cache=False,
     ):
         """
         Self-attention (if key_value_states is None) or attention over source sentence (provided by key_value_states).
@@ -1127,7 +1122,7 @@ def __call__(
         position_bias=None,
         output_attentions=False,
         deterministic=True,
-        init_cache=False,
+        **kwargs: Any,  # to accept init_cache kwargs
     ):
         normed_hidden_states = self.layer_norm(hidden_states)
         attention_output = self.LocalSelfAttention(
@@ -1136,7 +1131,6 @@ def __call__(
             position_bias=position_bias,
             output_attentions=output_attentions,
             deterministic=deterministic,
-            init_cache=init_cache,
         )
         hidden_states = hidden_states + self.dropout(attention_output[0], deterministic=deterministic)
         outputs = (hidden_states,) + attention_output[1:]  # add attentions if we output them
@@ -1166,7 +1160,7 @@ def __call__(
         position_bias=None,
         output_attentions=False,
         deterministic=True,
-        init_cache=False,
+        **kwargs: Any,  # to accept init_cache kwargs
     ):
         normed_hidden_states = self.layer_norm(hidden_states)
         attention_output = self.TransientGlobalSelfAttention(
@@ -1175,7 +1169,6 @@ def __call__(
             position_bias=position_bias,
             output_attentions=output_attentions,
             deterministic=deterministic,
-            init_cache=init_cache,
         )
         hidden_states = hidden_states + self.dropout(attention_output[0], deterministic=deterministic)
         outputs = (hidden_states,) + attention_output[1:]  # add attentions if we output them
diff --git a/src/transformers/models/longt5/modeling_longt5.py b/src/transformers/models/longt5/modeling_longt5.py
@@ -18,7 +18,7 @@
 import copy
 import math
 import warnings
-from typing import List, Optional, Tuple, Union
+from typing import Any, List, Optional, Tuple, Union
 
 import torch
 from torch import nn
@@ -656,22 +656,11 @@ def forward(
         self,
         hidden_states,
         mask=None,
-        key_value_states=None,
         position_bias=None,
-        past_key_value=None,
         layer_head_mask=None,
-        query_length=None,
-        use_cache=False,
         output_attentions=False,
     ):
         batch_size, seq_length = hidden_states.shape[:2]
-        real_seq_length = seq_length
-
-        if past_key_value is not None:
-            assert (
-                len(past_key_value) == 2
-            ), f"past_key_value should have 2 past states: keys and values. Got { len(past_key_value)} past states"
-            real_seq_length += past_key_value[0].shape[2] if query_length is None else query_length
 
         def shape(states):
             """projection"""
@@ -681,37 +670,10 @@ def unshape(states):
             """reshape"""
             return states.contiguous().view(batch_size, -1, self.inner_dim)
 
-        def project(hidden_states, proj_layer, key_value_states, past_key_value):
-            """projects hidden states correctly to key/query states"""
-            if key_value_states is None:
-                # self-attn
-                # (batch_size, seq_length, n_heads, dim_per_head)
-                hidden_states = shape(proj_layer(hidden_states))
-            elif past_key_value is None:
-                # cross-attn
-                # (batch_size, seq_length, n_heads, dim_per_head)
-                hidden_states = shape(proj_layer(key_value_states))
-
-            if past_key_value is not None:
-                if key_value_states is None:
-                    # self-attn
-                    # (batch_size, seq_length, n_heads, dim_per_head)
-                    hidden_states = torch.cat([past_key_value.transpose(1, 2), hidden_states], dim=2)
-                else:
-                    # cross-attn
-                    hidden_states = past_key_value.transpose(1, 2)
-            return hidden_states
-
-        # get query states -> (batch_size, seq_length, n_heads, dim_per_head)
+        # get query/key/value states -> (batch_size, seq_length, n_heads, dim_per_head)
         query_states = shape(self.q(hidden_states))
-
-        # get key/value states
-        key_states = project(
-            hidden_states, self.k, key_value_states, past_key_value[0] if past_key_value is not None else None
-        )
-        value_states = project(
-            hidden_states, self.v, key_value_states, past_key_value[1] if past_key_value is not None else None
-        )
+        key_states = shape(self.k(hidden_states))
+        value_states = shape(self.v(hidden_states))
 
         # Split into blocks -> (batch_size, num_blocks, block_len, n_heads, dim_per_head)
         query_states = _split_into_blocks(query_states, self.block_len, dim=1)
@@ -722,10 +684,8 @@ def project(hidden_states, proj_layer, key_value_states, past_key_value):
         key_states = _concatenate_3_blocks(key_states, block_dim=1, sequence_dim=2)
         value_states = _concatenate_3_blocks(value_states, block_dim=1, sequence_dim=2)
 
-        # Compute scores
-        scores = torch.einsum(
-            "...qhd,...khd->...hqk", query_states, key_states
-        )  # (batch_size, num_block, n_heads, block_len, 3 * block_len)
+        # Compute scores -> (batch_size, num_block, n_heads, block_len, 3 * block_len)
+        scores = torch.einsum("...qhd,...khd->...hqk", query_states, key_states)
 
         if position_bias is None:
             # position_bias shape: # (1, 1, n_heads, block_len, 3 * block_len)
@@ -737,10 +697,6 @@ def project(hidden_states, proj_layer, key_value_states, past_key_value):
                     position_bias.requires_grad = True
             else:
                 position_bias = self.compute_bias(self.block_len)
-            # if key and values are already calculated
-            # we want only the last query position bias
-            if past_key_value is not None:
-                position_bias = position_bias[:, :, -hidden_states.size(1) :, :]
 
             if mask is not None:
                 # Replace masked positions with -10_000 (according to the original implementation)
@@ -762,8 +718,7 @@ def project(hidden_states, proj_layer, key_value_states, past_key_value):
         attn_output = attn_output[:, :seq_length, :]
         attn_output = self.o(attn_output)
 
-        present_key_value_state = (key_states, value_states) if (self.is_decoder and use_cache) else None
-        outputs = (attn_output,) + (present_key_value_state,) + (position_bias,)
+        outputs = (attn_output,) + (position_bias,)
 
         if output_attentions:
             outputs = outputs + (attn_weights,)
@@ -797,9 +752,8 @@ def __init__(self, config: LongT5Config, has_relative_attention_bias: bool = Fal
         self.pruned_heads = set()
         self.gradient_checkpointing = False
 
-        # Relativen attention bias & Layer norm for global attention
-        if self.has_relative_attention_bias:
-            self.global_relative_attention_bias = nn.Embedding(self.relative_attention_num_buckets, self.n_heads)
+        # Relative attention bias & Layer norm for global attention - global relative attention bias is always applied
+        self.global_relative_attention_bias = nn.Embedding(self.relative_attention_num_buckets, self.n_heads)
         self.global_input_layer_norm = LongT5LayerNorm(config.d_model, eps=config.layer_norm_epsilon)
 
     # Copied from transformers.models.t5.modeling_t5.T5Attention.prune_heads
@@ -879,7 +833,7 @@ def compute_bias(self, block_length: int):
         # (block_length, 3 * block_length)
         relative_position = memory_position - context_position
         relative_position_bucket = self._relative_position_bucket(
-            relative_position,  # (block_length, 3 * block_length)
+            relative_position,
             bidirectional=(not self.is_decoder),
             num_buckets=self.relative_attention_num_buckets,
             max_distance=self.relative_attention_max_distance,
@@ -915,22 +869,11 @@ def forward(
         self,
         hidden_states,
         mask=None,
-        key_value_states=None,
         position_bias=None,
-        past_key_value=None,
         layer_head_mask=None,
-        query_length=None,
-        use_cache=False,
         output_attentions=False,
     ):
         batch_size, seq_length = hidden_states.shape[:2]
-        real_seq_length = seq_length
-
-        if past_key_value is not None:
-            assert (
-                len(past_key_value) == 2
-            ), f"past_key_value should have 2 past states: keys and values. Got { len(past_key_value)} past states"
-            real_seq_length += past_key_value[0].shape[2] if query_length is None else query_length
 
         def shape(states):
             """projection"""
@@ -940,27 +883,6 @@ def unshape(states):
             """reshape"""
             return states.contiguous().view(batch_size, -1, self.inner_dim)
 
-        def project(hidden_states, proj_layer, key_value_states, past_key_value):
-            """projects hidden states correctly to key/query states"""
-            if key_value_states is None:
-                # self-attn
-                # (batch_size, seq_length, n_heads, dim_per_head)
-                hidden_states = shape(proj_layer(hidden_states))
-            elif past_key_value is None:
-                # cross-attn
-                # (batch_size, seq_length, n_heads, dim_per_head)
-                hidden_states = shape(proj_layer(key_value_states))
-
-            if past_key_value is not None:
-                if key_value_states is None:
-                    # self-attn
-                    # (batch_size, seq_length, n_heads, dim_per_head)
-                    hidden_states = torch.cat([past_key_value.transpose(1, 2), hidden_states], dim=2)
-                else:
-                    # cross-attn
-                    hidden_states = past_key_value.transpose(1, 2)
-            return hidden_states
-
         # Prepare components for transient-global attention
         # Obtain block_ids and global_segment_ids
         # global_seq_len := seq_len // self.global_block_size
@@ -974,20 +896,14 @@ def project(hidden_states, proj_layer, key_value_states, past_key_value):
         global_inputs = _create_global_aggregates(hidden_states, block_ids, _global_seq_len)
         global_inputs = self.global_input_layer_norm(global_inputs)
 
-        # get query states -> (batch_size, seq_length, n_heads, dim_per_head)
+        # get query/key/value states -> (batch_size, seq_length, n_heads, dim_per_head)
         query_states = shape(self.q(hidden_states))
-
-        # get key/value states
-        key_states = project(
-            hidden_states, self.k, key_value_states, past_key_value[0] if past_key_value is not None else None
-        )
-        value_states = project(
-            hidden_states, self.v, key_value_states, past_key_value[1] if past_key_value is not None else None
-        )
+        key_states = shape(self.k(hidden_states))
+        value_states = shape(self.v(hidden_states))
 
         # Get global/side key/value states  shape: (batch_size, global_seq_len, n_heads, dim_per_head)
-        side_key_states = project(global_inputs, self.k, None, None)
-        side_value_states = project(global_inputs, self.v, None, None)
+        side_key_states = shape(self.k(global_inputs))
+        side_value_states = shape(self.v(global_inputs))
 
         # Split into blocks -> (batch_size, num_blocks, block_len, n_heads, dim_per_head)
         query_states = _split_into_blocks(query_states, self.block_len, dim=1)
@@ -1033,10 +949,6 @@ def project(hidden_states, proj_layer, key_value_states, past_key_value):
                     position_bias.requires_grad = True
             else:
                 position_bias = self.compute_bias(self.block_len)
-            # if key and values are already calculated
-            # we want only the last query position bias
-            if past_key_value is not None:
-                position_bias = position_bias[:, :, -hidden_states.size(1) :, :]
 
             if local_attention_mask is not None:
                 # (batch_size, 1, n_heads, block_len, 3 * block_len)
@@ -1065,8 +977,7 @@ def project(hidden_states, proj_layer, key_value_states, past_key_value):
         attn_output = attn_output[:, :seq_length, :]
         attn_output = self.o(attn_output)
 
-        present_key_value_state = (key_states, value_states) if (self.is_decoder and use_cache) else None
-        outputs = (attn_output,) + (present_key_value_state,) + (position_bias,)
+        outputs = (attn_output,) + (position_bias,)
 
         if output_attentions:
             outputs = outputs + (attn_weights,)
@@ -1121,18 +1032,15 @@ def forward(
         attention_mask=None,
         position_bias=None,
         layer_head_mask=None,
-        past_key_value=None,
-        use_cache=False,
         output_attentions=False,
+        **kwargs: Any,  # to accept past_key_value and use_cache kwargs
     ):
         normed_hidden_states = self.layer_norm(hidden_states)
         attention_output = self.LocalSelfAttention(
             normed_hidden_states,
             mask=attention_mask,
             position_bias=position_bias,
             layer_head_mask=layer_head_mask,
-            past_key_value=past_key_value,
-            use_cache=use_cache,
             output_attentions=output_attentions,
         )
         hidden_states = hidden_states + self.dropout(attention_output[0])
@@ -1157,18 +1065,15 @@ def forward(
         attention_mask=None,
         position_bias=None,
         layer_head_mask=None,
-        past_key_value=None,
-        use_cache=False,
         output_attentions=False,
+        **kwargs: Any,  # to accept past_key_value and use_cache kwargs
     ):
         normed_hidden_states = self.layer_norm(hidden_states)
         attention_output = self.TransientGlobalSelfAttention(
             normed_hidden_states,
             mask=attention_mask,
             position_bias=position_bias,
             layer_head_mask=layer_head_mask,
-            past_key_value=past_key_value,
-            use_cache=use_cache,
             output_attentions=output_attentions,
         )
         hidden_states = hidden_states + self.dropout(attention_output[0])
@@ -1402,10 +1307,8 @@ def _init_weights(self, module):
             module.o.weight.data.normal_(mean=0.0, std=factor * ((n_heads * key_value_proj_dim) ** -0.5))
             if module.has_relative_attention_bias:
                 module.relative_attention_bias.weight.data.normal_(mean=0.0, std=factor * ((d_model) ** -0.5))
-                if isinstance(module, LongT5TransientGlobalAttention):
-                    module.global_relative_attention_bias.weight.data.normal_(
-                        mean=0.0, std=factor * ((d_model) ** -0.5)
-                    )
+            if isinstance(module, LongT5TransientGlobalAttention):
+                module.global_relative_attention_bias.weight.data.normal_(mean=0.0, std=factor * ((d_model) ** -0.5))
 
     # Copied from transformers.models.t5.modeling_t5.T5PreTrainedModel._set_gradient_checkpointing with T5->LongT5
     def _set_gradient_checkpointing(self, module, value=False):
@@ -1644,17 +1547,19 @@ def custom_forward(*inputs):
             # We share the position biases between the layers - the first layer store them
             # layer_outputs = hidden-states, key-value-states (self-attention position bias), (self-attention weights),
             # (cross-attention position bias), (cross-attention weights)
-            position_bias = layer_outputs[2]
+            position_bias = layer_outputs[2] if self.is_decoder else layer_outputs[1]
             if self.is_decoder and encoder_hidden_states is not None:
                 encoder_decoder_position_bias = layer_outputs[4 if output_attentions else 3]
             # append next layer key value states
             if use_cache:
                 present_key_value_states = present_key_value_states + (present_key_value_state,)
 
             if output_attentions:
-                all_attentions = all_attentions + (layer_outputs[3],)
                 if self.is_decoder:
+                    all_attentions = all_attentions + (layer_outputs[3],)
                     all_cross_attentions = all_cross_attentions + (layer_outputs[5],)
+                else:
+                    all_attentions = all_attentions + (layer_outputs[2],)
 
             # Model Parallel: If it's the last layer for that device, put things on the next device
             if self.model_parallel: