zombie_retformer.py

import torch
from torch import nn
from torch.nn import functional as F
import math

class AttentionHead(nn.Module):
    def __init__(self, input_size, hidden_size, rope=False):
        super().__init__()

        self.input_size = input_size
        self.hidden_size = hidden_size
        self.sqrt_hidden_size = np.sqrt(hidden_size)

        self.linear_Q = nn.Linear(input_size, hidden_size)
        self.linear_K = nn.Linear(input_size, hidden_size)
        self.linear_V = nn.Linear(input_size, hidden_size)

        if rope:
            self.xpos = XPOS(hidden_size)
        else:
            self.xpos = nn.Identity()

    def forward(self, input, mask):
        """
        Input has shape (batch_size, MAX_SEQ_LENGTH, input_size)
        """
        batch_size, seq_length, _ = input.shape

        # TODO
        # use @cache

        queries = self.linear_Q(input)
        keys = self.linear_K(input)
        values = self.linear_V(input)

        queries = self.xpos(queries)
        keys = self.xpos(keys)

        scores = (queries @ keys.transpose(1, 2)) / self.sqrt_hidden_size
        scores = scores.masked_fill(mask == 0, NEG_INF)

        probs = F.softmax(scores, dim=-1)
        output = probs @ values

        return output

class MultiHeadedAttention(nn.Module):
    def __init__(self, hidden_size, num_heads, rope=False):
        super().__init__()

        self.hidden_size = hidden_size
        self.num_heads = num_heads

        self.heads = nn.ModuleList([
            AttentionHead(hidden_size, int(hidden_size / num_heads), rope=rope)
            for _ in range(num_heads)
        ])

        self.proj = nn.Linear(hidden_size, hidden_size)

    def forward(self, input, mask):
        """
        Input has shape (batch_size, MAX_SEQ_LENGTH, input_size)
        """
        return self.proj(torch.concat([head(input, mask) for head in self.heads], dim=-1))


class SimpleRetention(nn.Module):
    def __init__(self, hidden_size, gamma, head_size=None, double_v_dim=False):
        """
        Simple retention mechanism based on the paper
        "Retentive Network: A Successor to Transformer for Large Language Models"[https://arxiv.org/pdf/2307.08621.pdf]
        """
        super(SimpleRetention, self).__init__()

        self.hidden_size = hidden_size
        if head_size is None:
            head_size = hidden_size
        self.head_size = head_size

        self.v_dim = head_size * 2 if double_v_dim else head_size
        self.gamma = gamma

        self.W_Q = nn.Parameter(torch.randn(hidden_size, head_size) / hidden_size)
        self.W_K = nn.Parameter(torch.randn(hidden_size, head_size) / hidden_size)
        self.W_V = nn.Parameter(torch.randn(hidden_size, self.v_dim) / hidden_size)
        
        self.xpos = XPOS(head_size)

    def forward(self, X):
        """
        Parallel (default) representation of the retention mechanism.
        X: (batch_size, sequence_length, hidden_size)
        """
        sequence_length = X.shape[1]
        D = self._get_D(sequence_length).to(self.W_Q.device)

        Q = (X @ self.W_Q)
        K = (X @ self.W_K)

        Q = self.xpos(Q)
        K = self.xpos(K, downscale=True)

        V = X @ self.W_V
        ret = (Q @ K.permute(0, 2, 1)) * D.unsqueeze(0)
        
        return ret @ V
        
    def forward_recurrent(self, x_n, s_n_1, n):
        """
        Recurrent representation of the retention mechanism.
        x_n: (batch_size, 1, hidden_size)
        s_n_1: (batch_size, hidden_size, v_dim)
        """

        Q = (x_n @ self.W_Q)
        K = (x_n @ self.W_K)

        Q = self.xpos(Q, n+1)
        K = self.xpos(K, n+1, downscale=True)

        V = x_n @ self.W_V

        # K: (batch_size, 1, hidden_size)
        # V: (batch_size, 1, v_dim)
        # s_n = gamma * s_n_1 + K^T @ V

        s_n = self.gamma * s_n_1 + (K.transpose(-1, -2) @ V)
        
        return (Q @ s_n), s_n
    
    def forward_chunkwise(self, x_i, r_i_1, i):
        """
        Chunkwise representation of the retention mechanism.
        x_i: (batch_size, chunk_size, hidden_size)
        r_i_1: (batch_size, hidden_size, v_dim)
        """
        batch, chunk_size, _ = x_i.shape
        D = self._get_D(chunk_size)

        Q = (x_i @ self.W_Q)
        K = (x_i @ self.W_K)

        Q = self.xpos(Q, i * chunk_size)
        K = self.xpos(K, i * chunk_size, downscale=True)

        V = x_i @ self.W_V
        
        r_i =(K.transpose(-1, -2) @ (V * D[-1].view(1, chunk_size, 1))) + (self.gamma ** chunk_size) * r_i_1

        inner_chunk = ((Q @ K.transpose(-1, -2)) * D.unsqueeze(0)) @ V
        
        #e[i,j] = gamma ** (i+1)
        e = torch.zeros(batch, chunk_size, 1)
        
        for _i in range(chunk_size):
            e[:, _i, :] = self.gamma ** (_i + 1)
        
        cross_chunk = (Q @ r_i_1) * e
        
        return inner_chunk + cross_chunk, r_i

    def _get_D(self, sequence_length):
        n = torch.arange(sequence_length).unsqueeze(1)
        m = torch.arange(sequence_length).unsqueeze(0)

        # Broadcast self.gamma ** (n - m) with appropriate masking to set values where n < m to 0
        D = (self.gamma ** (n - m)) * (n >= m).float()  #this results in some NaN when n is much larger than m
        # fill the NaN with 0
        D[D != D] = 0

        return D
    

class MultiScaleRetention(nn.Module):
    def __init__(self, hidden_size, heads, double_v_dim=False):
        """
        Multi-scale retention mechanism based on the paper
        "Retentive Network: A Successor to Transformer for Large Language Models"[https://arxiv.org/pdf/2307.08621.pdf]
        """
        super(MultiScaleRetention, self).__init__()
        self.hidden_size = hidden_size
        self.v_dim = hidden_size * 2 if double_v_dim else hidden_size
        self.heads = heads
        assert hidden_size % heads == 0, "hidden_size must be divisible by heads"
        self.head_size = hidden_size // heads
        self.head_v_dim = hidden_size * 2 if double_v_dim else hidden_size
        
        self.gammas = (1 - torch.exp(torch.linspace(math.log(1/32), math.log(1/512), heads))).detach().cpu().tolist()

        self.swish = lambda x: x * torch.sigmoid(x)
        self.W_G = nn.Parameter(torch.randn(hidden_size, self.v_dim) / hidden_size)
        self.W_O = nn.Parameter(torch.randn(self.v_dim, hidden_size) / hidden_size)
        self.group_norm = nn.GroupNorm(heads, self.v_dim)

        self.retentions = nn.ModuleList([
            SimpleRetention(self.hidden_size, gamma, self.head_size, double_v_dim) for gamma in self.gammas
        ])

    def forward(self, X):
        """
        parallel representation of the multi-scale retention mechanism
        """

        # apply each individual retention mechanism to X
        Y = []
        for i in range(self.heads):
            Y.append(self.retentions[i](X))
        
        Y = torch.cat(Y, dim=2)
        Y_shape = Y.shape
        Y = self.group_norm(Y.reshape(-1, self.v_dim)).reshape(Y_shape)

        return (self.swish(X @ self.W_G) * Y) @ self.W_O
    
    def forward_recurrent(self, x_n, s_n_1s, n):
        """
        recurrent representation of the multi-scale retention mechanism
        x_n: (batch_size, 1, hidden_size)
        s_n_1s: (batch_size, heads, head_size, head_size)

        """
    
        # apply each individual retention mechanism to a slice of X
        Y = []
        s_ns = []
        for i in range(self.heads):
            y, s_n = self.retentions[i].forward_recurrent(
                x_n[:, :, :], s_n_1s[i], n
                )
            Y.append(y)
            s_ns.append(s_n)
        
        Y = torch.cat(Y, dim=2)
        Y_shape = Y.shape
        Y = self.group_norm(Y.reshape(-1, self.v_dim)).reshape(Y_shape)
        
        return (self.swish(x_n @ self.W_G) * Y) @ self.W_O, s_ns

    def forward_chunkwise(self, x_i, r_i_1s, i):
        """
        chunkwise representation of the multi-scale retention mechanism
        x_i: (batch_size, chunk_size, hidden_size)
        r_i_1s: (batch_size, heads, head_size, head_size)
        """
        batch, chunk_size, _ = x_i.shape

        # apply each individual retention mechanism to a slice of X
        Y = []
        r_is = []
        for j in range(self.heads):
            y, r_i = self.retentions[j].forward_chunkwise(
                x_i[:, :, :], r_i_1s[j], i
                )
            Y.append(y)
            r_is.append(r_i)
        
        
        Y = torch.cat(Y, dim=2)
        Y_shape = Y.shape
        Y = self.group_norm(Y.reshape(-1, self.v_dim)).reshape(Y_shape)

        return (self.swish(x_i @ self.W_G) * Y) @ self.W_O, r_is

class hybrid_retention(nn.Module):
    def __init__(self, hidden_size, heads, num_attn):
        super().__init__()
        self.hidden_size = hidden_size
        self.heads = heads
        self.num_attn = num_attn
        self.num_retn = heads - num_attn

        assert(hidden_size % heads == 0)
        self.head_size = hidden_size // heads

        
        self.retn_size = self.num_retn * self.head_size
        self.attn_size = self.num_attn * self.head_size

        self.retn_proj = nn.Linear(hidden_size, self.retn_size)
        self.attn_proj = nn.Linear(hidden_size, self.attn_size)

        self.retns = MultiScaleRetention(self.retn_size, self.num_retn)
        self.attns = MultiHeadedAttention(self.attn_size, self.num_attn, rope=True)

        self.proj = nn.Linear(hidden_size, hidden_size)
    
    def forward(self, x, mask):
        retns = self.retns(self.retn_proj(x))
        attns = self.attns(self.attn_proj(x), mask)
        
        out = torch.concat([retns, attns], dim=-1)
        return self.proj(out)

class ZombieRetformer(nn.Module):
    def __init__(self, layers, hidden_dim, ffn_size, heads, vocab_size, dropout, num_attn):
        super().__init__()
        self.vocab_size = vocab_size
        self.layers = layers
        self.hidden_dim = hidden_dim
        self.ffn_size = ffn_size
        self.heads = heads
        self.num_attn = num_attn

        self.embed = nn.Embedding(vocab_size, hidden_dim)
        self.proj = nn.Linear(hidden_dim, vocab_size)
        self.dropout = nn.Dropout(dropout)
        self.register_buffer("mask", None)
        self.attentions = nn.ModuleList([
            hybrid_retention(hidden_dim, heads, num_attn)
            for _ in range(layers)
        ])
        self.ffns = nn.ModuleList([
            nn.Sequential(
                nn.Linear(hidden_dim, ffn_size),
                nn.GELU(),
                nn.Linear(ffn_size, hidden_dim)
            )
            for _ in range(layers)
        ])
        self.layer_norms_1 = nn.ModuleList([
            nn.LayerNorm(hidden_dim)
            for _ in range(layers)
        ])
        self.layer_norms_2 = nn.ModuleList([
            nn.LayerNorm(hidden_dim)
            for _ in range(layers)
        ])
        self.mask = None

    def forward(self, X):
        """
        X: (batch_size, sequence_length, hidden_size)
        """
        X = self.embed(X)
        if self.mask is None or self.mask.shape[0] != X.shape[1]:
            self.mask = torch.ones(X.shape[1], X.shape[1]).tril().to(X.device)
        for i in range(self.layers):
            Y = self.attentions[i](self.layer_norms_1[i](X), self.mask)
            Y = self.dropout(Y)
            Y = Y + X
            Z = self.ffns[i](self.layer_norms_2[i](Y))
            Z = self.dropout(Z)
            X = Z + Y
        X = self.dropout(X)
        X = self.proj(X)
        return X