| import torch | |
| import torch.nn as nn | |
| import math | |
| class MultiHeadAttention(nn.Module): | |
| def __init__(self, d_model, n_heads): | |
| super(MultiHeadAttention, self).__init__() | |
| self.d_model = d_model | |
| self.n_heads = n_heads | |
| assert d_model % self.n_heads == 0 | |
| self.head_dim = d_model // n_heads | |
| self.query = nn.Linear(d_model, d_model) | |
| self.key = nn.Linear(d_model, d_model) | |
| self.value = nn.Linear(d_model, d_model) | |
| self.fc_out = nn.Linear(d_model, d_model) | |
| def forward(self, query, key, value, mask=None): | |
| N = query.shape[0] | |
| Q = self.query(query) | |
| K = self.key(key) | |
| V = self.value(value) | |
| Q = Q.view(N, -1, self.n_heads, self.head_dim).transpose(1, 2) | |
| K = K.view(N, -1, self.n_heads, self.head_dim).transpose(1, 2) | |
| V = V.view(N, -1, self.n_heads, self.head_dim).transpose(1, 2) | |
| energy = torch.matmul(Q, K.transpose(-2, -1)) / math.sqrt(self.head_dim) | |
| if mask is not None: | |
| energy = energy.masked_fill(mask == 0, float('-1e20')) | |
| attention = torch.softmax(energy, dim=-1) | |
| out = torch.matmul(attention, V) | |
| out = out.transpose(1, 2).contiguous().view(N, -1, self.n_heads * self.head_dim) | |
| out = self.fc_out(out) | |
| return out | |