graph TB
A --> p
long --> p
time --> p
ago --> p
... --> p
p["Positional Encoding"] --> X
subgraph Encoder
X["\(X\)"] --> |"\(W^Q\)"| Q["\(Q\)"]
X["\(X\)"] --> |"\(W^K\)"| K["\(K\)"]
X["\(X\)"] --> |"\(W^V\)"| V["\(V\)"]
Q --> self["Self-Attention\((X) = \sigma(Q*K^T)*V\)"]
K --> self
V --> self
self --> c["Connected Layer"]
end
c --> Y["\(Y\)"]
start["[start]"] --> pd["Positional Encoding"]
pd --> |"\(W^Q\)"| Q2["\(Q\)"]
subgraph Decoder
Y["\(Y\)"] --> |"\(W^K\)"| K2["\(K\)"]
Y["\(Y\)"] --> |"\(W^V\)"| V2["\(V\)"]
Q2 --> self2["Encoder/Decoder Attention\((Y,Q) = \sigma(Q*K^T)*V\)"]
K2 --> self2
V2 --> self2
self2 --> c2["Connected Layer"]
end
c2 --> |decode| Z["Hace"]