您好,看了您关于GAU的代码,发现您的代码中并没有scale_offset的相关代码。
`def scale_offset(x):
gamma = var(x.shape[−1:])
beta = var(x.shape[−1:])
return x ∗ gamma + beta
def attn(x, v, s=128):
z = dense(x, s)
q, k = scale_offset(z), scale_offset(z)
qk = tf.einsum('bns,bms→bnm', q, k)
a = relu(qk + rel_pos_bias(q, k)) ∗∗ 2
return tf.einsum('bnm,bme→bne', a, v)
def gated_attn_unit(x, d=768, e=1536):
shortcut, x = x, norm(x)
u, v = dense(x, e), dense(x, e)
x = u ∗ attn(x, v)
return dense(x, d) + shortcut`
您好,看了您关于GAU的代码,发现您的代码中并没有scale_offset的相关代码。
`def scale_offset(x):
gamma = var(x.shape[−1:])
beta = var(x.shape[−1:])
return x ∗ gamma + beta
def attn(x, v, s=128):
z = dense(x, s)
q, k = scale_offset(z), scale_offset(z)
qk = tf.einsum('bns,bms→bnm', q, k)
a = relu(qk + rel_pos_bias(q, k)) ∗∗ 2
return tf.einsum('bnm,bme→bne', a, v)
def gated_attn_unit(x, d=768, e=1536):
shortcut, x = x, norm(x)
u, v = dense(x, e), dense(x, e)
x = u ∗ attn(x, v)
return dense(x, d) + shortcut`