Patch Mpt: !!better!!
# If already 4D, assume correct if attention_mask.dim() == 4: return attention_mask.to(dtype)
batch = attention_mask.size(0)
# Broadcast to query_len mask = mask.expand(batch, 1, query_length, key_length)
# If already 4D, assume correct if attention_mask.dim() == 4: return attention_mask.to(dtype)
batch = attention_mask.size(0)
# Broadcast to query_len mask = mask.expand(batch, 1, query_length, key_length)