moreh-dev · odb9402 · Oct 25, 2023 · Nov 29, 2023
diff --git a/src/diffusers/__init__.py b/src/diffusers/__init__.py
@@ -43,6 +43,7 @@
         Transformer2DModel,
         UNet1DModel,
         UNet2DConditionModel,
+        EfficientUNet2DConditionModel,
         UNet2DModel,
         UNet3DConditionModel,
         VQModel,

diff --git a/src/diffusers/models/__init__.py b/src/diffusers/models/__init__.py
@@ -26,6 +26,7 @@
     from .unet_1d import UNet1DModel
     from .unet_2d import UNet2DModel
     from .unet_2d_condition import UNet2DConditionModel
+    from .unet_2d_condition_efficient import EfficientUNet2DConditionModel
     from .unet_3d_condition import UNet3DConditionModel
     from .vq_model import VQModel
 

diff --git a/src/diffusers/models/embeddings.py b/src/diffusers/models/embeddings.py
@@ -402,9 +402,9 @@ def __init__(self, num_heads, embed_dim, dtype=None):
         super().__init__()
         self.dtype = dtype
         self.positional_embedding = nn.Parameter(torch.randn(1, embed_dim) / embed_dim**0.5)
-        self.k_proj = nn.Linear(embed_dim, embed_dim, dtype=self.dtype)
-        self.q_proj = nn.Linear(embed_dim, embed_dim, dtype=self.dtype)
-        self.v_proj = nn.Linear(embed_dim, embed_dim, dtype=self.dtype)
+        self.k_proj = nn.Linear(embed_dim, embed_dim)
+        self.q_proj = nn.Linear(embed_dim, embed_dim)
+        self.v_proj = nn.Linear(embed_dim, embed_dim)
         self.num_heads = num_heads
         self.dim_per_head = embed_dim // self.num_heads
 
@@ -433,11 +433,13 @@ def shape(x):
 
         # (bs*n_heads, class_token_length, length+class_token_length):
         scale = 1 / math.sqrt(math.sqrt(self.dim_per_head))
-        weight = torch.einsum("bct,bcs->bts", q * scale, k * scale)  # More stable with f16 than dividing afterwards
+        #weight = torch.einsum("bct,bcs->bts", q * scale, k * scale)  # More stable with f16 than dividing afterwards
+        weight = torch.matmul((q * scale).transpose(1,2), k * scale)  # More stable with f16 than dividing afterwards
         weight = torch.softmax(weight.float(), dim=-1).type(weight.dtype)
 
         # (bs*n_heads, dim_per_head, class_token_length)
-        a = torch.einsum("bts,bcs->bct", weight, v)
+        #a = torch.einsum("bts,bcs->bct", weight, v)
+        a = torch.matmul(weight, v.transpose(1,2)).transpose(1,2)  # More stable with f16 than dividing afterwards
 
         # (bs, length+1, width)
         a = a.reshape(bs, -1, 1).transpose(1, 2)

diff --git a/src/diffusers/models/transformer_2d.py b/src/diffusers/models/transformer_2d.py
@@ -215,6 +215,7 @@ def forward(
         self,
         hidden_states,
         encoder_hidden_states=None,
+        encoder_attention_mask=None,
         timestep=None,
         class_labels=None,
         cross_attention_kwargs=None,
@@ -265,6 +266,7 @@ def forward(
             hidden_states = block(
                 hidden_states,
                 encoder_hidden_states=encoder_hidden_states,
+                encoder_attention_mask=encoder_attention_mask,
                 timestep=timestep,
                 cross_attention_kwargs=cross_attention_kwargs,
                 class_labels=class_labels,