
    rh                     B   d dl Z d dlmZ d dlmZmZmZ d dlZd dlm	Z	 d dl
m	c mZ ddlmZ ddlmZmZ ddlmZ ddlmZ dd	lmZ dd
lmZ ddlmZmZ ddlmZmZ ddl m!Z!m"Z" ddl#m$Z$ ddl%m&Z&m'Z'm(Z( ddl)m*Z* ddl+m,Z,m-Z-m.Z. d Z/d^dZ0dejb                  de2dejb                  fdZ3	 d_de	jh                  dejb                  dejb                  dejb                  deejb                     de5de5d e$e&   fd!Z6 G d" d#e	jh                        Z7 ed$       G d% d&e	jh                               Z8 G d' d(e	jh                        Z9 G d) d*e      Z: G d+ d,e	jh                        Z; G d- d.e	jh                        Z< G d/ d0e	jh                        Z= G d1 d2e	jh                        Z> G d3 d4e	jh                        Z? G d5 d6e	jh                        Z@ G d7 d8e	jh                        ZA G d9 d:e	jh                        ZB G d; d<e	jh                        ZC G d= d>e	jh                        ZD G d? d@e	j                        ZF G dA dBe	jh                        ZG G dC dDe	jh                        ZH G dE dFe	jh                        ZI G dG dHe	jh                        ZJ G dI dJe	jh                        ZK e'dKL       G dM dNe"             ZL G dO dP      ZMe' G dQ dRe"             ZN G dS dTe	jh                        ZOe' G dU dVeN             ZPe' G dW dXeNe             ZQ G dY dZeN      ZR G d[ d\eNe      ZSg d]ZTy)`    N)cached_property)CallableOptionalUnion   )ACT2FN)CacheDynamicCache)GenerationMixin)use_kernel_forward_from_hub)create_causal_mask)GradientCheckpointingLayer)BaseModelOutputWithPastCausalLMOutputWithPast)ROPE_INIT_FUNCTIONSdynamic_rope_update)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)TransformersKwargsauto_docstringcan_return_tuple)check_model_inputs   )
Emu3ConfigEmu3TextConfigEmu3VQVAEConfigc                     | dd| j                   d   dz  f   }| d| j                   d   dz  df   }t        j                  | |fd      S )z*Rotates half the hidden dims of the input..N   dim)shapetorchcat)xx1x2s      y/var/www/html/ai-insurance-compliance-backend/venv/lib/python3.12/site-packages/transformers/models/emu3/modeling_emu3.pyrotate_halfr*   .   sZ    	
3"!''"+"""	#B	
3q ""	#B99rc2YB''    c                     |j                  |      }|j                  |      }| |z  t        |       |z  z   }||z  t        |      |z  z   }||fS )a  Applies Rotary Position Embedding to the query and key tensors.

    Args:
        q (`torch.Tensor`): The query tensor.
        k (`torch.Tensor`): The key tensor.
        cos (`torch.Tensor`): The cosine part of the rotary embedding.
        sin (`torch.Tensor`): The sine part of the rotary embedding.
        position_ids (`torch.Tensor`, *optional*):
            Deprecated and unused.
        unsqueeze_dim (`int`, *optional*, defaults to 1):
            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
    Returns:
        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
    )	unsqueezer*   )qkcossinposition_idsunsqueeze_dimq_embedk_embeds           r)   apply_rotary_pos_embr6   5   sY    ( --
&C
--
&C3w;q>C/0G3w;q>C/0GGr+   hidden_statesn_repreturnc                     | j                   \  }}}}|dk(  r| S | dddddddddf   j                  |||||      } | j                  |||z  ||      S )z
    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
    r   N)r#   expandreshape)r7   r8   batchnum_key_value_headsslenhead_dims         r)   	repeat_kvrA   P   so    
 2?1D1D.Ehz!!Qa"23::5BUW\^bdlmM  (;e(CT8TTr+   modulequerykeyvalueattention_maskscalingdropoutkwargsc                 T   t        || j                        }t        || j                        }	t        j                  ||j	                  dd            |z  }
|#|d d d d d d d |j
                  d   f   }|
|z   }
t        j                  j                  |
dt        j                        j                  |j                        }
t        j                  j                  |
|| j                        }
t        j                  |
|	      }|j	                  dd      j                         }||
fS )Nr    r   r   )r"   dtype)ptrainingr   )rA   num_key_value_groupsr$   matmul	transposer#   nn
functionalsoftmaxfloat32torL   rH   rN   
contiguous)rB   rC   rD   rE   rF   rG   rH   rI   
key_statesvalue_statesattn_weightscausal_maskattn_outputs                r)   eager_attention_forwardr]   \   s    3 ; ;<JUF$?$?@L<<z';';Aq'ABWLL!$Q1.D
0@0@0D.D%DE#k1==((2U]](SVVW\WbWbcL==((6??([L,,|\:K''1-88:K$$r+   c                       e Zd ZdZdedef fdZ	 	 ddej                  de	ej                  ej                  f   de
ej                     de
e   d	e
ej                     d
ee   de	ej                  ej                  f   fdZ xZS )Emu3Attention=Multi-headed attention from 'Attention Is All You Need' paperconfig	layer_idxc                 d   t         |           || _        || _        t	        |d|j
                  |j                  z        | _        |j                  |j                  z  | _	        | j                  dz  | _
        |j                  | _        d| _        t        j                  |j
                  |j                  | j                  z  |j                        | _        t        j                  |j
                  |j                  | j                  z  |j                        | _        t        j                  |j
                  |j                  | j                  z  |j                        | _        t        j                  |j                  | j                  z  |j
                  |j                        | _        y )Nr@         Tbias)super__init__ra   rb   getattrhidden_sizenum_attention_headsr@   r>   rO   rG   attention_dropout	is_causalrR   Linearattention_biasq_projk_projv_projo_projselfra   rb   	__class__s      r)   rh   zEmu3Attention.__init__y   sM   "
F4F4F&JdJd4de$*$>$>&B\B\$\!}}d*!'!9!9ii : :T]] JQWQfQf
 ii : :T]] JQWQfQf
 ii : :T]] JQWQfQf
 ii&&68J8JQWQfQf
r+   r7   position_embeddingsrF   past_key_valuecache_positionrI   r9   c                 4   |j                   d d }g |d| j                  }| j                  |      j                  |      j	                  dd      }	| j                  |      j                  |      j	                  dd      }
| j                  |      j                  |      j	                  dd      }|\  }}t        |	|
||      \  }	}
|'|||d}|j                  |
|| j                  |      \  }
}t        }| j                  j                  dk7  rt        | j                  j                     } || |	|
||f| j                  sdn| j                  | j                   d|\  }} |j"                  g |d j%                         }| j'                  |      }||fS )Nr   r   r    )r1   r0   ry   eager        )rH   rG   )r#   r@   rp   viewrQ   rq   rr   r6   updaterb   r]   ra   _attn_implementationr   rN   rl   rG   r<   rW   rs   )ru   r7   rw   rF   rx   ry   rI   input_shapehidden_shapequery_statesrX   rY   r0   r1   cache_kwargsattention_interfacer\   rZ   s                     r)   forwardzEmu3Attention.forward   s    $))#2.88b8$--8{{=166|DNNqRST[[/44\BLLQPQR
{{=166|DNNqRST&S#7jRUWZ#[ j%#&snUL'5'<'<ZW[WeWegs't$J(?;;++w6"9$++:Z:Z"[$7	%
  $}}C$2H2HLL	%
 	%
!\ *k));;;;FFHkk+.L((r+   NN)__name__
__module____qualname____doc__r   intrh   r$   Tensortupler   r	   
LongTensorr   r   r   __classcell__rv   s   @r)   r_   r_   v   s    G
z 
c 
8 +/59))||)) #5<<#=>)) !.	))
 !)) !!1!12)) +,)) 
u||U\\)	*))r+   r_   RMSNormc                   ,     e Zd Zd fd	Zd Zd Z xZS )Emu3RMSNormc                     t         |           t        j                  t	        j
                  |            | _        || _        y)z:
        Emu3RMSNorm is equivalent to T5LayerNorm
        N)rg   rh   rR   	Parameterr$   onesweightvariance_epsilon)ru   rj   epsrv   s      r)   rh   zEmu3RMSNorm.__init__   s1     	ll5::k#:; #r+   c                 "   |j                   }|j                  t        j                        }|j	                  d      j                  dd      }|t        j                  || j                  z         z  }| j                  |j                  |      z  S )Nr    r   T)keepdim)	rL   rV   r$   rU   powmeanrsqrtr   r   )ru   r7   input_dtypevariances       r)   r   zEmu3RMSNorm.forward   sy    #))%((7 $$Q',,R,>%Ht?T?T4T(UU{{]--k:::r+   c                 ^    t        | j                  j                         d| j                   S )Nz, eps=)r   r   r#   r   ru   s    r)   
extra_reprzEmu3RMSNorm.extra_repr   s*    ))*+6$2G2G1HIIr+   )ư>)r   r   r   rh   r   r   r   r   s   @r)   r   r      s    $;Jr+   r   c                   $     e Zd Z fdZd Z xZS )Emu3MLPc                    t         |           || _        |j                  | _        |j                  | _        t        j                  | j                  | j                  |j                        | _        t        j                  | j                  | j                  |j                        | _	        t        j                  | j                  | j                  |j                        | _
        t        |j                     | _        y )Nre   )rg   rh   ra   rj   intermediate_sizerR   rn   mlp_bias	gate_projup_proj	down_projr   
hidden_actact_fnru   ra   rv   s     r)   rh   zEmu3MLP.__init__   s    !--!'!9!94#3#3T5K5KRXRaRabyy!1!143I3IPVP_P_`4#9#94;K;KRXRaRabV../r+   c                     | j                  | j                  | j                  |            | j                  |      z        }|S N)r   r   r   r   )ru   r&   r   s      r)   r   zEmu3MLP.forward   s6    NN4;;t~~a/@#ADLLQRO#ST	r+   r   r   r   rh   r   r   r   s   @r)   r   r      s    0r+   r   c                   d    e Zd Zdedef fdZ	 	 	 	 	 	 ddej                  deej                     deej                     dee
   dee   d	eej                     d
eeej                  ej                  f      dee   deej                  eeej                  ej                  f      f   fdZ xZS )Emu3DecoderLayerra   rb   c                 h   t         |           |j                  | _        t        ||      | _        t        |      | _        t        |j                  |j                        | _	        t        |j                  |j                        | _
        t        j                  |j                        | _        y )N)ra   rb   r   )rg   rh   rj   r_   	self_attnr   mlpr   rms_norm_epsinput_layernormpost_attention_layernormrR   Dropoutrl   rH   rt   s      r)   rh   zEmu3DecoderLayer.__init__   s    !--&f	J6?*6+=+=6CVCVW(3F4F4FFL_L_(`%zz&":":;r+   r7   rF   r2   rx   	use_cachery   rw   rI   r9   c                     |}	| j                  |      } | j                  d|||||||d|\  }}
|	| j                  |      z   }|}	| j                  |      }| j	                  |      }|	| j                  |      z   }|S )N)r7   rF   r2   rx   r   ry   rw    )r   r   rH   r   r   )ru   r7   rF   r2   rx   r   ry   rw   rI   residual_s              r)   r   zEmu3DecoderLayer.forward   s     !,,];)4>> 	
')%)) 3	
 	
q !4<<#>> 55mD/ 4<<#>>r+   )NNNFNN)r   r   r   r   r   rh   r$   r   r   r   r	   boolr   r   r   FloatTensorr   r   r   s   @r)   r   r      s    	<z 	<c 	< 2637*.$)59KO|| !. u//0	
 ! D> !!1!12 &eELL%,,,F&GH +, 
u  (51B1BEDUDU1U+V"WW	Xr+   r   c                   H     e Zd ZdZdef fdZdej                  fdZ xZ	S )Emu3VQVAEVectorQuantizera  
    A module for vector quantization using learned embedding vectors.

    This module implements the quantization process similar to te one described in
    the VQ-VAE (Vector Quantized Variational AutoEncoder) paper. It quantizes continuous
    input vectors into discrete codebook vectors, which are learned during training.
    Current implementation improves over previous ones by avoiding costly matrix multiplications
    and allowing for post-hoc remapping of indices.
    ra   c                    t         |           t        j                  |j                  |j
                        | _        | j                  j                  j                  j                  d|j                  z  d|j                  z         y )Ng            ?)
rg   rh   rR   	Embeddingcodebook_size	embed_dim	embeddingr   datauniform_r   s     r)   rh   z!Emu3VQVAEVectorQuantizer.__init__  sb    f&:&:F<L<LM""++D63G3G,GvOcOcIcdr+   hidden_statec                    |j                   \  }}}}}|j                  ddddd      j                         }|j                  d|      }t	        j
                  |dz  dd      }t	        j
                  | j                  j                  dz  d	      }	dt	        j                  || j                  j                  j                  dd            z  }
||	z   |
z
  }
t	        j                  |
d	      }|j                  ||||      }|S )
Nr   r   r      r    r   T)r"   r   r!   )r#   permuterW   r}   r$   sumr   r   rP   rQ   argmin)ru   r   
batch_sizetemporalchannelsheightwidthhidden_state_flattenedhidden_state_sumembedding_sum	distancesmin_encoding_indicess               r)   r   z Emu3VQVAEVectorQuantizer.forward  s    8D8J8J5
Hh#++Aq!Q:EEG!-!2!22x!@ !99%;Q%>AtT		$.."7"7":B %;T^^=R=R=\=\]^`a=bcc	$}4y@	$||I1=388XvW\]##r+   )
r   r   r   r   r   rh   r$   r   r   r   r   s   @r)   r   r     s&    e e
$ELL $r+   r   c                   $     e Zd Z fdZd Z xZS )Emu3VQVAEEncoderConvDownsamplec                 `    t         |           t        j                  ||ddd      | _        y )Nr   r    r   kernel_sizestridepaddingrg   rh   rR   Conv2dconvru   in_channelsrv   s     r)   rh   z'Emu3VQVAEEncoderConvDownsample.__init__1  '    IIk;AaYZ[	r+   c                 Z    t        j                  |ddd      }| j                  |      }|S )N)r   r   r   r   constantr   )padmoderE   )Fr   r   ru   r7   s     r)   r   z&Emu3VQVAEEncoderConvDownsample.forward5  s+    mJVWX		-0r+   r   r   s   @r)   r   r   0  s    \r+   r   c                   $     e Zd Z fdZd Z xZS )Emu3VQVAEEncoderConvUpsamplec                 `    t         |           t        j                  ||ddd      | _        y )Nr   r   r   r   r   s     r)   rh   z%Emu3VQVAEEncoderConvUpsample.__init__=  r   r+   c                 X    t        j                  |dd      }| j                  |      }|S )N       @nearestscale_factorr   )r   interpolater   r   s     r)   r   z$Emu3VQVAEEncoderConvUpsample.forwardA  s(    m#IV		-0r+   r   r   s   @r)   r   r   <  s    \r+   r   c            	       \     e Zd Zdededee   dee   f fdZdej                  fdZ xZ	S )Emu3VQVAEConv3d
in_channelout_channelr   r   c                 P   t         	|           t        |dd  |dd        D cg c]
  \  }}||z
   }}}d| _        |d d d   D ]%  }| xj                  |dz  |dz  z   |dz  fz  c_        ' | xj                  dz  c_        t	        j
                  ||||      | _        y c c}}w )Nr   r   r   r    )r    r   )r   )rg   rh   zipr   rR   Conv3dr   )
ru   r   r   r   r   
one_kernel
one_stridepadding_sizespad_sizerv   s
            r)   rh   zEmu3VQVAEConv3d.__init__H  s     	ORS^_`_aSbdjklkmdnOop5KZj0pp%dd+ 	JHLLX]X\98q=IIL	JII	
	 qs   B"r7   c                 h    t        j                  || j                        }| j                  |      }|S r   )r   r   r   r   r   s     r)   r   zEmu3VQVAEConv3d.forward^  s*    mT\\:		-0r+   )
r   r   r   r   r   rh   r$   r   r   r   r   s   @r)   r   r   G  sF    

 
 3Z	

 c

,U\\ r+   r   c                   `     e Zd Zdedef fdZdej                  dej                  fdZ xZS )Emu3VQVAESpatialNormr   out_channelsc                     t         |           t        j                  |ddd      | _        t        j
                  ||ddd      | _        t        j
                  ||ddd      | _        y )N    r   Tnum_channels
num_groupsr   affiner   r   r   )rg   rh   rR   	GroupNorm
norm_layerr   conv_yconv_bru   r   r  rv   s      r)   rh   zEmu3VQVAESpatialNorm.__init__e  sn    
 	,,%	
 ii
 ii
r+   r7   quant_statesc                     t        j                  ||j                  dd  d      }| j                  |      }|| j	                  |      z  | j                  |      z   }|S )NrK   r   )sizer   )r   r   r#   r  r  r  )ru   r7   r  s      r)   r   zEmu3VQVAESpatialNorm.forward  sX    }}\8K8KBC8PW`a6%L(AADKKP\D]]r+   	r   r   r   r   rh   r$   r   r   r   r   s   @r)   r  r  d  s5    

 
8U\\  r+   r  c                   H     e Zd Zdedef fdZdej                  fdZ xZS )Emu3VQVAETemporalUpsampler   r   c                 J    t         |           t        ||dd      | _        y )Nr   r   r   r   r   r   r   r   rg   rh   r   r   ru   r   r   rv   s      r)   rh   z"Emu3VQVAETemporalUpsample.__init__  (    
 	#!	
	r+   r7   c                 P   |j                   \  }}}}}|j                  ddddd      j                         j                  |d|      }t	        j
                  |dd	      }|j                  ||||d      j                  ddddd      j                         }| j                  |      }|S )
Nr   r   r   r   r    r   r   r   r   )r#   r   rW   r}   r   r   r   )ru   r7   r   r   r   r   r   s          r)   r   z!Emu3VQVAETemporalUpsample.forward  s    8E8K8K5
Hh%--aAq!<GGINNz[]_ghm#IV%**:xPRS[[\]_`bcefhijuuw		-0r+   r  r   s   @r)   r  r    s*    

 
U\\ r+   r  c                   H     e Zd Zdedef fdZdej                  fdZ xZS )Emu3VQVAETemporalDownsampler   r   c                 J    t         |           t        ||dd      | _        y )N)r   r   r   )r    r   r   r  r  r  s      r)   rh   z$Emu3VQVAETemporalDownsample.__init__  r   r+   r7   c                 (    | j                  |      }|S r   )r   r   s     r)   r   z#Emu3VQVAETemporalDownsample.forward  s    		-0r+   r  r   s   @r)   r#  r#    s*    

 
U\\ r+   r#  c                   (     e Zd Z	 d fd	Zd Z xZS )Emu3VQVAETemporalResnetBlockc                 p   t         |           || _        ||n|| _        t	        j
                  |      | _        t        ||dd      | _        t	        j
                  |      | _	        t        ||dd      | _
        | j                  | j                  k7  r t	        j                  ||ddd      | _        y y )Nr  r  r  r   r   r   )rg   rh   r   r  rR   BatchNorm3dnorm1r   conv1norm2conv2r   nin_shortcutr  s      r)   rh   z%Emu3VQVAETemporalResnetBlock.__init__  s    
 	&+7+?K\^^K0
$!	

 ^^L1
$!	

 t000 "		!D 1r+   c                 L   |}| j                  |      }|t        j                  |      z  }| j                  |      }| j	                  |      }|t        j                  |      z  }| j                  |      }| j                  | j                  k7  r| j                  |      }||z   S r   )	r*  r$   sigmoidr+  r,  r-  r   r  r.  )ru   r7   r   s      r)   r   z$Emu3VQVAETemporalResnetBlock.forward  s     

=1}55

=1

=1}55

=1t000((2H-''r+   r   r   r   s   @r)   r'  r'    s     @(r+   r'  c                   ~     e Zd Z	 	 ddedee   dee   f fdZddej                  deej                     fdZ xZ	S )	Emu3VQVAEResnetBlockr   r  quant_channelsc                    t         |           || _        ||n|}|| _        || _        |=t        j                  |ddd      | _        t        j                  |ddd      | _        n"t        ||      | _        t        ||      | _        t        j                  ||ddd      | _        t        j                  ||ddd      | _        | j                  | j                  k7  r t        j                  ||ddd      | _        y y )	Nr
  r   Tr  r   r   r   r   )rg   rh   r   r  r3  rR   r  r*  r,  r  r   r+  r-  r.  )ru   r   r  r3  rv   s       r)   rh   zEmu3VQVAEResnetBlock.__init__  s    	&&2&:{(,!;2SW`deDJ<BTXaefDJ-nkJDJ-nlKDJYY

 YY

 t000 "		!D 1r+   r7   c                 v   | j                   dn|f}|} | j                  |g| }|t        j                  |      z  }| j	                  |      } | j
                  |g| }|t        j                  |      z  }| j                  |      }| j                  | j                  k7  r| j                  |      }||z   S Nr   )
r3  r*  r$   r0  r+  r,  r-  r   r  r.  )ru   r7   r3  	norm_argsr   s        r)   r   zEmu3VQVAEResnetBlock.forward  s    --5BN;L	 "

==9=}55

=1"

==9=}55

=1t000((2H-''r+   r   r   )
r   r   r   r   r   rh   r$   r   r   r   r   s   @r)   r2  r2    sU     '+(,	** sm* !	*X(U\\ (8ELLCY (r+   r2  c            
            e Zd ZdZdef fdZ	 ddej                  deej                     de	ej                  eej                     f   fdZ
 xZS )	Emu3VQVAEAttentionBlockr`   ra   c                 &   t         |           || _        |j                  | _        |j
                  | _        | j                  | j                  z  | _        | j                  | j                  z  | j                  k7  r&t        d| j                   d| j                   d      | j                  dz  | _	        |j                  | _        d| _        t        j                  | j                  | j                        | _        t        j                  | j                  | j                        | _        t        j                  | j                  | j                        | _        t        j                  | j                  | j                        | _        d| _        y )Nz;embed_dim must be divisible by num_heads (got `embed_dim`: z and `num_heads`: z).rd   Fr   )rg   rh   ra   rj   r   rk   	num_headsr@   
ValueErrorscalerl   rH   rm   rR   rn   rq   rr   rp   out_projrO   r   s     r)   rh   z Emu3VQVAEAttentionBlock.__init__%  s$   ++33$..8==4>>)T^^;MdnnM] ^NN#2'  ]]D(
//ii?ii?ii?		$..$..A %&!r+   r7   rF   r9   c           
      :   |j                   \  }}}| j                  |      }| j                  |      }| j                  |      }	|j	                  ||| j
                  | j                        j                  dd      }|j	                  ||| j
                  | j                        j                  dd      }|	j	                  ||| j
                  | j                        j                  dd      }	t        }
| j                  j                  dk7  rt        | j                  j                     }
 |
| |||	|| j                  | j                  | j                  sdn| j                        \  }}|j!                  |||      j#                         }| j%                  |      }||fS )z#Input shape: Batch x Time x Channelr   r    r{   r|   )rm   rG   rH   )r#   rp   rq   rr   r}   r;  r@   rQ   r]   ra   r   r   rm   r=  rN   rH   r<   rW   r>  )ru   r7   rF   rI   r   
seq_lengthr   querieskeysvaluesr   r\   rZ   s                r)   r   zEmu3VQVAEAttentionBlock.forward<  sa    -:,?,?)
J	++m,{{=)]+,,z:t~~t}}U__`acdeyyZOYYZ[]^_ZT^^T]]S]]^_abc(?;;++w6"9$++:Z:Z"[$7nnJJ#}}C$,,	%
!\ "))*j)LWWYmmK0L((r+   r   )r   r   r   r   r   rh   r$   r   r   r   r   r   r   s   @r)   r9  r9  "  s\    G& &4 26$)||$) !.$)
 
u||Xell33	4$)r+   r9  c                   *     e Zd ZdZ fdZddZ xZS )Emu3VQVAEGroupNormz
    Same as the torch GroupNorm with the only difference that this ones accepts
    an optional kwarg `quant_states` which is not used. This class makes it easier to
    use SpatialNorm or GroupNorm without conditionals
    c                 $    t        |   di | y r6  )rg   rh   )ru   rI   rv   s     r)   rh   zEmu3VQVAEGroupNorm.__init__j  s    "6"r+   c                     t        j                  || j                  | j                  | j                  | j
                        S r   )r   
group_normr  r   rf   r   )ru   inputr  s      r)   r   zEmu3VQVAEGroupNorm.forwardm  s)    ||E4??DKKDHHUUr+   r   )r   r   r   r   rh   r   r   r   s   @r)   rE  rE  c  s    #Vr+   rE  c                   `     e Zd Zd fd	Zddej
                  deej
                     fdZ xZS )Emu3VQVAEMiddleBlockc                     t         |           t        |||      | _        t	        |      | _        |t        |ddd      | _        nt        ||      | _        t        |||      | _	        y )Nr   r  r3  r
  r   Tr  )
rg   rh   r2  block_1r9  attn_1rE  	attn_normr  block_2)ru   ra   r   r3  rv   s       r)   rh   zEmu3VQVAEMiddleBlock.__init__r  so    +#$)

 .f5!/[UW]ajnoDN1.+NDN+#$)
r+   r7   r  c                 b   | j                  ||      }|}| j                  ||      }|j                  \  }}}}|j                  ||||z        j	                  dd      }| j                  |      d   }|j                  ||||      j                  dddd      }||z   }| j                  ||      }|S )Nr   r    r   r   )	rN  rP  r#   r}   rQ   rO  r<   r   rQ  )ru   r7   r  r   r   r   r   r   s           r)   r   zEmu3VQVAEMiddleBlock.forward  s    ]LA }lC.;.A.A+
Hfe%**:x%PZZ[\^_`M215%--j&%RZZ[\^_abdef =0]LAr+   r   )	r   r   r   rh   r$   r   r   r   r   r   s   @r)   rK  rK  q  s,    
(
U%6%6 
huO`O`Fa 
r+   rK  c                   >     e Zd Z fdZdej
                  fdZ xZS )Emu3VQVAEDownBlockc           
         t         |           t        |j                        | _        |j
                  | _        |j                  }|j                  }dt        |      z   }|| _        t        j                         | _        t        | j                        D ]K  }t        j                         }t        j                         }t        j                         }|||   z  }	|||   z  }
t        | j
                        D ]~  }|j                  t        |	|
             |
}	|j                  .||j                  v s=|j                  t!        |             |j                  t        j"                  |	ddd              t        j$                         }||_        ||_        ||_        || j                  dz
  k7  rt-        |	      |_        | j                  j                  |       N y )N)r   r   r  r
  r   Tr  r   )rg   rh   lenchannel_multipliernum_resolutionsnum_res_blocksbase_channelsr   in_channel_multiplierrR   
ModuleListdownrangeappendr2  attn_resolutionsr9  r  Moduleblockattn
attn_normsr   
downsample)ru   ra   r[  rX  r\  i_levelrc  rd  re  block_in	block_outi_blockr^  rv   s                r)   rh   zEmu3VQVAEDownBlock.__init__  s   "6#<#<=$33,,#66 $u-?'@ @%:"MMO	T112 	#GMMOE==?DJ$'<W'EEH%(:7(CCI !4!45 
q($,%. %**67fF]F];]KK 7 ?@%%bllUW]ajn&op
q 99;DDJDI(DO$..22"@"JIIT"1	#r+   r7   c                 >   t        | j                        D ]  \  }}t        | j                        D ]  } |j                  |   |      }t        |j                        dkD  s1|} |j                  |   |      }|j                  \  }}}}	|j                  ||||	z        j                  dd      } |j                  |   |      d   }|j                  |||	|      j                  dddd      }||z   } || j                  dz
  k7  s|j                  |      } |S )Nr   r   r    r   )	enumerater^  r_  rZ  rc  rW  rd  re  r#   r}   rQ   r<   r   rY  rf  )
ru   r7   rg  blocksrj  r   r   r   r   r   s
             r)   r   zEmu3VQVAEDownBlock.forward  s5   (3 	AOGV !4!45 = 5W 5m Dv{{#a',H$>F$5$5g$>}$MM:G:M:M7J&%$1$6$6z8VV[^$\$f$fghjk$lM$8FKK$8$G$JM$1$9$9*feU]$^$f$fghjkmnpq$rM$,}$<M= $..22 & 1 1- @	A" r+   r   r   r   rh   r$   r   r   r   r   s   @r)   rT  rT    s    ##JU%6%6 r+   rT  c                   V     e Zd Z fdZdej
                  dej
                  fdZ xZS )Emu3VQVAEUpBlockc           	         t         |           t        |j                        | _        |j
                  | _        |j                  }|j                  |j                  d   z  }t        j                         | _
        t        t        | j                              D ]5  }t        j                         }t        j                         }t        j                         }|j                  |j                  |   z  }t        | j
                  dz         D ]e  }	|j                  t        |||             |}||j                  v s1|j                  t!        |             |j                  t#        ||             g t        j$                         }
||
_        ||
_        ||
_        |dk7  rt-        |      |
_        | j                  j1                  d|
       8 y )Nr   r   rM  r   )rg   rh   rW  rX  rY  rZ  r   r[  rR   r]  upreversedr_  r`  r2  ra  r9  r  rb  rc  rd  re  r   upsampleinsert)ru   ra   r3  rh  rg  rc  rd  re  ri  rj  rr  rv   s              r)   rh   zEmu3VQVAEUpBlock.__init__  s   "6#<#<=$33))''&*C*CB*GG--/d&:&: ;< 	"GMMOE==?DJ,,v/H/H/QQI !4!4q!89 V($,%.'5 %f555KK 7 ?@%%&:>8&TUV BBHBG&BM!|:8DGGNN1b!3	"r+   r7   r  c                 h   t        | j                  d d d         D ]  \  }}t        | j                  dz         D ]  } |j                  |   ||      }t        |j                        dkD  s2|} |j                  |   ||      }|j                  \  }}}	}
|j                  |||	|
z        j                  dd      } |j                  |   |      d   }|j                  ||	|
|      j                  dddd      }||z   } |t        | j                        dz
  k7  s|j                  |      } |S )Nr   r   r   r    r   )rl  rr  r_  rZ  rc  rW  rd  re  r#   r}   rQ   r<   r   rt  )ru   r7   r  rg  rm  rj  r   r   r   r   r   s              r)   r   zEmu3VQVAEUpBlock.forward  sD   (27 	?OGV !4!4q!89 = 5W 5m\ Rv{{#a',H$>F$5$5g$>}l$[M:G:M:M7J&%$1$6$6z8VV[^$\$f$fghjk$lM$8FKK$8$G$JM$1$9$9*feU]$^$f$fghjkmnpq$rM$,}$<M= #dgg,** & >	?  r+   rn  r   s   @r)   rp  rp    s(    #"JU%6%6 eFWFW r+   rp  c                   >     e Zd Z fdZdej
                  fdZ xZS )Emu3VQVAEEncoderc                    t         |           |j                  }|j                  }|j                  }|j
                  }|j                  }|rd|z  n|}||d   z  }t        j                  j                  ||ddd      | _
        t        |      | _        t        ||      | _        t        j                  j                  d|dd	      | _        t        j                  j                  ||ddd      | _        t%        t'        j(                  |j*                              }	t        j,                         | _        t        j,                         | _        t3        |	      D ])  }
t5        ||      }| j.                  j7                  |       + t3        |j8                        D ]*  }t;        ||
      }| j0                  j7                  |       , y )Nr    r   r   r   r   r
  r   T)r  r  r   r  rV  )rg   rh   r[  r   double_latentlatent_channelsrX  r$   rR   r   conv_inrT  
down_blockrK  middle_blockr  norm_outconv_outr   mathlog2temporal_downsample_factorr]  	time_convtime_res_stackr_  r#  r`  rZ  r'  )ru   ra   r[  r   rz  r{  rX  r  rh  temporal_down_blocksir   r   time_res_convrv   s                 r)   rh   zEmu3VQVAEEncoder.__init__	  s   ,,((,, 00#66.;q?* #5b#99xx{MqYZdef,V40B**bxUYbf*g ( 
  #499V-N-N#OP mmo+, 	(A.|\JDNN!!$'	( v,,- 	6A8()M &&}5	6r+   pixel_valuesc                 h   |j                   d   } |j                  dg|j                   dd   }| j                  |      }| j                  |      }| j	                  |      }| j                  |      }|t        j                  |      z  }| j                  |      } |j                  d|g|j                   dd   }|j                  ddddd      }| j                  D ]"  } ||      }|t        j                  |      z  }$ | j                  D ]
  } ||      } |j                  ddddd      }|S )Nr   r   r    r   r   r   )r#   r<   r|  r}  r~  r  r$   r0  r  r   r  r  )ru   r  temporal_dimr7   r   layers         r)   r   zEmu3VQVAEEncoder.forward0  sH   #))!,+|++BH1C1CAB1GH \26))-8 m4}55m4---b,YATATUVUWAXY%--aAq!< NN 	:D /MU]]=99M	: (( 	1E!-0M	1 &--aAq!<r+   )r   r   r   rh   r$   r   r   r   r   s   @r)   rx  rx    s    %6NE$4$4 r+   rx  c                   \     e Zd Zdef fdZdej                  dej                  fdZ xZS )Emu3VQVAEDecoderra   c                    t         	|           |j                  }|j                  |j                  d   z  }t        j                         | _        t        |j                        D ]>  }t        |j                  |j                        }| j                  j                  |       @ t        t        j                  |j                               }t        j                         | _        t        |      D ]=  }t%        |j                  |j                        }| j"                  j                  |       ? t        j&                  |j                  |ddd      | _        t+        |||      | _        t/        |      | _        |j                  |j                  d   z  }t3        ||      | _        t        j&                  ||j6                  ddd      | _        y )Nr   rV  r   r   r   )r3  r   )rg   rh   r   r[  rX  rR   r]  r  r_  rZ  r'  r{  r`  r   r  r  r  r  r  r   r|  rK  r~  rp  up_blockr  r  r  r  )
ru   ra   r3  rh  r   r  temp_upsample_block_numr  r   rv   s
            r)   rh   zEmu3VQVAEDecoder.__init__O  s   ))''&*C*CB*GG mmov,,- 	6A8"22AWAWM &&}5		6 #&dii0Q0Q&R"S./ 	(A,V-C-CVE[E[\DNN!!$'	( yy""
 1R`a(0''&*C*CA*FF,^XF		
r+   r7   r  c                    t        j                  ||fd      }|j                  ddddd      }| j                  D ]
  } ||      } | j                  D ]"  } ||      }|t        j
                  |      z  }$ |j                  ddddd      }t        j                  |dd      \  }} |j                  dg|j                  dd   } |j                  dg|j                  dd   }| j                  |      }| j                  ||      }| j                  ||      }| j                  ||      }|t        j
                  |      z  }| j                  |      }|S )Nr   r!   r    r   r   r   r   )r$   r%   r   r  r  r0  chunkr<   r#   r|  r~  r  r  r  )ru   r7   r  hidden_quant_statesr  s        r)   r   zEmu3VQVAEDecoder.forwardv  sp   #ii(E1M199!Q1aH (( 	=E"'(;"<	= ^^ 	FE"'(;"<5==1D#EE	F 299!Q1aH&+kk2Eqa&P#|---bK=3F3Fqr3JK+|++BH1C1CAB1GH]3 ))-Fm\Bm\B}55m4r+   )	r   r   r   r   rh   r$   r   r   r   r   s   @r)   r  r  N  s+    %
 %
NU\\  r+   r  aR  
    The VQ-VAE model used in Emu3 for encoding/decoding images into discrete tokens.
    This model follows the "Make-a-scene: Scene-based text-to-image generation with human priors" paper from
    [ Oran Gafni, Adam Polyak, Oron Ashual, Shelly Sheynin, Devi Parikh, and Yaniv
    Taigman](https://huggingface.co/papers/2203.13131).
    )custom_introc                        e Zd ZU eed<   dZdZdZdZdZ	dZ
g dZd Zdef fdZdej                  dej                  fd	Zd
ej                  fdZ xZS )	Emu3VQVAEra   
emuvideovqr  T)r'  r9  r2  r   c                 |   t        |t        j                  t        j                  f      rt        j                  j                  |j                  dd       |j                  qt        j                  j                  |j                        \  }}dt        j                  |      z  }t        j                  j                  |j                  | |       y y t        |t        j                        rt        j                  j                  |j                  t        j                  d             |j                  xt        j                  j                  |j                        \  }}|dkD  rdt        j                  |      z  nd}t        j                  j                  |j                  | |       y y t        |t        j                  t        j                  t        j                   f      rUt        j                  j#                  |j                  d       t        j                  j#                  |j                  d	       y t        |t        j$                        rc|j                  j&                  j)                          |j*                  2|j                  j&                  |j*                     j-                          y y y )
Nfan_outrelu)r   nonlinearityr      )ar   r   r|   )
isinstancerR   r   r   initkaiming_normal_r   rf   _calculate_fan_in_and_fan_outr  sqrtr   rn   kaiming_uniform_BatchNorm2dr)  r  	constant_r   r   normal_padding_idxzero_)ru   rB   fan_inr   bounds        r)   _init_weightszEmu3VQVAE._init_weights  s   fryy"))45GG##FMM	PV#W{{&GGAA&--P	DIIf--  ufe< ' 		*GG$$V]]diil$C{{&GGAA&--P	17!DIIf--  ufe< '  NOGGfmmS1GGfkk3/-MM&&(!!-""6#5#56<<> . .r+   c                    t         |   |       || _        t        |      | _        t        |      | _        t        |      | _        dt        |j                        dz
  z  | _        t        |j                  |j                  dd      | _        t        |j                  |j                  dd      | _        dt        |j                        dz
  z  | _        | j%                          | j'                          y )Nr    r   )r   r   r   r  r  )rg   rh   ra   rx  encoderr  decoderr   quantizerW  rX  vision_spatial_factorr   r{  r   
quant_convpost_quant_convspatial_scale_factoreval	post_initr   s     r)   rh   zEmu3VQVAE.__init__  s     '/'/08%&3v/H/H+IA+M%N")""F$4$4)T]
  /f44)T] 
 %&#f.G.G*H1*L$M!		r+   image_sizesc                    |j                   dk(  }|rL| j                  j                  }|j                  \  }}}}|j	                  d      j                  d|ddd      }n|j                  \  }}}}}| j                  |      }	|	j                  ddddd      }	| j                  |	      }	|	j                  ddddd      }	| j                  |	      }
|r|
j                  d      n|
}t        ||      D cg c]B  \  }}|d t        |d   | j                  z        d t        |d   | j                  z        f   D }}}|S c c}}w )Nr   r   r   r    r   )ndimra   r  r#   r-   repeatr  r   r  r  squeezer   r   r  )ru   r  r  is_imager   r   r   r   r   r7   codesimage_tokenssingle_imager  s                 r)   encodezEmu3VQVAE.encode  sX   $$){{==H2>2D2D/J&%'11!4;;AxAqQL<H<N<N9J(FE\2 &--aAq!<6 &--aAq!<m,+3u}}Q' '*,&D
"d D3tAw)C)CCDDFqDQRGVZVpVpLpHqFqqr
 

 
s   1AD=r7   c                    |j                   dk(  }|r|j                  d      }|j                  \  }}}}| j                  j	                  |j                               }|j                  d   }|j                  |||||      j                  ddddd      j                         }| j                  |      }	|j                  ddddd      }|	j                  ddddd      }	| j                  |	|      }
|
j                  ||| j                  j                  z  | j                  j                  || j                  z  || j                  z        }
|r	|
d d df   S |
S )Nr   r   r   r   r   r    )r  r-   r#   r  r   flattenr}   r   rW   r  r  r<   ra   r  r  r  )ru   r7   r  r   r   r   r   quantr   
post_quantvideos              r)   decodezEmu3VQVAE.decode  sK    %%*)33A6M.;.A.A+
Hfe''(=(=(?@;;r?

:xIQQRSUVXY[\^_`kkm))%0
aAq!,''1aA6
Z/t{{===KK$$T...D---
 'uQT{1E1r+   )r   r   r   r   __annotations__base_model_prefixmain_input_name_supports_sdpa_supports_flash_attn_supports_flex_attn_supports_attention_backend_no_split_modulesr  rh   r$   r   r  r  r   r   s   @r)   r  r    sq     $$ON"&?* *5<< ell 82ELL 2r+   r  c                       e Zd ZdZd Zed        Zed        Zed        Zed        Z	ed        Z
ed        Zd	eej                     d
ej                  fdZd	ej                  d
ej                  fdZy)Emu3ImageVocabularyMappingzM
    A class for mapping discrete image tokens from VQGAN to BPE tokens.
    c                 j    || _         |j                  d      | _        |j                  d      | _        y )Nz<|extra_200|>z<image>)	vocab_mapgeteol_token_idimage_token_id)ru   r  s     r)   rh   z#Emu3ImageVocabularyMapping.__init__  s+    "%MM/:'mmI6r+   c           	          t        | j                  j                         D cg c]  \  }}|j                  d      s| c}}      S c c}}w Nz<|visual tokensortedr  items
startswithru   namevals      r)   r  z'Emu3ImageVocabularyMapping.image_tokens  s8    DNN,@,@,BhytSdooVfFgshiih
   A	
A	
c           	          t        | j                  j                         D cg c]  \  }}|j                  d      s| c}}      S c c}}w r  r  r  s      r)   image_tokens_strz+Emu3ImageVocabularyMapping.image_tokens_str  s8    T^^-A-A-Ci	ctWgGhtijjir  c                 t    | j                   D ci c]  }t        |dd       | j                  |     c}S c c}w )NirK   )r  r   r  )ru   tokens     r)   img2bpez"Emu3ImageVocabularyMapping.img2bpe  s5    FJF[F[\UE"RL!4>>%#88\\\s   #5c                 j    | j                   j                         D ci c]  \  }}||
 c}}S c c}}w r   )r  r  )ru   r/   vs      r)   bpe2imgz"Emu3ImageVocabularyMapping.bpe2img!  s+    !%!3!3!56A1666s   /c                     t        j                  t        | j                  j	                               dz   t         j
                        }| j                  j                         D ]
  \  }}|||<    |S Nr   rL   )r$   zerosmaxr  rB  r   r  ru   mappingr/   r  s       r)   bpe2img_mapping_tensorz1Emu3ImageVocabularyMapping.bpe2img_mapping_tensor%  [    ++c$,,"3"3"56:%))LLL&&( 	DAqGAJ	r+   c                     t        j                  t        | j                  j	                               dz   t         j
                        }| j                  j                         D ]
  \  }}|||<    |S r  )r$   r  r  r  rB  r   r  r  s       r)   img2bpe_mapping_tensorz1Emu3ImageVocabularyMapping.img2bpe_mapping_tensor,  r  r+   	img_batchr9   c                 ,   |j                   }t        j                  |j                  d   dft        j                        | j
                  z  }| j                  |j                  d         }t        j                  ||gd      }|j                  |      S )Nr   r   r  cpur   r!   )	devicer$   r   r#   r   r  r  rV   r%   )ru   r  r  eol_row
img_tokenss        r)   convert_img2bpez*Emu3ImageVocabularyMapping.convert_img2bpe3  sw    !!**iooa0!4EIIFIZIZZ00e1DE
YY
G4"=
}}V$$r+   c                     |j                   }|dd df   }| j                  |j                  d         }|j                  |      S )N.r   r  )r  r  rV   )ru   r  r  r  s       r)   convert_bpe2imgz*Emu3ImageVocabularyMapping.convert_bpe2img:  sG    !!c3B3h'	00e1DE
}}V$$r+   N)r   r   r   r   rh   r   r  r  r  r  r  r  listr$   r   r  r  r   r+   r)   r  r    s    7
 j j k k ] ] 7 7    %ell); % %% %%,, %r+   r  c                   F    e Zd ZU eed<   dZdZdgZddgZdZ	dZ
dZdZdZdZy)	Emu3PreTrainedModelra   modelTr   past_key_valuesr[   FN)r   r   r   r   r  r  supports_gradient_checkpointingr  _skip_keys_device_placementr  r  _can_compile_fullgraph!_supports_param_buffer_assignmentr  r  r   r+   r)   r  r  A  sO    &*# $5m"DN!(-%"&r+   r  c                   ^     e Zd Zddef fdZ ej                         ed               Z xZ	S )Emu3RotaryEmbeddingra   c                    t         |           t        |d      rUt        |j                  t
              r;|j                  j                  d|j                  j                  d            | _        nd| _        |j                  | _	        |j                  | _
        || _        t        | j                     | _        | j                  | j                  |      \  }| _        | j                  d|d       | j                   | _        y )Nrope_scaling	rope_typetypedefaultinv_freqF)
persistent)rg   rh   hasattrr  r  dictr  r	  max_position_embeddingsmax_seq_len_cachedoriginal_max_seq_lenra   r   rope_init_fnattention_scalingregister_bufferr  original_inv_freq)ru   ra   r  r  rv   s       r)   rh   zEmu3RotaryEmbedding.__init__T  s    6>*z&:M:Mt/T#0044[&BUBUBYBYZ`BabDN&DN"("@"@$*$B$B!/?+/+<+<T[[&+Q($(ZeD!%r+   c                 b   | j                   d d d d f   j                         j                  |j                  d   dd      j	                  |j
                        }|d d d d d f   j                         }t        |j
                  j                  t              r/|j
                  j                  dk7  r|j
                  j                  nd}t        j                  |d      5  |j                         |j                         z  j                  dd      }t        j                  ||fd	      }|j                         | j                  z  }|j                         | j                  z  }	d d d        j	                  |j                   
      	j	                  |j                   
      fS # 1 sw Y   AxY w)Nr   r   r   mpsr  F)device_typeenabledr    r!   r  )r  floatr;   r#   rV   r  r  r
  strr$   autocastrQ   r%   r0   r  r1   rL   )
ru   r&   r2   inv_freq_expandedposition_ids_expandedr  freqsembr0   r1   s
             r)   r   zEmu3RotaryEmbedding.forwarde  sV    !MM$4-8>>@GGHZHZ[\H]_acdehhijiqiqr ,QaZ 8 > > @'1!((--'E!((--[`J`ahhmmfk^^UC 	5&,,.1F1L1L1NNYYZ[]^_E))UEN3C'')d444C'')d444C		5 vvAGGv$cff177f&;;;	5 	5s    BF%%F.r   )
r   r   r   r   rh   r$   no_gradr   r   r   r   s   @r)   r  r  S  s3    /z /" U]]_<  <r+   r  c                       e Zd ZeedZdef fdZee		 	 	 	 	 	 	 dde
ej                     de
ej                     de
ej                     de
e   de
ej                     d	e
ej                     d
e
e   dee   defd              Z xZS )Emu3TextModel)r7   
attentionsra   c           	         t         |   |       |j                  | _        |j                  | _        t        j                  |j                  |j                  | j                        | _        t        j                  t        |j                        D cg c]  }t        ||       c}      | _        t        |j                  |j                        | _        t#        |      | _        d| _        | j)                          y c c}w )Nr   )ra   F)rg   rh   pad_token_idr  
vocab_sizerR   r   rj   embed_tokensr]  r_  num_hidden_layersr   layersr   r   normr  
rotary_embgradient_checkpointingr  rt   s      r)   rh   zEmu3TextModel.__init__|  s     !.. ++LL):):F<N<NPTP`P`ammBGH`H`BabYfi0b
   2 28K8KL	-V<&+# 	 cs   D	input_idsrF   r2   r   inputs_embedsry   r   rI   r9   c           
      *   |d u |d uz  rt        d      || j                  |      }|r|
t               }|F||j                         nd}	t	        j
                  |	|	|j                  d   z   |j                        }||j                  d      }t        | j                  |||||      }
|}| j                  ||      }| j                  d | j                  j                   D ]  } ||f|
||||d|} | j                  |      }t        ||      S )Nz:You must specify exactly one of input_ids or inputs_embedsr   r   )r  )ra   input_embedsrF   ry   r   r2   )rF   r2   rx   ry   rw   )last_hidden_stater   )r<  r)  r
   get_seq_lengthr$   aranger#   r  r-   r   ra   r-  r+  r*  r,  r   )ru   r/  rF   r2   r   r0  ry   r   rI   past_seen_tokensr[   r7   rw   decoder_layers                 r)   r   zEmu3TextModel.forward  sT    -t";<YZZ *.*;*;I*FM0*nO!CRC^==?de+0<< "2]5H5H5K"KTaThTh,N )33A6L(;;&))+%
 &"oom\J![[)H4;;+H+HI 		M)*).-$7 M		 		-0&++
 	
r+   )NNNNNNN)r   r   r   r   r_   _can_record_outputsr   rh   r   r   r   r$   r   r   r	   r   r   r   r   r   r   r   r   s   @r)   r$  r$  u  s     *#
z    151537+/5959$(8
E,,-8
 !.8
 u//0	8

 "%8
   1 128
 !!1!128
 D>8
 +,8
 
!8
  8
r+   r$  c                   |    e Zd ZU dgZddiZddgdgfiZeed<    fdZd Z	d	 Z
ee	 	 	 	 	 	 	 	 	 dd
eej                     deej                      deej                     dee   deej$                     deej                     dee   deej                     deeej                   f   dee   defd              Z xZS )Emu3ForCausalLMlm_head.weightlm_headcolwise_repr7   logitsra   c                     t         |   |       t        |      | _        |j                  | _        t        j                  |j                  |j                  d      | _        | j                          y NFre   )
rg   rh   r$  r  r(  rR   rn   rj   r<  r  r   s     r)   rh   zEmu3ForCausalLM.__init__  sU     "6*
 ++yy!3!3V5F5FUS 	r+   c                     || _         y r   r  ru   r  s     r)   set_decoderzEmu3ForCausalLM.set_decoder  s	    
r+   c                     | j                   S r   rB  r   s    r)   get_decoderzEmu3ForCausalLM.get_decoder  s    zzr+   r/  rF   r2   r   r0  labelsr   ry   logits_to_keeprI   r9   c
                 z    | j                   d|||||||d|
}|j                  }t        |	t              rt	        |	 d      n|	}| j                  |dd|ddf         }d}|* | j                  d||| j                  j                  d|
}t        |||j                  |j                  |j                        S )a  
        Example:

        ```python
        >>> from transformers import Emu3Processor, Emu3ForConditionalGeneration
        >>> import torch
        >>> import requests
        >>> from PIL import Image

        >>> model = Emu3ForCausalLM.from_pretrained("BAAI/Emu3-Chat-hf", torch_dtype=torch.bfloat16)
        >>> processor = Emu3Processor.from_pretrained("BAAI/Emu3-Chat-hf")

        >>> inputs = processor(text=["Can you write me a poem about winter."], return_tensors="pt").to(model.device)

        >>> generated_ids = model.generate(**inputs, max_new_tokens=100, do_sample=False)
        >>> processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
        ```r/  rF   r2   r   r0  r   ry   Nr>  rG  r(  lossr>  r   r7   r%  r   )r  r3  r  r   slicer<  loss_functionra   r(  r   r   r7   r%  )ru   r/  rF   r2   r   r0  rG  r   ry   rH  rI   outputsr7   slice_indicesr>  rM  s                   r)   r   zEmu3ForCausalLM.forward  s    @ ,64:: 	,
)%+')	,
 	,
  118B>SV8W~ot4]kmA}a,?@A%4%%pVFt{{OeOepiopD%#33!//))
 	
r+   )	NNNNNNNNr   )r   r   r   _tied_weights_keys_tp_plan_pp_planr   r  rh   rD  rF  r   r   r   r$   r   r   r	   r   r   r   r   r   r   r   r   r   r   s   @r)   r:  r:    s@   *+=)H_-z:;H  151537+/59-1$(59348
E,,-8
 !.8
 u//0	8

 "%8
   1 128
 ))*8
 D>8
 !!1!128
 c5<</08
 +,8
 
 8
  8
r+   r:  c                   `    e Zd ZddiZ fdZd Zd Zd Zd Zde	j                  d	e	j                  fd
Zde	j                  d	e	j                  fdZe	j                  de	j                  dedefd       Zde	j                  de	j                  de	j                  fdZee	 	 	 	 	 	 	 	 	 dde	j                  de	j                  d	e	j(                  dee	j(                     dee	j                     dee   dee	j                     dee   dee	j                     dee   deeef   fd              Z xZS )	Emu3Modelztext_model.model
text_modelc                     t         |   |       t        j                  |j                        | _        t        |j                        | _        t        |j                        | _        | j                          y r   )rg   rh   r$  _from_configtext_configrW  r  	vq_configvqmodelr  vocabulary_mapvocabulary_mappingr  r   s     r)   rh   zEmu3Model.__init__  sY     '44V5G5GH !1!12"<V=R=R"S 	r+   c                 6    | j                   j                         S r   )rW  get_input_embeddingsr   s    r)   r`  zEmu3Model.get_input_embeddings(  s    3355r+   c                 :    | j                   j                  |       y r   )rW  set_input_embeddingsru   rE   s     r)   rb  zEmu3Model.set_input_embeddings+  s    ,,U3r+   c                     || _         y r   rW  rC  s     r)   rD  zEmu3Model.set_decoder.  s	    !r+   c                     | j                   S r   re  r   s    r)   rF  zEmu3Model.get_decoder1  s    r+   r  r  c                     | j                   j                  ||      }|D cg c]+  }| j                  j                  |      j	                         - }}t        j                  |      }|S c c}w )a  
        Tokenizes images into discrete tokens with VQGAN module. Converts
        obtained image tokens into BPE tokens and wraps with "boi" and "eoi"
        special tokens.

        Args:
            pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)`):
                The tensors corresponding to the input images.
            image_sizes (`torch.LongTensor` of shape `(batch_size, 2)`):
                The sizes of the images in the batch, being (height, width) for each image.
        )r\  r  r^  r  r  r$   r%   )ru   r  r  image_tokens_listtokensbpe_tokens_list
bpe_tokenss          r)   get_image_tokenszEmu3Model.get_image_tokens4  sc     !LL//kJctuY_422BB6JRRTuuYY/
 vs   0A*c                    | j                  ||      }|D cg c];  \  }}|| j                  j                  z  || j                  j                  z  dz   z  = }}} | j                         |      }t	        j
                  ||      }|S c c}}w )a7  
        Tokenizes images into discrete tokens with VQGAN module and embeds
        them with text embeddings layer

        Args:
            pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)):
                The tensors corresponding to the input images.
        r   )rl  r\  r  r`  r$   split)ru   r  r  r  r   r   split_sizesimage_featuress           r)   get_image_featureszEmu3Model.get_image_featuresE  s     ,,\;G "-
 t||999et||GiGi>ilm>mn
 
 5224\B^[A
s   A B	r  r   r   c                     |ddddf   j                  d||dz         }| j                  j                  |      }| j                  j	                  |      }|S )a  
        Decodes generated image tokens from language model to continuous pixel values
        with VQGAN module via upsampling.

        Args:
            image_tokens (`torch.LongTensor` of shape `(batch_size, num_of_tokens)`):
                The tensors corresponding to the input images.
            height (`int`):
                Height of the generated image before upsampling.
            width (`int`):
                Width of the generated image before upsampling.
        Nr   r   )r}   r^  r  r\  r  )ru   r  r   r   	sequencesimages         r)   decode_image_tokenszEmu3Model.decode_image_tokensW  sX     !CRC(--b&%!)D	..>>yI##L1r+   r/  r0  rp  c                 P   |m| | j                         t        j                  | j                  j                  t        j
                  |j                              k(  }|j                  d      }n|| j                  j                  k(  }|j                         }|j                  d      j                  |      j                  |j                        }|j                  d   |j                  d   z  }||   j                         |j                         k7  rt        d| d|       |S )z
        Obtains multimodal placeholdr mask from `input_ids` or `inputs_embeds`, and checks that the placeholder token count is
        equal to the length of multimodal features. If the lengths are different, an error is raised.
        )rL   r  r   r   r   z6Image features and image tokens do not match: tokens: z, features )r`  r$   tensorr^  r  longr  allr   r-   	expand_asrV   r#   numelr<  )ru   r/  r0  rp  special_image_maskn_image_tokensn_image_featuress          r)   get_placeholder_maskzEmu3Model.get_placeholder_maskj  s    !.2M$2K2K2MT44CC5::^k^r^rs3 " "4!7!7!;!*d.E.E.T.T!T+//1/99"=GGVYYZgZnZno)//2^5I5I!5LL+,2248L8L8NNHHXXcdtcuv  "!r+   rF   r2   r   r   ry   rI   r9   c
           
      2   |du |duz  rt        d      | | j                         |      }|O| j                  ||      }t        j                  |d      }| j                  |||      }|j                  ||      } | j                  d||||||	d|
}|S )ap  
        image_sizes (`torch.LongTensor` of shape `(batch_size, 2)`):
            The sizes of the images in the batch, being (height, width) for each image. Image sizes can be obtained using
            [`AutoImageProcessor`]. See [`Emu3ImageProcessor.__call__`] for details ([]`Emu3Processor`] uses
            [`Emu3ImageProcessor`] for processing images).
        NzaYou cannot specify both input_ids and inputs_embeds at the same time, and must specify either oner   r!   )r0  rp  )rF   r2   r   r0  r   ry   r   )r<  r`  rq  r$   r%   r  masked_scatterrW  )ru   r/  r  r  rF   r2   r   r0  r   ry   rI   image_embedsr}  rP  s                 r)   r   zEmu3Model.forward  s    * -t";<s   7D557	BM#22<ML 99\q9L!%!:!:| "; " *889K\ZM "$// 
)%+')
 
 r+   )	NNNNNNNNN)r   r   r   _checkpoint_conversion_mappingrh   r`  rb  rD  rF  r$   r   r   rl  rq  r"  r   rv  r  r   r   r   r   r	   r   r   r   r   r   r   r   r   r   s   @r)   rV  rV    s   &8,%G"64"U->-> UM]M] "u/@/@ uO_O_ $ ]]0@0@ # VY  $"))":?:K:K"]b]n]n"0  '+*.$(1537+/59$(59.##. ''. \\	.
 !.. u//0. "%.   1 12. D>. !!1!12. +,. 
u,,	-.  .r+   rV  c                       e Zd ZdZdgZddddZ fdZd Zd	 Zd
e	j                  fdZd Zd Zed        Zed        Zed        Zd Zee	 	 	 	 	 	 	 	 	 	 	 d dej,                  dej.                  dej0                  deej0                     deej,                     dee   deej.                     dee   deej,                     deej,                     deeej0                  f   dee   d
ee e!f   fd              Z"	 	 	 	 	 	 	 d! fd	Z# xZ$S )"Emu3ForConditionalGeneration r;  zmodel.text_modelzmodel.vqmodelr<  )z^text_model.modelz^vqmodelz^text_model.lm_headc                     t         |   |       t        |      | _        t	        j
                  |j                  j                  |j                  j                  d      | _	        | j                          y r@  )rg   rh   rV  r  rR   rn   rZ  rj   r(  r<  r  r   s     r)   rh   z%Emu3ForConditionalGeneration.__init__  sS     v&
yy!3!3!?!?ASASA^A^ejkr+   c                 6    | j                   j                         S r   )r  r`  r   s    r)   r`  z1Emu3ForConditionalGeneration.get_input_embeddings  s    zz..00r+   c                 :    | j                   j                  |       y r   )r  rb  rc  s     r)   rb  z1Emu3ForConditionalGeneration.set_input_embeddings  s    

''.r+   r9   c                     | j                   S r   )r<  r   s    r)   get_output_embeddingsz2Emu3ForConditionalGeneration.get_output_embeddings  s    ||r+   c                 :    | j                   j                  |       y r   )r  rD  rC  s     r)   rD  z(Emu3ForConditionalGeneration.set_decoder  s    

w'r+   c                 6    | j                   j                         S r   )r  rF  r   s    r)   rF  z(Emu3ForConditionalGeneration.get_decoder  s    zz%%''r+   c                 .    | j                   j                  S r   )r  rW  r   s    r)   rW  z'Emu3ForConditionalGeneration.text_model  s    zz$$$r+   c                 .    | j                   j                  S r   )r  r\  r   s    r)   r\  z$Emu3ForConditionalGeneration.vqmodel  s    zz!!!r+   c                 .    | j                   j                  S r   )r  r^  r   s    r)   r^  z/Emu3ForConditionalGeneration.vocabulary_mapping  s    zz,,,r+   c                 :     | j                   j                  di |S r6  )r  rv  )ru   rI   s     r)   rv  z0Emu3ForConditionalGeneration.decode_image_tokens  s    -tzz--777r+   r/  r  r  rF   r2   r   r0  r   ry   rG  rH  rI   c                     | j                   d|||||||	d|}|d   }t        |t              rt        | d      n|}| j	                  |dd|ddf         }d}|
4 | j
                  d||
| j                  j                  j                  d|}t        |||j                  |j                  |j                        S )at  
        image_sizes (`torch.LongTensor` of shape `(batch_size, 2)`):
            The sizes of the images in the batch, being (height, width) for each image. Image sizes can be obtained using
            [`AutoImageProcessor`]. See [`Emu3ImageProcessor.__call__`] for details ([]`Emu3Processor`] uses
            [`Emu3ImageProcessor`] for processing images).
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

        Example:

        ```python
        >>> from transformers import Emu3Processor, Emu3ForConditionalGeneration
        >>> import torch
        >>> import requests
        >>> from PIL import Image

        >>> model = Emu3ForConditionalGeneration.from_pretrained("BAAI/Emu3-Chat-hf", torch_dtype=torch.bfloat16)
        >>> processor = Emu3Processor.from_pretrained("BAAI/Emu3-Chat-hf")

        >>> conversation = [
        ...     {
        ...     "role": "system",
        ...     "content": [
        ...         {"type": "text", "text": "You are a helpful assistant."},
        ...         ],
        ...     },
        ...     {
        ...     "role": "user",
        ...     "content": [
        ...         {"type": "image"},
        ...         {"type": "text", "text": "Please describe the image."},
        ...         ],
        ...     },
        ... ]

        >>> prompt = processor.apply_chat_template(conversation, add_generation_prompt=True)
        >>> image = Image.open(requests.get("https://www.ilankelman.org/stopsigns/australia.jpg", stream=True).raw)

        >>> inputs = processor(images=[image], text=[prompt], return_tensors="pt").to(model.device, torch.bfloat16)

        >>> generated_ids = model.generate(**inputs, max_new_tokens=100, do_sample=False)
        >>> processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
        ```rJ  r   NrK  rL  r   )r  r  r   rN  r<  rO  ra   rZ  r(  r   r   r7   r%  )ru   r/  r  r  rF   r2   r   r0  r   ry   rG  rH  rI   rP  r7   rQ  r>  rM  s                     r)   r   z$Emu3ForConditionalGeneration.forward  s    | $** 	
)%+')	
 	
  
8B>SV8W~ot4]kmA}a,?@A%4%% f9P9P9[9[_eD &#33!//))
 	
r+   c	                 R    t        |   |f|||||||d|	}
|d   dk7  rd |
d<   |
S )N)r   rF   r0  ry   r2   r  r   r   r  )rg   prepare_inputs_for_generation)ru   r/  r   rF   r0  ry   r2   r   r  rI   model_inputsrv   s              r)   r  z:Emu3ForConditionalGeneration.prepare_inputs_for_generation@  sZ     w<

+)')%%

 

 !!+/L(r+   )NNNNNNNNNNr   )NNNNNTN)%r   r   r   r  rR  r  rh   r`  rb  rR   rb  r  rD  rF  propertyrW  r\  r^  rv  r   r   r$   r   r   r   r   r	   r   r   r   r   r   r   r   r   r  r   r   s   @r)   r  r    s   *+/#(&"1/ryy (( % % " " - -8  '+*.$(1537+/59$(59-134X
##X
 ''X
 \\	X

 !.X
 u//0X
 "%X
   1 12X
 D>X
 !!1!12X
 ))*X
 c5<</0X
 +,X
 
u,,	-X
  X
z  r+   r  )r  r:  r$  r  r  rV  )Nr   )r|   )Ur  	functoolsr   typingr   r   r   r$   torch.nnrR   torch.nn.functionalrS   r   activationsr   cache_utilsr	   r
   
generationr   integrationsr   masking_utilsr   modeling_layersr   modeling_outputsr   r   modeling_rope_utilsr   r   modeling_utilsr   r   processing_utilsr   utilsr   r   r   utils.genericr   configuration_emu3r   r   r   r*   r6   r   r   rA   rb  r  r]   r_   r   r   r   r   r   r   r   r  r  r#  r'  r2  r9  r  rE  rK  rT  rp  rx  r  r  r  r  r  r$  r:  rV  r  __all__r   r+   r)   <module>r     s$  .  % , ,     ! . ) 7 / 9 O K F & I I / K K(6	UU\\ 	U# 	U%,, 	U& %II%<<% 
% <<	%
 U\\*% % % '(%4C)BII C)L Y'J")) J (J(bii  *1 *Z$ryy $D	RYY 	299 bii :!299 !H		 .")) &.(299 .(b<(299 <(~>)bii >)BV V299 D8 8v7ryy 7tCryy CLCryy CL l2 l2l2^3% 3%l '/ ' '"<")) <D P
' P
 P
f O
)? O
 O
dV# Vrh#6 hVr+   