
    rh                     $   d dl mZmZmZ d dlZd dlmZ ddlmZ ddlm	Z	m
Z
mZ ddlmZ ddlmZ dd	lmZmZ dd
lmZ ddlmZ ddlmZmZmZmZ ddlmZmZ ddlm Z m!Z! ddl"m#Z# ddl$m%Z%m&Z&m'Z'm(Z(m)Z)m*Z* ddl+m,Z,m-Z-m.Z. ddl/m0Z0  e(       rddl1m2Z2  e*jf                  e4      Z5e& G d de!             Z6 G d dejn                        Z8 G d dejn                        Z9 ed       G d dejn                               Z: G d dejn                        Z;d  Z<dCd!Z=d"ej|                  d#e?d$ej|                  fd%Z@	 dDd&ejn                  d'ej|                  d(ej|                  d)ej|                  d*eej|                     d+eAd,eAd-e#e%   fd.ZB G d/ d0ejn                        ZC G d1 d2ejn                        ZD G d3 d4e      ZE G d5 d6e6      ZF G d7 d8e      ZG G d9 d:e6      ZH e&d;<       G d= d>e6             ZI e&d?<       G d@ dAe6e0             ZJg dBZKy)E    )CallableOptionalUnionN)nn   )ACT2FN)CacheDynamicCacheEncoderDecoderCache)use_kernel_forward_from_hub)create_causal_mask)_prepare_4d_attention_mask#_prepare_4d_attention_mask_for_sdpa)FlashAttentionKwargs)GradientCheckpointingLayer)BaseModelOutput)BaseModelOutputWithPastAndCrossAttentionsSeq2SeqLMOutputSeq2SeqModelOutput)ROPE_INIT_FUNCTIONSdynamic_rope_update)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)TransformersKwargsauto_docstringcan_return_tupleis_torch_flex_attn_availableis_torchdynamo_compilinglogging   )	DiaConfigDiaDecoderConfigDiaEncoderConfig)DiaGenerationMixin)make_flex_block_causal_maskc                   <    e Zd ZU eed<   dZdZdZdZdZ	dZ
dZddgZy)DiaPreTrainedModelconfigmodelT	input_idsDiaEncoderLayerDiaDecoderLayerN)__name__
__module____qualname__r"   __annotations__base_model_prefixsupports_gradient_checkpointing_supports_flash_attn_supports_sdpa_supports_flex_attn_can_compile_fullgraphmain_input_name_no_split_modules     w/var/www/html/ai-insurance-compliance-backend/venv/lib/python3.12/site-packages/transformers/models/dia/modeling_dia.pyr(   r(   >   s<    &*#N!!O*,=>r;   r(   c                   `     e Zd ZdZdef fdZdej                  dej                  fdZ xZ	S )DiaMultiChannelEmbeddinga  In order to efficiently compute the audio embedding from the 9 different channels,
    we vectorize the embedding process by using a single embedding layer and an offset.
    Example:
    - num_embeds = 4
    - vocab_size = 8
    - num_channels = 3
    We would have offsets = [0, 8, 16]
    If audio_codes = [0, 1, 2, 3], [1, 3, 4, 7], [5, 6, 7, 8],
    then tokens = audio_codes + offsets
                = [0, 1, 2, 3, 9, 11, 12, 15, 21, 22, 23, 24]
    This allows us to use a single embedding layer for all channels.
    r)   c                 ~   t         |           t        j                  |j                  |j
                  z  |j                        | _        |j                  | _        |j
                  | _        t        j                  |j
                  t        j                        |j                  z  }| j                  d|d       y )NdtypeoffsetsF
persistent)super__init__r   	Embedding
vocab_sizenum_channelshidden_sizeembedtorcharangelongregister_buffer)selfr)   rB   	__class__s      r<   rF   z!DiaMultiChannelEmbedding.__init__Y   s    \\&"3"3f6I6I"I6K]K]^
!--"//,,v22%**EHYHYYYEBr;   audio_codesreturnc                 "   || j                   j                  |j                        z   j                  d      }| j	                  |      j                  |j                  d   |j                  d   d| j                        }|j                  d      S )Nr!   r      dim)	rB   todevicesqueezerK   viewshaperJ   sum)rP   rR   tokensembedss       r<   forwardz DiaMultiChannelEmbedding.forwarda   su    0B0B CCLLQOF#((a+:K:KA:NPRTXTdTdezzaz  r;   )
r.   r/   r0   __doc__r#   rF   rL   Tensorra   __classcell__rQ   s   @r<   r>   r>   K   s2    C/ C!5<< !ELL !r;   r>   c                   V     e Zd Z fdZdej
                  dej
                  fdZ xZS )DiaMLPc                 *   t         |           || _        t        j                  |j
                  d|j                  z  d      | _        t        j                  |j                  |j
                  d      | _        t        |j                     | _        y )NrV   Fbias)rE   rF   r)   r   LinearrJ   intermediate_sizegate_up_proj	down_projr   
hidden_actactivation_fnrP   r)   rQ   s     r<   rF   zDiaMLP.__init__h   sp    IIf&8&8!f>V>V:V]bc6#;#;V=O=OV[\#F$5$56r;   hidden_statesrS   c                     | j                  |      }|j                  dd      \  }}|| j                  |      z  }| j                  |      S )NrV   rU   rW   )rm   chunkrp   rn   )rP   rr   	up_statesgates       r<   ra   zDiaMLP.forwardp   sL    %%m4	#//!/4i 2 24 88	~~i((r;   )r.   r/   r0   rF   rL   FloatTensorra   rd   re   s   @r<   rg   rg   g   s'    7)U%6%6 )5;L;L )r;   rg   RMSNormc                   ,     e Zd Zd fd	Zd Zd Z xZS )
DiaRMSNormc                     t         |           t        j                  t	        j
                  |            | _        || _        y)z9
        DiaRMSNorm is equivalent to T5LayerNorm
        N)rE   rF   r   	ParameterrL   onesweightvariance_epsilon)rP   rJ   epsrQ   s      r<   rF   zDiaRMSNorm.__init__{   s1     	ll5::k#:; #r;   c                 "   |j                   }|j                  t        j                        }|j	                  d      j                  dd      }|t        j                  || j                  z         z  }| j                  |j                  |      z  S )NrV   rU   T)keepdim)	rA   rY   rL   float32powmeanrsqrtr   r~   )rP   rr   input_dtypevariances       r<   ra   zDiaRMSNorm.forward   sy    #))%((7 $$Q',,R,>%Ht?T?T4T(UU{{]--k:::r;   c                 ^    t        | j                  j                         d| j                   S )Nz, eps=)tupler~   r]   r   rP   s    r<   
extra_reprzDiaRMSNorm.extra_repr   s*    ))*+6$2G2G1HIIr;   )gư>)r.   r/   r0   rF   ra   r   rd   re   s   @r<   rz   rz   y   s    $;Jr;   rz   c                   ^     e Zd Zddef fdZ ej                         ed               Z xZ	S )DiaRotaryEmbeddingr)   c                    t         |           t        |d      rUt        |j                  t
              r;|j                  j                  d|j                  j                  d            | _        nd| _        |j                  | _	        |j                  | _
        || _        t        | j                     | _        | j                  | j                  |      \  }| _        | j                  d|d       | j                   | _        y )Nrope_scaling	rope_typetypedefaultinv_freqFrC   )rE   rF   hasattr
isinstancer   dictgetr   max_position_embeddingsmax_seq_len_cachedoriginal_max_seq_lenr)   r   rope_init_fnattention_scalingrO   r   original_inv_freq)rP   r)   rZ   r   rQ   s       r<   rF   zDiaRotaryEmbedding.__init__   s    6>*z&:M:Mt/T#0044[&BUBUBYBYZ`BabDN&DN"("@"@$*$B$B!/?+/+<+<T[[&+Q($(ZeD!%r;   c                 b   | j                   d d d d f   j                         j                  |j                  d   dd      j	                  |j
                        }|d d d d d f   j                         }t        |j
                  j                  t              r/|j
                  j                  dk7  r|j
                  j                  nd}t        j                  |d      5  |j                         |j                         z  j                  dd      }t        j                  ||fd	      }|j                         | j                  z  }|j                         | j                  z  }	d d d        j	                  |j                   
      	j	                  |j                   
      fS # 1 sw Y   AxY w)Nr   rU   r!   mpscpuF)device_typeenabledrV   rW   r@   )r   floatexpandr]   rY   rZ   r   r   strrL   autocast	transposecatcosr   sinrA   )
rP   xposition_idsinv_freq_expandedposition_ids_expandedr   freqsembr   r   s
             r<   ra   zDiaRotaryEmbedding.forward   sV    !MM$4-8>>@GGHZHZ[\H]_acdehhijiqiqr ,QaZ 8 > > @'1!((--'E!((--[`J`ahhmmfk^^UC 	5&,,.1F1L1L1NNYYZ[]^_E))UEN3C'')d444C'')d444C		5 vvAGGv$cff177f&;;;	5 	5s    BF%%F.N)
r.   r/   r0   r"   rF   rL   no_gradr   ra   rd   re   s   @r<   r   r      s3    /y /" U]]_<  <r;   r   c                     | dd| j                   d   dz  f   }| d| j                   d   dz  df   }t        j                  | |fd      S )z*Rotates half the hidden dims of the input..NrU   rV   rW   )r]   rL   r   )r   x1x2s      r<   rotate_halfr      sZ    	
3"!''"+"""	#B	
3q ""	#B99rc2YB''r;   c                     |j                  |      }|j                  |      }| |z  t        |       |z  z   }||z  t        |      |z  z   }||fS )a  Applies Rotary Position Embedding to the query and key tensors.

    Args:
        q (`torch.Tensor`): The query tensor.
        k (`torch.Tensor`): The key tensor.
        cos (`torch.Tensor`): The cosine part of the rotary embedding.
        sin (`torch.Tensor`): The sine part of the rotary embedding.
        position_ids (`torch.Tensor`, *optional*):
            Deprecated and unused.
        unsqueeze_dim (`int`, *optional*, defaults to 1):
            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
    Returns:
        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
    )	unsqueezer   )qkr   r   r   unsqueeze_dimq_embedk_embeds           r<   apply_rotary_pos_embr      sY    ( --
&C
--
&C3w;q>C/0G3w;q>C/0GGr;   rr   n_reprS   c                     | j                   \  }}}}|dk(  r| S | dddddddddf   j                  |||||      } | j                  |||z  ||      S )z
    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
    r!   N)r]   r   reshape)rr   r   batchnum_key_value_headsslenhead_dims         r<   	repeat_kvr      so    
 2?1D1D.Ehz!!Qa"23::5BUW\^bdlmM  (;e(CT8TTr;   modulequerykeyvalueattention_maskscalingdropoutkwargsc                 T   t        || j                        }t        || j                        }	t        j                  ||j	                  dd            |z  }
|#|d d d d d d d |j
                  d   f   }|
|z   }
t        j                  j                  |
dt        j                        j                  |j                        }
t        j                  j                  |
|| j                        }
t        j                  |
|	      }|j	                  dd      j                         }||
fS )NrV   r   rU   )rX   rA   )ptrainingr!   )r   num_key_value_groupsrL   matmulr   r]   r   
functionalsoftmaxr   rY   rA   r   r   
contiguous)r   r   r   r   r   r   r   r   
key_statesvalue_statesattn_weightscausal_maskattn_outputs                r<   eager_attention_forwardr      s    3 ; ;<JUF$?$?@L<<z';';Aq'ABWLL!$Q1.D
0@0@0D.D%DE#k1==((2U]](SVVW\WbWbcL==((6??([L,,|\:K''1-88:K$$r;   c                       e Zd ZdZddeeef   dedef fdZ		 	 dde
j                  dee
j                  e
j                  f   dee
j                     d	ee   d
ee
j                     dee   dee
j                  e
j                  f   fdZ xZS )DiaSelfAttention=Multi-headed attention from 'Attention Is All You Need' paperr)   	layer_idx	is_causalc                    t         |           || _        || _        |j                  | _        | j                  j
                  | _        | j                  j                  xs | j                  | _        | j                  | j                  z  | _        t        |d|j                  | j                  z        | _
        d| _        d| _        || _        t        j                  | j                  | j                  | j                  z  d      | _        t        j                  | j                  | j                  | j                  z  d      | _        t        j                  | j                  | j                  | j                  z  d      | _        t        j                  | j                  | j                  z  | j                  d      | _        y )Nr   r!           Fri   )rE   rF   r)   r   rJ   num_attention_heads	num_headsr   r   getattrr   r   attention_dropoutr   r   rk   q_projk_projv_projo_proj)rP   r)   r   r   rQ   s       r<   rF   zDiaSelfAttention.__init__   sF   "!--88#';;#B#B#Tdnn $(NNd6N6N$N!
F4F4F$..4XY!$"ii 0 0$..4==2PW\]ii 0 0$2J2JT]]2Zafgii 0 0$2J2JT]]2Zafgii >@P@PW\]r;   rr   position_embeddingsr   past_key_valuecache_positionr   rS   c                 4   |j                   d d }g |d| j                  }| j                  |      j                  |      j	                  dd      }	| j                  |      j                  |      j	                  dd      }
| j                  |      j                  |      j	                  dd      }|\  }}t        |	|
||      \  }	}
|'|||d}|j                  |
|| j                  |      \  }
}t        }| j                  j                  dk7  rt        | j                  j                     } || |	|
||f| j                  sdn| j                  | j                   d|\  }} |j"                  g |d j%                         }| j'                  |      }||fS )NrU   r!   rV   )r   r   r   eagerr   )r   r   )r]   r   r   r\   r   r   r   r   updater   r   r)   _attn_implementationr   r   r   r   r   r   r   )rP   rr   r   r   r   r   r   input_shapehidden_shapequery_statesr   r   r   r   cache_kwargsattention_interfacer   r   s                     r<   ra   zDiaSelfAttention.forward  s    $))#2.88b8$--8{{=166|DNNqRST[[/44\BLLQPQR
{{=166|DNNqRST&S#7jRUWZ#[ j%#&snUL'5'<'<ZW[WeWegs't$J(?;;++w6"9$++:Z:Z"[$7	%
  $}}C$2H2HLL	%
 	%
!\ *k));;;;FFHkk+.L((r;   )FNN)r.   r/   r0   rb   r   r$   r#   intboolrF   rL   rc   r   r   r	   
LongTensorr   r   ra   rd   re   s   @r<   r   r      s    G^u%57G%GH ^UX ^ei ^. +/59))||)) #5<<#=>)) !.	))
 !)) !!1!12)) +,)) 
u||U\\)	*))r;   r   c                        e Zd ZdZdedef fdZ	 	 ddej                  dej                  de	ej                     de	e
   d	ee   d
eej                  e	ej                     f   fdZ xZS )DiaCrossAttentionr   r)   r   c                 f   t         |           || _        || _        |j                  | _        |j
                  | _        | j                  j                  | _        | j                  j                  | _	        | j                  | j                  z  | _
        |j                  | _        d| _        d| _        d| _        t!        j"                  | j                  | j                  | j                  z  d      | _        t!        j"                  | j
                  | j                  | j                  z  d      | _        t!        j"                  | j
                  | j                  | j                  z  d      | _        t!        j"                  | j                  | j                  z  | j                  d      | _        y )Nr!   r   Fri   )rE   rF   r)   r   rJ   cross_hidden_sizecross_num_attention_headsr   cross_num_key_value_headsr   r   cross_head_dimr   r   r   r   r   rk   r   r   r   r   rP   r)   r   rQ   s      r<   rF   zDiaCrossAttention.__init__<  s?   "!--!'!9!9>>#';;#H#H $(NNd6N6N$N!--!$ii 0 0$..4==2PW\]ii 6 68P8PSWS`S`8`glmii 6 68P8PSWS`S`8`glmii >@P@PW\]r;   rr   cross_attention_statesr   past_key_valuesr   rS   c                 b   |j                   d d }g |d| j                  }g |j                   d d d| j                  }| j                  |      j                  |      j	                  dd      }	|%|j
                  j                  | j                        nd}
|]|
r[|j                  j                  | j                     j                  }|j                  j                  | j                     j                  }n| j                  |      j                  |      j	                  dd      }| j                  |      j                  |      j	                  dd      }|C|j                  j                  ||| j                        \  }}d|j
                  | j                  <   t        }| j                   j"                  dk7  rt$        | j                   j"                     } || |	|||fd| j&                  i|\  }}|j)                  g |d      j+                         }| j-                  |      }||fS )NrU   r!   rV   FTr   r   )r]   r   r   r\   r   
is_updatedr   r   cross_attention_cachelayerskeysvaluesr   r   r   r   r)   r   r   r   r   r   r   )rP   rr   r  r   r  r   r   r   cross_shaper   r	  r   r   r   r   r   s                   r<   ra   zDiaCrossAttention.forwardO  s    $))#2.88b8$--8M.44Sb9M2Mt}}M{{=166|DNNqRSTGVGb_//33DNNChm
&:(>>EEdnnUZZJ*@@GGW^^L%;<AA+NXXYZ\]^J;;'=>CCKPZZ[\^_`L*+:+P+P+W+W NN,(
L >B**4>>:(?;;++w6"9$++:Z:Z"[$7%
 LL%
 %
!\ "))*<K*<*<=HHJkk+.L((r;   r   )r.   r/   r0   rb   r#   r   rF   rL   rc   r   r   r   r   r   ra   rd   re   s   @r<   r   r   9  s    G^/ ^C ^. 269=1)||1) !&1) !.	1)
 ""561) -.1) 
u||Xell33	41)r;   r   c                        e Zd Zdedef fdZ	 	 d
dej                  dee	ej                  ej                  f      deej                     de
e   de	ej                  eej                     f   f
d	Z xZS )r,   r)   r   c                     t         |           t        |j                  |j                        | _        t        ||d      | _        t        |j                  |j                        | _        t        |      | _
        y )Nr   Fr   )rE   rF   rz   rJ   norm_epspre_sa_normr   self_attentionpost_sa_normrg   mlpr  s      r<   rF   zDiaEncoderLayer.__init__  s\    %f&8&8fooN.vyER&v'9'9vO&>r;   rr   r   r   r   rS   c                     |}| j                  |      } | j                  |f||d|\  }}||z   }|}| j                  |      }| j                  |      }	||	z   }||fS )Nr   r   )r  r  r  r  )
rP   rr   r   r   r   residualnormed_statesself_attn_outputself_attn_weightsmlp_outs
             r<   ra   zDiaEncoderLayer.forward  s     !((7.Ad.A.A/
 3)/
 	/
++ !#33 ))-8((=) 7*///r;   r   )r.   r/   r0   r$   r   rF   rL   rc   r   r   r   r   ra   rd   re   s   @r<   r,   r,     s    "/ "C " LP15	0||0 &eELL%,,,F&GH0 !.	0
 -.0 
u||Xell33	40r;   r,   c                        e Zd Zdef fdZee	 	 	 ddej                  de	ej                     de	e
   de	e
   dee   d	eeef   fd
              Zdeej                  df   dej                  fdZ xZS )
DiaEncoderr)   c           	         t         |   |       || _        t        j                  |j
                  |j                        | _        t        j                  t        |j                        D cg c]  }t        ||       c}      | _        t        |j                  |j                        | _        t!        |      | _        y c c}w Nr  )rE   rF   r)   r   rG   rH   rJ   	embedding
ModuleListrangenum_hidden_layersr,   r  rz   r  normr   rotary_embeddingsr  s      r<   rF   zDiaEncoder.__init__  s     f&7&79K9KLmmAFvG_G_A`aI_VY/a
 v11vG	!3F!; bs   -CNr+   r   output_attentionsoutput_hidden_statesr   rS   c                    | j                  |      }t        j                  |j                  d   |j                        d d d f   }| j                  ||      }| j                  ||      }|rdnd }	|rdnd }
| j                  D ]'  }|r|	|fz   }	 ||f||d|}|d   }|s|
|d   fz   }
) | j                  |      }|r|	|fz  }	t        ||	|
      S )NrU   rZ   r:   r  r   r!   last_hidden_staterr   
attentions)
r#  rL   rM   r]   rZ   r(  _update_full_maskr  r'  r   )rP   r+   r   r)  r*  r   rr   r   r   encoder_statesall_attentionsencoder_layerlayer_outputss                r<   ra   zDiaEncoder.forward  s    y1
 ||IOOB$7	@P@PQRVXYRYZ"44]LQ//

  40d![[ 	FM#!/=2B!B)$7- 	M *!,M !/=3C2E!E	F 		-0}..N+>Vd
 	
r;   inputs_embedsc                 f   || j                   j                  dk(  rd|v r|}|S d }|S | j                   j                  dk(  rt        ||j                        }|S | j                   j                  dk(  r)t	        |t
        j                        rt        |d      }|S t        ||j                        }|S )Nflash_attention_2r   sdpaflex_attentionFr  	r)   r   r   rA   r   rL   rc   r&   r   )rP   r   r5  s      r<   r0  zDiaEncoder._update_full_mask  s    
 %{{//3FF343F  MQ  11V; "E^UbUhUh!i  115EEnell;%@[`%aN
  "<NML_L_!`r;   )NFF)r.   r/   r0   r$   rF   r   r   rL   rc   r   r   r   r   r   r   r   ra   r0  rd   re   s   @r<   r   r     s    	</ 	<  26,1/4.
<<.
 !..
 $D>	.

 'tn.
 -..
 
%	&.
  .
bellD01 ||r;   r   c                   l    e Zd Zdedef fdZ	 	 	 	 	 	 ddej                  dee	ej                  ej                  f      deej                     deej                     deej                     d	ee
   d
eej                     de	ej                  eej                     eej                     f   fdZ xZS )r-   r)   r   c                    t         |           |j                  | _        t	        ||d      | _        t        ||      | _        t        |j                  |j                        | _
        t        |j                  |j                        | _        t        |j                  |j                        | _        t        |      | _        y )NTr  r  )rE   rF   rJ   	embed_dimr   r  r   cross_attentionrz   r  r  pre_ca_normpre_mlp_normrg   r  r  s      r<   rF   zDiaDecoderLayer.__init__  s    ++.vyDQ0C%f&8&8fooN%f&8&8fooN&v'9'9vO&>r;   rr   r   r   encoder_hidden_statesencoder_attention_maskr  r   rS   c                 d   |}	t        |	t              r|	j                  }	|}
| j                  |      } | j                  ||||	fd|i|\  }}|
|z   }|}
| j                  |      } | j                  ||f||d|\  }}|
|z   }|}
| j                  |      }| j                  |      }|
|z   }|||fS )Nr   )r   r  )	r   r   self_attention_cacher  r  r?  r>  r@  r  )rP   rr   r   r   rA  rB  r  r   r   self_attn_cacher  r  r  r  cross_statescross_attn_weightsr  s                    r<   ra   zDiaDecoderLayer.forward  s    *o':;-BBO ((7.Ad.A.A 	/
 *	/
 	/
++ !#33 ((7+?4+?+?!,
 2+	,

 ,
(( !</ ))-8((=) 7*/1CCCr;   )NNNNNN)r.   r/   r0   r#   r   rF   rL   rc   r   r   r   r   ra   rd   re   s   @r<   r-   r-     s    "/ "C " LP158<9=9=59-D||-D &eELL%,,,F&GH-D !.	-D
  (5-D !) 6-D ""56-D !!1!12-D 
u||Xell3Xell5KK	L-Dr;   r-   c                       e Zd ZdZdef fdZee	 	 	 	 	 	 	 	 ddej                  de
ej                     de
ej                     de
ej                     d	e
ej                     d
e
e   de
e   de
e   de
ej                     deeef   fd              Zdeej                  df   d	eej                  df   dej&                  dej                  fdZ xZS )
DiaDecoderz-Transformer Decoder Stack using DenseGeneral.r)   c           	         t         |   |       |j                  | _        |j                  | _        t	        |      | _        t        |      | _        t        j                  t        |j                        D cg c]  }t        ||       c}      | _        t        |j                  |j                         | _        y c c}w r"  )rE   rF   rI   rH   r>   
embeddingsr   r(  r   r$  r%  r&  r-   r  rz   rJ   r  r'  r  s      r<   rF   zDiaDecoder.__init__8  s     "// ++26:!3F!;mmAFvG_G_A`aI_VY/a
 v11vG	 bs   9B?Nr+   r   r   rA  rB  r  r)  r*  r   rS   c
                    |j                         dd \  }}||j                         nd}|	%t        j                  |||z   |j                        }	|	|	dddf   }| j                  |      }| j                  ||      }|1t               s'||z   }t        j                  |||j                        }t        | j                  |||	||      }| j                  |||j                  dd |      }|rdnd}|rdnd}|r|dnd}| j                  D ]7  }|r||fz  } |||||f|||	d|
}|d   }|s#||d	   fz   }|/||d   fz   }9 | j                  |      }|r||fz  }t        |||||
      S )a  
        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length, num_codebooks)`):
            The original `decoder_input_ids` in 3D shape to facilitate more efficient computations.

            [What are input IDs?](../glossary#input-ids)
        NrU   r   r,  )r)   input_embedsr   r   r  r   rV   r:   )rB  r  r   r!   )r.  r  rr   r/  cross_attentions)sizeget_seq_lengthrL   rM   rZ   rK  r(  r   r}   r   r)   _update_cross_attn_maskr]   r  r'  r   )rP   r+   r   r   rA  rB  r  r)  r*  r   r   
batch_size
seq_lengthpast_key_values_lengthrr   r   mask_seq_lengthall_hidden_statesall_self_attnsall_cross_attentionslayerr4  s                         r<   ra   zDiaDecoder.forwardC  s   , "+!1#2!6
JETE`!?!?!Afg!"\\&(>(KT]TdTdN )$'2L 	2"44]LQ!*B*D4zAO"ZZ
OIL\L\]N+;;&))+%
 "&!=!=!"#	"
 #7BD0d&7<Q<]rdh[[ 	VE#!m%55!!#%		
 (> /-	 	M *!,M !/=3C2E!E(4+?=QRCSBU+U()	V, 		-0-!118+++%1
 	
r;   r   r5  c                    ||| j                   j                  dk(  rd|v r|}|S d }|S | j                   j                  dk(  rt        ||j                  |d         }|S | j                   j                  dk(  r-t	        |t
        j                        rt        ||d   d      }|S t        ||j                  |d         }|S )	Nr7  r   r8  rU   )tgt_lenr9  F)query_lengthr   r:  )rP   rA  rB  r   r5  s        r<   rQ  z"DiaDecoder._update_cross_attn_mask  s     !,1G1S{{//3FFCDH^C^)?&. &%/ ei&. &%- 11V; *M*!'''O*&$ &% 115EE4ellC-H.%0_"'.* &%	 *D*M,?,?UW*& &%r;   )NNNNNFFN)r.   r/   r0   rb   r#   rF   r   r   rL   rc   r   r   rw   r   r   r   r   r   ra   SizerQ  rd   re   s   @r<   rI  rI  5  s[   7	H/ 	H  4815=A=A9=,1/459Z
<<Z
 u//0Z
 !.	Z

  ((9(9:Z
 !))9)9 :Z
 ""56Z
 $D>Z
 'tnZ
 !!1!12Z
 
8%?	@Z
  Z
z!&$U\\4%78!& !&ellD&8 9!& ZZ	!&
 ||!&r;   rI  z[
    The bare Dia model outputting raw hidden-states without any specific head on top.
    )custom_introc                   b    e Zd Zdef fdZd Zd Zee	 	 	 	 	 	 	 	 	 	 	 dde	e
j                     de	e
j                     de	e
j                     de	e
j                     d	e	e
j                     d
e	eeef      de	e   de	e   de	e   de	e   de	e
j                     deeef   fd              Z xZS )DiaModelr)   c                     t         |   |       || _        t        |j                        | _        t        |j                        | _        | j                          y r   )
rE   rF   r)   r   encoder_configencoderrI  decoder_configdecoder	post_initrq   s     r<   rF   zDiaModel.__init__  sE     !&"7"78!&"7"78r;   c                     | j                   S r   )rc  r   s    r<   get_encoderzDiaModel.get_encoder      ||r;   c                     | j                   S r   )re  r   s    r<   get_decoderzDiaModel.get_decoder  ri  r;   r+   r   decoder_input_idsdecoder_position_idsdecoder_attention_maskencoder_outputsr  	use_cacher)  r*  r   rS   c                 Z   ||t        d      |	|	n| j                  j                  }	|
|
n| j                  j                  }
||n| j                  j                  }| j
                  r%| j                  r|rt        j                  d       d}|r|t        t               t                     }| | j                  d|||	|
d|}nGt        |t              s7t        |d   t        |      dkD  r|d   ndt        |      dkD  r|d   nd	      }|d   j                  d   d
| j                  j                   j"                  }}}|9t%        j&                  |d|f| j                  j(                  | j*                        }|j,                  dk(  r#|j/                  |||      j1                  dd      } | j2                  d||||d   |||	|
||d
|}t5        |j6                  |j8                  |j:                  |j<                  |j>                  |d   |j:                  |j<                        S )a\  
        decoder_input_ids (`torch.LongTensor` of shape `(batch_size * num_codebooks, target_sequence_length)
        or (batch_size, target_sequence_length, num_codebooks)`, *optional*):
            1. (batch_size * num_codebooks, target_sequence_length): corresponds to the general use case where
            the audio input codebooks are flattened into the batch dimension. This also aligns with the flat-
            tened audio logits which are used to calculate the loss.

            2. (batch_size, sequence_length, num_codebooks): corresponds to the internally used shape of
            Dia to calculate embeddings and subsequent steps more efficiently.

            If no `decoder_input_ids` are provided, it will create a tensor of `bos_token_id` with shape
            `(batch_size, 1, num_codebooks)`. Indices can be obtained using the [`DiaProcessor`]. See
            [`DiaProcessor.__call__`] for more details.

            [What are decoder input IDs?](../glossary#decoder-input-ids)
        decoder_position_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`):
            Indices of positions of each input sequence tokens in the position embeddings.
            Used to calculate the position embeddings up to `config.decoder_config.max_position_embeddings`.

            [What are position IDs?](../glossary#position-ids)
        NzXYou should either provide text ids or the cached text encodings. Neither has been found.zZ`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...F)r+   r   r)  r*  r   r!   rV   r-  rU   )rO  
fill_valuerZ   )
r+   r   r   rA  rB  r  r)  r*  rp  r   )r.  r  decoder_hidden_statesdecoder_attentionsrN  encoder_last_hidden_staterA  encoder_attentionsr:   ) 
ValueErrorr)   r)  r*  rp  is_gradient_checkpointingr   loggerwarning_oncer   r
   rc  r   r   lenr]   rd  rI   rL   fullbos_token_idrZ   ndimr   r   re  r   r.  r  rr   r/  rN  )rP   r+   r   rl  rm  rn  ro  r  rp  r)  r*  r   r   bszseq_lenchannelsdecoder_outputss                    r<   ra   zDiaModel.forward  s]   N !8j  2C1N-TXT_T_TqTq$8$D $++JjJj 	 "+!6IDKK<Q<Q	))dmm##p "	01,.,.QO"*dll #-"3%9	
 O O_=-"1!"4474H14Loa0RV14_1E1I?1-tO #2!"4":":1"=r4;;C]C]CjCjhW$ %

1h'DKK4L4LUYU`U`! !!Q& 1 9 9#x Q [ [\]_` a&$,, 
'-1"1!"4#1+/!5)
 
 "-??+;;"1"?"?.99,==&5a&8"1"?"?.99	
 		
r;   )NNNNNNNNNNN)r.   r/   r0   r"   rF   rh  rk  r   r   r   rL   r   r   r   r   r   r   r   ra   rd   re   s   @r<   r`  r`    sJ   y   15598<;?=ACG9=$(,0/359k
E,,-k
 !!1!12k
 $E$4$45	k

 'u'7'78k
 !))9)9 :k
 "%(>"?@k
 ""56k
 D>k
 $D>k
 'tnk
 !!1!12k
 
u((	)k
  k
r;   r`  zl
    The Dia model consisting of a (byte) text encoder and audio decoder with a prediction head on top.
    c                       e Zd ZdZdef fdZd Zd Zee		 	 	 	 	 	 	 	 	 	 	 	 dde
ej                     de
ej                     de
ej                     d	e
ej                     d
e
ej                     de
eeef      de
e   de
e   de
e   de
e   de
ej                     de
ej                     deeef   fd              Z xZS )DiaForConditionalGenerationr*   r)   c                 |   t         |   |       || _        t        |      | _        |j
                  j                  | _        |j
                  j                  | _        t        j                  |j
                  j                  | j                  | j                  z  d      | _        d| _        | j                          y )NFri   ForMaskedLM)rE   rF   r)   r`  r*   rd  rI   rH   r   rk   rJ   logits_dense	loss_typerf  rq   s     r<   rF   z$DiaForConditionalGeneration.__init__Q  s     f%
"11>> //::II!!--0A0ADOO0S[`
 ' 	r;   c                 6    | j                   j                         S r   )r*   rh  r   s    r<   rh  z'DiaForConditionalGeneration.get_encoder`      zz%%''r;   c                 6    | j                   j                         S r   )r*   rk  r   s    r<   rk  z'DiaForConditionalGeneration.get_decoderc  r  r;   r+   r   rl  rm  rn  ro  r  rp  r)  r*  labelsr   rS   c                 ^    | j                   d	|||||||||	|
|d|}|d   }|j                  d   }| j                  |      j                  |d| j                  | j
                  f      j                  dd      j                         j                  || j                  z  d| j
                        }d}|  | j                  d	||| j
                  d|}t        |||j                  |j                  |j                  |j                  |j                  |j                  |j                   	      S )
a  
        decoder_input_ids (`torch.LongTensor` of shape `(batch_size * num_codebooks, target_sequence_length)
        or (batch_size, target_sequence_length, num_codebooks)`, *optional*):
            1. (batch_size * num_codebooks, target_sequence_length): corresponds to the general use case where
            the audio input codebooks are flattened into the batch dimension. This also aligns with the flat-
            tened audio logits which are used to calculate the loss.

            2. (batch_size, sequence_length, num_codebooks): corresponds to the internally used shape of
            Dia to calculate embeddings and subsequent steps more efficiently.

            If no `decoder_input_ids` are provided, it will create a tensor of `bos_token_id` with shape
            `(batch_size, 1, num_codebooks)`. Indices can be obtained using the [`DiaProcessor`]. See
            [`DiaProcessor.__call__`] for more details.

            [What are decoder input IDs?](../glossary#decoder-input-ids)
        decoder_position_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`):
            Indices of positions of each input sequence tokens in the position embeddings.
            Used to calculate the position embeddings up to `config.decoder_config.max_position_embeddings`.

            [What are position IDs?](../glossary#position-ids)
        labels (`torch.LongTensor` of shape `(batch_size * num_codebooks,)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in
            `[0, ..., config.decoder_config.vocab_size - 1]` or -100. Tokens with indices set to `-100`
            are ignored (masked).
        )r+   r   rl  rm  rn  ro  r  rp  r)  r*  r   r   rU   r!   rV   N)logitsr  rH   )	lossr  r  rs  rt  rN  ru  rA  rv  r:   )r*   r]   r  r\   rI   rH   r   r   loss_functionr   r  rs  rt  rN  ru  rA  rv  )rP   r+   r   rl  rm  rn  ro  r  rp  r)  r*  r  r   r   outputsr.  rR  audio_logitsr  s                      r<   ra   z#DiaForConditionalGeneration.forwardf  sH   X $** 
)/!5#9++/!5)
 
 $AJ&,,Q/
 /0T:r4#4#4dooFGYq!_Z\T*t000"dooF 	 %4%%o\&UYUdUdohnoD#33")"?"?&99$55&-&G&G")"?"?&99

 
	
r;   )NNNNNNNNNNNN)r.   r/   r0   r2   r"   rF   rh  rk  r   r   r   rL   r   r   r   r   r   r   r   ra   rd   re   s   @r<   r  r  I  sj     y ((  15598<;?=ACG9=$(,0/3-159R
E,,-R
 !!1!12R
 $E$4$45	R

 'u'7'78R
 !))9)9 :R
 "%(>"?@R
 ""56R
 D>R
 $D>R
 'tnR
 ))*R
 !!1!12R
 
uo%	&R
  R
r;   r  )r`  r(   r  )Nr!   )r   )Ltypingr   r   r   rL   r   activationsr   cache_utilsr	   r
   r   integrationsr   masking_utilsr   modeling_attn_mask_utilsr   r   modeling_flash_attention_utilsr   modeling_layersr   modeling_outputsr   r   r   r   modeling_rope_utilsr   r   modeling_utilsr   r   processing_utilsr   utilsr   r   r   r   r   r    configuration_diar"   r#   r$   generation_diar%   integrations.flex_attentionr&   
get_loggerr.   ry  r(   Moduler>   rg   rz   r   r   r   rc   r   r   r   r   r   r   r,   r   r-   rI  r`  r  __all__r:   r;   r<   <module>r     sO  , - ,   ! C C 7 / g B 9  L F &  M L .  !J 
		H	% 	? 	? 	?!ryy !8)RYY )$ Y'J J (J(< <D(6	UU\\ 	U# 	U%,, 	U& %II%<<% 
% <<	%
 U\\*% % % '(%4>)ryy >)BG)		 G)T00 0BS# Sl8D0 8DvN&# N&b 
{
! {

{
| 
l
"46H l

l
^ Lr;   