
    rh                        d Z ddlmZmZmZ ddlZddlmZ ddlmZm	Z	 ddl
mZ ddlmZmZ dd	lmZ dd
lmZ ddlmZmZmZmZ ddlmZmZ ddlmZ ddlmZmZm Z m!Z!m"Z" ddl#m$Z$m%Z%m&Z&m'Z' ddl(m)Z) ddl*m+Z+m,Z,m-Z- ddl.m/Z/  e        rddl0m1Z1  e"jd                  e3      Z4e G d de             Z5 G d dejl                        Z7 G d de)      Z8 G d de%      Z9 G d de&      Z: G d  d!e$ejl                        Z; G d" d#ejl                        Z< G d$ d%e      Z= G d& d'e5      Z> G d( d)e      Z? G d* d+e5      Z@ ed,-       G d. d/e5             ZA ed0-       G d1 d2e5e/             ZBg d3ZCy)4zPyTorch Dia model.    )CallableOptionalUnionN)nn   )DynamicCacheEncoderDecoderCache)create_causal_mask)_prepare_4d_attention_mask#_prepare_4d_attention_mask_for_sdpa)FlashAttentionKwargs)GradientCheckpointingLayer)BaseModelOutput)BaseModelOutputWithPastAndCrossAttentionsSeq2SeqLMOutputSeq2SeqModelOutput)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)auto_docstringcan_return_tupleis_torch_flex_attn_availableis_torchdynamo_compilinglogging   )LlamaAttentionLlamaRMSNormLlamaRotaryEmbeddingeager_attention_forward)Phi3MLP   )	DiaConfigDiaDecoderConfigDiaEncoderConfig)DiaGenerationMixin)make_flex_block_causal_maskc                   <    e Zd ZU eed<   dZdZdZdZdZ	dZ
dZddgZy)DiaPreTrainedModelconfigmodelT	input_idsDiaEncoderLayerDiaDecoderLayerN)__name__
__module____qualname__r"   __annotations__base_model_prefixsupports_gradient_checkpointing_supports_flash_attn_supports_sdpa_supports_flex_attn_can_compile_fullgraphmain_input_name_no_split_modules     v/var/www/html/ai-insurance-compliance-backend/venv/lib/python3.12/site-packages/transformers/models/dia/modular_dia.pyr(   r(   9   s<    &*#N!!O*,=>r;   r(   c                   `     e Zd ZdZdef fdZdej                  dej                  fdZ xZ	S )DiaMultiChannelEmbeddinga  In order to efficiently compute the audio embedding from the 9 different channels,
    we vectorize the embedding process by using a single embedding layer and an offset.
    Example:
    - num_embeds = 4
    - vocab_size = 8
    - num_channels = 3
    We would have offsets = [0, 8, 16]
    If audio_codes = [0, 1, 2, 3], [1, 3, 4, 7], [5, 6, 7, 8],
    then tokens = audio_codes + offsets
                = [0, 1, 2, 3, 9, 11, 12, 15, 21, 22, 23, 24]
    This allows us to use a single embedding layer for all channels.
    r)   c                 ~   t         |           t        j                  |j                  |j
                  z  |j                        | _        |j                  | _        |j
                  | _        t        j                  |j
                  t        j                        |j                  z  }| j                  d|d       y )N)dtypeoffsetsF)
persistent)super__init__r   	Embedding
vocab_sizenum_channelshidden_sizeembedtorcharangelongregister_buffer)selfr)   rA   	__class__s      r<   rD   z!DiaMultiChannelEmbedding.__init__T   s    \\&"3"3f6I6I"I6K]K]^
!--"//,,v22%**EHYHYYYEBr;   audio_codesreturnc                 "   || j                   j                  |j                        z   j                  d      }| j	                  |      j                  |j                  d   |j                  d   d| j                        }|j                  d      S )Nr!   r   r   )dim)	rA   todevicesqueezerI   viewshaperH   sum)rN   rP   tokensembedss       r<   forwardz DiaMultiChannelEmbedding.forward\   su    0B0B CCLLQOF#((a+:K:KA:NPRTXTdTdezzaz  r;   )
r.   r/   r0   __doc__r#   rD   rJ   Tensorr]   __classcell__rO   s   @r<   r>   r>   F   s2    C/ C!5<< !ELL !r;   r>   c                       e Zd Zy)DiaMLPNr.   r/   r0   r:   r;   r<   rc   rc   b       r;   rc   c                       e Zd Zy)
DiaRMSNormNrd   r:   r;   r<   rg   rg   f   re   r;   rg   c                       e Zd Zy)DiaRotaryEmbeddingNrd   r:   r;   r<   ri   ri   j   re   r;   ri   c                   0    e Zd ZdZddeeef   dedefdZ	y)DiaSelfAttention=Multi-headed attention from 'Attention Is All You Need' paperr)   	layer_idx	is_causalc                    t         j                  j                          || _        || _        |j
                  | _        | j                  j                  | _        | j                  j                  xs | j                  | _        | j                  | j                  z  | _	        t        |d|j
                  | j                  z        | _        d| _        d| _        || _        t        j                  | j
                  | j                  | j                  z  d      | _        t        j                  | j
                  | j                  | j                  z  d      | _        t        j                  | j
                  | j                  | j                  z  d      | _        t        j                  | j                  | j                  z  | j
                  d      | _        y )Nhead_dimr!           Fbias)r   ModulerD   r)   rm   rH   num_attention_heads	num_headsnum_key_value_headsnum_key_value_groupsgetattrrp   scalingattention_dropoutrn   Linearq_projk_projv_projo_proj)rN   r)   rm   rn   s       r<   rD   zDiaSelfAttention.__init__q   sJ   
		"!--88#';;#B#B#Tdnn $(NNd6N6N$N!
F4F4F$..4XY!$"ii 0 0$..4==2PW\]ii 0 0$2J2JT]]2Zafgii 0 0$2J2JT]]2Zafgii >@P@PW\]r;   N)F)
r.   r/   r0   r^   r   r$   r#   intboolrD   r:   r;   r<   rk   rk   n   s1    G^u%57G%GH ^UX ^ei ^r;   rk   c                        e Zd ZdZdedef fdZ	 	 ddej                  dej                  de	ej                     de	e
   d	ee   d
eej                  e	ej                     f   fdZ xZS )DiaCrossAttentionrl   r)   rm   c                 f   t         |           || _        || _        |j                  | _        |j
                  | _        | j                  j                  | _        | j                  j                  | _	        | j                  | j                  z  | _
        |j                  | _        d| _        d| _        d| _        t!        j"                  | j                  | j                  | j                  z  d      | _        t!        j"                  | j
                  | j                  | j                  z  d      | _        t!        j"                  | j
                  | j                  | j                  z  d      | _        t!        j"                  | j                  | j                  z  | j                  d      | _        y )Nr!   rq   Frr   )rC   rD   r)   rm   rH   cross_hidden_sizecross_num_attention_headsrv   cross_num_key_value_headsrw   rx   cross_head_dimrp   rz   r{   rn   r   r|   r}   r~   r   r   rN   r)   rm   rO   s      r<   rD   zDiaCrossAttention.__init__   s?   "!--!'!9!9>>#';;#H#H $(NNd6N6N$N!--!$ii 0 0$..4==2PW\]ii 6 68P8PSWS`S`8`glmii 6 68P8PSWS`S`8`glmii >@P@PW\]r;   hidden_statescross_attention_statesattention_maskpast_key_valueskwargsrQ   c                 b   |j                   d d }g |d| j                  }g |j                   d d d| j                  }| j                  |      j                  |      j	                  dd      }	|%|j
                  j                  | j                        nd}
|]|
r[|j                  j                  | j                     j                  }|j                  j                  | j                     j                  }n| j                  |      j                  |      j	                  dd      }| j                  |      j                  |      j	                  dd      }|C|j                  j                  ||| j                        \  }}d|j
                  | j                  <   t        }| j                   j"                  dk7  rt$        | j                   j"                     } || |	|||fd| j&                  i|\  }}|j)                  g |d      j+                         }| j-                  |      }||fS )NrS   r!   r   FTeagerrz   )rY   rp   r}   rX   	transpose
is_updatedgetrm   cross_attention_cachelayerskeysvaluesr~   r   updater   r)   _attn_implementationr   rz   reshape
contiguousr   )rN   r   r   r   r   r   input_shapehidden_shapecross_shapequery_statesr   
key_statesvalue_statesattention_interfaceattn_outputattn_weightss                   r<   r]   zDiaCrossAttention.forward   s    $))#2.88b8$--8M.44Sb9M2Mt}}M{{=166|DNNqRSTGVGb_//33DNNChm
&:(>>EEdnnUZZJ*@@GGW^^L%;<AA+NXXYZ\]^J;;'=>CCKPZZ[\^_`L*+:+P+P+W+W NN,(
L >B**4>>:(?;;++w6"9$++:Z:Z"[$7%
 LL%
 %
!\ "))*<K*<*<=HHJkk+.L((r;   NN)r.   r/   r0   r^   r#   r   rD   rJ   r_   r   r	   r   r   tupler]   r`   ra   s   @r<   r   r      s    G^/ ^C ^. 269=1)||1) !&1) !.	1)
 ""561) -.1) 
u||Xell33	41)r;   r   c                        e Zd Zdedef fdZ	 	 d
dej                  dee	ej                  ej                  f      deej                     de
e   de	ej                  eej                     f   f
d	Z xZS )r,   r)   rm   c                     t         |           t        |j                  |j                        | _        t        ||d      | _        t        |j                  |j                        | _        t        |      | _
        y )NepsFrn   )rC   rD   rg   rH   norm_epspre_sa_normrk   self_attentionpost_sa_normrc   mlpr   s      r<   rD   zDiaEncoderLayer.__init__   s\    %f&8&8fooN.vyER&v'9'9vO&>r;   r   position_embeddingsr   r   rQ   c                     |}| j                  |      } | j                  |f||d|\  }}||z   }|}| j                  |      }| j                  |      }	||	z   }||fS )Nr   r   )r   r   r   r   )
rN   r   r   r   r   residualnormed_statesself_attn_outputself_attn_weightsmlp_outs
             r<   r]   zDiaEncoderLayer.forward   s     !((7.Ad.A.A/
 3)/
 	/
++ !#33 ))-8((=) 7*///r;   r   )r.   r/   r0   r$   r   rD   rJ   r_   r   r   r   r   r]   r`   ra   s   @r<   r,   r,      s    "/ "C " LP15	0||0 &eELL%,,,F&GH0 !.	0
 -.0 
u||Xell33	40r;   r,   c                        e Zd Zdef fdZee	 	 	 ddej                  de	ej                     de	e
   de	e
   dee   d	eeef   fd
              Zdeej                  df   dej                  fdZ xZS )
DiaEncoderr)   c           	         t         |   |       || _        t        j                  |j
                  |j                        | _        t        j                  t        |j                        D cg c]  }t        ||       c}      | _        t        |j                  |j                        | _        t!        |      | _        y c c}w Nr   )rC   rD   r)   r   rE   rF   rH   	embedding
ModuleListrangenum_hidden_layersr,   r   rg   r   normri   rotary_embeddingsr   s      r<   rD   zDiaEncoder.__init__   s     f&7&79K9KLmmAFvG_G_A`aI_VY/a
 v11vG	!3F!; bs   -CNr+   r   output_attentionsoutput_hidden_statesr   rQ   c                    | j                  |      }t        j                  |j                  d   |j                        d d d f   }| j                  ||      }| j                  ||      }|rdnd }	|rdnd }
| j                  D ]'  }|r|	|fz   }	 ||f||d|}|d   }|s|
|d   fz   }
) | j                  |      }|r|	|fz  }	t        ||	|
      S )NrS   rV   r:   r   r   r!   last_hidden_stater   
attentions)
r   rJ   rK   rY   rV   r   _update_full_maskr   r   r   )rN   r+   r   r   r   r   r   position_idsr   encoder_statesall_attentionsencoder_layerlayer_outputss                r<   r]   zDiaEncoder.forward   s    y1
 ||IOOB$7	@P@PQRVXYRYZ"44]LQ//

  40d![[ 	FM#!/=2B!B)$7- 	M *!,M !/=3C2E!E	F 		-0}..N+>Vd
 	
r;   inputs_embedsc                 f   || j                   j                  dk(  rd|v r|}|S d }|S | j                   j                  dk(  rt        ||j                        }|S | j                   j                  dk(  r)t	        |t
        j                        rt        |d      }|S t        ||j                        }|S )Nflash_attention_2r   sdpaflex_attentionFr   	r)   r   r   r@   
isinstancerJ   r_   r&   r   )rN   r   r   s      r<   r   zDiaEncoder._update_full_mask.  s    
 %{{//3FF343F  MQ  11V; "E^UbUhUh!i  115EEnell;%@[`%aN
  "<NML_L_!`r;   )NFF)r.   r/   r0   r$   rD   r   r   rJ   r_   r   r   r   r   r   r   r   r]   r   r`   ra   s   @r<   r   r      s    	</ 	<  26,1/4.
<<.
 !..
 $D>	.

 'tn.
 -..
 
%	&.
  .
bellD01 ||r;   r   c                   l    e Zd Zdedef fdZ	 	 	 	 	 	 ddej                  dee	ej                  ej                  f      deej                     deej                     deej                     d	ee
   d
eej                     de	ej                  eej                     eej                     f   fdZ xZS )r-   r)   rm   c                    t         |           |j                  | _        t	        ||d      | _        t        ||      | _        t        |j                  |j                        | _
        t        |j                  |j                        | _        t        |j                  |j                        | _        t        |      | _        y )NTr   r   )rC   rD   rH   	embed_dimrk   r   r   cross_attentionrg   r   r   pre_ca_normpre_mlp_normrc   r   r   s      r<   rD   zDiaDecoderLayer.__init__F  s    ++.vyDQ0C%f&8&8fooN%f&8&8fooN&v'9'9vO&>r;   r   r   r   encoder_hidden_statesencoder_attention_maskr   cache_positionrQ   c                 d   |}	t        |	t              r|	j                  }	|}
| j                  |      } | j                  ||||	fd|i|\  }}|
|z   }|}
| j                  |      } | j                  ||f||d|\  }}|
|z   }|}
| j                  |      }| j                  |      }|
|z   }|||fS )Nr   )r   r   )	r   r	   self_attention_cacher   r   r   r   r   r   )rN   r   r   r   r   r   r   r   r   self_attn_cacher   r   r   r   cross_statescross_attn_weightsr   s                    r<   r]   zDiaDecoderLayer.forwardP  s    *o':;-BBO ((7.Ad.A.A 	/
 *	/
 	/
++ !#33 ((7+?4+?+?!,
 2+	,

 ,
(( !</ ))-8((=) 7*/1CCCr;   )NNNNNN)r.   r/   r0   r#   r   rD   rJ   r_   r   r   r	   
LongTensorr]   r`   ra   s   @r<   r-   r-   E  s    "/ "C " LP158<9=9=59-D||-D &eELL%,,,F&GH-D !.	-D
  (5-D !) 6-D ""56-D !!1!12-D 
u||Xell3Xell5KK	L-Dr;   r-   c                       e Zd ZdZdef fdZee	 	 	 	 	 	 	 	 ddej                  de
ej                     de
ej                     de
ej                     d	e
ej                     d
e
e   de
e   de
e   de
ej                     deeef   fd              Zdeej                  df   d	eej                  df   dej&                  dej                  fdZ xZS )
DiaDecoderz-Transformer Decoder Stack using DenseGeneral.r)   c           	         t         |   |       |j                  | _        |j                  | _        t	        |      | _        t        |      | _        t        j                  t        |j                        D cg c]  }t        ||       c}      | _        t        |j                  |j                         | _        y c c}w r   )rC   rD   rG   rF   r>   
embeddingsri   r   r   r   r   r   r-   r   rg   rH   r   r   r   s      r<   rD   zDiaDecoder.__init__  s     "// ++26:!3F!;mmAFvG_G_A`aI_VY/a
 v11vG	 bs   9B?Nr+   r   r   r   r   r   r   r   r   rQ   c
                    |j                         dd \  }}||j                         nd}|	%t        j                  |||z   |j                        }	|	|	dddf   }| j                  |      }| j                  ||      }|1t               s'||z   }t        j                  |||j                        }t        | j                  |||	||      }| j                  |||j                  dd |      }|rdnd}|rdnd}|r|dnd}| j                  D ]7  }|r||fz  } |||||f|||	d|
}|d   }|s#||d	   fz   }|/||d   fz   }9 | j                  |      }|r||fz  }t        |||||
      S )a  
        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length, num_codebooks)`):
            The original `decoder_input_ids` in 3D shape to facilitate more efficient computations.

            [What are input IDs?](../glossary#input-ids)
        NrS   r   r   )r)   input_embedsr   r   r   r   r   r:   )r   r   r   r!   )r   r   r   r   cross_attentions)sizeget_seq_lengthrJ   rK   rV   r   r   r   onesr
   r)   _update_cross_attn_maskrY   r   r   r   )rN   r+   r   r   r   r   r   r   r   r   r   
batch_size
seq_lengthpast_key_values_lengthr   r   mask_seq_lengthall_hidden_statesall_self_attnsall_cross_attentionslayerr   s                         r<   r]   zDiaDecoder.forward  s   , "+!1#2!6
JETE`!?!?!Afg!"\\&(>(KT]TdTdN )$'2L 	2"44]LQ!*B*D4zAO"ZZ
OIL\L\]N+;;&))+%
 "&!=!=!"#	"
 #7BD0d&7<Q<]rdh[[ 	VE#!m%55!!#%		
 (> /-	 	M *!,M !/=3C2E!E(4+?=QRCSBU+U()	V, 		-0-!118+++%1
 	
r;   r   r   c                    ||| j                   j                  dk(  rd|v r|}|S d }|S | j                   j                  dk(  rt        ||j                  |d         }|S | j                   j                  dk(  r-t	        |t
        j                        rt        ||d   d      }|S t        ||j                  |d         }|S )	Nr   r   r   rS   )tgt_lenr   F)query_lengthrn   r   )rN   r   r   r   r   s        r<   r   z"DiaDecoder._update_cross_attn_mask  s     !,1G1S{{//3FFCDH^C^)?&. &%/ ei&. &%- 11V; *M*!'''O*&$ &% 115EE4ellC-H.%0_"'.* &%	 *D*M,?,?UW*& &%r;   )NNNNNFFN)r.   r/   r0   r^   r#   rD   r   r   rJ   r_   r   r   FloatTensorr	   r   r   r   r   r]   Sizer   r`   ra   s   @r<   r   r     s[   7	H/ 	H  4815=A=A9=,1/459Z
<<Z
 u//0Z
 !.	Z

  ((9(9:Z
 !))9)9 :Z
 ""56Z
 $D>Z
 'tnZ
 !!1!12Z
 
8%?	@Z
  Z
z!&$U\\4%78!& !&ellD&8 9!& ZZ	!&
 ||!&r;   r   z[
    The bare Dia model outputting raw hidden-states without any specific head on top.
    )custom_introc                   b    e Zd Zdef fdZd Zd Zee	 	 	 	 	 	 	 	 	 	 	 dde	e
j                     de	e
j                     de	e
j                     de	e
j                     d	e	e
j                     d
e	eeef      de	e   de	e   de	e   de	e   de	e
j                     deeef   fd              Z xZS )DiaModelr)   c                     t         |   |       || _        t        |j                        | _        t        |j                        | _        | j                          y N)
rC   rD   r)   r   encoder_configencoderr   decoder_configdecoder	post_initrN   r)   rO   s     r<   rD   zDiaModel.__init__  sE     !&"7"78!&"7"78r;   c                     | j                   S r  )r  rN   s    r<   get_encoderzDiaModel.get_encoder      ||r;   c                     | j                   S r  )r  r  s    r<   get_decoderzDiaModel.get_decoder!  r  r;   r+   r   decoder_input_idsdecoder_position_idsdecoder_attention_maskencoder_outputsr   	use_cacher   r   r   rQ   c                 Z   ||t        d      |	|	n| j                  j                  }	|
|
n| j                  j                  }
||n| j                  j                  }| j
                  r%| j                  r|rt        j                  d       d}|r|t        t               t                     }| | j                  d|||	|
d|}nGt        |t              s7t        |d   t        |      dkD  r|d   ndt        |      dkD  r|d   nd	      }|d   j                  d   d
| j                  j                   j"                  }}}|9t%        j&                  |d|f| j                  j(                  | j*                        }|j,                  dk(  r#|j/                  |||      j1                  dd      } | j2                  d||||d   |||	|
||d
|}t5        |j6                  |j8                  |j:                  |j<                  |j>                  |d   |j:                  |j<                        S )a\  
        decoder_input_ids (`torch.LongTensor` of shape `(batch_size * num_codebooks, target_sequence_length)
        or (batch_size, target_sequence_length, num_codebooks)`, *optional*):
            1. (batch_size * num_codebooks, target_sequence_length): corresponds to the general use case where
            the audio input codebooks are flattened into the batch dimension. This also aligns with the flat-
            tened audio logits which are used to calculate the loss.

            2. (batch_size, sequence_length, num_codebooks): corresponds to the internally used shape of
            Dia to calculate embeddings and subsequent steps more efficiently.

            If no `decoder_input_ids` are provided, it will create a tensor of `bos_token_id` with shape
            `(batch_size, 1, num_codebooks)`. Indices can be obtained using the [`DiaProcessor`]. See
            [`DiaProcessor.__call__`] for more details.

            [What are decoder input IDs?](../glossary#decoder-input-ids)
        decoder_position_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`):
            Indices of positions of each input sequence tokens in the position embeddings.
            Used to calculate the position embeddings up to `config.decoder_config.max_position_embeddings`.

            [What are position IDs?](../glossary#position-ids)
        NzXYou should either provide text ids or the cached text encodings. Neither has been found.zZ`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...F)r+   r   r   r   r   r!   r   r   rS   )r   
fill_valuerV   )
r+   r   r   r   r   r   r   r   r  r   )r   r   decoder_hidden_statesdecoder_attentionsr   encoder_last_hidden_stater   encoder_attentionsr:   ) 
ValueErrorr)   r   r   r  is_gradient_checkpointingtrainingloggerwarning_oncer	   r   r  r   r   lenrY   r  rG   rJ   fullbos_token_idrV   ndimr   r   r  r   r   r   r   r   r   )rN   r+   r   r  r  r  r  r   r  r   r   r   r   bszseq_lenchannelsdecoder_outputss                    r<   r]   zDiaModel.forward$  s]   N !8j  2C1N-TXT_T_TqTq$8$D $++JjJj 	 "+!6IDKK<Q<Q	))dmm##p "	01,.,.QO"*dll #-"3%9	
 O O_=-"1!"4474H14Loa0RV14_1E1I?1-tO #2!"4":":1"=r4;;C]C]CjCjhW$ %

1h'DKK4L4LUYU`U`! !!Q& 1 9 9#x Q [ [\]_` a&$,, 
'-1"1!"4#1+/!5)
 
 "-??+;;"1"?"?.99,==&5a&8"1"?"?.99	
 		
r;   )NNNNNNNNNNN)r.   r/   r0   r"   rD   r  r  r   r   r   rJ   r   r   r   r   r	   r   r   r]   r`   ra   s   @r<   r  r    sJ   y   15598<;?=ACG9=$(,0/359k
E,,-k
 !!1!12k
 $E$4$45	k

 'u'7'78k
 !))9)9 :k
 "%(>"?@k
 ""56k
 D>k
 $D>k
 'tnk
 !!1!12k
 
u((	)k
  k
r;   r  zl
    The Dia model consisting of a (byte) text encoder and audio decoder with a prediction head on top.
    c                       e Zd ZdZdef fdZd Zd Zee		 	 	 	 	 	 	 	 	 	 	 	 dde
ej                     de
ej                     de
ej                     d	e
ej                     d
e
ej                     de
eeef      de
e   de
e   de
e   de
e   de
ej                     de
ej                     deeef   fd              Z xZS )DiaForConditionalGenerationr*   r)   c                 |   t         |   |       || _        t        |      | _        |j
                  j                  | _        |j
                  j                  | _        t        j                  |j
                  j                  | j                  | j                  z  d      | _        d| _        | j                          y )NFrr   ForMaskedLM)rC   rD   r)   r  r*   r  rG   rF   r   r|   rH   logits_dense	loss_typer  r	  s     r<   rD   z$DiaForConditionalGeneration.__init__  s     f%
"11>> //::II!!--0A0ADOO0S[`
 ' 	r;   c                 6    | j                   j                         S r  )r*   r  r  s    r<   r  z'DiaForConditionalGeneration.get_encoder      zz%%''r;   c                 6    | j                   j                         S r  )r*   r  r  s    r<   r  z'DiaForConditionalGeneration.get_decoder  r/  r;   r+   r   r  r  r  r  r   r  r   r   labelsr   rQ   c                 ^    | j                   d	|||||||||	|
|d|}|d   }|j                  d   }| j                  |      j                  |d| j                  | j
                  f      j                  dd      j                         j                  || j                  z  d| j
                        }d}|  | j                  d	||| j
                  d|}t        |||j                  |j                  |j                  |j                  |j                  |j                  |j                   	      S )
a  
        decoder_input_ids (`torch.LongTensor` of shape `(batch_size * num_codebooks, target_sequence_length)
        or (batch_size, target_sequence_length, num_codebooks)`, *optional*):
            1. (batch_size * num_codebooks, target_sequence_length): corresponds to the general use case where
            the audio input codebooks are flattened into the batch dimension. This also aligns with the flat-
            tened audio logits which are used to calculate the loss.

            2. (batch_size, sequence_length, num_codebooks): corresponds to the internally used shape of
            Dia to calculate embeddings and subsequent steps more efficiently.

            If no `decoder_input_ids` are provided, it will create a tensor of `bos_token_id` with shape
            `(batch_size, 1, num_codebooks)`. Indices can be obtained using the [`DiaProcessor`]. See
            [`DiaProcessor.__call__`] for more details.

            [What are decoder input IDs?](../glossary#decoder-input-ids)
        decoder_position_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`):
            Indices of positions of each input sequence tokens in the position embeddings.
            Used to calculate the position embeddings up to `config.decoder_config.max_position_embeddings`.

            [What are position IDs?](../glossary#position-ids)
        labels (`torch.LongTensor` of shape `(batch_size * num_codebooks,)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in
            `[0, ..., config.decoder_config.vocab_size - 1]` or -100. Tokens with indices set to `-100`
            are ignored (masked).
        )r+   r   r  r  r  r  r   r  r   r   r   r   rS   r!   r   N)logitsr1  rF   )	lossr3  r   r  r  r   r  r   r  r:   )r*   rY   r,  rX   rG   rF   r   r   loss_functionr   r   r  r  r   r  r   r  )rN   r+   r   r  r  r  r  r   r  r   r   r1  r   r   outputsr   r   audio_logitsr4  s                      r<   r]   z#DiaForConditionalGeneration.forward  sH   X $** 
)/!5#9++/!5)
 
 $AJ&,,Q/
 /0T:r4#4#4dooFGYq!_Z\T*t000"dooF 	 %4%%o\&UYUdUdohnoD#33")"?"?&99$55&-&G&G")"?"?&99

 
	
r;   )NNNNNNNNNNNN)r.   r/   r0   r2   r"   rD   r  r  r   r   r   rJ   r   r   r   r   r	   r   r   r]   r`   ra   s   @r<   r)  r)    sj     y ((  15598<;?=ACG9=$(,0/3-159R
E,,-R
 !!1!12R
 $E$4$45	R

 'u'7'78R
 !))9)9 :R
 "%(>"?@R
 ""56R
 D>R
 $D>R
 'tnR
 ))*R
 !!1!12R
 
uo%	&R
  R
r;   r)  )r  r(   r)  )Dr^   typingr   r   r   rJ   r   cache_utilsr   r	   masking_utilsr
   modeling_attn_mask_utilsr   r   modeling_flash_attention_utilsr   modeling_layersr   modeling_outputsr   r   r   r   modeling_utilsr   r   processing_utilsr   utilsr   r   r   r   r   llama.modeling_llamar   r   r   r   phi3.modeling_phi3r    configuration_diar"   r#   r$   generation_diar%   integrations.flex_attentionr&   
get_loggerr.   r  r(   rt   r>   rc   rg   ri   rk   r   r,   r   r-   r   r  r)  __all__r:   r;   r<   <module>rI     s    , ,   < / C 9  G & v v  ) L L .  !J 
		H	% 	? 	? 	?!ryy !8	W 		 		- 	^~ryy ^,G)		 G)T00 0BS# Sl8D0 8DvN&# N&b 
{
! {

{
| 
l
"46H l

l
^ Lr;   