
    rhh                         d Z ddlZddlmZ ddlmZ ddlmZ  ej                  e	      Z
g dZg dZd	gZg d
Zd Zd Zd Zd ZeeeedZ G d de      Z G d de      Z G d de      Zg dZy)zJukebox configuration    N)Union   )PretrainedConfig)logging)O
block_attntranspose_block_attnprev_block_attnr   r   r	   r   r   r	   r   r   r	   r   r   r	   r   r   r	   cross_attentionr   r   r	   r   r   r	   r   r   r	   r
   r   r   r	   r   r   r	   r   r   r	   r
   r   r   r	   r   r   r	   r   r   r	   r
   r   r   r	   r   r   r	   r   r   r	   r
   r   r   r	   r   r   r	   r   r   r	   r
   r   r   r	   r   r   r	   r   r   r	   r
   )r   r   r	   dense_attention)
prime_attnr   
dense_attnc                     t         d   S )Nr   )_FullDenseAttentionlayers    /var/www/html/ai-insurance-compliance-backend/venv/lib/python3.12/site-packages/transformers/models/deprecated/jukebox/configuration_jukebox.pyfull_dense_attentionr   q   s    q!!    c                     t         | dz     S )N   )_RawColumnPreviousRowAttentionr   s    r   !raw_column_previous_row_attentionr   u   s    )%!)44r   c                     t         | dz     S )NO   )_LARGE_ATTENTIONr   s    r    large_separated_enc_dec_w_lyricsr   y   s    EBJ''r   c                 B    | dz  dk(  rt         | dz     S t        | dz     S )N      r   )_PrimePrimeDenseAttentionr   r   s    r   enc_dec_with_lyricsr!   }   s,    rzR(33)%!)44r   )r   r   r   r!   c            *            e Zd ZdZdZdddZdddd	d
ddddddddddddddddddddgdddddddddddg dg ddd dddf* fd!	Zed$d"ee	e
j                  f   fd#       Z xZS )%JukeboxPriorConfiga"  
        This is the configuration class to store the configuration of a [`JukeboxPrior`]. It is used to instantiate a
        `JukeboxPrior` according to the specified arguments, defining the model architecture. Instantiating a
        configuration with the defaults will yield a similar configuration to that of the top level prior from the
        [openai/jukebox-1b-lyrics](https://huggingface.co/openai/jukebox
    -1b-lyrics) architecture.

        Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
        documentation from [`PretrainedConfig`] for more information.



    Args:
        act_fn (`str`, *optional*, defaults to `"quick_gelu"`):
            Activation function.
        alignment_head (`int`, *optional*, defaults to 2):
            Head that is responsible of the alignment between lyrics and music. Only used to compute the lyric to audio
            alignment
        alignment_layer (`int`, *optional*, defaults to 68):
            Index of the layer that is responsible of the alignment between lyrics and music. Only used to compute the
            lyric to audio alignment
        attention_multiplier (`float`, *optional*, defaults to 0.25):
            Multiplier coefficient used to define the hidden dimension of the attention layers. 0.25 means that
            0.25*width of the model will be used.
        attention_pattern (`str`, *optional*, defaults to `"enc_dec_with_lyrics"`):
            Which attention pattern to use for the decoder/
        attn_dropout (`int`, *optional*, defaults to 0):
            Dropout probability for the post-attention layer dropout in the decoder.
        attn_res_scale (`bool`, *optional*, defaults to `False`):
            Whether or not to scale the residuals in the attention conditioner block.
        blocks (`int`, *optional*, defaults to 64):
            Number of blocks used in the `block_attn`. A sequence of length seq_len is factored as `[blocks, seq_len //
            blocks]` in the `JukeboxAttention` layer.
        conv_res_scale (`int`, *optional*):
            Whether or not to scale the residuals in the conditioner block. Since the top level prior does not have a
            conditioner, the default value is to None and should not be modified.
        num_layers (`int`, *optional*, defaults to 72):
            Number of layers of the transformer architecture.
        emb_dropout (`int`, *optional*, defaults to 0):
            Embedding dropout used in the lyric decoder.
        encoder_config (`JukeboxPriorConfig`, *optional*) :
            Configuration of the encoder which models the prior on the lyrics.
        encoder_loss_fraction (`float`, *optional*, defaults to 0.4):
            Multiplication factor used in front of the lyric encoder loss.
        hidden_size (`int`, *optional*, defaults to 2048):
            Hidden dimension of the attention layers.
        init_scale (`float`, *optional*, defaults to 0.2):
            Initialization scales for the prior modules.
        is_encoder_decoder (`bool`, *optional*, defaults to `True`):
            Whether or not the prior is an encoder-decoder model. In case it is not, and `nb_relevant_lyric_tokens` is
            greater than 0, the `encoder` args should be specified for the lyric encoding.
        mask (`bool`, *optional*, defaults to `False`):
            Whether or not to mask the previous positions in the attention.
        max_duration (`int`, *optional*, defaults to 600):
            Maximum supported duration of the generated song in seconds.
        max_nb_genres (`int`, *optional*, defaults to 1):
            Maximum number of genres that can be used to condition the model.
        merged_decoder (`bool`, *optional*, defaults to `True`):
            Whether or not the decoder and the encoder inputs are merged. This is used for the separated
            encoder-decoder architecture
        metadata_conditioning (`bool`, *optional*, defaults to `True)`:
            Whether or not to condition on the artist and genre metadata.
        metadata_dims (`List[int]`, *optional*, defaults to `[604, 7898]`):
            Number of genres and the number of artists that were used to train the embedding layers of the prior
            models.
        min_duration (`int`, *optional*, defaults to 0):
            Minimum duration of the generated audio on which the model was trained.
        mlp_multiplier (`float`, *optional*, defaults to 1.0):
            Multiplier coefficient used to define the hidden dimension of the MLP layers. 0.25 means that 0.25*width of
            the model will be used.
        music_vocab_size (`int`, *optional*, defaults to 2048):
            Number of different music tokens. Should be similar to the `JukeboxVQVAEConfig.nb_discrete_codes`.
        n_ctx (`int`, *optional*, defaults to 6144):
            Number of context tokens for each prior. The context tokens are the music tokens that are attended to when
            generating music tokens.
        n_heads (`int`, *optional*, defaults to 2):
                Number of attention heads.
        nb_relevant_lyric_tokens (`int`, *optional*, defaults to 384):
            Number of lyric tokens that are used when sampling a single window of length `n_ctx`
        res_conv_depth (`int`, *optional*, defaults to 3):
            Depth of the `JukeboxDecoderConvBock` used to upsample the previously sampled audio in the
            `JukeboxMusicTokenConditioner`.
        res_conv_width (`int`, *optional*, defaults to 128):
            Width of the `JukeboxDecoderConvBock` used to upsample the previously sampled audio in the
            `JukeboxMusicTokenConditioner`.
        res_convolution_multiplier (`int`, *optional*, defaults to 1):
            Multiplier used to scale the `hidden_dim` of the `JukeboxResConv1DBlock`.
        res_dilation_cycle (`int`, *optional*):
            Dilation cycle used to define the `JukeboxMusicTokenConditioner`. Usually similar to the ones used in the
            corresponding level of the VQVAE. The first prior does not use it as it is not conditioned on upper level
            tokens.
        res_dilation_growth_rate (`int`, *optional*, defaults to 1):
            Dilation grow rate used between each convolutionnal block of the `JukeboxMusicTokenConditioner`
        res_downs_t (`List[int]`, *optional*, defaults to `[3, 2, 2]`):
            Downsampling rates used in the audio conditioning network
        res_strides_t (`List[int]`, *optional*, defaults to `[2, 2, 2]`):
            Striding used in the audio conditioning network
        resid_dropout (`int`, *optional*, defaults to 0):
            Residual dropout used in the attention pattern.
        sampling_rate (`int`, *optional*, defaults to 44100):
            Sampling rate used for training.
        spread (`int`, *optional*):
            Spread used in the `summary_spread_attention` pattern
        timing_dims (`int`, *optional*, defaults to 64):
            Dimension of the timing embedding.
        zero_out (`bool`, *optional*, defaults to `False`):
            Whether or not to zero out convolution weights when initializing.
    jukebox_priorn_positionsn_head)max_position_embeddingsnum_attention_heads
quick_gelur      D   g      ?r!   F@   NH   g?   皙?TP   iX     i\  i  g      ?i   i  r      r   r*   r*   r*   r*   r*   D  c+                    t        ,|   di |+ || _        || _        || _        || _        || _        || _        || _        |	| _	        |
| _
        || _        || _        || _        |t        di || _        nd | _        || _        || _        || _        || _        || _        || _        || _        || _        || _        || _        || _        || _        || _        || _        || _        || _        || _         | | _!        |!| _"        |"| _#        |#| _$        |$| _%        |%| _&        |&| _'        |'| _(        |(| _)        |)| _*        || _+        |*| _,        y N )-super__init__act_fnalignment_headalignment_layerattention_multiplierattention_patternattn_dropoutattn_res_scaleblocksconv_res_scale
num_layersemb_dropoutmusic_vocab_sizer#   encoder_configencoder_loss_fraction
init_scaleis_encoder_decoderlyric_vocab_sizelevelmaskmax_durationmax_nb_genresmerged_decodermetadata_conditioningmetadata_dimsmin_durationmlp_multipliern_ctxn_headsnb_relevant_lyric_tokensres_conv_depthres_conv_widthres_convolution_multiplierres_dilation_cycleres_dilation_growth_rateres_downs_tres_strides_tresid_dropoutsampling_ratespreadtiming_dimshidden_sizezero_out)-selfr;   rL   r<   r=   r>   r?   r@   rA   rB   rC   rD   rE   rG   rH   rc   rI   rJ   rK   rM   rN   rO   rP   rQ   rR   rS   rT   rF   rU   rV   rW   rX   rY   rZ   r[   r\   r]   r^   r_   r`   ra   rb   rd   kwargs	__class__s-                                               r   r:   zJukeboxPriorConfig.__init__   sr   \ 	"6",.$8!!2(,,$& 0%"4"F~"FD"&D%:"$"4 0
	(*,%:"*(,
(@%,,*D'"4(@%&***&& r   pretrained_model_name_or_pathc                 D   | j                  |        | j                  |fi |\  }}|j                  d      dk(  r|d|    }d|v rGt        | d      r;|d   | j                  k7  r)t
        j                  d|d    d| j                   d        | j                  |fi |S )N
model_typejukeboxprior_You are using a model of type   to instantiate a model of type N. This is not supported for all configurations of models and can yield errors._set_token_in_kwargsget_config_dictgethasattrrj   loggerwarning	from_dict)clsrh   rL   rf   config_dicts        r   from_pretrainedz"JukeboxPriorConfig.from_pretrained\  s      (1c112OZSYZV ??<(I5%ug&67K;&73+E+VbJcgjguguJuNN0\1J0KKk>>""pr
 s}}[3F33r   )r   )__name__
__module____qualname____doc__rj   attribute_mapr:   classmethodr   strosPathLikerz   __classcell__rg   s   @r   r#   r#      s    kZ !J#0'M !/!"Dk!$#$!"W[!z 4E#r{{BR<S 4 4r   r#   c                        e Zd ZdZdZddddddg d	d
dg dddddd
g dg ddddf fd	Zedeee	j                  f   fd       Z xZS )JukeboxVQVAEConfiga  
    This is the configuration class to store the configuration of a [`JukeboxVQVAE`]. It is used to instantiate a
    `JukeboxVQVAE` according to the specified arguments, defining the model architecture. Instantiating a configuration
    with the defaults will yield a similar configuration to that of the VQVAE from
    [openai/jukebox-1b-lyrics](https://huggingface.co/openai/jukebox-1b-lyrics) architecture.

    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PretrainedConfig`] for more information.

    Args:
        act_fn (`str`, *optional*, defaults to `"relu"`):
            Activation function of the model.
        nb_discrete_codes (`int`, *optional*, defaults to 2048):
            Number of codes of the VQVAE.
        commit (`float`, *optional*, defaults to 0.02):
            Commit loss multiplier.
        conv_input_shape (`int`, *optional*, defaults to 1):
            Number of audio channels.
        conv_res_scale (`bool`, *optional*, defaults to `False`):
            Whether or not to scale the residuals of the `JukeboxResConv1DBlock`.
        embed_dim (`int`, *optional*, defaults to 64):
            Embedding dimension of the codebook vectors.
        hop_fraction (`List[int]`, *optional*, defaults to `[0.125, 0.5, 0.5]`):
            Fraction of non-intersecting window used when continuing the sampling process.
        levels (`int`, *optional*, defaults to 3):
            Number of hierarchical levels that used in the VQVAE.
        lmu (`float`, *optional*, defaults to 0.99):
            Used in the codebook update, exponential moving average coefficient. For more detail refer to Appendix A.1
            of the original [VQVAE paper](https://huggingface.co/papers/1711.00937v2.pdf)
        multipliers (`List[int]`, *optional*, defaults to `[2, 1, 1]`):
            Depth and width multipliers used for each level. Used on the `res_conv_width` and `res_conv_depth`
        res_conv_depth (`int`, *optional*, defaults to 4):
            Depth of the encoder and decoder block. If no `multipliers` are used, this is the same for each level.
        res_conv_width (`int`, *optional*, defaults to 32):
            Width of the encoder and decoder block. If no `multipliers` are used, this is the same for each level.
        res_convolution_multiplier (`int`, *optional*, defaults to 1):
            Scaling factor of the hidden dimension used in the `JukeboxResConv1DBlock`.
        res_dilation_cycle (`int`, *optional*):
            Dilation cycle value used in the `JukeboxResnet`. If an int is used, each new Conv1 block will have a depth
            reduced by a power of `res_dilation_cycle`.
        res_dilation_growth_rate (`int`, *optional*, defaults to 3):
            Resnet dilation growth rate used in the VQVAE (dilation_growth_rate ** depth)
        res_downs_t (`List[int]`, *optional*, defaults to `[3, 2, 2]`):
            Downsampling rate for each level of the hierarchical VQ-VAE.
        res_strides_t (`List[int]`, *optional*, defaults to `[2, 2, 2]`):
            Stride used for each level of the hierarchical VQ-VAE.
        sample_length (`int`, *optional*, defaults to 1058304):
            Provides the max input shape of the VQVAE. Is used to compute the input shape of each level.
        init_scale (`float`, *optional*, defaults to 0.2):
            Initialization scale.
        zero_out (`bool`, *optional*, defaults to `False`):
            Whether or not to zero out convolution weights when initializing.
    jukebox_vqvaerelur.   g{Gz?r1   Fr,   )g      ?      ?r   r   gGz?)r*   r1   r1   r       Nr3   r4   i & r/   c                 <   t        |   di | || _        || _        || _        || _        || _        || _        || _        || _	        || _
        || _        || _        |
| _        || _        || _        |	| _        || _        || _        || _        || _        || _        y r7   )r9   r:   hop_fractionconv_input_shapesample_lengthlevels	embed_dimnb_discrete_codesrY   rX   rZ   r\   r[   multipliersr]   r^   lmucommitrC   r;   rI   rd   )re   r;   r   r   r   rC   r   r   r   r   r   rX   rY   rZ   r[   r\   r]   r^   r   rI   rd   rf   rg   s                         r   r:   zJukeboxVQVAEConfig.__init__  s    0 	"6"( 0* "!2,,*D'(@%"4&&*,$ r   rh   c                 >   | j                  |        | j                  |fi |\  }}|j                  d      dk(  r|d   }d|v rGt        | d      r;|d   | j                  k7  r)t
        j                  d|d    d| j                   d        | j                  |fi |S )Nrj   rk   vqvae_configrm   rn   ro   rp   )rx   rh   rf   ry   s       r   rz   z"JukeboxVQVAEConfig.from_pretrained  s      (1c112OZSYZV ??<(I5%n5K;&73+E+VbJcgjguguJuNN0\1J0KKk>>""pr
 s}}[3F33r   )r{   r|   r}   r~   rj   r:   r   r   r   r   r   rz   r   r   s   @r   r   r   o  s}    4l !J &#$!"+.!` 4E#r{{BR<S 4 4r   r   c                   d     e Zd ZdZdZ	 	 	 	 	 	 	 	 	 d fd	Zedee   de	fd       Z
 fdZ xZS )	JukeboxConfigaW  
    This is the configuration class to store the configuration of a [`JukeboxModel`].

    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PretrainedConfig`] for more information. Instantiating a configuration with the defaults will
    yield a similar configuration to that of
    [openai/jukebox-1b-lyrics](https://huggingface.co/openai/jukebox-1b-lyrics) architecture.


    The downsampling and stride are used to determine downsampling of the input sequence. For example, downsampling =
    (5,3), and strides = (2, 2) will downsample the audio by 2^5 = 32 to get the first level of codes, and 2**8 = 256
    to get the second level codes. This is mostly true for training the top level prior and the upsamplers.

    Args:
        vqvae_config (`JukeboxVQVAEConfig`, *optional*):
            Configuration for the `JukeboxVQVAE` model.
        prior_config_list (`List[JukeboxPriorConfig]`, *optional*):
            List of the configs for each of the `JukeboxPrior` of the model. The original architecture uses 3 priors.
        nb_priors (`int`, *optional*, defaults to 3):
            Number of prior models that will sequentially sample tokens. Each prior is conditional auto regressive
            (decoder) model, apart from the top prior, which can include a lyric encoder. The available models were
            trained using a top prior and 2 upsampler priors.
        sampling_rate (`int`, *optional*, defaults to 44100):
            Sampling rate of the raw audio.
        timing_dims (`int`, *optional*, defaults to 64):
            Dimensions of the JukeboxRangeEmbedding layer which is equivalent to traditional positional embedding
            layer. The timing embedding layer converts the absolute and relative position in the currently sampled
            audio to a tensor of length `timing_dims` that will be added to the music tokens.
        min_duration (`int`, *optional*, defaults to 0):
            Minimum duration of the audios to generate
        max_duration (`float`, *optional*, defaults to 600.0):
            Maximum duration of the audios to generate
        max_nb_genres (`int`, *optional*, defaults to 5):
            Maximum number of genres that can be used to condition a single sample.
        metadata_conditioning (`bool`, *optional*, defaults to `True`):
            Whether or not to use metadata conditioning, corresponding to the artist, the genre and the min/maximum
            duration.

    Example:

    ```python
    >>> from transformers import JukeboxModel, JukeboxConfig

    >>> # Initializing a Jukebox configuration
    >>> configuration = JukeboxConfig()

    >>> # Initializing a model from the configuration
    >>> model = JukeboxModel(configuration)

    >>> # Accessing the model configuration
    >>> configuration = model.config
    ```
    rk   c
                 4   |i }t         j                  d       t        di || _        ||D cg c]  }t	        di | c}| _        nmg | _        t        |      D ]X  }|
j                  d| d       }|i }t         j                  d| d       | j
                  j                  t	        di |       Z | j                  j                  | _	        || _
        || _        || _        || _        || _        || _        |	| _        t#        | H  di |
 y c c}w )NzHvqvae_config is None. initializing the JukeboxVQVAE with default values.rl   zQ's  config is None. Initializing the JukeboxPriorConfig list with default values.r8   )ru   infor   r   r#   prior_configsrangepopappendr   	nb_priorsrO   r`   rb   rS   rN   rQ   r9   r:   )re   r   prior_config_listr   r`   rb   rS   rN   rO   rQ   rf   prior_config	prior_idxrg   s                r   r:   zJukeboxConfig.__init__$  s0    LKKbc.>>(Yj!k"4"D|"D!kD!#D"9- N	%zzF9+*>E'#%LKK  ,# # ""))*<*L|*LMN !--::" +*&((%:""6"3 "ls   Dr   r   c                 x    |D cg c]  }|j                          }} | d||j                         d|S c c}w )z
        Instantiate a [`JukeboxConfig`] (or a derived class) from clip text model configuration and clip vision model
        configuration.

        Returns:
            [`JukeboxConfig`]: An instance of a configuration object
        )r   vqvae_config_dictr8   )to_dict)rx   r   r   rf   configr   s         r   from_configszJukeboxConfig.from_configsR  sF     =JJ&V^^-JJk%6,J^J^J`kdjkk Ks   7c                     t         |          }|j                  d      D cg c]  }|j                          c}|d<   |S c c}w )Nr   r   )r9   r   r   )re   resultr   rg   s      r   r   zJukeboxConfig.to_dict^  sA    "FLjjQ`Fa&bFv~~'7&b"# 'cs   A)	NNr   r5   r,   r   g     @   T)r{   r|   r}   r~   rj   r:   r   listr#   r   r   r   r   r   s   @r   r   r     sj    4l J ",#\ 	l.@)A 	lQc 	l 	l r   r   )r   r#   r   )r~   r   typingr   configuration_utilsr   utilsr   
get_loggerr{   ru   r   r   r   r    r   r   r   r!   ATTENTION_PATTERNSr#   r   r   __all__r8   r   r   <module>r      s     	  4  
		H	%P b "[ () F "5(5 1)J(H.	 a4) a4Hy4) y4xw$ wt Hr   