
    rh                        d dl Z d dlmZ d dlmZmZmZ d dlZd dlmZ ddl	m
Z
 ddlmZ ddlmZmZmZmZ dd	lmZ dd
lmZ ddlmZmZmZ ddlmZmZ ddlmZ ddlm Z m!Z!m"Z"m#Z#m$Z$m%Z% ddl&m'Z' ddl(m)Z)m*Z*m+Z+  e#       r	d dl,mc m-Z.  e$j^                  e0      Z1e! G d de             Z2e e!d       G d de                    Z3e e!d       G d de                    Z4e e!d       G d de                    Z5 G d d ejl                        Z7d!ejp                  d"e9d#ejp                  fd$Z:	 dWd%ejl                  d&ejp                  d'ejp                  d(ejp                  d)eejp                     d*e;d+e;d,ee    fd-Z< G d. d/ejl                        Z= G d0 d1ejl                        Z> G d2 d3e      Z? G d4 d5ejl                        Z@e! G d6 d7e2             ZA G d8 d9ejl                        ZB G d: d;ejl                        ZC G d< d=ejl                        ZD G d> d?ejl                        ZE G d@ dAejl                        ZF G dB dCejl                        ZG G dD dEejl                        ZH G dF dGejl                        ZI G dH dIejl                        ZJ e!dJ       G dK dLe2             ZK G dM dNejl                        ZL G dO dPejl                        ZM e!dQ       G dR dSe2             ZN G dT dUe2e      ZOg dVZPy)X    N)	dataclass)CallableOptionalUnion)nn   )ACT2FN)Cache)%ClassifierFreeGuidanceLogitsProcessorGenerationMixinGenerationModeLogitsProcessorList)GenerateDecoderOnlyOutput)GradientCheckpointingLayer)BaseModelOutputBaseModelOutputWithPoolingModelOutput)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)TransformersKwargsauto_docstringcan_return_tupleis_torch_availablelogging	torch_int   )	AutoModel   )JanusConfigJanusVisionConfigJanusVQVAEConfigc                   @    e Zd ZU eed<   dZdZddgZddgZdZ	dZ
dZdZy	)
JanusPreTrainedModelconfigmodelTLlamaDecoderLayerJanusVisionEncoderLayerpast_key_valuescausal_maskFN)__name__
__module____qualname__r    __annotations__base_model_prefixsupports_gradient_checkpointing_no_split_modules_skip_keys_device_placement_supports_flash_attn_supports_sdpa_can_compile_fullgraph!_supports_param_buffer_assignment     {/var/www/html/ai-insurance-compliance-backend/venv/lib/python3.12/site-packages/transformers/models/janus/modeling_janus.pyr$   r$   8   sB    &*#,.GH#4m"DN!(-%r8   r$   z9
    Base class for Janus VQ-VAE mode model outputs.
    )custom_introc                   \    e Zd ZU dZdZeej                     ed<   dZ	ej                  ed<   y)JanusVQVAEOutputz
    decoded_pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)`):
        Reconstructed pixel values after encoding and decoding the input.
    embedding_loss (`torch.FloatTensor`):
        Embedding loss.
    Ndecoded_pixel_valuesembedding_loss)
r+   r,   r-   __doc__r=   r   torchFloatTensorr.   r>   r7   r8   r9   r<   r<   F   s/     9=(5#4#45<(,NE%%,r8   r<   zy
    Base class for Janus model's outputs that may also contain a past key/values (to speed up sequential decoding).
    c                       e Zd ZU dZdZeej                     ed<   dZ	ee
e
ej                           ed<   dZee
ej                        ed<   dZee
ej                        ed<   dZee
ej                        ed<   y)JanusBaseModelOutputWithPastal  
    last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
        Sequence of hidden-states at the output of the last layer of the model.

        If `past_key_values` is used only the last hidden-state of the sequences of shape `(batch_size, 1,
        hidden_size)` is output.
    past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
        Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
        `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and optionally if
        `config.is_encoder_decoder=True` 2 additional tensors of shape `(batch_size, num_heads,
        encoder_sequence_length, embed_size_per_head)`.

        Contains pre-computed hidden-states (key and values in the self-attention blocks and optionally if
        `config.is_encoder_decoder=True` in the cross-attention blocks) that can be used (see `past_key_values`
        input) to speed up sequential decoding.
    image_hidden_states (`tuple(torch.FloatTensor)`, *optional*):
        Tuple of `torch.FloatTensor` (one for the output of the image embeddings, `(batch_size, num_images,
        sequence_length, hidden_size)`.

        image_hidden_states of the model produced by the vision encoder, and optionally by the perceiver
    Nlast_hidden_stater)   hidden_states
attentionsimage_hidden_states)r+   r,   r-   r?   rD   r   r@   rA   r.   r)   tuplerE   rF   rG   r7   r8   r9   rC   rC   X   s    , 6:x 1 129AEOXeE%*;*;$<=>E8<M8E%"3"345<59Ju00129>B%(9(9":;Br8   rC   zQ
    Base class for Janus causal language model (or autoregressive) outputs.
    c                      e Zd ZU dZdZeej                     ed<   dZ	eej                     ed<   dZ
eeej                        ed<   dZeeej                        ed<   dZeeej                        ed<   dZeeej                        ed<   y)	JanusCausalLMOutputWithPasta  
    loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
        Language modeling loss (for next-token prediction).
    logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
        Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
    past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
        Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
        `(batch_size, num_heads, sequence_length, embed_size_per_head)`)

        Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
        `past_key_values` input) to speed up sequential decoding.
    image_hidden_states (`tuple(torch.FloatTensor)`, *optional*):
        Tuple of `torch.FloatTensor` (one for the output of the image embeddings, `(batch_size, num_images,
        sequence_length, hidden_size)`.

        image_hidden_states of the model produced by the vision encoder, and optionally by the perceiver
    Nlosslogitsr)   rE   rF   rG   )r+   r,   r-   r?   rK   r   r@   rA   r.   rL   r)   listrE   rH   rF   rG   r7   r8   r9   rJ   rJ   |   s    $ )-D(5$$
%,*.FHU&&'.9=OXd5#4#456=8<M8E%"3"345<59Ju00129>B%(9(9":;Br8   rJ   c                        e Zd Zdef fdZdej                  dededej                  fdZddej                  d	e	dej                  fd
Z
 xZS )JanusVisionEmbeddingsr%   c                 f   t         |           || _        |j                  | _        |j
                  | _        |j                  | _        t        j                  |j                  | j                  | j                  | j                  d      | _
        | j
                  | j                  z  dz  | _        | j                  | _        t        j                  | j                  | j                        | _        | j                  dt!        j"                  | j                        j%                  d      d       y )Nvalid)in_channelsout_channelskernel_sizestridepaddingr   position_ids)r   F)
persistent)super__init__r%   hidden_size	embed_dim
image_size
patch_sizer   Conv2dnum_channelspatch_embeddingnum_patchesnum_positions	Embeddingposition_embeddingregister_bufferr@   arangeexpandselfr%   	__class__s     r9   r[   zJanusVisionEmbeddings.__init__   s    ++ ++ ++!yy++?? 
 !OOt>1D!--"$,,t/A/A4>>"R^U\\$:L:L-M-T-TU\-]jopr8   
embeddingsheightwidthreturnc                    |j                   d   }| j                  j                  j                   d   }t        j                  j                         s%||k(  r ||k(  r| j                  | j                        S | j                  j                  j                  d      }|j                   d   }|| j                  z  }|| j                  z  }	t        |dz        }
|j                  d|
|
|      }|j                  dddd      }t        j                  j                  |||	fdd	      }|j                  dddd      j                  dd|      }|S )
a  
        This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher resolution
        images. This method is also adapted to support torch.jit tracing and no class embeddings.

        Adapted from:
        - https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174-L194, and
        - https://github.com/facebookresearch/dinov2/blob/e1277af2ba9496fbadf7aec6eba56e8d882d1e35/dinov2/models/vision_transformer.py#L179-L211
        r   r   rX   g      ?r   r   bicubicF)sizemodealign_corners)shaperf   weightr@   jit
is_tracingrW   	unsqueezer_   r   reshapepermuter   
functionalinterpolateview)rk   rm   rn   ro   rc   rd   patch_pos_embeddim
new_height	new_widthsqrt_num_positionss              r9   interpolate_pos_encodingz.JanusVisionEmbeddings.interpolate_pos_encoding   sE    !&&q)//66<<Q? yy##%+*F6UZ?**4+<+<==1188BB1Er"t.
T__,	&}c'9:)11!5GI[]`a)11!Q1=--33i(	 4 
 *11!Q1=BB1b#Nr8   pixel_valuesr   c                 X   |j                   \  }}}}| j                  j                  j                  }| j                  |j	                  |            }|j                  d      j                  dd      }|r| j                  |||      }	n| j                  | j                        }	||	z   }|S )N)dtyper   r   )
rv   rb   rw   r   toflatten	transposer   rf   rW   )
rk   r   r   _rn   ro   target_dtypepatch_embedsrm   
pos_embedss
             r9   forwardzJanusVisionEmbeddings.forward   s    *001fe++2288++LOO,O,OP!))!,66q!<
#66z65QJ001B1BCJ*,
r8   F)r+   r,   r-   r!   r[   r@   Tensorintr   boolr   __classcell__rl   s   @r9   rO   rO      se    q0 q($5<< $ $UX $]b]i]i $LELL D ]b]i]i r8   rO   rE   n_reprp   c                     | j                   \  }}}}|dk(  r| S | dddddddddf   j                  |||||      } | j                  |||z  ||      S )z
    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
    r   N)rv   ri   r{   )rE   r   batchnum_key_value_headsslenhead_dims         r9   	repeat_kvr      so    
 2?1D1D.Ehz!!Qa"23::5BUW\^bdlmM  (;e(CT8TTr8   modulequerykeyvalueattention_maskscalingdropoutkwargsc                 T   t        || j                        }t        || j                        }	t        j                  ||j	                  dd            |z  }
|#|d d d d d d d |j
                  d   f   }|
|z   }
t        j                  j                  |
dt        j                        j                  |j                        }
t        j                  j                  |
|| j                        }
t        j                  |
|	      }|j	                  dd      j                         }||
fS )Nr   r   rX   )r   r   )ptrainingr   )r   num_key_value_groupsr@   matmulr   rv   r   r}   softmaxfloat32r   r   r   r   
contiguous)r   r   r   r   r   r   r   r   
key_statesvalue_statesattn_weightsr*   attn_outputs                r9   eager_attention_forwardr      s    3 ; ;<JUF$?$?@L<<z';';Aq'ABWLL!$Q1.D
0@0@0D.D%DE#k1==((2U]](SVVW\WbWbcL==((6??([L,,|\:K''1-88:K$$r8   c                   t     e Zd ZdZdef fdZ	 ddej                  deej                     de	e
   fdZ xZS )	JanusVisionAttentionz(Attention Class for Janus Vision Encoderr%   c                 F   t         |           || _        |j                  | _        |j
                  | _        | j                  | j                  z  | _        | j                  | j                  z  | j                  k7  r&t        d| j                   d| j                   d      | j                  dz  | _	        |j                  | _
        |j                  }|j                  }d| _        d| _        t        j                   | j                  | j                  | j                  z  |j"                        | _        t        j                   | j                  | j                  | j                  z  |j"                        | _        t        j                   | j                  | j                  | j                  z  |j"                        | _        t        j                   | j                  | j                        | _        |dkD  rt        j,                  |      nt        j.                         | _        |rt        j0                  | j                        nt        j.                         | _        |r%t        j0                  | j                        | _        y t        j.                         | _        y )	Nz;embed_dim must be divisible by num_heads (got `embed_dim`: z and `num_heads`: z).      Fr   biasr   )rZ   r[   r%   r\   r]   num_attention_heads	num_headsr   
ValueErrorscaleattention_dropoutprojection_dropoutuse_qk_norm	is_causalr   r   Linearattention_biasq_projk_projv_projprojection_layerDropoutIdentity	LayerNormq_normk_norm)rk   r%   proj_dropoutqk_normrl   s       r9   r[   zJanusVisionAttention.__init__  s   ++33$..8==4>>)T^^;MdnnM] ^NN#2'  ]]D(
!'!9!900$$ %&!ii0NU[UjUjkii0NU[UjUjkii0NU[UjUjk "		$..$.. I>JQ>N"**\":TVT_T_Ta6=bll4>>22;;=6=bll4>>22;;=r8   rE   r   r   c                 >   |j                         \  }}}| j                  |      }| j                  |      }| j                  |      }	|j	                  d| j
                  | j                        }| j                  |      }|j	                  d| j
                  | j                        }| j                  |      }|j	                  ||| j
                  | j                        j                  dd      }|j	                  ||| j
                  | j                        j                  dd      }|	j                  ||| j
                  | j                        j                  dd      }	t        }
| j                  j                  dk7  rt        | j                  j                     }
 |
| |||	|f| j                  sdn| j                   | j"                  | j$                  d|\  }}|j	                  ||| j&                        }| j)                  |      }| j+                  |      }||fS )NrX   r   r   eager        )r   r   r   )rs   r   r   r   r{   r   r   r   r   r   r   r   r%   _attn_implementationr   r   r   r   r   r]   r   r   )rk   rE   r   r   
batch_sizeseq_lenr   query_statesr   r   attention_interfacer   r   outputs                 r9   r   zJanusVisionAttention.forward.  s    "/!3!3!5
GQ{{=1[[/
{{=1#++BN{{<0''DNNDMMJ
[[,
#++JQUQ^Q^_iijkmno''
GT^^T]][eefgijk
#((Wdnndmm\ffghjkl(?;;++w6"9$++:Z:Z"[$7
%
  $}}C$2H2HJJnn
%
 
%
!\ "))*gt~~N&&{3((0|##r8   N)r+   r,   r-   r?   r!   r[   r@   r   r   r   r   r   r   r   s   @r9   r   r     sO    2Q0 Q@ 26)$||)$ !.)$ +,	)$r8   r   c                   \     e Zd Zdef fdZdej                  dej                  fdZ xZS )JanusVisionMLPr%   c                    t         |           || _        t        |j                  |j
                  z        | _        t        |j                     | _	        t        j                  |j                  | j                        | _        t        j                  | j                  |j                        | _        t        j                  |j                        | _        t        j                  |j                        | _        y r   )rZ   r[   r%   r   r\   	mlp_ratiointermediate_sizer	   
hidden_actactivation_fnr   r   fc1fc2r   hidden_dropout_ratedropout1dropout2rj   s     r9   r[   zJanusVisionMLP.__init__[  s    !$V%7%7&:J:J%J!K#F$5$5699V//1G1GH99T33V5G5GH

6#=#=>

6#=#=>r8   rE   rp   c                     | j                  |      }| j                  |      }| j                  |      }| j                  |      }| j	                  |      }|S r   )r   r   r   r   r   rk   rE   s     r9   r   zJanusVisionMLP.forwarde  sP    /**=9m4/m4r8   )	r+   r,   r-   r!   r[   r@   r   r   r   r   s   @r9   r   r   Z  s+    ?0 ?U\\ ell r8   r   c            
            e Zd Zdef fdZ	 ddej                  dej                  dee   de	ej                     fdZ xZS )	r(   r%   c                 R   t         |           |j                  | _        t	        j
                  | j                  |j                        | _        t        |      | _	        t	        j
                  | j                  |j                        | _
        t        |      | _        || _        y N)eps)rZ   r[   r\   r]   r   r   layer_norm_epslayer_norm1r   	self_attnlayer_norm2r   mlpr%   rj   s     r9   r[   z JanusVisionEncoderLayer.__init__o  st    ++<<F<Q<QR-f5<<F<Q<QR!&)r8   rE   r   output_attentionsrp   c                     |}| j                  |      }| j                  |||      \  }}||z   }|}| j                  |      }| j                  |      }||z   }|f}|r||fz  }|S )a=  
        Args:
            hidden_states (`torch.FloatTensor`):
                Input to the layer of shape `(batch, seq_len, embed_dim)`.
            attention_mask (`torch.FloatTensor`):
                Attention mask of shape `(batch, 1, q_len, k_v_seq_len)` where padding elements are indicated by very large negative values.
            output_attentions (`bool`, *optional*, defaults to `False`):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
        )rE   r   r   )r   r   r   r   )rk   rE   r   r   residualr   outputss          r9   r   zJanusVisionEncoderLayer.forwardx  s      !((7&*nn')/ '5 '
#|
 !=0 ((7/ =0 "&Gr8   r   )r+   r,   r-   r!   r[   r@   r   r   r   rH   rA   r   r   r   s   @r9   r(   r(   n  sW    0  -2	$||$ $ $D>	$
 
u  	!$r8   r(   c            
       x     e Zd ZdZdef fdZe	 	 	 d	deej                     dee
   dee
   defd       Z xZS )
JanusVisionEncoderz
    Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a
    [`JanusVisionEncoderLayer`].

    Args:
        config: JanusVisionConfig
    r%   c                     t         |           || _        t        j                  t        |j                        D cg c]  }t        |       c}      | _        d| _	        y c c}w NF)
rZ   r[   r%   r   
ModuleListrangenum_hidden_layersr(   layersgradient_checkpointingrk   r%   r   rl   s      r9   r[   zJanusVisionEncoder.__init__  sP    mmeTZTlTlNm$n%<V%D$no&+# %os   A#r   r   output_hidden_statesrp   c                    ||n| j                   j                  }||n| j                   j                  }|rdnd}|rdnd}|}| j                  D ]&  }|r||fz   } ||||      }	|	d   }|s||	d   fz   }( |r||fz   }t	        |||      S )ad  
        Args:
            inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
                Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
                This is useful if you want more control over how to convert `input_ids` indices into associated vectors
                than the model's internal embedding lookup matrix.
            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

                - 1 for tokens that are **not masked**,
                - 0 for tokens that are **masked**.

                [What are attention masks?](../glossary#attention-mask)
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
            output_hidden_states (`bool`, *optional*):
                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
                for more detail.
            return_dict (`bool`, *optional*):
                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
        Nr7   )r   r   r   )rD   rE   rF   )r%   r   r   r   r   )
rk   inputs_embedsr   r   r   encoder_statesall_attentionsrE   encoder_layerlayer_outputss
             r9   r   zJanusVisionEncoder.forward  s    < 2C1N-TXT_T_TqTq$8$D $++JjJj 	  40d%![[ 	FM#!/=2B!B)"3M *!,M !/=3C2E!E	F  +}.>>N+(%
 	
r8   NNN)r+   r,   r-   r?   r!   r[   r   r   r@   r   r   r   r   r   r   s   @r9   r   r     sm    ,0 ,  26,0/3<
 !.<
 $D>	<

 'tn<
 
<
 <
r8   r   c                        e Zd ZU dZeed<   def fdZe	 	 	 	 	 ddee	j                     dee   dee   dee   dedeeef   fd	       Zd
 Z xZS )JanusVisionModelr   r%   c                     t         |   |       || _        |j                  }t	        |      | _        t        |      | _        t        j                  ||j                        | _        | j                          y r   )rZ   r[   r%   r\   rO   rm   r   encoderr   r   r   post_layernorm	post_init)rk   r%   r]   rl   s      r9   r[   zJanusVisionModel.__init__  s]     &&	/7)&1 ll9&:O:OPr8   r   r   return_dictr   rp   c                    ||n| j                   j                  }||n| j                   j                  }||n| j                   j                  }|t	        d      | j                  ||      }| j                  ||||      }|d   }| j                  |      }|d d dd d f   }	| j                  |	      }	|s
||	f|dd  z   S t        ||	|j                  |j                        S )Nz You have to specify pixel_values)r   )r   r   r   r  r   r   )rD   pooler_outputrE   rF   )r%   r   r   use_return_dictr   rm   r	  r
  r   rE   rF   )
rk   r   r   r   r  r   rE   encoder_outputsrD   pooled_outputs
             r9   r   zJanusVisionModel.forward  s    2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B]?@@Ogh,,'/!5#	 ' 
 ,A. //0AB)!Q'2++M:%}58KKK)/')77&11	
 	
r8   c                     | j                   S r   )rm   rk   s    r9   get_input_embeddingsz%JanusVisionModel.get_input_embeddings*  s    r8   )NNNNF)r+   r,   r-   main_input_namer!   r.   r[   r   r   r@   rA   r   r   rH   r   r   r  r   r   s   @r9   r  r    s    $O	0 	  59,0/3&*).(
u001(
 $D>(
 'tn	(

 d^(
 #'(
 
u00	1(
 (
Tr8   r  c                   *     e Zd Zdef fdZd Z xZS )JanusVisionAlignerMLPr%   c           	         t         |           t        j                  |j                  |j
                        | _        t        j                  t        d|j                        D cg c],  }t        j                  |j
                  |j
                        . c}      | _
        t        |j                     | _        y c c}w Nr   )rZ   r[   r   r   r\   projection_dimr   r   r   depthhidden_layersr	   r   r   r   s      r9   r[   zJanusVisionAlignerMLP.__init__/  s    99V//1F1FG]]NSTUW]WcWcNdeRYYv,,f.C.CDe
 $F$5$56 f   &1B<c                 |    | j                  |      }| j                  D ]  }| j                  |      } ||      } |S r   r   r  r   rk   rE   layers      r9   r   zJanusVisionAlignerMLP.forward8  G    /'' 	1E ..}=M!-0M	1 r8   )r+   r,   r-   r!   r[   r   r   r   s   @r9   r  r  .  s    70 7r8   r  c                        e Zd ZdZdef fdZdej                  fdZdej                  dej                  fdZ xZS )	JanusVQVAEVectorQuantizera  
    A module for vector quantization using learned embedding vectors.

    This module implements the quantization process similar to te one described in
    the VQ-VAE (Vector Quantized Variational AutoEncoder) paper. It quantizes continuous
    input vectors into discrete codebook vectors, which are learned during training.
    Current implementation improves over previous ones by avoiding costly matrix multiplications
    and allowing for post-hoc remapping of indices.
    r%   c                    t         |           |j                  | _        |j                  | _        t        |dd      | _        t        j                  | j                  | j                        | _	        |j                  gdz  | _        y )Nbetag      ?r   )rZ   r[   num_embeddingsr]   embedding_dimgetattrr&  r   re   	embeddingrc   quant_state_dimsrj   s     r9   r[   z"JanusVQVAEVectorQuantizer.__init__K  sn    $33#--FFD1	d&9&94;M;MN!'!3!3 4q 8r8   hidden_statec           
      L   |j                  dddd      j                         }|j                  d| j                        }t	        j
                  |dz  dd      t	        j
                  | j                  j                  dz  d      z   dt	        j                  d	|| j                  j                  j                  dd            z  z
  }t	        j                  |d      }| j                  |      j                  |j                        }t	        j                  |j                         |z
  dz        | j                  t	        j                  ||j                         z
  dz        z  z   }|||z
  j                         z   }|j                  dddd      j                         }|||fS )
Nr   r   r   r   rX   T)r   keepdimr   z	bd,dn->bn)r|   r   r   r(  r@   sumr*  rw   einsumr   argminrv   meandetachr&  )rk   r,  hidden_state_flattened	distancesmin_encoding_indiceshidden_state_quantrK   s          r9   r   z!JanusVQVAEVectorQuantizer.forwardT  s   #++Aq!Q7BBD!-!2!22t7I7I!J II,a/QEii--q0a89%,,{,BDNNDYDYDcDcdeghDijjk 	  %||I1=!^^,@AFF|GYGYZ zz-446E!KLtyy[`[e[e,"5"5"77A=\
 P
 

 *-?,-N,V,V,XX 0771aCNNP!4)===r8   image_tokensrp   c                 B   |j                   d   }| j                  j                  j                   d   }| j                  |      }t        j                  |dd      }|j                  |g| j                  |      }|j                  dddd      j                         }|S )Nr   rX   r   )r   r   r   r   )	rv   r*  rw   F	normalizer   r+  r|   r   )rk   r9  r   emb_dimr8  s        r9   get_codebook_entryz,JanusVQVAEVectorQuantizer.get_codebook_entryo  s    !''*
~~,,2226 "^^L9[[);qbI 044j5b4CXCX5bZa5bc/771aCNNP!!r8   )r+   r,   r-   r?   r"   r[   r@   r   r   
LongTensorrA   r>  r   r   s   @r9   r$  r$  @  sD    9/ 9>ELL >6"u/?/? "EDUDU "r8   r$  c                   *     e Zd Z	 	 d fd	Zd Z xZS )JanusVQVAEResnetBlockc                    t         |           || _        ||n|| _        || _        t
        j                  j                  d|dd      | _        t
        j                  j                  ||ddd      | _
        t
        j                  j                  d|dd      | _        t
        j                  j                  |j                        | _        t
        j                  j                  ||ddd      | _        | j                  | j                  k7  r`| j                  r*t
        j                  j                  ||ddd      | _        y t
        j                  j                  ||ddd      | _        y y )	N    ư>T
num_groupsra   r   affiner   r   rT   rU   rV   r   )rZ   r[   rR   rS   use_conv_shortcutr@   r   	GroupNormnorm1r`   conv1norm2r   r   conv2conv_shortcutnin_shortcut)rk   r%   rR   rS   rO  rl   s        r9   r[   zJanusVQVAEResnetBlock.__init__  s1    	&+7+?K\!.XX''2KUYbf'g
XX__[,AVWab_c
XX''2LVZcg'h
xx''7XX__\<QWXbc_d
t000%%%*XX__[,\]fgqr_%s"$)HHOOK[\efpqO$r!	 1r8   c                    |}| j                  |      }|t        j                  |      z  }| j                  |      }| j	                  |      }|t        j                  |      z  }| j                  |      }| j                  |      }| j                  | j                  k7  r3| j                  r| j                  |      }||z   S | j                  |      }||z   S r   )rK  r@   sigmoidrL  rM  r   rN  rR   rS   rI  rO  rP  )rk   rE   r   s      r9   r   zJanusVQVAEResnetBlock.forward  s     

=1}55

=1

=1}55]3

=1t000%%--h7 -''  ,,X6-''r8   r   r+   r,   r-   r[   r   r   r   s   @r9   rA  rA    s    
 s.(r8   rA  c                   $     e Zd Z fdZd Z xZS )JanusVQVAEAttnBlockc                    t         |           || _        t        j                  j                  d|dd      | _        t        j                  j                  ||ddd      | _        t        j                  j                  ||ddd      | _	        t        j                  j                  ||ddd      | _
        t        j                  j                  ||ddd      | _        y )NrC  rD  TrE  r   r   rH  )rZ   r[   rR   r@   r   rJ  normr`   qkvproj_outrk   rR   rl   s     r9   r[   zJanusVQVAEAttnBlock.__init__  s    &HH&&";TXae&f	kqQR\]^kqQR\]^kqQR\]^[aXYcder8   c                 t   |}| j                  |      }| j                  |      }| j                  |      }| j                  |      }|j                  \  }}}}	|j                  ||||	z        j                  ddd      }|j                  ||||	z        }t        j                  ||      }
|
t        |      dz  z  }
t        j                  |
d      }
|j                  ||||	z        }|
j                  ddd      }
t        j                  ||
      j                  ||||	      }| j                  |      }||z   S )Nr   r   r   r   r/  )rW  rX  rY  rZ  rv   r{   r|   r@   bmmr   r;  r   r[  )rk   rE   r   r   r   r   r   channelsrn   ro   r   r   s               r9   r   zJanusVQVAEAttnBlock.forward  s5    		-0vvm,VVM*
vvm, /;.@.@+
Hfe#++J&5.QYYZ[]^`ab''
HfunM
yyz:#s8}'>?yy15 $++J&5.Q#++Aq!4iil;CCJPXZ`bghmmK0+%%r8   rS  r   s   @r9   rU  rU    s    f&r8   rU  c                   $     e Zd Z fdZd Z xZS )JanusVQVAEConvDownsamplec                 `    t         |           t        j                  ||ddd      | _        y )Nr   r   r   rH  )rZ   r[   r   r`   convr\  s     r9   r[   z!JanusVQVAEConvDownsample.__init__  s'    IIk;AaYZ[	r8   c                 Z    t        j                  |ddd      }| j                  |      }|S )N)r   r   r   r   constantr   )padrt   r   )r;  rf  rc  r   s     r9   r   z JanusVQVAEConvDownsample.forward  s+    mJVWX		-0r8   rS  r   s   @r9   ra  ra    s    \r8   ra  c                   $     e Zd Z fdZd Z xZS )JanusVQVAEConvUpsamplec                 t    t         |           t        j                  j	                  ||ddd      | _        y )Nr   r   rH  )rZ   r[   r@   r   r`   rc  r\  s     r9   r[   zJanusVQVAEConvUpsample.__init__  s.    HHOOK!TU_`Oa	r8   c                 X    t        j                  |dd      }| j                  |      }|S )Ng       @nearest)scale_factorrt   )r;  r~   rc  r   s     r9   r   zJanusVQVAEConvUpsample.forward  s(    m#IV		-0r8   rS  r   s   @r9   rh  rh    s    br8   rh  c                   `     e Zd Zdedef fdZdej                  dej                  fdZ xZ	S )JanusVQVAEMidBlockr%   r_  c                     t         |           t        |||      | _        t	        |      | _        t        |||      | _        y )Nr%   rR   rS   )rZ   r[   rA  block_1rU  attn_1block_2)rk   r%   r_  rl   s      r9   r[   zJanusVQVAEMidBlock.__init__  sF    , !

 *(3, !
r8   rE   rp   c                 l    | j                  |      }| j                  |      }| j                  |      }|S r   )rq  rr  rs  r   s     r9   r   zJanusVQVAEMidBlock.forward  s2    ]3M2]3r8   )
r+   r,   r-   r"   r   r[   r@   r   r   r   r   s   @r9   rn  rn    s2    
/ 
3 
U\\ ell r8   rn  c                   >     e Zd Z fdZdej
                  fdZ xZS )JanusVQVAEEncoderc           	         t         |           t        |j                        | _        |j
                  | _        |j                  }|j                  }|j                  }|j                  }|j                  }t        j                  j                  ||ddd      | _        dt        |      z   }|| _        t        j                          | _        t%        | j                        D ]   }t        j                          }	t        j                          }
|||   z  }|||   z  }t%        | j
                        D ]N  }|	j'                  t)        |||             |}|| j                  dz
  k(  s5|
j'                  t+        |             P t        j,                         }|	|_        |
|_        || j                  dz
  k7  rt3        |      |_        | j"                  j'                  |        t7        |      | _        t        j                  j;                  d|dd	      | _        t        j                  j                  ||rd
|z  n|ddd      | _        y )Nr   r   rH  )r   rp  rC  rD  TrE  r   ) rZ   r[   lenchannel_multipliernum_resolutionsnum_res_blocksbase_channelsrR   double_latentlatent_channelsr@   r   r`   conv_inrH   in_channel_multiplierr   downr   appendrA  rU  Moduleblockattnra  
downsamplern  midrJ  norm_outconv_out)rk   r%   r|  rR   r}  r~  ry  r  i_levelr  r  block_in	block_outi_blockr  rl   s                  r9   r[   zJanusVQVAEEncoder.__init__  s   "6#<#<=$33,,((,, 00#66xx{MqYZdef $u-?'@ @%:"MMO	T112 	#GMMOE==?D$'<W'EEH%(:7(CCI !4!45 
?)%$,%. %d22Q66KK 3H =>
? 99;DDJDI$..22":8"DIIT"-	#0 &fh7**bxUYbf*g#0Ao ( 
r8   r   c                    | j                  |      g}t        | j                        D ]  }t        | j                        D ]  } | j                  |   j
                  |   |d         }t        | j                  |   j                        dkD  r" | j                  |   j                  |   |      }|j                  |        || j                  dz
  k7  s|j                  | j                  |   j                  |d                 |d   }| j                  |      }| j                  |      }|t        j                  |      z  }| j                  |      }|S )NrX   r   r   )r  r   rz  r{  r  r  rx  r  r  r  r  r  r@   rR  r  )rk   r   rE   r  r  r,  rD   s          r9   r   zJanusVQVAEEncoder.forward/  sT   l34T112 		WG !4!45 3@tyy177@!"%  tyy)../!3#C499W#5#:#:7#CL#QL$$\23 $..22$$TYYw%7%B%B=QSCT%UV		W *"- HH%67 !MM*;<U]]+<== MM*;<  r8   )r+   r,   r-   r[   r@   r?  r   r   r   s   @r9   rv  rv    s    1
f!E$4$4 !r8   rv  c                   V     e Zd Z fdZdej
                  dej
                  fdZ xZS )JanusVQVAEDecoderc           	      v   t         |           t        |j                        | _        |j
                  | _        |j                  }|j                  }|j                  }||j                  | j                  dz
     z  }t        j                  j                  ||ddd      | _        t        ||      | _        t        j                         | _        t#        t%        | j                              D ]  }t        j                         }t        j                         }||j                  |   z  }	t%        | j
                  dz         D ]N  }
|j'                  t)        |||	             |	}|| j                  dz
  k(  s5|j'                  t+        |             P t        j,                         }||_        ||_        |dk7  rt3        |      |_        | j                   j'                  |        t        j                  j7                  d|dd	      | _        t        j                  j                  ||ddd      | _        y )
Nr   r   rH  rp  r   rC  rD  TrE  )rZ   r[   rx  ry  rz  r{  r|  r~  rS   r@   r   r`   r  rn  r  r   upreversedr   r  rA  rU  r  r  r  rh  upsamplerJ  r  r  )rk   r%   r|  r~  rS   r  r  r  r  r  r  r  rl   s               r9   r[   zJanusVQVAEDecoder.__init__I  s   "6#<#<=$33,, 00** !6#<#<T=Q=QTU=U#VV xxaXYcde &fh7 --/d&:&: ;< 	GMMOE==?D%(A(A'(JJI !4!4q!89 
?)%$,%. %d22Q66KK 3H =>
? BBHBG!|4X>GGNN2)	. **bxUYbf*g,AVWabcr8   r,  rp   c                 b   | j                  |      }| j                  |      }t        | j                        D ]  }t        | j                  dz         D ]l  } | j
                  |   j                  |   |      }t        | j
                  |   j                        dkD  sK | j
                  |   j                  |   |      }n || j                  dz
  k7  s| j
                  |   j                  |      } | j                  |      }|t        j                  |      z  }| j                  |      }|S )Nr   r   )r  r  r   rz  r{  r  r  rx  r  r  r  r@   rR  r  )rk   r,  r  r  s       r9   r   zJanusVQVAEDecoder.forwardw  s    ||L1 xx- T112 	GG !4!4q!89 P>twww/55g>|Ltwww',,-1#A4777#3#8#8#A,#OLP $..22#www/88F	G }}\2l33}}\2r8   )r+   r,   r-   r[   r@   rA   r   r   r   s   @r9   r  r  H  s)    ,d\E$5$5 %:K:K r8   r  aS  
    The VQ-VAE model used in Janus for encoding/decoding images into discrete tokens.
    This model follows the "Make-a-scene: Scene-based text-to-image generation with human priors" paper from
    [ Oran Gafni, Adam Polyak, Oron Ashual, Shelly Sheynin, Devi Parikh, and Yaniv
    Taigman](https://huggingface.co/papers/2203.13131).
    c                        e Zd ZU eed<   g dZdZdef fdZdej                  fdZ
dej                  dej                  fdZeedej                  deej                  ej                  f   fd	              Z xZS )

JanusVQVAEr%   )rU  rA  r$  r   c                    t         |   |       t        |      | _        t	        |      | _        t        j                  j                  |j                  |j                  d      | _        t        j                  j                  |j                  |j                  d      | _        | j                          t        |      | _        d| _        | j#                          y )Nr   F)rZ   r[   rv  r	  r$  quantizer@   r   r`   r~  r]   
quant_convpost_quant_convevalr  decoderr   r  rj   s     r9   r[   zJanusVQVAE.__init__  s     (01&9((//&*@*@&BRBRTUV$xxv/?/?AWAWYZ[		(0&+# 	r8   c                 z    | j                  |      }| j                  |      }| j                  |      \  }}}|||fS r   )r	  r  r  )rk   r   rE   quantemb_lossindicess         r9   encodezJanusVQVAE.encode  s@    \26#'==#? xh''r8   r9  rp   c                    |j                   d   | j                  j                  d   | j                  j                  d   z  k7  rMt        d| j                  j                  d   | j                  j                  d   z   d|j                    d      | j                  j	                  |      }| j                  |      }| j                  |      }|S )aG  
        Decodes quantized token IDs into pixel values.
        Args:
            image_tokens (torch.LongTensor): Batch of token IDs.
        Returns:
            pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)`):
                Pixel values decoded from the token IDs.
        r   r   z4Expected `image_tokens` to have shape `(batch_size, z)`, but got shape `z`.)rv   r  r+  r   r>  r  r  )rk   r9  codebook_entryrE   r   s        r9   decodezJanusVQVAE.decode  s     a DMM$B$B1$EHfHfghHi$iiFt}}GeGefgGhkokxkx  lJ  lJ  KL  lM  HM  GN N""."4"4!5R9  99,G,,^<||M2r8   c                     |j                   d   }| j                  |      \  }}}| j                  |j                  |d            }t	        ||      S )Nr   rX   )rv   r  r  r   r<   )rk   r   r   r  r>   r  r=   s          r9   r   zJanusVQVAE.forward  sQ     "''*
)-\)B&~w#{{7<<
B+GH 4nEEr8   )r+   r,   r-   r"   r.   r1   r  r[   r@   r?  r  rA   r  r   r   rH   r   r   r   s   @r9   r  r    s     
 %O/ (5#3#3 (5#3#3 8I8I & F''F 
u  %"3"33	4F  Fr8   r  c                   *     e Zd Zdef fdZd Z xZS )JanusVQVAEAlignerMLPr%   c           	         t         |           t        j                  |j                  |j
                        | _        t        j                  t        d|j                        D cg c],  }t        j                  |j
                  |j
                        . c}      | _
        t        |j                     | _        y c c}w r  )rZ   r[   r   r   r]   r  r   r   r   r   r  r	   r   r   r   s      r9   r[   zJanusVQVAEAlignerMLP.__init__  s    99V--v/D/DE]]NSTUW]WoWoNpqRYYv,,f.C.CDq
 $F$5$56 rr  c                 |    | j                  |      }| j                  D ]  }| j                  |      } ||      } |S r   r  r   s      r9   r   zJanusVQVAEAlignerMLP.forward  r"  r8   )r+   r,   r-   r"   r[   r   r   r   s   @r9   r  r    s    7/ 7r8   r  c                   `     e Zd ZdZdef fdZdej                  dej                  fdZ	 xZ
S )JanusVQVAEHeadzOHead used for sampling tokens in image generation, replacing the usual lm head.r%   c                    t         |           t        j                  |j                  |j
                        | _        t        |j                     | _	        t        j                  |j
                  |j                        | _        y r   )rZ   r[   r   r   image_token_embed_dimr  r[  r	   r   r   r'  vision_headrj   s     r9   r[   zJanusVQVAEHead.__init__  s^    		&">">@U@UV#F$5$5699V%:%:F<Q<QRr8   rE   rp   c                 l    | j                  |      }| j                  |      }| j                  |      }|S r   )r[  r   r  r   s     r9   r   zJanusVQVAEHead.forward  s6    m4**=9((7r8   )r+   r,   r-   r?   r"   r[   r@   r   tensorr   r   r   s   @r9   r  r    s0    YS/ SU\\ ell r8   r  zl
    The Janus model which consists of a siglip vision backbone, a Llama language model and a VQ model.
    c                       e Zd Zdef fdZd Zd Zd Zdej                  dej                  dej                  fd	Zee	 	 	 	 	 	 	 	 	 ddej                  d
ej                  deej                     deej                     dee   deej                     deej                     dee   deeej                  f   fd              Z xZS )
JanusModelr%   c                    t         |   |       || _        t        j	                  |j
                        | _        t        | j                  j                        | _        t        j	                  |j                        | _        t        j                  | j                  j                  j                  | j                  j                  j                        | _        t#        | j                  j                        | _        t'        | j                  j                        | _        t+        j,                  |j.                        | _        d| _        | j5                          y )N)r%   F)rZ   r[   r%   r  _from_configvision_configvision_modelr  alignerr  	vq_configvqmodelr   re   r'  r]   generation_embeddingsr  generation_alignerr  generation_headr   from_configtext_configlanguage_modelr   r  rj   s     r9   r[   zJanusModel.__init__  s     ,99&:N:NO,T->->-E-EF!..v/?/?@ &(\\$,,2E2E2T2TVZVbVbViViVsVs%t""6t||7J7J"K-dll.A.AB'336;M;MN&+#r8   c                 6    | j                   j                         S r   )r  r  r  s    r9   r  zJanusModel.get_input_embeddings  s    ""7799r8   c                 :    | j                   j                  |       y r   )r  set_input_embeddingsrk   r   s     r9   r  zJanusModel.set_input_embeddings  s    007r8   c                 ^    | j                  |      }| j                  |j                        }|S r   )r  r  rD   )rk   r   image_embedss      r9   get_image_featureszJanusModel.get_image_features  s,    ((6||L$B$BCr8   	input_idsr   image_featuresc                 P   |m| | j                         t        j                  | j                  j                  t        j
                  |j                              k(  }|j                  d      }n|| j                  j                  k(  }|j                         }|j                  d      j                  |      j                  |j                        }||   j                         |j                         k7  r0|j                  d   |j                  d   z  }t        d| d|       |S )z
        Obtains multimodal placeholdr mask from `input_ids` or `inputs_embeds`, and checks that the placeholder token count is
        equal to the length of multimodal features. If the lengths are different, an error is raised.
        r   devicerX   r   r   z6Image features and image tokens do not match: tokens: z, features )r  r@   r  r%   image_token_idlongr  allr0  rz   	expand_asr   numelrv   r   )rk   r  r   r  special_image_maskn_image_tokensn_image_featuress          r9   get_placeholder_maskzJanusModel.get_placeholder_mask  s    !.2M$2K2K2MT[[77uzzR_RfRfg3 " "4!7!7!;!*dkk.H.H!H+//1/99"=GGVYYZgZnZno+,2248L8L8NN-33A69M9Ma9PPHHXXcdtcuv  "!r8   r   r   rW   r)   cache_position	use_cachelogits_to_keepc
                    |d u |d uz  rt        d      | | j                         |      }||| j                  |      }|j                  d|j                  d         }|j                  |j                  |j                        }| j                  |||      }|j                  ||      } | j                  d|||||||	d|
}t        |j                  |j                  |j                  |j                  |      S d       S )NzaYou cannot specify both input_ids and inputs_embeds at the same time, and must specify either onerX   )r   r  )r   r   rW   r)   r  r  r  )rD   r)   rE   rF   rG   r7   )r   r  r  r{   rv   r   r  r   r  masked_scatterr  rC   rD   r)   rE   rF   )rk   r  r   r   rW   r)   r  r   r  r  r   r  r  image_attention_mask	lm_outputs                  r9   r   zJanusModel.forward1  sH    -t";<s   7D557	BM#22<@L)11"m6I6I"6MNN+..}/C/C]EXEXYN#'#<#<~ $= $  *889M~^M'D'' 	
')%+))	
 	
	 ,'99%55#11 ++0<0H
 	

 OS
 	
r8   )	NNNNNNNNr   )r+   r,   r-   r    r[   r  r  r  r@   r?  rA   r  r   r   r   r   r
   r   r   r   r   r   r   s   @r9   r  r    s*   { *:8
"))":?:K:K"]b]n]n"0  '+*.1537+/5959$(34.
##.
 ''.
 !.	.

 u//0.
 "%.
 !!1!12.
   1 12.
 D>.
 c5<</0.
  .
r8   r  c                   b    e Zd ZddgZdZdef fdZd Zd Zde	j                  d	e	j                  fd
Zd Zd Zee	 	 	 	 	 	 	 	 	 	 dde	j                   de	j"                  dee	j                     dee	j                      dee   dee	j                      dee	j"                     dee	j                      dee   deee	j                  f   dee   fd              Z	 	 	 	 	 	 d fd	Zde	j                  fdZe	j8                  	 	 	 d de	j                  dee	j                      dee   f fd       Z xZS )!JanusForConditionalGenerationz(model.language_model.embed_tokens.weightzlm_head.weightTr%   c                     t         |   |       || _        t        |      | _        t        j                  |j                  j                  |j                  j                  d      | _
        | j                          y )NFr   )rZ   r[   r%   r  r&   r   r   r  r\   
vocab_sizelm_headr  rj   s     r9   r[   z&JanusForConditionalGeneration.__init__h  s\     '
yy!3!3!?!?ASASA^A^ejk 	r8   c                 J    | j                   j                  j                         S r   )r&   r  r  r  s    r9   r  z2JanusForConditionalGeneration.get_input_embeddingsq  s    zz((==??r8   c                 N    | j                   j                  j                  |       y r   )r&   r  r  r  s     r9   r  z2JanusForConditionalGeneration.set_input_embeddingst  s    

!!66u=r8   inputsrp   c                 r    | j                   j                  |      }| j                   j                  |      }|S r   )r&   r  r  )rk   r  r,  s      r9   'prepare_embeddings_for_image_generationzEJanusForConditionalGeneration.prepare_embeddings_for_image_generationw  s0    zz77?zz44\Br8   c                     || _         y r   r&   )rk   r  s     r9   set_decoderz)JanusForConditionalGeneration.set_decoder|  s	    
r8   c                     | j                   S r   r  r  s    r9   get_decoderz)JanusForConditionalGeneration.get_decoder  s    zzr8   r  r   r   rW   r)   r  r   labelsr  r  r   c                     | j                   d|||||||	|d|}|j                  }t        |
t              rt	        |
 d      n|
}| j                  |dd|ddf         }d}|4 | j                  d||| j                  j                  j                  d|}t        |||j                  |j                  |j                  |j                        S )a  
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
        )r  r   r   rW   r)   r   r  r  N)rL   r  r  )rK   rL   r)   rE   rF   rG   r7   )r&   rD   
isinstancer   slicer  loss_functionr%   r  r  rJ   r)   rE   rF   rG   )rk   r  r   r   rW   r)   r  r   r  r  r  r   r   rE   slice_indicesrL   rK   s                    r9   r   z%JanusForConditionalGeneration.forward  s    , $** 

%)%+')

 

  118B>SV8W~ot4]kmA}a,?@A%4%% f9P9P9[9[_eD +#33!//)) ' ; ;
 	
r8   c           	      N    t        
|   |f|||||d|}	|d   dk(  r||	d<   |	S )N)r)   r   r   r  r  r   r   )rZ   prepare_inputs_for_generation)rk   r  r   r)   r   r   r  r  r   model_inputsrl   s             r9   r  z;JanusForConditionalGeneration.prepare_inputs_for_generation  sT     w<
+')))
 
 !!+7L(r8   r9  c                 x    | j                   j                  j                  |      }|j                  dddd      }|S )a,  
        Decodes generated image tokens from language model to continuous pixel values
        with VQGAN module via upsampling.
        Args:
            image_tokens (`torch.LongTensor` of shape `(batch_size, num_of_tokens)`):
                The tensors corresponding to the input images.
        r   r   r   r   )r&   r  r  r|   )rk   r9  decoded_images      r9   decode_image_tokensz1JanusForConditionalGeneration.decode_image_tokens  s:     

**11,?%--aAq9r8   logits_processorc           	         |j                  d| j                        }t        j                  |      }|j                  dd      }|dk(  rt	        %|   d|||d d|S  |j                  di |}|j                         t        j                  t        j                  fvrt        d      |j                          | j                  |j                                ||n	t               }d|d<   |j                  t         j#                  d       d	|_        |j                  |d
<   | j%                  ||j&                  |      \  }}	}|j(                  |j*                  }}
t-        |j.                        dk7  rt        d|j.                   d      |d u}| j1                  |||j*                         |j                  r:|j                  dkD  r+|j3                  t5        |j                               d |_        | j7                  ||j.                  d   |d ||      } | j8                  d|||j:                  d|\  }}| j<                  j>                  j@                  jB                  }|j.                  \  }}|jE                  dd      }|j                  dd       }|jE                  dd      }||d<   ||d d d f   |j&                  k7  ||d d d f   |jF                  d   k7  z  }||d d d f   jI                  ||jJ                          | jM                         |      }| jO                  |||      }|jQ                  dd       A| jS                  |jT                  xs d|dz  tW        |jX                  ||z         ||      |d<   t[        j\                  ||f|
|      }|j^                  }|j`                  }|jb                  }|jd                  }|jf                  }|r|rdnd }|r|rdnd }|r|rdnd }|r|rdnd }ti        |      D ]x  } | jj                  d||d|}|d   jm                  |j*                        |d<   |d   jm                  |j*                        |d<    | j<                  jn                  di |||d}| jq                  ||      }|jr                  d d dd d f   ju                         } | j<                  jw                  |       }! |||!      }"|jx                  r>t[        jz                  |"d      }#t[        j|                  |#d      j                  d      }$nt[        j                  |"d      }$|$|d d |f<   t[        j                  |$|$g      }$|$j                  d      }$| j                  |$      }{ |r@|r|!fz  }|r| j                         fz  }|r|j                  z  }|r|j                  z  }|rt        |!|||j                        S |S ) Ngeneration_configgeneration_modetext)r  r   r  guidance_scalezGot incompatible mode for Image Generation, should be one of greedy or sampling. Ensure that beam search is de-activated by setting `num_beams=1` and `num_beam_groups=1`.Tr  zU`guidance_scale` is required for CFG but not provided. Setting to default value of 5.   r  r   z;Expected input ids of shape (batch_size, seq_len), but got z3Passing `inputs embeds` is not supported currently.)r  r   )r  input_ids_seq_lengthencoder_input_idsprefix_allowed_tokens_fnr  r  )r  r   expand_sizer   boi_token_idr)   static)cache_implementationr   max_cache_lenr  model_kwargsr  r7   )r   r  r  )r   r   rX   r/  )num_samples)	sequencesscoresrL   rF   rE   r)   )Ipopr  copydeepcopyrZ   generateupdateget_generation_moder   SAMPLEGREEDY_SEARCHr   validate_validate_model_kwargsr   r  loggerwarning_prepare_model_inputsbos_token_idr   r  rx  rv   _prepare_special_tokensr  r   _get_logits_processor_expand_inputs_for_generationnum_return_sequencesr&   r  r%   num_image_tokensrepeatgeneration_kwargsmasked_fill_pad_token_idr  _get_initial_cache_positionget
_get_cacher  max
max_lengthr@   zerosr   r   output_scoresoutput_logitsreturn_dict_in_generater   r  r   r  #_update_model_kwargs_for_generationrD   cloner  	do_sampler   multinomialsqueezeargmaxcatrz   r  floatrF   rE   r   r)   )&rk   r  r   r  r   r  r  r  r  model_input_namer   r  kwargs_has_attention_maskr  r   r   input_tokensmaskr   generated_tokensr   r   r(  r)  r*  
raw_scores
raw_logitsdecoder_hidden_statesdecoder_attentionsir  r   r,  r
  next_token_scoresprobs
next_tokenrl   s&                                        r9   r  z&JanusForConditionalGeneration.generate  s    #JJ':D<R<RS MM*;< !**%6?f$7# -"3#	
   0(//9&9 002>;P;PR`RnRn:ool  	""$##L$5$5$78 0@/K+QdQf %)[!++3NNrs/0,):)I)I%& 594N4N%22L5
1	#\ ")9)9vy1$MiooM^EF  %3$$>!$$%68QZcZjZj$k ++0A0P0PST0T##$IJ[JjJj$kl/3,  55/!*!3'%)- 6 
 #E$"D"D #
))>>#
 	#
	<  ::2299JJ'oo
G ''1-%))*:DA'..q!4)7%& Z[!^,0A0N0NNa(,=,O,OP^,__
 	Z[!^$11$8I8V8VW3113LA77V-t4<.2oo%6%K%K%Wx%>!"3">">@PSZ@Z[) /> /L*+ !;;
4D'EU[ab .??0EE)77)77"3"K"K3RD
3RD
'>CW^b$;@QRX\'( #	UA=4== +|GSL .::J-K-N-N}OcOc-dL)*-9:J-K-N-N}OcOc-dL)*/djj// "3%9G  CCG\ZL"44QAX>DDFL ZZ//=F 0F C !**&7R@"..u!DLLRP
"\\*;D
%/QT" J
#;<J#--b1J HHTMG#	UJ #vi'
|11355
 "g&8&88"#%)>)>>%",*!-3 ' 7 7  $#r8   )
NNNNNNNNNr   )NNNNNNr  ) r+   r,   r-   _tied_weights_keysr5   r    r[   r  r  r@   r   r  r  r  r   r   r?  rA   r   r
   r   r   r   r   r   r   r  r  no_gradr   r  r   r   s   @r9   r  r  d  s   DFVW!{ @>ell u|| 
  '+*.1537+/5959-1$(341
##1
 ''1
 !.	1

 u//01
 "%1
 !!1!121
   1 121
 ))*1
 D>1
 c5<</01
 +,1
  1
l <
 
 ]]  $59:>	}$}$ !!1!12}$ ##67	}$ }$r8   r  )r$   r  r  r  r  )r   )Qr  dataclassesr   typingr   r   r   r@   r   activationsr	   cache_utilsr
   
generationr   r   r   r   generation.utilsr   modeling_layersr   modeling_outputsr   r   r   modeling_utilsr   r   processing_utilsr   utilsr   r   r   r   r   r   autor   configuration_janusr    r!   r"   torch.nn.functionalr}   r;  
get_loggerr+   r  r$   r<   rC   rJ   r  rO   r   r   r   r2  r   r   r   r(   r   r  r  r$  rA  rU  ra  rh  rn  rv  r  r  r  r  r  r  __all__r7   r8   r9   <module>rR     sL  ,  ! , ,   !   u u 9 9 X X F &   Q Q ## 
		H	% 
.? 
. 
. 
	-{ 	- 	- 
C; C C< 
C+ C C6HBII HV	UU\\ 	U# 	U%,, 	U& %II%<<% 
% <<	%
 U\\*% % % '(%4I$299 I$XRYY (.8 .bM
 M
` ;+ ; ;|BII $<"		 <"~)(BII )(X &"))  &F	ryy 	RYY  ,J!		 J!ZA		 AH :F% :F:Fz299 $RYY   
i
% i

i
X{$$8/ {$|	 tr8   