
    rh                        d Z ddlmZmZ ddlZddlmZ ddlmZ ddl	m
Z
 ddlmZ dd	lmZ dd
lmZmZ ddlmZ ddlmZ ddlmZmZmZmZmZmZmZmZmZm Z  ddl!m"Z"  ejF                  e$      Z%dZ& G d de      Z' G d de      Z( G d de      Z) G d de      Z* G d de      Z+ G d de      Z, G d de      Z- G d  d!e      Z. G d" d#e      Z/ G d$ d%e      Z0g d&Z1y)'zPyTorch Qwen3 model.    )CallableOptionalN   )Cache)FlashAttentionKwargs)CausalLMOutputWithPast)ALL_ATTENTION_FUNCTIONS)Unpack)TransformersKwargslogging   )GemmaMLP)LlamaAttention)
Qwen2DecoderLayerQwen2ForCausalLMQwen2ForQuestionAnsweringQwen2ForSequenceClassificationQwen2ForTokenClassification
Qwen2ModelQwen2PreTrainedModelQwen2RMSNormapply_rotary_pos_embeager_attention_forward   )Qwen3ConfigzQwen/Qwen3-8Bc                       e Zd Zy)Qwen3RMSNormN__name__
__module____qualname__     z/var/www/html/ai-insurance-compliance-backend/venv/lib/python3.12/site-packages/transformers/models/qwen3/modular_qwen3.pyr   r   3       r#   r   c                       e Zd Zy)Qwen3MLPNr   r"   r#   r$   r'   r'   7   r%   r#   r'   c                   2    e Zd Zdedef fdZ	 	 ddej                  deej                  ej                  f   de	ej                     de	e
   de	ej                     d	ee   d
eej                  e	ej                     e	eej                        f   fdZ xZS )Qwen3Attentionconfig	layer_idxc                    t         |   ||       t        | j                  |j                        | _        t        | j                  |j                        | _        |j                  |   dk(  r|j                  | _        y d | _        y )N)epssliding_attention)	super__init__r   head_dimrms_norm_epsq_normk_normlayer_typessliding_window)selfr*   r+   	__class__s      r$   r0   zQwen3Attention.__init__<   sk    +"4==f6I6IJ"4==f6I6IJ7=7I7I)7TXk7kf33qur#   hidden_statesposition_embeddingsattention_maskpast_key_valuecache_positionkwargsreturnc                    |j                   d d }g |d| j                  }| j                  | j                  |      j	                  |            j                  dd      }	| j                  | j                  |      j	                  |            j                  dd      }
| j                  |      j	                  |      j                  dd      }|\  }}t        |	|
||      \  }	}
|'|||d}|j                  |
|| j                  |      \  }
}t        }| j                  j                  dk7  rt        | j                  j                     } || |	|
||f| j                   sdn| j"                  | j$                  | j&                  d|\  }} |j(                  g |d j+                         }| j-                  |      }||fS )Nr   r   )sincosr=   eagerg        )dropoutscalingr6   )shaper1   r3   q_projview	transposer4   k_projv_projr   updater+   r   r*   _attn_implementationr	   trainingattention_dropoutrF   r6   reshape
contiguouso_proj)r7   r9   r:   r;   r<   r=   r>   input_shapehidden_shapequery_states
key_statesvalue_statesrC   rB   cache_kwargsattention_interfaceattn_outputattn_weightss                     r$   forwardzQwen3Attention.forwardB   s    $))#2.88b8$--8{{4;;}#=#B#B<#PQ[[\]_`a[[]!;!@!@!NOYYZ[]^_
{{=166|DNNqRST&S#7jRUWZ#[ j%#&snUL'5'<'<ZW[WeWegs't$J(?;;++w6"9$++:Z:Z"[$7
%
  $}}C$2H2HLL..
%
 
%
!\ *k));;;;FFHkk+.L((r#   )NN)r   r    r!   r   intr0   torchTensortupler   r   
LongTensorr
   r   r]   __classcell__r8   s   @r$   r)   r)   ;   s    v{ vs v +/59*)||*) #5<<#=>*) !.	*)
 !*) !!1!12*) -.*) 
u||Xell3XeELL>Q5RR	S*)r#   r)   c                       e Zd Zy)Qwen3DecoderLayerNr   r"   r#   r$   rf   rf   o   r%   r#   rf   c                       e Zd Zy)Qwen3PreTrainedModelNr   r"   r#   r$   rh   rh   s   r%   r#   rh   c                       e Zd Zy)
Qwen3ModelNr   r"   r#   r$   rj   rj   w   r%   r#   rj   c                   .     e Zd Zdee   def fdZ xZS )Qwen3ForCausalLMsuper_kwargsr?   c                 "    t        |   di |S )a^  
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

        Example:

        ```python
        >>> from transformers import AutoTokenizer, Qwen3ForCausalLM

        >>> model = Qwen3ForCausalLM.from_pretrained("Qwen/Qwen3-8B")
        >>> tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen3-8B")

        >>> prompt = "Hey, are you conscious? Can you talk to me?"
        >>> inputs = tokenizer(prompt, return_tensors="pt")

        >>> # Generate
        >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
        >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
        "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
        ```r"   )r/   r]   )r7   rm   r8   s     r$   r]   zQwen3ForCausalLM.forward|   s    4 w...r#   )r   r    r!   r
   r   r   r]   rc   rd   s   @r$   rl   rl   {   s%    /12/ 
 / /r#   rl   c                       e Zd Zy)Qwen3ForSequenceClassificationNr   r"   r#   r$   rp   rp      r%   r#   rp   c                       e Zd Zy)Qwen3ForTokenClassificationNr   r"   r#   r$   rr   rr      r%   r#   rr   c                       e Zd Zy)Qwen3ForQuestionAnsweringNr   r"   r#   r$   rt   rt      r%   r#   rt   )rl   rt   rh   rj   rp   rr   )2__doc__typingr   r   r_   cache_utilsr   modeling_flash_attention_utilsr   modeling_outputsr   modeling_utilsr	   processing_utilsr
   utilsr   r   gemma.modeling_gemmar   llama.modeling_llamar   qwen2.modeling_qwen2r   r   r   r   r   r   r   r   r   r   configuration_qwen3r   
get_loggerr   logger_CHECKPOINT_FOR_DOCr   r'   r)   rf   rh   rj   rl   rp   rr   rt   __all__r"   r#   r$   <module>r      s     %    B 6 5 & 0 +   - 
		H	%% 	< 		x 	1)^ 1)h	) 		/ 		 	/' /<	%C 		"= 		 9 	r#   