
    rh$                         d dl mZmZmZ ddlmZmZmZmZ ddl	m
Z
 ddlmZ  e       rd dlZerddlmZ  ej                   e      Z G d	 d
e
      Zy)    )TYPE_CHECKINGAnyOptional   )is_accelerate_availableis_torch_availableis_torch_xpu_availablelogging   )HfQuantizer)get_module_from_nameN)PreTrainedModelc                       e Zd ZdZdZdZdgZ fdZd ZddZ		 dd	d
ddde
dddee
ef   deee
      fdZd	d
ddde
dee
ef   fdZ	 dd	d
deee
      fdZd dZdee
   de
dee
   fdZd ZddZedefd       Zd Z xZS )!FineGrainedFP8HfQuantizerz
    FP8 quantization implementation supporting both standard and MoE models.
    Supports both e4m3fn formats based on platform.
    TF
acceleratec                 4    t        |   |fi | || _        y N)super__init__quantization_config)selfr   kwargs	__class__s      /var/www/html/ai-insurance-compliance-backend/venv/lib/python3.12/site-packages/transformers/quantizers/quantizer_finegrained_fp8.pyr   z"FineGrainedFP8HfQuantizer.__init__   s    ,77#6     c                    t               st        d      t               st        d      |j                  dd      s|j                  dd      rt	        d      t
        j                  j                         st               st        d      t
        j                  j                         rDt
        j                  j                         }|\  }}|dk  s
|dk(  r|d	k  rt	        d
| d| d      |j                  d      }|t        j                  d       y |N| j                  sAt        |t              r0d|j!                         v sd|j!                         v rt	        d      y y y y )NzxUsing fp8 quantization requires torch >= 2.1.0Please install the latest version of torch ( pip install --upgrade torch )zMLoading an FP8 quantized model requires accelerate (`pip install accelerate`)from_tfF	from_flaxzConverting into FP8 weights from tf/flax weights is currently not supported, please make sure the weights are in PyTorch format.zANo GPU or XPU found. A GPU or XPU is needed for FP8 quantization.   	   ziFP8 quantized models is only supported on GPUs with compute capability >= 8.9 (e.g 4090/H100), actual = `.`
device_mapzYou have loaded an FP8 model on CPU and have a CUDA device available, make sure to set your model on a GPU device in order to run your model. To remove this warning, pass device_map = 'cuda'. cpudiskzYou are attempting to load an FP8 model with a device_map that contains a cpu/disk device.This is not supported when the model is quantized on the fly. Please use a quantized checkpoint or remove the cpu/disk device from the device_map.)r   ImportErrorr   get
ValueErrortorchcudais_availabler	   RuntimeErrorget_device_capabilityloggerwarning_oncepre_quantized
isinstancedictvalues)r   argsr   compute_capabilitymajorminorr#   s          r   validate_environmentz.FineGrainedFP8HfQuantizer.validate_environment   sr   !#] 
 '(mnn::i'6::k5+IF 
 

'')-C-Ebcc::""$!&!A!A!C-LE5	uzeai ##('5'4 
 ZZ-
| #&&z40j//11Vz?P?P?R5R k  6S 1 ' $r   returnc                 T    |%t         j                  d       t        j                  }|S )NzWSetting torch_dtype to torch.float32 as no torch_dtype was specified in from_pretrained)r.   infor)   float32)r   torch_dtypes     r   update_torch_dtypez,FineGrainedFP8HfQuantizer.update_torch_dtypeM   s$    KKqr--Kr   modelr   param_valueztorch.Tensor
param_nametarget_deviceztorch.device
state_dictunexpected_keysc                     ddl m} |j                  |      }t        j                  t        j
                        j                  }t        j                  t        j
                        j                  }	| j                  j                  \  }
}|j                  dd \  }}||
z  dk7  s||z  dk7  rt        d| d| d|
 d| d		      |j                  }|j                  d
||
z  |
||z  |      j                  ddddd      }t        j                  t        j                  |      d      }|	|z  }|j                  }|j!                  d
      j!                  d
      }t        j"                  ||z  ||	      j                  t        j
                        }|j                  ddddd      }|j                  |      }|j                  |      j%                         j'                         } ||||        |||j)                  dd      d   dz   |       y)zO
        Quantizes weights to FP8 format using Block-wise quantization
        r   )_load_parameter_into_modelNr   zMatrix dimensions (z, z$) must be divisible by block sizes ()r         )rI   rG   )dim)minmaxr!   z.weight_scale_inv)modeling_utilsrF   tor)   finfofloat8_e4m3fnrM   rN   r   weight_block_sizeshaper(   reshapepermuteamaxabs	unsqueezeclampsqueeze
reciprocalrsplit)r   r?   r@   rA   rB   rC   rD   rF   fp8_minfp8_maxblock_size_mblock_size_nrowscolsparam_value_orig_shapemax_absscalescale_orig_shapequantized_params                      r   create_quantized_paramz0FineGrainedFP8HfQuantizer.create_quantized_paramS   s    	@!nn]3 ++e11266++e11266%)%=%=%O%O"l &&rs+
d,!#tl':a'?%dV2dV3WXdWeeghtguuvw  "-!2!2!))$lDL4H,

'!Q1a
  	
 **UYY{3B'! ;;#--b1  ++kE&9wGTWWX]XkXkl)11!Q1a@)112HI ./779DDF 	#5*oF"5**;*;C*CA*FI\*\^cdr   c                     ddl m} t        ||      \  }}t        ||      rP| j                  s|dk(  r.|dk(  r(|j
                  t        j                  k7  rt        d      y|dk(  rt        d      y	y)
Nr   	FP8Linearbiasweightz6Expect quantized weights but got an unquantized weightFweight_scale_invz;Expect unquantized weights but got a quantized weight_scaleT)	integrations.finegrained_fp8rl   r   r1   r0   dtyper)   rR   r(   )	r   r?   r@   rA   rC   r   rl   moduletensor_names	            r   check_quantized_paramz/FineGrainedFP8HfQuantizer.check_quantized_param   sw     	=25*Efi(!![F%:(*{/@/@EDWDW/W$%]^^"44$%bccr   keep_in_fp32_modulesc                     ddl m} | j                  || j                  j                  |      | _         ||| j                  | j                        }| j                  |j
                  _        y )Nr   )replace_with_fp8_linear)modules_to_not_convertr   )rp   rw   get_modules_to_not_convertr   rx   config)r   r?   ru   r   rw   s        r   $_process_model_before_weight_loadingz>FineGrainedFP8HfQuantizer._process_model_before_weight_loading   sd     	K&*&E&E4++BBDX'
# (#'#>#> $ 8 8
 ,0+C+C(r   c                     |S r    )r   r?   r   s      r   #_process_model_after_weight_loadingz=FineGrainedFP8HfQuantizer._process_model_after_weight_loading   s    r   missing_keysprefixc                 $   ddl m} g }|j                         D ]\  \  }}t        ||      s|D ]E  }||v s
|| d| v s|j	                  d      r#|j	                  d      r5|j                  |       G ^ |D 	cg c]	  }	|	|vs|	 c}	S c c}	w )Nr   rk   r!   z.weightz.bias)integrationsrl   named_modulesr1   endswithappend)
r   r?   r   r   rl   not_missing_keysnamerr   missingks
             r   update_missing_keysz-FineGrainedFP8HfQuantizer.update_missing_keys   s    ,!//1 	9LD&&),+ 9GDvhay4I,I ' 0 0 ; ' 0 0 9(//89	9 (Ea14D+DEEEs   <	BBc                     d|j                   j                  v r9i ddddddddddddd	d
dd
dddddddddddd
dd
dd}||_        |S )NQwen3z layers.*.self_attn.q_proj.weightlocal_colwisez*layers.*.self_attn.q_proj.weight_scale_invz layers.*.self_attn.k_proj.weightz*layers.*.self_attn.k_proj.weight_scale_invz layers.*.self_attn.v_proj.weightz*layers.*.self_attn.v_proj.weight_scale_invz layers.*.self_attn.o_proj.weightlocal_rowwisez*layers.*.self_attn.o_proj.weight_scale_invzlayers.*.self_attngatherzlayers.*.mlp.gate_proj.weightz'layers.*.mlp.gate_proj.weight_scale_invzlayers.*.mlp.up_proj.weightz%layers.*.mlp.up_proj.weight_scale_invzlayers.*.mlp.down_proj.weightz'layers.*.mlp.down_proj.weight_scale_invzlayers.*.mlp)r   __name__base_model_tp_plan)r   rz   	text_plans      r   update_tp_planz(FineGrainedFP8HfQuantizer.update_tp_plan   s    f&&///2O<o 3O =o	
 3O =o 3O =o %h 0 :? . 8 0 :?  !I& )2F%r   c                      y)NTr}   )r   safe_serializations     r   is_serializablez)FineGrainedFP8HfQuantizer.is_serializable   s    r   c                      y)NFr}   r   s    r   is_trainablez&FineGrainedFP8HfQuantizer.is_trainable   s    r   c                      y)Nr   r}   r   s    r   get_cuda_warm_up_factorz1FineGrainedFP8HfQuantizer.get_cuda_warm_up_factor   s    r   )r=   torch.dtyper9   r   r   )r?   r   )r   
__module____qualname____doc__ requires_parameters_quantizationrequires_calibrationrequired_packagesr   r8   r>   strr2   r   r   listri   rt   r{   r~   r   r   r   propertyboolr   r   __classcell__)r   s   @r   r   r      s>   
 (,$ %7,\ 044e 4e $4e 	4e
 &4e cN4e "$s),4el  $ 	
 cN4 59D D 'tCy1D(FtCy F# FRVWZR[ F2 d  r   r   )typingr   r   r   utilsr   r   r	   r
   baser   quantizers_utilsr   r)   rO   r   
get_loggerr   r.   r   r}   r   r   <module>r      sH    / / ` `  2 0			H	%W Wr   