
    rh5                         d dl mZmZmZ ddlmZ erddlmZ ddlm	Z	m
Z
mZmZ ddlmZ  e       rd dlZ ej                   e      Z G d	 d
e      Zy)    )TYPE_CHECKINGAnyOptional   )HfQuantizer   )PreTrainedModel)is_accelerate_availableis_fbgemm_gpu_availableis_torch_availablelogging)get_module_from_nameNc                       e Zd ZdZdZdZddgZ fdZd Zdd	Z	d
dddde
dee
ef   fdZ	 dd
dddde
dddee
ef   deee
      fdZd dZ	 dd
ddeee
      fdZdee
   de
dee
   fdZd ZddZedefd       Z xZS )!FbgemmFp8HfQuantizerz/
    FP8 quantization using fbgemm kernels
    TFz
fbgemm-gpu
acceleratec                 4    t        |   |fi | || _        y N)super__init__quantization_config)selfr   kwargs	__class__s      /var/www/html/ai-insurance-compliance-backend/venv/lib/python3.12/site-packages/transformers/quantizers/quantizer_fbgemm_fp8.pyr   zFbgemmFp8HfQuantizer.__init__+   s    ,77#6     c                 .   t               st        d      t               st        d      t        d      st        d      t        j
                  j                         st        d      t        j
                  j                         }|\  }}|dk  rt        d      |j                  d      }|t        j                  d	       y |N| j                  sAt        |t              r0d
|j!                         v sd|j!                         v rt        d      y y y y )NzUsing fbgemm fp8 quantization requires torch >= 2.1.0Please install the latest version of torch ( pip install --upgrade torch )zUsing fbgemm fp8 quantization requires fbgemm-gpu libraryPlease install the latest version of fbgemm-gpu library by following : https://pytorch.org/FBGEMM/fbgemm_gpu-development/InstallationInstructions.html#fbgemm-gpu-install-librariesz0.32.2z`Loading an FP8 quantized model requires accelerate > 0.32.1 (`pip install --upgrade accelerate`)z=Using FP8 quantized models with fbgemm kernels requires a GPU	   zXFP8 quantized models is only supported on GPUs with compute capability >= 9.0 (e.g H100)
device_mapzYou have loaded an FP8 model on CPU and have a CUDA device available, make sure to set your model on a GPU device in order to run your model. To remove this warning, pass device_map = 'cuda'. cpudiskzYou are attempting to load an FP8 model with a device_map that contains a CPU or disk device.This is not supported when the model is quantized on the fly. Please use a quantized checkpoint or remove the CPU or disk device from the device_map.)r   ImportErrorr   r
   torchcudais_availableRuntimeErrorget_device_capability
ValueErrorgetloggerwarning_oncepre_quantized
isinstancedictvalues)r   argsr   compute_capabilitymajorminorr   s          r   validate_environmentz)FbgemmFp8HfQuantizer.validate_environment/   s/   !#]  '(F 
 'x0r  zz&&(^__"ZZ==?)u19j  ZZ-
| #&&z40j//11Vz?P?P?R5R n  6S 1 ' $r   returnc                     |(t         j                  }t        j                  d|       |S |t         j                  k(  rt        d      |S )Na  Overriding torch_dtype=%s with `torch_dtype=torch.bloat16` due to requirements of `fbgemm-gpu` to enable model loading in fp8. Pass your own torch_dtype to specify the dtype of the remaining non-linear layers or pass torch_dtype=torch.bfloat16 to remove this warning.zeYou cannot use FP8 with torch_dtype=torch.float16.We recommend you passing torch_dtype=torch.bfloat16)r"   bfloat16r)   infofloat16r'   )r   torch_dtypes     r   update_torch_dtypez'FbgemmFp8HfQuantizer.update_torch_dtype\   sV    ..KKKF  	 EMM)w  r   modelr	   param_valueztorch.Tensor
param_name
state_dictc                 R   ddl m}m} t        ||      \  }}	t	        ||      rP| j
                  s|	dk(  r.|	dk(  r(|j                  t        j                  k7  rt        d      y|	dk(  rt        d      y	t	        ||      r(| j
                  s|	dk(  ry|	d
k(  s|	dk(  rt        d      y	y)Nr   FbgemmFp8LinearFbgemmFp8Llama4TextExpertsbiasweightz6Expect quantized weights but got an unquantized weightFweight_scalez;Expect unquantized weights but got a quantized weight_scaleTgate_up_proj_scaledown_proj_scale)
integrationsrA   rB   r   r,   r+   dtyper"   float8_e4m3fnr'   )
r   r;   r<   r=   r>   r   rA   rB   moduletensor_names
             r   check_quantized_paramz*FbgemmFp8HfQuantizer.check_quantized_paraml   s     	O25*Efo.!![F%:(*{/@/@EDWDW/W$%]^^.0$%bccf89!![F%:"66+IZ:Z$%bccr   target_deviceztorch.deviceunexpected_keysc                    ddl m} t        ||      \  }}	t        ||      r|	dk(  r|j	                  dd      }
|
j
                  }|
j                  d|d         }t        j                  j                  j                  |      \  }}|j                  |      }|j	                  dd      }|j                  |d   d|d         }n|	dk(  r|j	                  dd      }
|
j
                  }|
j                  d|d         }t        j                  j                  j                  |      \  }}|j                  |      }|j	                  dd      }|j                  |d   |d   d      }t        j                  j                  j                  |            |j                  |	 d<   nt        j                  j                  j                  |      \  }}t        j                  j                  |j                  |j
                  d   d      j                  |            |j                  |	 d<   t        j                  j                  j                  |            |j                  |	<   |||v r|j!                  |       ~y	)
z@
        Quantizes weights into weight and weight_scale
        r   )rB   gate_up_projr   r   	down_proj_scaleN)rH   rB   r   r,   	transposeshapereshaper"   opsfbgemmquantize_fp8_per_rownn	Parameterto_parametersviewremove)r   r;   r<   r=   rN   r>   rO   rB   rK   rL   transposed_paramoriginal_shapeflattened_paramnew_value_flatweight_scale_flat	new_valuerE   s                    r   create_quantized_paramz+FbgemmFp8HfQuantizer.create_quantized_param   sU    	>25*Ef89n, $/#8#8A#>  "2!7!7"2":":2~b?Q"R 5:II4D4D4Y4YZi4j1 1 +22>B	%//15	0889JA~^_O`a+ $/#8#8A#>  "2!7!7"2":":2~b?Q"R 5:II4D4D4Y4YZi4j1 1 +22>B	%//15	0889JN[\L]_`a9>9K9KLOO\iLj9kF+f56&+ii&6&6&K&KK&X#I|9>9K9K!!,"4"4Q"7;>>}M:F+f56 +0((*<*<Y\\-=X*Y;'&:+H"":.r   c                     |S r    )r   r;   r   s      r   #_process_model_after_weight_loadingz8FbgemmFp8HfQuantizer._process_model_after_weight_loading   s    r   keep_in_fp32_modulesc                 &   ddl m} |j                  }| j                  || j                  j
                  |      | _        |j                  } ||| j
                  | j                  | j                  ||      }| j                  |j                  _        y )Nr   )replace_with_fbgemm_fp8_linear)modules_to_not_convertr   r+   configtp_plan)rH   rm   _tp_planget_modules_to_not_convertr   rn   ro   r+   )r   r;   rk   r   rm   rp   ro   s          r   $_process_model_before_weight_loadingz9FbgemmFp8HfQuantizer._process_model_before_weight_loading   s     	B..&*&E&E4++BBDX'
# .#'#>#> $ 8 8,,
 ,0+C+C(r   missing_keysprefixc                 ,   ddl m}m} g }|j                         D ]^  \  }}t	        |||f      s|D ]E  }	||	v s
|| d|	 v s|	j                  d      r#|	j                  d      r5|j                  |	       G ` |D 
cg c]	  }
|
|vs|
 c}
S c c}
w )Nr   r@   .z.weightz.bias)rH   rA   rB   named_modulesr,   endswithappend)r   r;   rt   ru   rA   rB   not_missing_keysnamerK   missingks              r   update_missing_keysz(FbgemmFp8HfQuantizer.update_missing_keys   s    N!//1 	9LD&&?4N"OP+ 9GDvhay4I,I ' 0 0 ; ' 0 0 9(//89	9 (Ea14D+DEEEs    	B
Bc                    d|j                   j                  v rqi ddddddddddddd	d
dddddddddddddddddd
dddddddd
dddd	}|j                         ||j                         _        |S ||_        |S |S )NLlama4z layers.*.self_attn.q_proj.weightlocal_colwisez&layers.*.self_attn.q_proj.weight_scalez layers.*.self_attn.k_proj.weightz&layers.*.self_attn.k_proj.weight_scalez layers.*.self_attn.v_proj.weightz&layers.*.self_attn.v_proj.weight_scalez layers.*.self_attn.o_proj.weightlocal_rowwisezlayers.*.self_attngatherzlayers.*.input_layernorm.weightsequence_parallelz(layers.*.post_attention_layernorm.weightznorm.weightz4layers.*.feed_forward.shared_expert.gate_proj.weightz:layers.*.feed_forward.shared_expert.gate_proj.weight_scalez2layers.*.feed_forward.shared_expert.up_proj.weightz8layers.*.feed_forward.shared_expert.up_proj.weight_scalez4layers.*.feed_forward.shared_expert.down_proj.weightzlayers.*.feed_forward.expertslocallocal_packed_rowwise)	zlayers.*.feed_forwardz0layers.*.feed_forward.experts.*.gate_proj.weightz6layers.*.feed_forward.experts.*.gate_proj.weight_scalez.layers.*.feed_forward.experts.*.up_proj.weightz4layers.*.feed_forward.experts.*.up_proj.weight_scalez0layers.*.feed_forward.experts.*.down_proj.weightz*layers.*.feed_forward.experts.gate_up_projz0layers.*.feed_forward.experts.gate_up_proj_scalez'layers.*.feed_forward.experts.down_proj)r   __name__get_text_configbase_model_tp_plan)r   ro   	text_plans      r   update_tp_planz#FbgemmFp8HfQuantizer.update_tp_plan   sU   v''000$ 3O	$
 9/$ 3O$ 9/$ 3O$ 9/$ 3O$ %h$ 23F$ ;<O$ 2$& G'$( Mo)$* Eo+$, KO-$. G/$0 01$2 *2DSJYBQHWDS ?UDZ;JG$IJ %%'3>G&&(; M -6)Mr   c                      y)NTri   )r   safe_serializations     r   is_serializablez$FbgemmFp8HfQuantizer.is_serializable#  s    r   c                      y)NFri   )r   s    r   is_trainablez!FbgemmFp8HfQuantizer.is_trainable&  s    r   )r9   torch.dtyper4   r   r   )r;   r	   )r   
__module____qualname____doc__ requires_parameters_quantizationrequires_calibrationrequired_packagesr   r3   r:   strr-   r   rM   r   listrg   rj   rs   r   r   r   propertyboolr   __classcell__)r   s   @r   r   r   !   s7    (,$ %|47+Z   $ 	
 cNJ 04= = $= 	=
 &= cN= "$s),=~ 59D D 'tCy1D2FtCy F# FRVWZR[ F-^ d  r   r   )typingr   r   r   baser   modeling_utilsr	   utilsr
   r   r   r   quantizers_utilsr   r"   
get_loggerr   r)   r   ri   r   r   <module>r      sN    0 /  0 a a 2  
		H	%G; Gr   