
    rh                     N   d dl Z d dlmZmZ d dlZddlmZmZmZm	Z	 ddl
mZ ddlmZmZmZmZ ddlmZmZ ddlmZmZ  G d	 d
      Z G d dej0                  j2                        Z G d dej0                  j2                        Z G d dej0                  j2                        Z	 	 	 	 d*dedeej:                     deej:                     dee   dee   f
dZ  G d dej0                  j2                        Z! G d dej0                  j2                        Z" G d dej0                  j2                        Z#	 	 d+dedeej:                     deej:                     fdZ$	 	 	 	 	 	 d,de%d ej:                  d!e%d"e%d#ee   d$eej:                     d%ee%   d&ed'ed(eej:                     fd)Z&y)-    N)CallableOptional   )DynamicCacheEncoderDecoderCacheHybridCacheStaticCache)GenerationConfig)ALL_MASK_ATTENTION_FUNCTIONS_ignore_causal_mask_sdpa#_is_torch_greater_or_equal_than_2_5prepare_padding_mask)ALL_ATTENTION_FUNCTIONSPreTrainedModel)is_torch_greater_or_equal"is_torch_greater_or_equal_than_2_3c                   J    e Zd ZdZddedefdZd Zd Zd Zd Z	d	 Z
	 ddZy
)TorchExportableModuleForVLMa|  
    A wrapper class for exporting Vision-Language Models (VLMs) like SmolVLM2 for ExecuTorch.

    This class handles the export of three main components:
        1. Vision encoder (processes images to visual features)
        2. Connector/projector (maps visual features to text embedding space)
        3. Text decoder (generates text from combined visual and text tokens)
    max_batch_sizemax_cache_lenc                    || _         || _        || _        |j                  | _        |j                   j                  | _        |j                   j                  | _        |j                   j                  | _        d| _	        d| _
        d| _        y)a  
        Initialize the exportable VLM module.

        Args:
            model: The VLM (e.g. SmolVLM) model instance
            max_batch_size: Maximum batch size. Always 1 for ExecuTorch
            max_cache_len: Maximum cache length for text generation
        N)modelr   r   configvision_modelvision_encoder	connector
text_modeltext_decoderexported_vision_encoderexported_connectorexported_text_decoder)selfr   r   r   s       w/var/www/html/ai-insurance-compliance-backend/venv/lib/python3.12/site-packages/transformers/integrations/executorch.py__init__z$TorchExportableModuleForVLM.__init__+   sw     
,*ll $kk66..!KK22 (,$"&%)"    c                    | j                   j                          t        j                  ddddt        j                        }dt        j
                  j                  j                  t        j
                  j                  j                  di}t        j
                  j                  | j                   |f|d      | _        | j                  S )	z$Export the vision encoder component.      i  dtypepixel_values)r   r(   Fargsdynamic_shapesstrict)	r   evaltorchrandnfloat32exportDimAUTOr   )r"   r+   r.   s      r#   export_vision_encoderz1TorchExportableModuleForVLM.export_vision_encoderC   s      " {{1acG <<##((<<##((
 (-||':':)	 (; (
$ +++r%   c                 &   | j                   j                          | j                  j                  j                  }| j                  j                  j
                  }| j                  j                  j                  }||z  }||z  }t        j                  d||t        j                        }ddt        j                  j                  j                  ii}t        j                  j                  | j                   |f|d      | _        | j                  S )zExport the connector component.r'   r)   image_hidden_statesFr,   )r   r0   r   vision_confighidden_size
image_size
patch_sizer1   r2   r3   r4   r5   r6   r    )r"   vision_hidden_sizer<   r=   patches_per_dimnum_patchesr9   r.   s           r#   export_connectorz,TorchExportableModuleForVLM.export_connector[   s     "[[66BB[[..99
[[..99
$
2%7#kk![:LTYTaTab 0!U\\5E5E5J5J1KL #(,,"5"5NN%')	 #6 #
 &&&r%   c                 "   t        | j                  | j                  | j                        | _        d}t        j                  d|ft
        j                        }t        j                  |t
        j                        }t        | j                  | j                  j                  j                        }t
        j                  j                  d|dz
        }d|id|id}| j                  j                  |||d	
      | _        | j                  S )z"Export the text decoder component.)r   r   r   r(   r'   r)   seq_length_dimmaxr   	input_idscache_positionF)rG   rH   r.   r/   )%TorchExportableModuleForDecoderOnlyLMr   r   r   exportable_text_decoderr1   zeroslongarangeminr   text_configmax_position_embeddingsr4   r5   r!   )r"   
seq_lengthrG   rH   max_seq_lengthseq_len_dimr.   s          r#   export_text_decoderz/TorchExportableModuleForVLM.export_text_decodert   s     (M##..,,(
$ 
KKJuzzB	j

CT//1H1H1`1`all&&'7^a=O&P [) +.

 &*%A%A%H%H))	 &I &
" )))r%   c                      | j                   di |  | j                  di |  | j                  di | | j                  | j                  | j
                  dS )z'Export all components of the VLM model.)r   r   r    )r7   rA   rT   r   r    r!   )r"   kwargss     r#   r4   z"TorchExportableModuleForVLM.export   s`    """,V,''   *6*"::00 66
 	
r%   c                      y)a  
        Simplified forward pass for inference with guaranteed non-null input_ids and cache_position.

        Args:
            pixel_values: Input images [1, channels, height, width] (optional)
            input_ids: Text token IDs [1, seq_len] (required - won't be None)
            cache_position: Cache positions [seq_len] (required - won't be None)

        Returns:
            Output with logits for text generation
        NrV   )r"   r+   rG   rH   s       r#   forwardz#TorchExportableModuleForVLM.forward   s     	r%   Nc                      y)a  
        Simplified generate method with guaranteed non-null input_ids.

        Args:
            pixel_values: Input images [1, channels, height, width] (optional)
            input_ids: Initial text tokens [1, seq_len] (required - won't be None)
            max_new_tokens: Maximum number of tokens to generate
            do_sample: Whether to use sampling or greedy decoding
            temperature: Temperature for sampling

        Returns:
            Generated sequences
        NrV   )r"   r+   rG   max_new_tokens	do_sampletemperaturerW   s          r#   generatez$TorchExportableModuleForVLM.generate   s      	r%   )r'      )NN2   F      ?)__name__
__module____qualname____doc__intr$   r7   rA   rT   r4   rY   r^   rV   r%   r#   r   r   !   s?    *c *c *0,0'2*>	
 ber%   r   c                       e Zd ZdZ	 	 ddededef fdZdej                  dej                  dej                  fd	Z		 	 	 	 dde
ej                     de
ej                     d
e
e   de
e   dej                  j                  f
dZe	 	 	 	 	 	 ddej                  j                  dedededededededefd       Z xZS )rI   a  
    A recipe module designed to make a `PreTrainedModel` exportable with `torch.export`,
    specifically for decoder-only LM with cache. This module ensures that the
    exported model is compatible with further lowering and execution in `ExecuTorch`.
    r   r   r   c                    t         |           t        |j                  d      r|j                  j                  du rt        d      t        |j                  d      r*t        |j                  dd      t        |||      | _        n't        j                  d       t        |||      | _        t        j                  dt               t        j                  dt        d	          d| j                  j                  j                  _        y)
a  
        Initializes the exportable module with `HybridCache`.

        Args:
            model (`PreTrainedModel`): The pretrained model to wrap.
            max_batch_size (int): Maximum batch size for the cache.
            max_cache_len (int): Maximum sequence length for the cache.

        Raises:
            ValueError: If the model is configured with a unsupported cache implementation.
        	use_cacheFz5The model must have caching enabled to be performant.layer_typessliding_windowNzmUsing `StaticCache` for export as `layer_types` is not specified or `sliding_window` is `null` in the config.sdpa_without_vmapsdpa)superr$   hasattrr   ri   
ValueErrorgetattr$TorchExportableModuleWithHybridCacher   logginginfo$TorchExportableModuleWithStaticCacher   registersdpa_mask_without_vmapr   _attn_implementation)r"   r   r   r   	__class__s       r#   r$   z.TorchExportableModuleForDecoderOnlyLM.__init__   s    " 	u||[1U\\5K5Ku5TTUU5<</GELLJZ\`4a4m=e^UbcDJ LL >e^UbcDJ$--.ACYZ(()<>UV\>]^7J

4r%   rG   rH   returnc                 :    | j                   j                  ||      S )  
        Forward pass of the module, which is compatible with the ExecuTorch llm runner.

        Args:
            input_ids (`torch.Tensor`): Tensor representing current input token id to the module.
            cache_position (`torch.Tensor`): Tensor representing current input position in the cache.

        Returns:
            torch.Tensor: Logits output from the model.
        )r   rY   )r"   rG   rH   s      r#   rY   z-TorchExportableModuleForDecoderOnlyLM.forward   s     zz!!)^<<r%   r.   r/   c                 h   t        | j                  d      rBt        | j                  | j                  j                  | j                        }|j                  }nNt        | j                  d      r!| j                  j                  j                  }nd}t        j                  d       ||n't        j                  dggt        j                  |      }||n&t        j                  dgt        j                  |      }t        j                  j                  | j                  ||fi |||nd	      }	|	S )
a  
        Export the wrapped module using `torch.export`.

        Args:
            input_ids (`Optional[torch.Tensor]`):
                Tensor representing current input token id to the module. If not provided, a default tensor will be used.
            cache_position (`Optional[torch.Tensor]`):
                Tensor representing current input position in the cache. If not provided, a default tensor will be used.
            dynamic_shapes (`Optional[dict]`):
                Dynamic shapes to use for export if specified.
            strict(`Optional[bool]`):
                Flag to instruct `torch.export` to use `torchdynamo`.
        base_model_prefixr   cpuzfTorchExportableModuleForDecoderOnlyLM.export Can't infer device from the model. Set to CPU by default.r'   r*   devicer   Tr-   rW   r.   r/   )ro   r   rq   r~   r   rs   warningr1   tensorrL   r4   )
r"   rG   rH   r.   r/   basemodel_deviceexample_input_idsexample_cache_positionexported_programs
             r#   r4   z,TorchExportableModuleForDecoderOnlyLM.export   s   ( 4::234::tzz'C'CTZZPD;;LTZZ)::++22L LOOx
 #.IELL1#ejjam4n 	 -8NellA3V[V`V`iu>v 	 !<<..JJ#%;<)#/6T / 
  r%   r   promptr[   r\   r]   top_ktop_pr   c	                    | j                         }	 ||d      j                  j                  |      }
|
j                         }d}t	        |
j
                  d         D ]E  }|
dd||dz   f   }t        j                  |gt        j                  |      } |	||      }|dz  }G t	        |      D ]  }|ddddf   }t        j                  |gt        j                  |      } |	||      }|r|dkD  r||z  }n|}|dkD  r-|t        j                  ||      d   d   k  }t        d	      ||<   |d
k  rt        j                  |d      \  }}t        j                  t        j                  |d      d      }||kD  }|dddf   j                         |dddf<   d|d<   |j                  d||      }t        d	      ||<   t        j                  |d      }t        j                  |d      }n|j!                  dd      }|j#                         dkD  r|j%                  d      }t        j&                  ||gd      }|dz  }|j)                         |j*                  k(  s n |j-                  |d   d      S )a   
        Generate a sequence of tokens using an exported program.

        Args:
            exported_program (`torch.export.ExportedProgram`): The exported model being used for generate.
            tokenizer: The tokenizer to use.
            prompt (str): The input prompt.
            max_new_tokens (int): Maximum number of new tokens to generate.
            do_sample (bool): Whether to use sampling or greedy decoding.
            temperature (float): The temperature for sampling.
            top_k (int): The number of highest probability tokens to keep for top-k sampling.
            top_p (float): The cumulative probability for nucleus sampling.
            device (str): The device to use.

        Returns:
            str: The generated text.
        pt)return_tensorsr   r'   Nr   ).r   Nz-infra   T)
descendingdim.).r   )num_samplesr   keepdimr   )skip_special_tokens)modulerG   toclonerangeshaper1   r   rL   topkfloatsortcumsumsoftmaxscattermultinomialargmaxr   squeezecatitemeos_token_iddecode)r   	tokenizerr   r[   r\   r]   r   r   r   exported_modulerG   generated_idscurr_positionicurr_input_idscurr_cache_position_outputslogitsindices_to_removesorted_logitssorted_indicescumulative_probssorted_indices_to_removeprobsnext_token_ids                             r#   r^   z.TorchExportableModuleForDecoderOnlyLM.generate*  s   < +113 fT:DDGGO	 ") yq)* 	A&q!a!e)|4N"',,ejjY_"`  0CDAQM	 ~& 5	A*1bc62N"',,ejjY_"` &n6IJG ?${2F$F 19(.FE1J11Mm1\(\%05fF,- 3;49JJvRV4W1M>',||EMM-UW4X^`'a$ 0@%/G,8PQTVYWYVYQY8Z8`8`8b,S!"W578,V4 )A(H(H^]u(v%05fF,- f"5 % 1 1%Q G !(2t D   "Q& - 5 5b 9 "II}m&D"MMQM !!#y'='==k5	p a 0dKKr%   r'      NNNN)   Fra   r`   ra   r   )rb   rc   rd   re   r   rf   r$   r1   TensorrY   r   dictboolr4   ExportedProgramstaticmethodstrr   r^   __classcell__ry   s   @r#   rI   rI      sk     !	"K"K "K 	"KH=<<= = 
	=& -115)-!%- ELL)-  !.-  !	- 
 -  
	%	%- ^ 
 ! iL,,66iL iL 	iL
 iL iL iL iL iL 
iL iLr%   rI   c            	            e Zd ZdZ	 	 ddededef fdZdej                  dej                  fdZ	e
d	ej                  j                  d
ej                  dedej                  fd       Z xZS )ru   a  
    A recipe module designed to make a `PreTrainedModel` exportable with `torch.export`,
    specifically for decoder-only LM to `StaticCache`. This module ensures that the
    exported model is compatible with further lowering and execution in `ExecuTorch`.

    Note:
        This class is specifically designed to support export process using `torch.export`
        in a way that ensures the model can be further lowered and run efficiently in `ExecuTorch`.
    r   r   r   c           	         t         |           |j                  ,t        |j                  j
                  d|||dd      |_        |j                  j
                  st        d      |j                  j                  dk7  rt        d      || _        t        | j                  j                  | j                  j                  j                  j                  d      | j                  j                  j                  j                  d	      | j                  j                  j                  j                  d
      | j                  j                        | _        t        t        | j                              D ]r  }| j!                  d| | j                  j"                  |   j$                  d       | j!                  d| | j                  j"                  |   j&                  d       t y)a  
        Initializes the wrapper module with the pretrained model.

        Args:
            model (`PreTrainedModel`): The pretrained model to wrap. The model must have caching
            enabled and use a 'static' caching implementation.

        Raises:
            AssertionError: If the pretrained model does not have caching enabled or if it does
            not use a 'static' caching implementation in `model.generation_config`.
        Nstaticr   )
batch_sizer   r   )ri   cache_implementation
max_lengthcache_configzvThe model must have caching enabled to be exported with static caching. Please set `generation_config.use_cache=True`.zThe model must use a 'static' caching implementation to be exported with static caching. Please set `generation_config.cache_implementation='static'`.r   r   r   r   r   r   r   r*   
key_cache_F
persistentvalue_cache_)rn   r$   generation_configr
   r   ri   AssertionErrorr   r   r	   r   getr*   static_cacher   lenregister_bufferlayerskeysvaluesr"   r   r   r   r   ry   s        r#   r$   z-TorchExportableModuleWithStaticCache.__init__  s   " 	 ""*&6,,00%-("0%2#		'E# &&00 A 
 ""778C P 
 
'::$$::77DDHHV**66CCGGX:://<<@@J**""
 s4,,-. 	kA  :aS!143D3D3K3KA3N3S3S`e f  <s!3T5F5F5M5Ma5P5W5Wdi j	kr%   rG   rH   c                     |j                   \  }}|j                  d      }| j                  }| j                  |d|||d      }t	        |d      r|j
                  S |j                  S )a  
        Forward pass of the module, which is compatible with the ExecuTorch runtime.

        Args:
            input_ids (`torch.Tensor`): Tensor representing current input token id to the module.
            cache_position (`torch.Tensor`): Tensor representing current input position in the cache.

        Returns:
            torch.Tensor: Logits output from the model.

        This forward adapter serves two primary purposes:

        1. **Making the Model `torch.export`-Compatible**:
            The adapter hides unsupported objects, such as the `Cache`, from the graph inputs and outputs,
            enabling the model to be exportable using `torch.export` without encountering issues.

        2. **Ensuring Compatibility with `ExecuTorch` runtime**:
            The adapter matches the model's forward signature with that in `executorch/extension/llm/runner`,
            ensuring that the exported model can be executed in `ExecuTorch` out-of-the-box.
        r   NT)rG   attention_maskposition_idsrH   past_key_valuesri   r   )r   	unsqueezer   r   ro   r   last_hidden_state)r"   rG   rH   r   seqlenr   r   outss           r#   rY   z,TorchExportableModuleWithStaticCache.forward  su    * OO	6%//2++zz%)+  
 4";; )))r%   r   prompt_token_idsr[   rz   c           	      f   |j                   }|j                  d   }||z   }| j                         D ]3  \  }}|j                  d      s|j                  d   }t	        ||      } n g }	t        t	        ||            D ]y  }
| j                         j                  |dd|
|
dz   f   t        j                  |
gt        j                  |            }|	j                  |d   |
   j                                { t        j                  dddddf   d	      j                         }|	j                  |       t        |	      |k  r| j                         j                  t        j                  |ggt        j                  |      t        j                  t        |	      gt        j                  |            }t        j                  |dddddf   d	      j                         }|	j                  |       t        |	      |k  rt        j                  |	gt        j                  |      S )
a  
        Generate a sequence of tokens using an exported program.

        This util function is designed to test exported models by simulating the generation process.
        It processes the input prompt tokens sequentially (no parallel prefill).
        This generate function is not intended to replace the original `generate` method, and the support
        for leveraging the original `generate` is potentially planned!

        Args:
            exported_program (`torch.export.ExportedProgram`): The exported program generated via `torch.export`.
            prompt_token_ids (`torch.Tensor`): Tensor representing the input prompt token IDs.
            max_new_tokens (`int`): Maximum number of new tokens to generate. Note that the total generation
                length is limited by both `max_new_tokens` and the model's cache size.

        Returns:
            torch.Tensor: A tensor containing the generated sequence of token IDs, including the original prompt tokens.
        r   	key_cacher   Nr'   r   rF   r   r   )r   r   named_buffers
startswithrN   r   r   rY   r1   r   rL   appendr   r   r   )r   r   r[   r   prompt_token_lenmax_generation_lengthbuffer_namebufferr   response_tokens	input_posresultcurrent_tokens                r#   r^   z-TorchExportableModuleWithStaticCache.generate  s    . "((+11"5 0> A#3#A#A#C 	K%%k2 &Q(+,A=(Q%		 s#8:JKL 	JI%,,.66*1i)a-.G+GH$||YKuzzRXY 7 F ""#3A#6y#A#F#F#HI	J VAr1H%52>CCE}-/"%::%,,.66,,'8

SYZ$||S-A,B%**]cd 7 F "LL2q)9rBGGIM""=1 /"%:: ||_-UZZOOr%   r   )rb   rc   rd   re   r   rf   r$   r1   r   rY   r   r4   r   r^   r   r   s   @r#   ru   ru     s      !	7k7k 7k 	7kr&* &*u|| &*P 2P,,662P,,2P 2P 
	2P 2Pr%   ru   c                        e Zd ZdZ	 	 d
dededef fdZdej                  dej                  dej                  fd	Z	 xZ
S )rr   a  
    A recipe module designed to make a `PreTrainedModel` exportable with `torch.export`,
    specifically for decoder-only LM to `HybridCache`. This module ensures that the
    exported model is compatible with further lowering and execution in `ExecuTorch`.
    r   r   r   c                 N   t         |           || _        | j                  j                  j                  st        d      t        | j                  j                  ||| j                  j                  | j                  j                        | _	        t        t        | j                              D ]r  }| j                  d| | j                  j                  |   j                  d       | j                  d| | j                  j                  |   j                  d       t y)a  
        Initializes the exportable module with `HybridCache`.

        Args:
            model (`PreTrainedModel`): The pretrained model to wrap.
            max_batch_size (int): Maximum batch size for the cache.
            max_cache_len (int): Maximum sequence length for the cache.

        Raises:
            AssertionError: If the model doesn't have the expected configuration for HybridCache.
        zModel must have caching enabledr   r   Fr   r   N)rn   r$   r   r   ri   r   r   r   r*   cacher   r   r   r   r   r   r   s        r#   r$   z-TorchExportableModuleWithHybridCache.__init__@  s    " 	
 zz  ** !BCC !::$$)'::$$**""

 s4::' 	dA  :aS!14::3D3DQ3G3L3LY^ _  <s!3TZZ5F5Fq5I5P5P]b c	dr%   rG   rH   rz   c                     |j                   d   }|j                  d      j                  |d      }| j                  |d|| j                  d|      }|j
                  S )r|   r   r   NT)rG   r   r   r   ri   rH   )r   r   expandr   r   r   )r"   rG   rH   r   r   r   s         r#   rY   z,TorchExportableModuleWithHybridCache.forwardf  sh     __Q'
 &//299*bI **% JJ)  
 ~~r%   r   )rb   rc   rd   re   r   rf   r$   r1   r   rY   r   r   s   @r#   rr   rr   9  sc      !	$d$d $d 	$dL<<  
	r%   rr   r   r   r   r.   r/   c                    t         st        d      ddl}t        j                  dt
               t        j                  dt        d          d| j                  _         |j                         5  ||n* |j                  dgg|j                  | j                        }||n) |j                  dg|j                  | j                        }t        d      r0|j                  j                  t        |       ||fi |||nd	
      }nd|t!        j"                  d       |t!        j"                  d       |j                  j$                  j'                  t        |       |fd|idd	      }|cddd       S # 1 sw Y   yxY w)a  
    Convert a `PreTrainedModel` into an exportable module and export it using `torch.export`,
    ensuring the exported model is compatible with `ExecuTorch`.

    Args:
        model (`PreTrainedModel`): The pretrained model to be exported.
        example_input_ids (`Optional[torch.Tensor]`): Example input token id used by `torch.export`.
        example_cache_position (`Optional[torch.Tensor]`): Example current cache position used by `torch.export`.
        dynamic_shapes(`Optional[dict]`): Dynamic shapes used by `torch.export`.
        strict(`Optional[bool]`): Flag to instruct `torch.export` to use `torchdynamo`.

    Returns:
        Exported program (`torch.export.ExportedProgram`): The exported program generated via `torch.export`.
    torch >= 2.3 is required.r   Nrl   rm   r'   r   z2.6.0Tr   zWDynamic shapes spec will be ignored by convert_and_export_with_cache for torch < 2.6.0.zSThe strict flag will be ignored by convert_and_export_with_cache for torch < 2.6.0.rH   F)r-   rW   pre_dispatchr/   )r   ImportErrortorch.export._tracer   rv   rw   r   r   rx   no_gradr   rL   r   r   r4   ru   rs   r   _trace_export)r   r   r   r.   r/   r1   r   s          r#   convert_and_export_with_cacher     s   * .566 !))*=?UV$$%8:QRX:YZ(;ELL%	 '  !, se5::ellK 	 &1 #qcELLI 	 %W-$||224U;')?@-!'!3v  3   )m ! uv
  %||22::4U;')(*@A"  ;    O'  '  ' s   /C>E77F c                   (     e Zd ZdZ fdZd Z xZS ) Seq2SeqLMEncoderExportableModulez
    A wrapper module designed to make a Seq2Seq LM encoder exportable with `torch.export`.
    This module ensures that the exported encoder model is compatible with ExecuTorch.
    c                 0    t         |           || _        y )N)rn   r$   encoder)r"   encoder_modelry   s     r#   r$   z)Seq2SeqLMEncoderExportableModule.__init__  s    $r%   c                 :    | j                  |      j                  S )N)rG   )r  r   )r"   rG   s     r#   rY   z(Seq2SeqLMEncoderExportableModule.forward  s    ||i|0BBBr%   rb   rc   rd   re   r$   rY   r   r   s   @r#   r  r    s    
%Cr%   r  c                   (     e Zd ZdZ fdZd Z xZS )/Seq2SeqLMDecoderExportableModuleWithStaticCachez
    A wrapper module designed to make a Seq2Seq LM decoder exportable with `torch.export`,
    specifically for use with static caching. This module ensures the exported decoder
    is compatible with ExecuTorch.
    c                 V   t         |           |j                         | _        |j                  | _        |j
                  | _        t        | j
                  ||dt        j                        | _	        t        | j                  t                     | _        t        t        | j                              D ]r  }| j                  d| | j                  j                   |   j"                  d       | j                  d| | j                  j                   |   j$                  d       t y )Nr   r   r   Fr   r   )rn   r$   get_decoderdecoderlm_headr   r	   r1   r3   r   r   r   r   r   r   r   r   r   r   )r"   r   max_static_cache_lengthr   r   ry   s        r#   r$   z8Seq2SeqLMDecoderExportableModuleWithStaticCache.__init__  s     ((*}}ll (;;%1--
 )):):LNK
 s4,,-. 	kA  :aS!143D3D3K3KA3N3S3S`e f  <s!3T5F5F5M5Ma5P5W5Wdi j	kr%   c                 n    | j                  ||| j                  d|      }| j                  |d         }|S )NT)rG   encoder_hidden_statesr   ri   rH   r   )r  r   r  )r"   decoder_input_idsr  rH   r   	lm_logitss         r#   rY   z7Seq2SeqLMDecoderExportableModuleWithStaticCache.forward  sB    ,,'"7 JJ)  
 LL,	r%   r  r   s   @r#   r	  r	    s    k.r%   r	  c                   <     e Zd Z	 d fd	Zd Zd ZddZd Z xZS )Seq2SeqLMExportableModulec                     t         |           || _        |j                         | _        |j
                  | _        || _        t        d||||d      | _        d | _	        d | _
        y )NT)r   r   )ri   r   r   r   )rn   r$   
full_modelget_encoderr  r   max_hidden_seq_lengthr
   r   exported_encoderexported_decoder)r"   r   r   r  r   max_cache_lengthry   s         r#   r$   z"Seq2SeqLMExportableModule.__init__  sm     	((*ll%:"!1'!5(!1	"
 !% $r%   c                    t        | j                        j                  | j                  j                        j                         }t        j                  j                  d| j                        }t        j                         5  t        j                  j                  ||fdd|iid      }d d d        |S # 1 sw Y   S xY w)Nencoder_seq_lengthrD   rG   r'   Tr.   r/   )r  r  r   r  r   r0   r1   r4   r5   r  r   )r"   encoder_input_idswrapped_encoderrS   r  s        r#   _export_encoderz)Seq2SeqLMExportableModule._export_encoder#  s    :4<<HKKDOOLbLbchhj ll&&';A[A[&\ ]]_ 	$||22"3!5{UVXcTdFenr  3  	
  	
  s   )B99Cc           	         t        | j                  | j                  j                  j                  | j                  j                  j
                        j                  d      j                         }t        j                  j                  d| j                        }t        j                         5  t        j                  j                  ||||fd d|id dd      }d d d        |S # 1 sw Y   S xY w)	N)r   r  r   r   encoder_hidden_seq_lengthrD   r'   )r  r  rH   Tr  )r	  r  r   r   r   r   r   r0   r1   r4   r5   r  r   )r"   r  r  rH   wrapped_decoderencoder_seq_len_dimr  s          r#   _export_decoderz)Seq2SeqLMExportableModule._export_decoder1  s    ;oo(,(>(>(K(K(Y(Y11>>II
 RYTV 	 $ll../JPTPjPj.k ]]_ 
	$||22"$9>J)-./1D-E&* 
   3 	 
	  
	  s   1-C((C2c                 N   | j                   j                  }||n%t        j                  dt        j                  |      }||n't        j
                  dggt        j                  |      }||n&t        j
                  dgt        j                  |      }||nZt        j                  | j                  j                  j                  d| j                  j                  ft        j                  |      }	| j                  |      | _        | j                  ||	|      | _        | S )N)r'   
   r   r   r(  )r  r   r1   onesrL   r   rK   r   r   r   r   d_modelr3   r!  r  r&  r  )
r"   r  r  r  rH   r   example_encoder_input_idsexample_decoder_input_idsr   example_encoder_hidden_statess
             r#   r4   z Seq2SeqLMExportableModule.exportN  s   '' !, G5::fE 	" !, se5::fE 	" -8NellA3V[V`V`io>p 	
 %0 "''44??T[[EXEXYmm 	& !% 4 45N O $ 4 4%'DF\!

 r%   c           
         t        j                         5   | j                  j                         |      }t        j                  dggt         j
                        }dg}t        |dz
        D ]  } | j                  j                         ||t        j                  |gt         j
                              }t        j                  |d d dd d f   d      j                         }|j                  |       t        j                  |ggt         j
                        }|| j                  j                  k(  s n |cd d d        S # 1 sw Y   y xY w)Nr   r)   r'   r   r   )r1   r   r  r   r   rL   r   r  r   r   r   r   r   )	r"   r   r[   encoder_outputr  r   r   r   
next_tokens	            r#   r^   z"Seq2SeqLMExportableModule.generaten  s   ]]_ 	!;T2299;<LMN !&qcU%** ECM >A-. 7..557%~u||QCuzz7Z
 #\\&B*:CHHJ
$$Z0 %*LL:,uzz$R! !9!99" !5	! 	! 	!s   D"E8EE)r'   r   r   r_   r   )	rb   rc   rd   r$   r!  r&  r4   r^   r   r   s   @r#   r  r    s     os%*  :@!r%   r  example_attention_maskc           
      l   t         st        d      t        j                  dt               t        j                  dt
        d          d| j                  _        t        j                         5  t        j                  j                  | d||t               ddd      }|cd	d	d	       S # 1 sw Y   y	xY w)
a  
    Export a model with DynamicCache using `torch.export`, ensuring the exported model is compatible with `ExecuTorch`.

    Args:
        model (`PreTrainedModel`): The pretrained model to be exported.
        example_input_ids (`Optional[torch.Tensor]`): Example input token id used by `torch.export`.
        example_attention_mask (`Optional[torch.Tensor]`): Example attention mask used by `torch.export`.

    Returns:
        Exported program (`torch.export.ExportedProgram`): The exported program generated via `torch.export`.
    r   rl   rm   rV   T)rG   r   r   ri   F)r/   N)r   r   r   rv   rw   r   r   rx   r1   r   r4   r   )r   r   r1  r   s       r#   export_with_dynamic_cacher3    s      .566 !))*=?UV$$%8:QRX:YZ(;ELL%	   <<..."8#/>!	  / 

       s   .2B**B3r   rH   	kv_length	kv_offsetmask_functionr   
local_sizeallow_is_causal_skipallow_torch_fixrz   c	                 &   |j                   d   }
t        |||      }|rt        ||
||      ryt        j                  ||j
                        }||z  }|j                  dd      }t        |	d   dd      }t        |	d   dd      }||t        d	      ||k  }||||z
  kD  }||z  }n|||z  ||z  k(  }||z  }|ddddddf   j                  | ddd      }|||ddddddf   z  }t        s|r|t        j                  | dd
      z  }|S )a  
    Create a 4D boolean mask of shape `(batch_size, 1, query_length, kv_length)` where a value of True indicates that
    the element should take part in the attention computation, and False that it should not.

    This is similar to `masking_utils.sdpa_mask` but does not use `vmap` which is incompatible with export.

    Args:
        batch_size (`int`):
            The batch size of the input sequence.
        cache_position (`torch.Tensor`):
            A tensor of shape (query_length,) indicating the current indices of the input sequence elements.
        kv_length (`int`):
            The size that the key and value states will have during the attention computation.
        kv_offset (`int`, optional):
            An optional offset to indicate at which first position the key and values states will refer to.
        mask_function (`Callable`):
            The mask factory function describing the mask pattern.
        attention_mask (`torch.Tensor`, optional):
            The 2D attention mask corresponding to padded tokens of shape (batch_size, number_of_seen_tokens+q_length)
        local_size (`int`, optional):
            The size of the local attention, if we do not use full attention. This is used only if `allow_is_causal_skip=True`
            to try to skip mask creation if possible.
        allow_is_causal_skip (`bool`, optional):
            Whether to allow to return `None` for the mask under conditions where we can use the `is_causal` argument in
            `torch.sdpa` instead. Default to `True`.
        allow_torch_fix (`bool`, optional):
            Whether to update the mask in case a query is not attending to any tokens, to solve a bug in torch's older
            versions. We need an arg to skip it when using eager. By default `True`.

    r   N)r   r   r'   r   rk   attention_chunk_sizez;Cannot use both `sliding_window` and `attention_chunk_size`Tr   )r   r   r   r1   rM   r   viewrq   rp   r   r   all)r   rH   r4  r5  r6  r   r7  r8  r9  rW   q_lengthpadding_mask	kv_arangereshaped_cache_positionrk   
chunk_sizecausal_masksliding_mask_overlaychunked_mask_overlays                      r#   rw   rw     sp   V ##A&H'	9ML  8xQZ\f g Y~/D/DEII,11"a8 VH-/?FN)+A4HJ!j&<VWW 66K!(+B^+SS++		(J6:QU_:__++dD!Q./66z2r2NK!LD$1A$BB /?uyy+2tDDr%   r   )NN)r   NNNTT)'rs   typingr   r   r1   cache_utilsr   r   r   r	   generation.configuration_utilsr
   masking_utilsr   r   r   r   modeling_utilsr   r   pytorch_utilsr   r   r   nnModulerI   ru   rr   r   r   r   r   r  r	  r  r3  rf   rw   rV   r%   r#   <module>rN     s    %  U U =  F Y[ [|ULEHHOO ULp_P588?? _PDL588?? Lb 1559%)!F F -F  %U\\2F  TN	F 
 TNF RCuxx C+ehhoo +\|! |!B 1559$ $ -$  %U\\2$ V (,-1 $!% VVLLV V 	V
 H%V U\\*V V V V ellVr%   