Ë
    ¹rœhˆ<  ã                   óÖ   — d dl mZmZ d dlZddlmZ ddlmZ ddl	m
Z
mZmZmZmZmZ ddlmZmZ ddlmZ  G d	„ d
ed¬«      Z G d„ de
«      Z G d„ ded¬«      Z G d„ de«      ZdgZy)é    )ÚOptionalÚUnionNé   )ÚBatchFeature)Ú
ImageInput)ÚImagesKwargsÚMultiModalDataÚProcessingKwargsÚProcessorMixinÚUnpackÚVideosKwargs)ÚPreTokenizedInputÚ	TextInput)Ú
VideoInputc                   ó(   — e Zd ZU eee   ef   ed<   y)ÚGlm4vVideosProcessorKwargsÚfpsN)Ú__name__Ú
__module__Ú__qualname__r   ÚlistÚfloatÚ__annotations__© ó    ú}/var/www/html/ai-insurance-compliance-backend/venv/lib/python3.12/site-packages/transformers/models/glm4v/processing_glm4v.pyr   r       s   … Ø	ˆtE‰{˜EÐ!Ñ	"Ô"r   r   F)Útotalc                   ó>   — e Zd ZU ee   ed<   ee   ed<   ee   ed<   y)ÚGlm4vImagesKwargsÚ
patch_sizeÚtemporal_patch_sizeÚ
merge_sizeN)r   r   r   r   Úintr   r   r   r   r   r   $   s    … Ø˜‘ÓØ! #™Ó&Ø˜‘Ôr   r   c                   ó0   — e Zd ZU eed<   eed<   ddddœiZy)ÚGlm4vProcessorKwargsÚimages_kwargsÚvideos_kwargsÚtext_kwargsF)ÚpaddingÚreturn_mm_token_type_idsN)r   r   r   r   r   r   Ú	_defaultsr   r   r   r%   r%   *   s#   … Ø$Ó$Ø-Ó-àØØ(-ñ
ðIr   r%   c                   óª   ‡ — e Zd ZdZg d¢ZdZdZdZdˆ fd„	Z	 	 	 dde	de
eeee   ee   f   d	ed
ee   def
d„Zdd„Zd„ Zd„ Z	 dd„Zed„ «       Zˆ xZS )ÚGlm4vProcessoraÇ  
    Constructs a GLM-4V processor which wraps a GLM-4V image processor and a GLM-4 tokenizer into a single processor.
    [`~Glm4vProcessor.__call__`] and [`~Glm4vProcessor.decode`] for more information.
    Args:
        image_processor ([`Glm4vProcessor`], *optional*):
            The image processor is a required input.
        tokenizer ([`PreTrainedTokenizerFast`], *optional*):
            The tokenizer is a required input.
        video_processor ([`Glm4vVideoProcessor`], *optional*):
            The video processor is a required input.
        chat_template (`str`, *optional*): A Jinja template which will be used to convert lists of messages
            in a chat into a tokenizable string.
    )Úimage_processorÚ	tokenizerÚvideo_processorÚAutoImageProcessorÚAutoVideoProcessor)ÚPreTrainedTokenizerÚPreTrainedTokenizerFastc                 ó˜  •— t         ‰|   ||||¬«       t        |d«      sdn|j                  | _        t        |d«      sdn|j                  | _        t        |dd «      r|j                  n|j                  | j                  «      | _        t        |dd «      r|j                  | _        y |j                  | j                  «      | _        y )N)Úchat_templateÚimage_tokenz	<|image|>Úvideo_tokenz	<|video|>Úimage_token_idÚvideo_token_id)	ÚsuperÚ__init__Úhasattrr7   r8   Úgetattrr9   Úconvert_tokens_to_idsr:   )Úselfr.   r/   r0   r6   ÚkwargsÚ	__class__s         €r   r<   zGlm4vProcessor.__init__K   sÄ   ø€ Ü‰Ñ˜¨)°_ÐTaÐÔbÜ.5°iÀÔ.O™;ÐU^×UjÑUjˆÔÜ.5°iÀÔ.O™;ÐU^×UjÑUjˆÔô yÐ"2°DÔ9ð ×$Ò$à×0Ñ0°×1AÑ1AÓBð 	Ôô yÐ"2°DÔ9ð ×$Ñ$ð 	Õð ×0Ñ0°×1AÑ1AÓBð 	Õr   ÚimagesÚtextÚvideosrA   Úreturnc                 ó  —  | j                   t        fd| j                  j                  i|¤Ž}| | j                  dd|i|d   ¤Ž}|d   }ni }d}|. | j
                  dd|i|d   ¤Ž}|j                  d«      }	|d	   }
ni }g }	d}
t        |t        «      s|g}|j                  «       }|º| j                  j                  d
z  }d}t        t        |«      «      D ]ˆ  }| j                  ||   v rS||   j                  «       |z  }||   j                  | j                  d|z  d«      ||<   |dz  }| j                  ||   v rŒS||   j                  d| j                  «      ||<   ŒŠ |
æ| j
                  j                  d
z  }d}t        t        |«      «      D ]³  }| j                   ||   v r||
|   d   }d}t#        |	d«      r|	j%                  «       d   }nt        |	d   t        «      r|	d   n|	}g }t        dt        |«      «      D ]  }|j'                  ||   «       Œ |d| }t        |«      |k  r'|j'                  |r|d   nd«       t        |«      |k  rŒ't        |«      D ]  }||   }d| j                  › d|› }||z  }Œ  ||   j                  | j                   |d«      ||<   |
|   j                  «       |z  |
|   d   z  }t        |«      D ]:  }| j                  ||   v sŒ||   j                  | j                  d|z  d«      ||<   Œ< |dz  }| j                   ||   v rŒ|||   j                  d| j                  «      ||<   Œ¶ |d   j                  dd«      }|d   j                  dd«      } | j                  |fi |d   ¤Ž}| j)                  ||ddg¬«       |rUt+        j,                  |d   «      }t+        j.                  |d   «      }d||| j0                  k(  <   |j%                  «       |d<   t3        i |¥|¥|¥|¬«      S )a^
  
        Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
        and `kwargs` arguments to PreTrainedTokenizerFast's [`~PreTrainedTokenizerFast.__call__`] if `text` is not `None` to encode
        the text.

        Args:
            images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `List[PIL.Image.Image]`, `List[np.ndarray]`, `List[torch.Tensor]`):
                The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
                tensor. Both channels-first and channels-last formats are supported.
            text (`str`, `List[str]`, `List[List[str]]`):
                The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
                (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
                `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
            videos (`np.ndarray`, `torch.Tensor`, `List[np.ndarray]`, `List[torch.Tensor]`):
                The image or batch of videos to be prepared. Each video can be a 4D NumPy array or PyTorch
                tensor, or a nested list of 3D frames. Both channels-first and channels-last formats are supported.
            return_tensors (`str` or [`~utils.TensorType`], *optional*):
                If set, will return tensors of a particular framework. Acceptable values are:
                - `'tf'`: Return TensorFlow `tf.constant` objects.
                - `'pt'`: Return PyTorch `torch.Tensor` objects.
                - `'np'`: Return NumPy `np.ndarray` objects.
                - `'jax'`: Return JAX `jnp.ndarray` objects.

        Returns:
            [`BatchFeature`]: A [`BatchFeature`] with the following fields:

            - **input_ids** -- List of token ids to be fed to a model. Returned when `text` is not `None`.
            - **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when
              `return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not
              `None`).
            - **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`.
            - **pixel_values_videos** -- Pixel values of videos to be fed to a model. Returned when `videos` is not `None`.
            - **image_grid_thw** -- List of image 3D grid in LLM. Returned when `images` is not `None`.
            - **video_grid_thw** -- List of video 3D grid in LLM. Returned when `videos` is not `None`.
        Útokenizer_init_kwargsNrC   r&   Úimage_grid_thwrE   r'   Ú
timestampsÚvideo_grid_thwé   r   z<|placeholder|>é   Ú Útolistéÿÿÿÿz<|begin_of_image|>z<|end_of_image|>r(   Úreturn_tensorsr*   FÚimageÚvideo)Ú
modalitiesÚ	input_idsÚmm_token_type_ids)ÚdataÚtensor_typer   )Ú_merge_kwargsr%   r/   Úinit_kwargsr.   r0   ÚpopÚ
isinstancer   Úcopyr"   ÚrangeÚlenr7   ÚprodÚreplacer8   r=   rO   ÚappendÚ_check_special_mm_tokensÚnpÚarrayÚ
zeros_liker9   r   )r@   rC   rD   rE   rA   Úoutput_kwargsÚimage_inputsrI   Úvideos_inputsrJ   rK   Úmerge_lengthÚindexÚiÚnum_image_tokensÚvideo_indexÚ
num_framesÚvideo_structureÚtimestamps_listÚunique_timestampsÚidxÚselected_timestampsÚ	frame_idxÚtimestamp_secÚframe_structurerQ   r*   Útext_inputsÚ	array_idsrV   s                                 r   Ú__call__zGlm4vProcessor.__call__Z   sÇ  € ðT +˜×*Ñ*Ü ñ
à"&§.¡.×"<Ñ"<ð
ð ñ
ˆð
 ÐØ/˜4×/Ñ/Ñ`°vÐ`ÀÈÑA_Ñ`ˆLØ)Ð*:Ñ;‰NàˆLØ!ˆNàÐØ0˜D×0Ñ0Ña¸ÐaÀ-ÐP_ÑB`ÑaˆMØ&×*Ñ*¨<Ó8ˆJØ*Ð+;Ñ<‰NàˆMØˆJØ!ˆNä˜$¤Ô%Ø6ˆDày‰y‹{ˆØÐ%Ø×/Ñ/×:Ñ:¸AÑ=ˆLØˆEÜœ3˜t›9Ó%ò OØ×&Ñ&¨$¨q©'Ñ1Ø'5°eÑ'<×'AÑ'AÓ'CÀ|Ñ'SÐ$Ø" 1™gŸo™o¨d×.>Ñ.>Ð@QÐTdÑ@dÐfgÓhD˜‘GØ˜Q‘JEð ×&Ñ&¨$¨q©'Ò1ð ˜q™'Ÿ/™/Ð*;¸T×=MÑ=MÓNQ’ðOð Ñ%Ø×/Ñ/×:Ñ:¸AÑ=ˆLØˆKÜœ3˜t›9Ó%ó !OØ×&Ñ&¨$¨q©'Ò1Ø!/°Ñ!<¸QÑ!?JØ&(Oä˜z¨8Ô4Ø*4×*;Ñ*;Ó*=¸aÑ*@™ä;EÀjÐQRÁmÔUYÔ;Z¨*°Qª-Ð`j˜à(*Ð%Ü$ Q¬¨OÓ(<Ó=ò G˜Ø)×0Ñ0°ÀÑ1EÕFðGð +<¸K¸ZÐ*HÐ'ÜÐ1Ó2°ZÒ?Ø+×2Ñ2ÑNaÐ3FÀrÒ3JÐghÔiô Ð1Ó2°ZÓ?ô &+¨:Ó%6ò ;˜	Ø(;¸IÑ(F˜Ø,>¸t×?OÑ?OÐ>PÐP`ÐanÐ`oÐ*p˜Ø'¨?Ñ:™ð;ð
 # 1™gŸo™o¨d×.>Ñ.>ÀÐQRÓSD˜‘Gà& {Ñ3×8Ñ8Ó:¸lÑJÈnÐ]hÑNiÐjkÑNlÑlð %ô &+¨:Ó%6ò q˜	Ø×+Ñ+¨t°A©wÒ6Ø&*¨1¡g§o¡o°d×6FÑ6FÐHYÐ\lÑHlÐnoÓ&p˜D šGðqð   1Ñ$Kð= ×&Ñ&¨$¨q©'Ó1ð@ ˜q™'Ÿ/™/Ð*;¸T×=MÑ=MÓNQ“ðC!OðD ' }Ñ5×9Ñ9Ð:JÈDÓQˆØ#0°Ñ#?×#CÑ#CÐD^Ð`eÓ#fÐ Ø$d—n‘n TÑJ¨]¸=Ñ-IÑJˆØ×%Ñ% d¨KÀWÈgÐDVÐ%ÔWá#ÜŸ™ ¨[Ñ!9Ó:ˆIÜ "§¡¨k¸+Ñ.FÓ GÐØBCÐ˜i¨4×+>Ñ+>Ñ>Ñ?Ø/@×/GÑ/GÓ/IˆKÐ+Ñ,ÜÐ!Q KÐ!Q°<Ð!QÀ=Ð!QÐ_mÔnÐnr   c                 ó–  — i }|¯t         j                  j                  di «      }|j                  |«       |j                  dd«      xs | j                  j
                  }|D cg c]   } | j                  j                  g |¢|‘­Ž ‘Œ" }}|D 	cg c]
  }	|	|dz  z  ‘Œ }
}	|j                  |
|dœ«       |vt         j                  j                  di «      }|j                  |«       |D cg c]   } | j                  j                  g |¢|‘­Ž ‘Œ" }}|D 	cg c]
  }	|	dz  z  ‘Œ }}	||d<   t        di |¤ŽS c c}w c c}	w c c}w c c}	w )	aK  
        Computes the number of placeholder tokens needed for multimodal inputs with the given sizes.
        Args:
            image_sizes (`list[list[int]]`, *optional*):
                The input sizes formatted as (height, width) per each image.
            video_sizes (`list[list[int]]`, *optional*):
                The input sizes formatted as (num_frames, height, width) per each video.
        Returns:
            `MultiModalData`: A `MultiModalData` object holding number of tokens per each of the provided
            input modalities, along with other useful data.
        Nr&   r"   rL   )rm   Únum_image_patchesr'   Únum_video_tokensr   )
r%   r+   ÚgetÚupdater.   r"   Úget_number_of_image_patchesr0   Úget_number_of_video_patchesr	   )r@   Úimage_sizesÚvideo_sizesrA   Úvision_datar&   r"   Ú
image_sizer|   Únum_patchesrm   r'   Ú
video_sizeÚnum_video_patchesr}   s                  r   Ú_get_num_multimodal_tokensz)Glm4vProcessor._get_num_multimodal_tokensØ   s„  € ð ˆØÐ"Ü0×:Ñ:×>Ñ>¸ÐPRÓSˆMØ× Ñ  Ô(Ø&×*Ñ*¨<¸Ó>ÒaÀ$×BVÑBV×BaÑBaˆJð #.ö!àð A×$Ñ$×@Ñ@Ð\À*Ð\ÈmÔ\ð!Ðð !ð SdÖdÀ; °
¸A±Ó!=ÐdÐÐdØ×ÑÐ4DÐ[lÑmÔnàÐ"Ü0×:Ñ:×>Ñ>¸ÐPRÓSˆMØ× Ñ  Ô(ð #.ö!àð A×$Ñ$×@Ñ@Ð\À*Ð\ÈmÔ\ð!Ðð !ð SdÖdÀ; °
¸A±Ó!=ÐdÐÐdØ.>ˆKÐ*Ñ+äÑ, Ñ,Ð,ùò#!ùò  eùò!ùò  es   Á$%D7ÂD<Ã+%EÄEc                 ó:   —  | j                   j                  |i |¤ŽS )zÂ
        This method forwards all its arguments to Qwen2TokenizerFast's [`~PreTrainedTokenizer.batch_decode`]. Please
        refer to the docstring of this method for more information.
        ©r/   Úbatch_decode©r@   ÚargsrA   s      r   rŒ   zGlm4vProcessor.batch_decodeþ   s    € ð
 +ˆt~‰~×*Ñ*¨DÐ;°FÑ;Ð;r   c                 ó:   —  | j                   j                  |i |¤ŽS )z¼
        This method forwards all its arguments to Qwen2TokenizerFast's [`~PreTrainedTokenizer.decode`]. Please refer to
        the docstring of this method for more information.
        )r/   Údecoder   s      r   r   zGlm4vProcessor.decode  s    € ð
 %ˆt~‰~×$Ñ$ dÐ5¨fÑ5Ð5r   c                 óB   —  | j                   j                  |f||dœ|¤ŽS )aš  
        Post-process the output of the model to decode the text.

        Args:
            generated_outputs (`torch.Tensor` or `np.ndarray`):
                The output of the model `generate` function. The output is expected to be a tensor of shape `(batch_size, sequence_length)`
                or `(sequence_length,)`.
            skip_special_tokens (`bool`, *optional*, defaults to `True`):
                Whether or not to remove special tokens in the output. Argument passed to the tokenizer's `batch_decode` method.
            clean_up_tokenization_spaces (`bool`, *optional*, defaults to `False`):
                Whether or not to clean up the tokenization spaces. Argument passed to the tokenizer's `batch_decode` method.
            **kwargs:
                Additional arguments to be passed to the tokenizer's `batch_decode method`.

        Returns:
            `list[str]`: The decoded text.
        )Úskip_special_tokensÚclean_up_tokenization_spacesr‹   )r@   Úgenerated_outputsr’   r“   rA   s        r   Úpost_process_image_text_to_textz.Glm4vProcessor.post_process_image_text_to_text  s5   € ð( +ˆt~‰~×*Ñ*Øð
à 3Ø)Eñ
ð ñ	
ð 	
r   c                 ó¨   — | j                   j                  }| j                  j                  }t        t        j                  ||z   «      «      }|dgz   S )NÚsecond_per_grid_ts)r/   Úmodel_input_namesr.   r   ÚdictÚfromkeys)r@   Útokenizer_input_namesÚimage_processor_input_namesÚnames_from_processors       r   r˜   z Glm4vProcessor.model_input_names'  sN   € à $§¡× @Ñ @ÐØ&*×&:Ñ&:×&LÑ&LÐ#Ü#¤D§M¡MÐ2GÐJeÑ2eÓ$fÓgÐØ#Ð';Ð&<Ñ<Ð<r   )NNNN)NNN)NN)TF)r   r   r   Ú__doc__Ú
attributesÚimage_processor_classÚvideo_processor_classÚtokenizer_classr<   r   r   r   r   r   r   r   r%   r   rz   r‰   rŒ   r   r•   Úpropertyr˜   Ú__classcell__)rB   s   @r   r-   r-   5   sÂ   ø„ ñò E€Jà0ÐØ0ÐàH€Oõ
ð" "Ø^bØ!ñ	|oàð|oð IÐ0°$°y±/À4ÐHYÑCZÐZÑ[ð|oð ð	|oð
 Ð-Ñ.ð|oð 
ó|oó|$-òL<ò6ð Y^ó
ð6 ñ=ó ô=r   r-   )Útypingr   r   Únumpyrd   Úfeature_extraction_utilsr   Úimage_utilsr   Úprocessing_utilsr   r	   r
   r   r   r   Útokenization_utils_baser   r   Úvideo_utilsr   r   r   r%   r-   Ú__all__r   r   r   ú<module>r­      sa   ð÷* #ã å 4Ý %ß t× tß CÝ %ô# °Uõ #ô˜ô ôÐ+°5õ ôw=^ô w=ðt Ð
r   