
    rh.C                        d dl mZmZ d dlZddlmZmZ ddlm	Z	m
Z
mZ ddlmZmZ ddlmZmZmZmZmZ ddlmZ dd	lmZ dd
lmZmZmZ  e       rddlmZ  e       r e       rd dlmZ  nd dl!mZ   e       rd dl"Z"ddlm#Z#  e#jH                  e%      Z&dZ'dZ(dZ)dZ*dZ+de,d   de,e-   fdZ.de-de/e-e-f   fdZ0 G d de      Z1 ed       G d de             Z2dgZ3y)    )OptionalUnionN   )BatchFeatureget_size_dict)IMAGENET_STANDARD_MEANIMAGENET_STANDARD_STDSizeDict)UnpackVideosKwargs)
TensorTypeis_torch_availableis_torchvision_availableis_torchvision_v2_availableis_vision_available)requires)BaseVideoProcessor)VideoMetadatagroup_videos_by_shapereorder_videos)PILImageResampling)
functional)loggingzYou are a helpful language and vision assistant. You are able to understand the visual content that the user provides, and assist the user with a variety of tasks using natural language.zgYou are provided the following series of {frame_count} frames from a {video_duration} [H:MM:SS] video.
z

z
Frame from {timestamp}:i   videostorch.Tensorreturnc                     t        d      x}}| D ]0  }|j                         dd \  }}t        ||      }t        ||      }2 ||fS )zH
    Get the maximum height and width across all videos in a batch.
    z-infN)floatsizemax)r   
max_height	max_widthvideoheightwidths         /var/www/html/ai-insurance-compliance-backend/venv/lib/python3.12/site-packages/transformers/models/smolvlm/video_processing_smolvlm.pyget_max_height_widthr(   G   s\     #6]*J *

RS),
y)	* 	""    resolution_max_sidec                 :   | j                         dd \  }}t        t        |      }|t        ||      n|}||z  }||k\  r|}t	        ||z        }|dz  dk7  r(|dz  }n"||kD  r|}t	        ||z        }|dz  dk7  r|dz  }t        |d      }t        |d      }||fS )a  
    Get the output size of the video after resizing given a dictionary specifying the max and min sizes.
    Args:
        video (`np.ndarray`):
            Video to resize.
        resolution_max_side (`int`):
            The longest edge of the video will be resized to this value. The shortest edge will be resized to keep the
            input aspect ratio.
    Returns:
        The output size of the video after resizing.
    r   N   r      )r    minMAX_IMAGE_SIZEr!   int)r$   r*   r%   r&   aspect_ratios        r'   get_resize_output_image_sizer2   S   s     JJL%MFE n.AB0C0K#fe,Qd6>L#U\)*A:?aKF	%$F\)*19>QJE^FqME5=r)   c                   &    e Zd ZU dZeeef   ed<   y)SmolVLMVideoProcessorInitKwargsNmax_image_size)__name__
__module____qualname__r5   dictstrr0   __annotations__ r)   r'   r4   r4   {   s    %)NDcN)r)   r4   )torchvision)backendsc            &           e Zd Zej                  ZddiZddiZeZ	e
ZdZdZdZdZdZdZeZddgZdee   f fd	Z	 	 d*d
ddedddeddf
dZ	 	 d+d
ddeeef   dededef
dZ	 	 	 d,d
ddeeef   de e   de eee!f      de e   f
dZ"	 	 	 	 	 d-de#d   dee#e   e#e   f   dededede d   d ed!e!d"ed#ed$ed%e ee!e#e!   f      d&e ee!e#e!   f      de eee!f      de e   de e   d'e ee$e%f      d(e d   f$d)Z& xZ'S ).SmolVLMVideoProcessorlongest_edgei  il  TFpixel_valuespixel_attention_maskkwargsc                     t        |   di | d|v rd|v r|d   |d   d<   d|v r=|d   d   | _        |d   d   | _        t	        |d   d   | j
                        | _        y y )Nr    video_sampling
video_size
max_framesfps)default_to_squarer<   )super__init__
num_framesrI   r   rJ   r    )selfrD   	__class__s     r'   rL   zSmolVLMVideoProcessor.__init__   s    "6" V 0F :5;F^F#$\2v%$%56|DDO./6DH%f-=&>|&L`d`v`vwDI &r)   r$   r   r    interpolationzF.InterpolationMode	antialiasr   c                 B   ||nt         j                  j                  }|t         j                  j                  k(  r/t        j                  d       t         j                  j                  }|j                  rt        ||j                        }n@|j                  r%|j                  r|j                  |j                  f}nt        d| d      t        j                  ||||      }| j                  d   | j                  d   f}t        j                  ||||      }|S )a9  
        Resize an video to `(size["height"], size["width"])`.
        Args:
            video (`torch.Tensor`):
                Video to resize.
            size (`SizeDict`):
                Dictionary in the format `{"height": int, "width": int}` specifying the size of the output video.
            resample (`InterpolationMode`, *optional*, defaults to `InterpolationMode.BILINEAR`):
                `InterpolationMode` filter to use when resizing the video e.g. `InterpolationMode.BICUBIC`.
        Returns:
            `torch.Tensor`: The resized video.
        zYou have used fast image processor with LANCZOS resample which not yet supported for torch.Tensor. BICUBIC resample will be used as an alternative. Please fall back to image processor if you want full consistency with the original model.)r*   zHSize must contain 'height' and 'width' keys, or 'longest_edge' key. Got .)rP   rQ   rA   )FInterpolationModeBILINEARLANCZOSloggerwarning_onceBICUBICrA   r2   r%   r&   
ValueErrorresizer5   )rN   r$   r    rP   rQ   rD   new_sizemax_sizes           r'   r\   zSmolVLMVideoProcessor.resize   s   ( *7)BH[H[HdHdA//777A
 //77M 4$($5$5H [[TZZTZZ0HghlgmmnoppQZ[ &&~68K8KN8[[QZ[r)   padded_sizemax_num_framesfillreturn_pixel_maskc                    |j                         dd }|d   |d   z
  }|d   |d   z
  }||j                  d   z
  }	|dk  s|dk  rt        d| d| d      ||k7  r"d|d|ddd|	g}
t        j                  ||
|      }d}|rBt        j                  |d	dddddf   t
        j                  
      }d|d	d|d   d|d   f<   ||fS )a"  Pads the sample with empty video to the padded_size
        Args:
            video (`torch.Tensor`):
                Video to pad.
            padded_size (`tuple[int, int]`):
                Height and width to pad.
            max_num_frames (`int`):
                The maximum number of frames to which video will be padded.
            fill (`int`, *optional*):
                The value to use for the padding.
            return_pixel_mask (`bool`, *optional*, defaults to `True`):
                Whether to return a pixel mask.
        r   Nr   r-   zzPadding dimensions are negative. Please make sure that the padded size is larger than the original size. Got padded size: z, original size: rS   )ra   .dtype)r    shaper[   rT   padtorch
zeros_likeint64)rN   r$   r_   r`   ra   rb   original_sizepadding_heightpadding_widthpadding_framepadding
pixel_masks               r'   rg   zSmolVLMVideoProcessor.pad   s   * 

RS)$Q-*::#Aq)99&Q71 233>-?PQ^P__`b  K'-NAq!]SGEE%t4E 
))%Q1*=U[[QJFGJs.mA..0B-2B0BBCj  r)   metadatarM   rI   	skip_secsc                    ||n| j                   }||n| j                  }|j                  d   }t        t	        ||d   z              }t        ||      }|dk  rd}d}	|dz
  }
|dkD  r6|d   d|z  z
  ||z  kD  r%t        ||d   z        }	t        |||d   z  z
        }
t        d|	      }	t        |
|dz
        }
|	|
k\  rd|dz
  }
}	t        j                  |	|
|t              }t        j                  |      }||   j                         }g }|D ]9  }||d   z  }t        |dz        }t        |dz        }|j                  ||g       ; ||t        |d         fS )al  
        Video sampling function which:
            - Uses `num_frames` (if provided) or calculates it from `fps` and metadata.
            - Applies a basic center-skip if fewer frames than available, otherwise
                optionally skips `skip_secs` from both the start and end.
            - Uniformly samples the desired number of frames between the start and end indices.

        Args:
            video (`torch.Tensor`):
                Video that need to be sampled.
            metadata (`VideoMetadata`):
                Metadata of the video containing information about total duration, fps and total number of frames.
            num_frames (`int`, *optional*):
                Maximum number of frames to sample. Defaults to `self.num_frames`.
            fps (`int` or `float`, *optional*):
                Target frames to sample per second. Defaults to `self.fps`.
            skip_secs (`float`, *optional*, defaults to `1`):
                Number of seconds to skip from the start and end if the video is long enough.

        Returns:
            torch.Tensor:
                Sampled video frames.
        r   durationr-   r,   rI   rd   <   )rM   rI   rf   r0   roundr.   r!   nplinspaceunique
contiguousappend)rN   r$   rq   rM   rI   rr   total_num_framesestimated_framesdesired_frames	start_idxend_idxindices
timestampsidxsecmmsss                    r'   sample_framesz#SmolVLMVideoProcessor.sample_frames   s   > $.#9Zt
_c$(( ;;q> uS8J+?%?@A -z:AN 	"Q&q=hz2Q]BzTWGWXI78I*Y%-HHIG9%	g/!34!"$4q$8wI++i.L))G$g))+
 	(C'CSBYBS2XBr2h'		(
 j#hz&:";;;r)   r   video_metadatado_convert_rgb	do_resize
do_rescalerescale_factordo_normalizedo_paddo_sample_frames
image_mean	image_stdreturn_tensorsdevicec                 @   |rw|d   t        d      g }g g }}t        ||      D ]Q  \  }}| j                  |||||      \  }}}|j                  |       |j                  |       |j                  |       S nn|}|D cg c]D  }t	        t        |            D cg c]$  }t        |dz  dz        t        |dz  dz        f& c}F }}}|D cg c]  }t        |      dz   }}||D cg c]  }|j                  |       }}t        |      \  }}i }|j                         D ]3  \  }} |r| j                  |       } |r| j                  | ||      } | ||<   5 t        ||      }!t        |!      \  }}i }"|j                         D ]   \  }} | j                  | |||	||      } | |"|<   " t        |"|      }|
rt        |      }#t        d |D              }$t        |      \  }}i }%i }"|j                         D ]&  \  }} | j!                  | |#|$      \  } }&| |"|<   |&|%|<   ( t        |"|      }t        |%|      }'|rt#        j$                  |d      n|}|||d	}(|
r |
r|t#        j$                  'd      n'|(d
<   t'        |(|      S c c}w c c}}w c c}w c c}w )Nr   zFrame sampling is enabled but no video metadata was found. SmolVLM requires metadata to correctly sample frames. Please pass in `VideoMetadata` object per each input video or set `do_sample_frames=False`   ru   )r    rP   c              3   2   K   | ]  }t        |        y w)N)len).0r$   s     r'   	<genexpr>z4SmolVLMVideoProcessor._preprocess.<locals>.<genexpr>  s      JU Js   )r_   r`   )dim)rB   r   	durationsrC   )tensor_type)r[   zipr   r{   ranger   r0   tor   itemsconvert_to_rgbr\   r   rescale_and_normalizer(   r!   rg   rh   stackr   ))rN   r   r   r   r   r    rP   r   r   r   r   r   r   r   rI   rM   rr   r   r   rD   processed_videostimestamps_listdurations_listr$   rq   r   rt   r   grouped_videosgrouped_videos_indexresized_videos_groupedrf   stacked_videosresized_videosprocessed_videos_groupedpad_sizer`   processed_padded_mask_groupedpadded_masksrC   datas)                                            r'   _preprocessz!SmolVLMVideoProcessor._preprocess<  sJ   . a ( q   ".0"^O#&v~#> /x.2.@.@R\^acl.m+z8&&z2%%h/ ''.	/  &jpaf%PSTYPZJ[\3#sRxB&'cBh"_)=>\O  =CC5c%jB.CNC 4:;5ehhv&;F;/DEU/V,,!#%3%9%9%; 	;!E>!%!4!4^!D!%^$Vc!d,:"5)	; ((>@TU/D^/T,,#% %3%9%9%; 	=!E>!77
NL*V_N /=$U+		= **BDXY+,<=H  J9I JJN3HIY3Z0N0,.)')$)7)=)=)? D%~/3xx" 08 0, 3A(/7C-e4D  ..FH\]#12OQe#f CQ5;;'7Q?Wg 0^lm n8 0a8) '(
 Dn==o ] D
 <s$   J)JJJ.JJ)NT)r   T)NNr-   )NNr   NN)(r6   r7   r8   r   rW   resampler    r5   r   r   r	   r   r   r   r   r   r   r   r4   valid_kwargsmodel_input_namesr   rL   r
   boolr\   tupler0   rg   r   r   r9   r   r   r   listr:   r   r   __classcell__)rO   s   @r'   r@   r@      s   !))HG$D$c*N'J%IIJLNF2L')?@
x(G!H 
x  04// / -	/
 / 
/l "&)!)! 38_)! 	)!
 )!  )!^ %)+/#$C<C< t+,C< SM	C<
 eCJ'(C< C=C<h ,0$(#$;?+/'_>^$_> d=14:=>_> 	_>
 _> _>   56_> _> _> _> _> _> U5$u+#567_> E%e"456_> eCJ'(_>  SM!_>" C=#_>$ !sJ!78%_>& ('_>r)   r@   )4typingr   r   numpyrw   image_processing_utilsr   r   image_utilsr   r	   r
   processing_utilsr   r   utilsr   r   r   r   r   utils.import_utilsr   video_processing_utilsr   video_utilsr   r   r   r   torchvision.transforms.v2r   rT   torchvision.transformsrh   r   
get_loggerr6   rX   DEFAULT_SYSTEM_MESSAGEDEFAULT_VIDEO_INTRODEFAULT_MEDIA_OUTTROFRAME_TIMESTAMP_MESSAGEr/   r   r0   r(   r   r2   r4   r@   __all__r<   r)   r'   <module>r      s    #  
 5  + P O 1"$=:   
		H	% V n   5 	#n!5 	#$s) 	#%% 38_%P*l * 
#$[>. [> %[>| #
#r)   