
    rhJ                       d Z ddlZddlmZ ddlmZmZmZ ddlZddlm	Z	 ddl
mZ ddlmZ dd	lmZ dd
lmZmZ ddlmZ ddlmZmZmZ ddlmZmZmZ ddlmZ  ej>                  e       Z!	 	 	 dmde	jD                  dejF                  dejF                  dejF                  deejF                     dee$   de$deejF                     fdZ% G d de	jD                        Z& G d de	jD                        Z'	 	 	 dndejF                  de$dee(   d e)d!e*f
d"Z+	 	 dodejF                  d#ee(e*f   dee(   d!e*fd$Z, G d% d&e	jD                        Z- G d' d(e	jD                        Z. G d) d*e	jD                        Z/e G d+ d,e             Z0 G d- d.e	jD                        Z1 G d/ d0e	jD                        Z2 G d1 d2e0      Z3e ed34       G d5 d6e                    Z4e ed74       G d8 d9e                    Z5e ed:4       G d; d<e                    Z6e ed=4       G d> d?e                    Z7e ed@4       G dA dBe                    Z8e edC4       G dD dEe                    Z9dFejt                  jv                  dGejF                  dHejF                  fdIZ<dpdJejF                  dKeejF                     dHejF                  fdLZ= G dM dNe	jD                        Z> G dO dPe	jD                        Z? G dQ dRe	jD                        Z@ G dS dTe	jD                        ZAe G dU dVe0             ZB G dW dXe	jD                        ZC edY4       G dZ d[e0             ZD G d\ d]e	jD                        ZE ed^4       G d_ d`e0             ZF eda4       G db dce	jD                               ZG edd4       G de dfe0             ZH G dg dhe	jD                        ZI edi4       G dj dke0             ZJg dlZKy)qzPyTorch PatchTST model.    N)	dataclass)CallableOptionalUnion)nn   )ACT2CLS)FlashAttentionKwargs)BaseModelOutput)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)NegativeBinomialOutputNormalOutputStudentTOutput)ModelOutputauto_docstringlogging   )PatchTSTConfigmodulequerykeyvalueattention_maskscalingdropout	head_maskc                    ||j                  d      dz  }t        j                  ||j                  dd            |z  }	||	|z   }	t        j
                  j                  |	d      }	||	|j                  dddd      z  }	t        j
                  j                  |	|| j                        }	t        j                  |	|      }
|
j                  dd      j                         }
|
|	fS )N         r   dimr   )ptraining)sizetorchmatmul	transposer   
functionalsoftmaxviewr   r&   
contiguous)r   r   r   r   r   r   r   r   kwargsattn_weightsattn_outputs              /var/www/html/ai-insurance-compliance-backend/venv/lib/python3.12/site-packages/transformers/models/patchtst/modeling_patchtst.pyeager_attention_forwardr3   &   s     **R.D(<<s}}Q':;gEL!#n4==((2(>L#innQAq&AA==((6??([L,,|U3K''1-88:K$$    c                   H    e Zd ZdZ	 	 	 	 	 ddededededededee   f fd	Z		 	 	 	 dd
e
j                  dee
j                     dee
j                     dee
j                     dee   dee   dee
j                  ee
j                     eee
j                        f   fdZ xZS )PatchTSTAttentionz=Multi-headed attention from 'Attention Is All You Need' paper	embed_dim	num_headsr   
is_decoderbias	is_causalconfigc                 
   t         |           || _        || _        || _        ||z  | _        || _        | j
                  |z  | j                  k7  rt        d| j                   d| d      | j
                  dz  | _        || _	        || _
        t        j                  |||      | _        t        j                  |||      | _        t        j                  |||      | _        t        j                  |||      | _        y )Nz;embed_dim must be divisible by num_heads (got `embed_dim`: z and `num_heads`: ).r!   r:   )super__init__r7   r8   r   head_dimr<   
ValueErrorr   r9   r;   r   Lineark_projv_projq_projout_proj)	selfr7   r8   r   r9   r:   r;   r<   	__class__s	           r2   rA   zPatchTSTAttention.__init__H   s     	""!Y.MMI%$..8MdnnM]$YKr3  }}d*$"ii	94@ii	94@ii	94@		)YTBr4   hidden_stateskey_value_statesr   layer_head_maskoutput_attentionsr/   returnc                    |du}|j                   dd \  }}	|r|j                   d   n|	}
||	d| j                  f}||
d| j                  f} | j                  |      j                  | j	                  dd      }|r|n|} | j                  |      j                  | j	                  dd      } | j                  |      j                  | j	                  dd      }t        }| j                  j                  dk7  rt        | j                  j                     } || ||||f| j                  sdn| j                  | j                  ||d|\  }}|j                  ||	d      j                         }| j!                  |      }||dfS )z#Input shape: Batch x Time x ChannelNr    r   r"   eager        )r   r   rN   r   )shaperB   rG   r-   r*   rE   rF   r3   r<   _attn_implementationr   r&   r   r   reshaper.   rH   )rI   rK   rL   r   rM   rN   r/   is_cross_attentionbsztgt_lensrc_lenq_input_shapekv_input_shapequery_statescurrent_states
key_statesvalue_statesattention_interfacer1   r0   s                       r2   forwardzPatchTSTAttention.forwardg   s    .T9 %**3B/W/A"((+wgr4==9wDMM: 7t{{=166FPPQRTUV-?)]5T[[055~FPPQRTUV
7t{{>277HRRSTVWX(?;;++w6"9$++:Z:Z"[$7%
  $}}C$,,LL/%%
 %
!\ "))#w;FFHmmK0L$..r4   )rR   FTFN)NNNF)__name__
__module____qualname____doc__intfloatboolr   r   rA   r(   Tensorr   r
   tuplera   __classcell__rJ   s   @r2   r6   r6   E   s   G  +/CC C 	C
 C C C (CD 481526,13/||3/ #5<<03/ !.	3/
 "%,,/3/ $D>3/ -.3/ 
u||Xell3XeELL>Q5RR	S3/r4   r6   c                   H     e Zd ZdZdef fdZdej                  fdZ xZ	S )PatchTSTBatchNormzP
    Compute batch normalization over the sequence length (time) dimension.
    r<   c                     t         |           t        j                  |j                  |j
                        | _        y )Neps)r@   rA   r   BatchNorm1dd_modelnorm_eps	batchnormrI   r<   rJ   s     r2   rA   zPatchTSTBatchNorm.__init__   s(    FOOLr4   inputsc                 l    |j                  dd      }| j                  |      }|j                  dd      S )a  
        Parameters:
            inputs (`torch.Tensor` of shape `(batch_size, sequence_length, d_model)`):
                input for Batch norm calculation
        Returns:
            `torch.Tensor` of shape `(batch_size, sequence_length, d_model)`
        r   r"   )r*   ru   )rI   rw   outputs      r2   ra   zPatchTSTBatchNorm.forward   s7     !!!Q''1%%r4   
rb   rc   rd   re   r   rA   r(   ri   ra   rk   rl   s   @r2   rn   rn      s&    M~ M
&ell 
&r4   rn   rw   
mask_ratiounmasked_channel_indiceschannel_consistent_masking
mask_valuec                    |dk  s|dk\  rt        d| d      | j                  \  }}}}| j                  }	t        |d|z
  z        }
|r-t	        j
                  |d||	      }|j                  d|d      }nt	        j
                  ||||	      }t	        j                  ||||	      }d|ddddd|
f<   t	        j                  |d      }t	        j                  |d      }t	        j                  |d|	      }|j                  d      j                  ddd|      }|d|dd|ddddf<   | j                  |j                         |      }||d
   fS )a  random_masking: Mask the input considering the control variables.

    Args:
        inputs (`torch.Tensor` of shape `(batch_size, num_channels, sequence_length, num_features)`):
            The input tensor to mask.
        mask_ratio (`float`):
            Masking ratio applied to mask the input data during random pretraining. It is the number between 0 and 1.
        unmasked_channel_indices (list, *optional*):
            Indices of channels that will not be masked.
        channel_consistent_masking (bool, *optional*, defaults to `False`):
            When true, masking will be same across all channels of a timeseries. Otherwise, masking positions will vary
            across channels.
        mask_value (int, *optional*, defaults to 0):
            Define the value of masked patches for pretraining.

    Returns:
        `tuple(torch.Tensor)`: inputs_mask, masked input, same shape as input Tensor and mask tensor of shape [bs x c x
        n]
    r   r   zMask ratio z has to be between 0 and 1.deviceNr    r#   )r$   index.r   )rC   rS   r   rf   r(   randrepeatonesargsortgather	unsqueezemasked_fillrh   )rw   r{   r|   r}   r~   
batch_sizenum_channelssequence_lengthnum_featuresr   len_keepnoisemaskids_shuffleids_restoreinputs_masks                   r2   random_maskingr      sQ   4 A~q;zl2MNOO>Dll;Jo|]]F?a*n56H!

:q/&IQa0 

:|_VT ::j,ODDAyy --2.K--4K<<"K8D>>"$$Q1l;D+23Q(!Q./$$TYY[*=KV$$r4   num_forecast_mask_patchesc                 P   t        |t              r|g}|D cg c]  }d }}| j                  \  }}}}	t        j                  |||| j
                        }
g }d}t        |      }t        ||      D ]H  \  }}|dk  s||k\  rt        d| d      t        ||z  |z        }|j                  |||g       ||z  }J t        |d       }||k  r|d   d   ||z
  z   |d   d<   n||kD  r|d	   d   ||z
  z   |d	   d<   d}|D ]  \  }}}||z   }d|
||d
d
| d
f<   |} t        j                  |
j                  d         }|
|   }
|
j                  d	      j                  ddd|	      }
|d|
d
d
|d
d
d
d
f<   | j                  |
j                         |      }||
d   fS c c}w )a  Forecast masking that masks the last K patches where K is from the num_forecast_mask_patches.
    If num_forecast_mask_patches is a list, samples in the batch will be randomly masked by numbers defined in the list.

    Parameters:
        inputs (`torch.Tensor`):
            Input of shape `(bs, num_channels, num_patch, patch_length)`
        num_forecast_mask_patches (`list`):
            Number of patches to be masked at the end of each batch sample. e.g. 4 or [3, 5].
        unmasked_channel_indices (`list`, *optional*):
            Indices of channels that are not masked.
        mask_value (`int`, *optional*, defaults to 0):
            Values in the masked patches will be filled by `mask_value`.

    Returns:
        `tuple(torch.Tensor)`: inputs_mask, masked input, same shape as inputs Tensor and Mask tensor of shape `(bs,
        num_channels , num_patch)` or `(bs, tsg1, tsg2, num_channels, num_patch)`
    r   r   r   znum_forecast_mask_patches z6 should be greater than 0 and less than total patches.c                     | d   S )Nr"    )xs    r2   <lambda>z"forecast_masking.<locals>.<lambda>  s
    !A$ r4   )r   r"   r    Nr   )
isinstancerf   rS   r(   zerosr   sumziprC   appendsortedrandpermr   r   r   rh   )rw   r   r|   r~   _forecast_mask_ratiosr   r   r   r   r   t_listtotal_lengthtotal_ratiopatch_lengthratiotemp_lenbatch1	patch_lenbatch2permr   s                         r2   forecast_maskingr      s   0 +S1%>$?!'@A!AAA>Dll;Jo|;;z<WDFL*+K"#<>RS !e1 ?,\N:pq  zE)K78|UH56 ! F/Fj ay|zL'@Aq	!	
	"r
1
)BCr
1F"( 	1h("./VF]A	z{*+
 >>$**Q-(D:D>>"$$Q1l;D+23Q(!Q./$$TYY[*=KV$$O Bs   	F#c                   H     e Zd ZdZdef fdZdej                  fdZ xZ	S )PatchTSTPatchifyz
    A class to patchify the time series sequence into different patches

    Returns:
        `torch.Tensor` of shape `(batch_size, num_channels, num_patches, patch_length)`
    r<   c                    t         |           |j                  | _        |j                  | _        |j
                  | _        | j                  | j                  k  r&t        d| j                   d| j                   d      t        | j                  | j                        | j                  z
  | j
                  z  dz   | _        | j                  | j
                  | j                  dz
  z  z   }| j                  |z
  | _	        y )NzSequence length (z+) has to be greater than the patch length ()r   )
r@   rA   context_lengthr   r   patch_striderC   maxnum_patchessequence_start)rI   r<   new_sequence_lengthrJ   s      r2   rA   zPatchTSTPatchify.__init__9  s    %44"//"//4#4#44#D$8$8#99deievevdwwxy 
   4 4d6G6GH4K\K\\aeararruvv"//$2C2CtGWGWZ[G[2\\"225HHr4   past_valuesc                 :   |j                   d   }|| j                  k7  rt        d| d| j                   d      |dd| j                  dddf   }|j	                  d| j
                  | j                        }|j                  dd      j                         }|S )a!  
        Parameters:
            past_values (`torch.Tensor` of shape `(batch_size, sequence_length, num_channels)`, *required*):
                Input for patchification

        Returns:
            `torch.Tensor` of shape `(batch_size, num_channels, num_patches, patch_length)`
        zInput sequence length (z%) doesn't match model configuration (r>   N)	dimensionr'   step)	rS   r   rC   r   unfoldr   r   r*   r.   )rI   r   r   ry   s       r2   ra   zPatchTSTPatchify.forwardJ  s     &++B/d222)/)::_`d`t`t_uuwx  Q 3 3 5q89$2C2C$J[J[\!!"b)446r4   rz   rl   s   @r2   r   r   1  s&    I~ I"5<< r4   r   c                   H     e Zd ZdZdef fdZdej                  fdZ xZ	S )PatchTSTMaskinga  
    Class to perform random or forecast masking.

    Parameters:
        config (`PatchTSTConfig`): model config
    Returns:
        x_mask (`torch.Tensor` of shape `(batch_size, num_channels, num_patches, patch_length)`)
            Masked patched input
        mask (`torch.Tensor` of shape `(batch_size, num_channels, num_patches)`)
            Bool tensor indicating True on masked points
    r<   c                 <   t         |           |j                  | _        |j                  | _        |j                  | _        |j
                  | _        |j                  | _        |j                  | _        | j                  t        | j                        | _        y y N)	r@   rA   random_mask_ratior}   	mask_typer   r|   r~   r   rv   s     r2   rA   zPatchTSTMasking.__init__n  s    !'!9!9*0*K*K')))/)I)I&(.(G(G% ++((4,243P3P,QD) 5r4   patch_inputc                 r   | j                   dk(  r<t        || j                  | j                  | j                  | j
                        \  }}nY| j                   dk(  r1t        || j                  | j                  | j
                        \  }}nt        d| j                    d      |j                         }||fS )a  
        Parameters:
            patch_input (`torch.Tensor` of shape `(batch_size, num_channels, num_patches, patch_length)`, *required*):
                Patch input

        Return:
            masked_input (`torch.Tensor` of shape `(batch_size, num_channels, num_patches, patch_length)`)
                Masked patched input
            mask (`torch.Tensor` of shape `(batch_size, num_channels, num_patches)`)
                Bool tensor indicating True on masked points

        random)rw   r{   r|   r}   r~   forecast)rw   r   r|   r~   zInvalid mask type .)
r   r   r   r|   r}   r~   r   r   rC   rh   )rI   r   masked_inputr   s       r2   ra   zPatchTSTMasking.forwardy  s     >>X%!/"11)-)F)F+/+J+J??"L$ ^^z)!1"*.*H*H)-)F)F??	"L$ 1$..1ACDD yy{T!!r4   rz   rl   s   @r2   r   r   a  s&    
	R~ 	R!"5<< !"r4   r   c                   T     e Zd ZdZdef fdZddej                  dee	   fdZ
 xZS )PatchTSTEncoderLayerz 
    PatchTST encoder layer
    r<   c           
         t         |           |j                  | _        t        |j                  |j
                  |j                  |      | _        |j                  dkD  rt        j                  |j                        nt        j                         | _        |j                  dk(  rt        |      | _        nX|j                  dk(  r1t        j                   |j                  |j"                        | _        nt%        |j                   d      | j                  r|j                  dkD  rt        j                  |j                        nt        j                         | _        |j                  dk(  rt        |      | _        nX|j                  dk(  r1t        j                   |j                  |j"                        | _        nt%        |j                   d      t        j*                  t        j,                  |j                  |j.                  |j0                        t3        |j4                            |j6                  dkD  rt        j                  |j6                        nt        j                         t        j,                  |j.                  |j                  |j0                              | _        |j                  dkD  rt        j                  |j                        nt        j                         | _        |j                  dk(  rt        |      | _        nX|j                  dk(  r1t        j                   |j                  |j"                        | _        nt%        |j                   d      |j>                  | _        y )N)r7   r8   r   r<   r   ru   	layernormrp   z$ is not a supported norm layer type.r?   ) r@   rA   channel_attentionr6   rs   num_attention_headsattention_dropout	self_attnpath_dropoutr   DropoutIdentitydropout_path1	norm_typern   norm_sublayer1	LayerNormrt   rC   dropout_path2norm_sublayer2
SequentialrD   ffn_dimr:   r	   activation_function
ff_dropoutffdropout_path3norm_sublayer3pre_normrv   s     r2   rA   zPatchTSTEncoderLayer.__init__  s   !'!9!9*nn00,,	
 AG@S@SVW@WRZZ(;(;<]_]h]h]j{*"3F";D,"$,,v~~6??"SD 0 011UVWW !!DJDWDWZ[D[F,?,?!@acalalanD;.&7&?#!![0&(ll6>>v&W# F$4$4#55Y!Z[[ --IIfnnfnn6;;GF../1-3->->-BBJJv(()IIfnnfnn6;;G	
 AG@S@SVW@WRZZ(;(;<]_]h]h]j{*"3F";D,"$,,v~~6??"SD 0 011UVWWr4   hidden_staterN   c                    |j                   \  }}}}|j                  ||z  ||      }| j                  r;| j                  | j	                  |      |      \  }}}	|| j                  |      z   }n:| j                  ||      \  }}}	| j	                  || j                  |      z         }|j                  ||||      }| j                  r|j                  dd      j                         }|j                  ||z  ||      }| j                  r;| j                  | j                  |      |      \  }}
}	|| j                  |      z   }n:| j                  ||      \  }}
}	| j                  || j                  |      z         }|j                  ||||      }|j                  dd      j                         }|j                  ||z  ||      }| j                  r3|| j                  | j                  | j                  |                  z   }n2| j                  || j                  | j                  |            z         }|j                  ||||      }|f}|r|| j                  r|
fn|fz  }|S )a  
        Parameters:
            hidden_state (`torch.Tensor` of shape `(batch_size, num_channels, sequence_length, d_model)`, *required*):
                Past values of the time series
            output_attentions (`bool`, *optional*):
                Whether or not to return the output attention of all layers
        Return:
            `torch.Tensor` of shape `(batch_size, num_channels, sequence_length, d_model)`

        )rK   rN   r"   r   )rS   r-   r   r   r   r   rU   r   r*   r.   r   r   r   r   r   )rI   r   rN   r   num_input_channelsr   rs   r1   r0   r   channel_attn_weightsoutputss               r2   ra   zPatchTSTEncoderLayer.forward  s    DPCUCU@
& $((6H)H/[bc==+/>>"11,?Sd ,: ,(Kq ($*<*<[*IIL ,0>>*>O ,: ,(Kq  ..|d>P>PQ\>]/]^L $++J8JO]de !!'11!Q7BBDL',,Z/-IK]_fgL}}7;~~"&"5"5l"CWh 8F 8411  ,d.@.@.MM 8<~~".BS 8F 8411  $22<$BTBTU`Ba3ab (//
OM_ahiL'11!Q7BBDL $((6H)H/[bc== ($*<*<TWWTEXEXYeEf=g*hhL  ..|d>P>PQUQXQXYeQf>g/ghL $++J8JO]de/t?U?U&:;\h[jjGr4   r   )rb   rc   rd   re   r   rA   r(   ri   r   rh   ra   rk   rl   s   @r2   r   r     s3    0(~ 0(dQELL QXd^ Qr4   r   c                   L    e Zd ZU eed<   dZdZdZdej                  fdZ
d	dZy)
PatchTSTPreTrainedModelr<   modelr   Fr   c                 p   t        |t              rt        | j                  j                  | j                  j
                        | j                  j
                  z
  | j                  j                  z  dz   }| j                  j                  r0t        j                  j                  |j                  d       |dz  }|j                  | j                  |      |_        yt        |t        j                        rJ|j                  j                   j#                          |j$                  j                   j'                  d       yt        |t(              r^|j*                  j                  j                   j#                          |j*                  j$                  j                   j'                  d       yt        |t        j,                        rm|j$                  j                   j                  d| j                  j.                         |j                  %|j                  j                   j#                          yyy)z$
        Initialize weights
        r   g{Gz?)std      ?rR   )meanr   N)r   PatchTSTPositionalEncodingr   r<   r   r   r   use_cls_tokenr   initnormal_	cls_token_init_peposition_encr   r:   datazero_weightfill_rn   ru   rD   init_std)rI   r   r   s      r2   _init_weightsz%PatchTSTPreTrainedModel._init_weights/  s    f89 DKK..0H0HIDKKLdLdd))*,-.K {{(( 0 0d;q "(//$++{"KF-KK""$MM$$S) 12!!&&,,.##((..s3		*MM&&CT[[5I5I&J{{&  &&( ' +r4   c                 4    t        |t              r||_        y y r   )r   PatchTSTEncodergradient_checkpointing)rI   r   r   s      r2   _set_gradient_checkpointingz3PatchTSTPreTrainedModel._set_gradient_checkpointingI  s    f0,1F) 1r4   N)F)rb   rc   rd   r   __annotations__base_model_prefixmain_input_namesupports_gradient_checkpointingr   Moduler   r  r   r4   r2   r   r   (  s.    #O&+#)BII )42r4   r   c                   D     e Zd Zdef fdZdej                  fdZ xZS )PatchTSTEmbeddingr<   c                    t         |           |j                  | _        |j                  | _        | j                  r0t	        j
                  |j                  |j                        | _        y t	        j                         | _        t        |j                        D ]E  }| j                  j                  t	        j
                  |j                  |j                               G y r   )r@   rA   r   share_embeddingr   rD   r   rs   input_embedding
ModuleListranger   )rI   r<   r   rJ   s      r2   rA   zPatchTSTEmbedding.__init__O  s    "(";";%55#%99V-@-@&..#QD #%==?D 6445 \$$++BIIf6I6I6>>,Z[\r4   r   c                 `   |j                   d   }|| j                  k7  rt        d| j                   d| d      | j                  r| j	                  |      }|S t        |      D cg c]$  } | j                  |   |dd|ddddf         & }}t        j                  |d      }|S c c}w )a%  
        Parameters:
            patch_input (`torch.Tensor` of shape `(batch_size, num_channels, num_patches, patch_length)`, *required*):
                Patch input for embedding
        return:
            `torch.Tensor` of shape `(batch_size, num_channels, num_patches, d_model)`
        r   z&The defined number of input channels (zQ) in the config has to be the same as the number of channels in the batch input (r   Nr#   )rS   r   rC   r  r  r  r(   stack)rI   r   r   
embeddingsis        r2   ra   zPatchTSTEmbedding.forward[  s     )..q1!8!8889P9P8Q RTTfSgghj  --k:J  UZZlTmnq1$..q1+aAqj2IJnJnZQ7J os   ')B+	rb   rc   rd   r   rA   r(   ri   ra   rk   rl   s   @r2   r	  r	  N  s!    
\~ 
\5<< r4   r	  c                   ~     e Zd ZdZdedef fdZedededej                  fd       Z
dej                  fdZ xZS )	r   z'
    Class for positional encoding
    r<   r   c                    t         |           |j                  | _        |j                  | _        |j                  r?t	        j
                  t        j                  ddd|j                              | _	        |dz  }| j                  ||      | _        |j                  dkD  r%t	        j                  |j                        | _        y t	        j                         | _        y )Nr   r   )r@   rA   r   r   r   	Parameterr(   r   rs   r   r   r   positional_dropoutr   r   rI   r<   r   rJ   s      r2   rA   z#PatchTSTPositionalEncoding.__init__w  s    #11"(";";\\%++aAv~~*NODN1K MM&+> 6<5N5NQR5RBJJv001 	XZXcXcXe 	r4   rO   c                 $   | j                   dk(  r7t        j                  t        j                  || j
                        d      }|S | j                   dk(  r#t        j                  || j
                        }t        j                  d|      j                  d      }t        j                  t        j                  d| j
                  d      t        j                  d      | j
                  z   z        }t        j                  ||z        |d d dd df<   t        j                  ||z        |d d dd df<   ||j                         z
  }||j                         d	z  z  }t        j                  |d
      }|S t!        | j                    d      )Nr   Trequires_gradsincosr   r   r"   g     @
   FzN is not a valid positional encoder. Available types are 'random' and 'sincos'.)positional_encoding_typer   r  r(   randnrs   r   aranger   expmathlogsincosr   r   rC   )r<   r   r   positiondiv_terms        r2   r   z#PatchTSTPositionalEncoding._init_pe  sd    **h6<<K(P`deL  ,,8 ;;{FNNCL||A{3==a@Hyya!CQXHY\b\j\jHjFk!klH$)IIh.A$BLADqD!$)IIh.A$BLADqD!',*;*;*==L'<+;+;+=+BCL<<EJL
  223  4B  C r4   r   c                 x   | j                   r| j                  || j                  dd d d f   z         }| j                  | j                  d dd d f   z   }|j	                  |j
                  d   | j                  dd      }t        j                  ||fd      }|S | j                  || j                  z         }|S )Nr   r   r    r"   r#   )	r   r  r   r   expandrS   r   r(   cat)rI   r   r   
cls_tokensr   s        r2   ra   z"PatchTSTPositionalEncoding.forward  s    11+@Q@QRSRTVWRW@X2XYK):):2A2q5)AAI"))+*;*;A*>@W@WY[]_`J 99j+%>AFL   22;ARAR3RSLr4   )rb   rc   rd   re   r   rf   rA   staticmethodr   r  r   r(   ri   ra   rk   rl   s   @r2   r   r   r  sX    
~ 
C 
  c bll  &5<< r4   r   c            	       j     e Zd ZdZdedef fdZ	 	 d
dej                  de	e
   de	e
   defd	Z xZS )r   z
    PatchTST Encoder
    r<   r   c                 &   t         |   |       d| _        t        |      | _        t        ||      | _        t        j                  t        |j                        D cg c]  }t        |       c}      | _        | j                          y c c}w )NF)r@   rA   r  r	  embedderr   positional_encoderr   r  r  num_hidden_layersr   layers	post_init)rI   r<   r   r  rJ   s       r2   rA   zPatchTSTEncoder.__init__  st     &+# *&1"<V["Qmm5QWQiQiKj$ka%9&%A$kl 	 %ls   Br   output_hidden_statesrN   rO   c                 J   ||n| j                   j                  }||n| j                   j                  }| j                  |      }| j	                  |      }|rdnd}|rdnd}| j
                  D ]%  }|r||fz   } |||      }|d   }|s||d   fz   }' t        |||      S )a  
        Parameters:
            patch_input (`torch.Tensor` of shape `(batch_size, num_channels, num_patches, patch_length)`, *required*):
                Past values of the time series
            output_hidden_states (bool, optional): Indicates if hidden states should be outputted.
            output_attentions (bool, optional): Indicates if attentions should be outputted.

        return:
            `BaseModelOutput`
        Nr   )r   rN   r   r   )last_hidden_staterK   
attentions)r<   rN   r4  r/  r0  r2  r   )	rI   r   r4  rN   r   encoder_statesall_attentionsencoder_layerlayer_outputss	            r2   ra   zPatchTSTEncoder.forward  s      2C1N-TXT_T_TqTq$8$D $++JjJj 	
 mmK0..{;30d![[ 
	FM#!/</!A)|WhiM )+L !/=3C2E!E
	F ^hvwwr4   NN)rb   rc   rd   re   r   rf   rA   r(   ri   r   rh   r   ra   rk   rl   s   @r2   r   r     s_    ~ C " 04,0	(x\\(x 'tn(x $D>	(x
 
(xr4   r   zG
    Base class for model's outputs, with potential hidden states.
    )custom_introc                   6   e Zd ZU dZdZeej                     ed<   dZ	ee
ej                        ed<   dZee
ej                        ed<   dZeej                     ed<   dZeej                     ed<   dZeej                     ed<   dZeej                     ed	<   y)
PatchTSTModelOutputa>  
    last_hidden_state (`torch.FloatTensor` of shape `(batch_size, num_channels, num_patches, patch_length)`):
        Sequence of hidden-states at the output of the last layer of the model.
    hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
        Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
        one for the output of each layer) of shape `(batch_size, num_channels, height, width)`. Hidden-states of
        the model at the output of each layer plus the optional initial embedding outputs.
    mask (`torch.FloatTensor` of shape `(batch_size, num_channels, num_patches)`, *optional*):
        Bool masked tensor indicating which patches are masked
    loc (`torch.FloatTensor` of shape `(batch_size, 1, num_channels)`, *optional*):
        Mean of the input data (batch_size, sequence_length, num_channels) over the sequence_length
    scale (`torch.FloatTensor` of shape `(batch_size, 1, num_channels)`, *optional*):
        Std of the input data (batch_size, sequence_length, num_channels) over the sequence_length
    patch_input (`torch.FloatTensor` of shape `(batch_size, num_channels, num_patches, patch_length)`):
        Patched input to the Transformer
    Nr6  rK   r7  r   locscaler   )rb   rc   rd   re   r6  r   r(   FloatTensorr  rK   rj   r7  r   r@  rA  r   r   r4   r2   r?  r?    s    " 6:x 1 1298<M8E%"3"345<59Ju00129(,D(5$$
%,'+C%##	$+)-E8E%%&-/3K%++,3r4   r?  z4
    Output type of [`PatchTSTForPretraining`].
    c                       e Zd ZU dZdZeej                     ed<   dZ	eej                     ed<   dZ
eeej                        ed<   dZeeej                        ed<   y)PatchTSTForPretrainingOutputa  
    loss (*optional*, returned when `labels` is provided, `torch.FloatTensor` of shape `(1,)`):
        MSE loss.
    prediction_output (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
        Prediction outputs of the time series modeling heads.
    Nlossprediction_outputrK   r7  )rb   rc   rd   re   rE  r   r(   rB  r  rF  rK   rj   r7  r   r4   r2   rD  rD  	  sh     )-D(5$$
%,59x 1 1298<M8E%"3"345<59Ju00129r4   rD  z3
    Output type of [`PatchTSTForRegression`].
    c                       e Zd ZU dZdZeej                     ed<   dZ	eej                     ed<   dZ
eeej                        ed<   dZeeej                        ed<   y)PatchTSTForRegressionOutputa  
    loss (*optional*, returned when `labels` is provided, `torch.FloatTensor` of shape `(1,)`):
        MSE loss.
    regression_outputs (`torch.FloatTensor` of shape `(batch_size, num_targets)`):
        Regression outputs of the time series modeling heads.
    NrE  regression_outputsrK   r7  )rb   rc   rd   re   rE  r   r(   rB  r  rI  rK   rj   r7  r   r4   r2   rH  rH    sh     )-D(5$$
%,6:!2!23:8<M8E%"3"345<59Ju00129r4   rH  z3
    Output type of [`PatchTSTForPrediction`].
    c                      e Zd ZU dZdZeej                     ed<   dZ	eej                     ed<   dZ
eeej                        ed<   dZeeej                        ed<   dZeej                     ed<   dZeej                     ed<   y)	PatchTSTForPredictionOutputa!  
    loss (*optional*, returned when `labels` is provided, `torch.FloatTensor` of shape `(1,)`):
        MSE loss.
    prediction_outputs (`torch.FloatTensor` of shape `(batch_size, prediction_length, -1)`):
        Prediction outputs of the time series modeling heads.
    attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
        Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
        sequence_length)`.

        Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
        heads.
    loc: (`torch.FloatTensor` of shape `(batch_size, 1, num_channels)`, *optional*)
        Mean of the input data (batch_size, sequence_length, num_channels) over the sequence_length
    scale: (`torch.FloatTensor` of shape `(batch_size, 1, num_channels)`, *optional*)
        Std of the input data (batch_size, sequence_length, num_channels) over the sequence_length
    NrE  prediction_outputsrK   r7  r@  rA  )rb   rc   rd   re   rE  r   r(   rB  r  rL  rK   rj   r7  r@  rA  r   r4   r2   rK  rK  1  s    " )-D(5$$
%,6:!2!23:8<M8E%"3"345<59Ju00129'+C%##	$+)-E8E%%&-r4   rK  z7
    Output type of [`PatchTSTForClassification`].
    c                       e Zd ZU dZdZeej                     ed<   dZ	eej                     ed<   dZ
eeej                        ed<   dZeeej                        ed<   y)PatchTSTForClassificationOutputa  
    loss (*optional*, returned when `labels` is provided, `torch.FloatTensor` of shape `(1,)`):
        Total loss as the sum of the masked language modeling loss and the next sequence prediction
        (classification) loss.
    prediction_logits (`torch.FloatTensor` of shape `(batch_size, num_targets)`):
        Prediction scores of the PatchTST modeling head (scores before SoftMax).
    NrE  prediction_logitsrK   r7  )rb   rc   rd   re   rE  r   r(   rB  r  rO  rK   rj   r7  r   r4   r2   rN  rN  Q  sh     )-D(5$$
%,59x 1 1298<M8E%"3"345<59Ju00129r4   rN  z
    Base class for time series model's predictions outputs that contains the sampled values from the chosen
    distribution.
    c                   :    e Zd ZU dZdZeej                     ed<   y)SamplePatchTSTOutputz
    sequences (`torch.FloatTensor` of shape `(batch_size, num_samples, prediction_length, num_targets)`):
        Sampled values from the chosen distribution.
    N	sequences)	rb   rc   rd   re   rR  r   r(   rB  r  r   r4   r2   rQ  rQ  f  s    
 .2Ix))*1r4   rQ  inputtargetrO   c                 &    | j                  |       S )zc
    Computes the negative log likelihood loss from input distribution with respect to target.
    )log_prob)rS  rT  s     r2   nllrW  w  s     NN6"""r4   input_tensorweightsc                 P   |t        j                  |dk7  | |z  t        j                  |             }t        j                  |r|j	                  |      n|j	                         d      }|r|j	                  |      |z  S |j	                         |z  S | j                  |      S )aj  
    Computes the weighted average of a given tensor across a given `dim`, masking values associated with weight zero,
    meaning instead of `nan * 0 = nan` you will get `0 * 0 = 0`.

    Args:
        input_tensor (`torch.FloatTensor`):
            Input tensor, of which the average must be computed.
        weights (`torch.FloatTensor`, *optional*):
            Weights tensor, of the same shape as `input_tensor`.
        dim (`int`, *optional*):
            The dim along which to average `input_tensor`.

    Returns:
        `torch.FloatTensor`: The tensor with values averaged along the specified `dim`.
    r   r#   r   min)r(   where
zeros_likeclampr   r   )rX  rY  r$   weighted_tensorsum_weightss        r2   weighted_averagerb    s      ++glL74JEL\L\]iLjkkk#'++#+"67;;=VYZ03###,R]]]9L9L9NR]]]  S ))r4   c            	            e Zd ZdZdef fdZdej                  dej                  deej                  ej                  ej                  f   fdZ	 xZ
S )PatchTSTStdScalerz
    Standardize features by calculating the mean and scaling along the first dimension, and then normalizes it by
    subtracting from the mean and dividing by the standard deviation.
    r<   c                     t         |           t        |d      r|j                  nd| _        t        |d      r|j
                  nd| _        t        |d      r|j                  | _        y d| _        y )Nscaling_dimr   keepdimTminimum_scalegh㈵>)r@   rA   hasattrrf  r$   rg  rh  rv   s     r2   rA   zPatchTSTStdScaler.__init__  s[    )0)G6%%Q)0)Cv~~5<V_5UV11[_r4   r   observed_indicatorrO   c                    |j                  | j                  | j                        }|j                  d      }||z  j                  | j                  | j                        |z  }||z
  |z  dz  j                  | j                  | j                        |z  }t	        j
                  || j                  z         }||z
  |z  ||fS )C  
        Parameters:
            data (`torch.Tensor` of shape `(batch_size, sequence_length, num_input_channels)`):
                input for Batch norm calculation
            observed_indicator (`torch.BoolTensor` of shape `(batch_size, sequence_length, num_input_channels)`):
                Calculating the scale on the observed indicator.
        Returns:
            tuple of `torch.Tensor` of shapes
                (`(batch_size, sequence_length, num_input_channels)`,`(batch_size, 1, num_input_channels)`,
                `(batch_size, 1, num_input_channels)`)
        rg  r   r"   )r   r$   rg  	clamp_minr(   sqrtrh  )rI   r   rj  denominatorr@  variancerA  s          r2   ra   zPatchTSTStdScaler.forward  s     ),,TXXt||,L!++C0((--dhh-MP[[Sj$661<AA$((TXT`T`Aadoo

8d&8&889s
e#S%//r4   rb   rc   rd   re   r   rA   r(   ri   rj   ra   rk   rl   s   @r2   rd  rd    sS    
`~ `0LL06;ll0	u||U\\5<<7	80r4   rd  c            	            e Zd ZdZdef fdZdej                  dej                  deej                  ej                  ej                  f   fdZ	 xZ
S )PatchTSTMeanScalerz
    Computes a scaling factor as the weighted average absolute value along the first dimension, and scales the data
    accordingly.
    r<   c                 &   t         |           t        |d      r|j                  nd| _        t        |d      r|j
                  nd| _        t        |d      r|j                  nd| _        t        |d      r|j                  | _        y d | _        y )Nrf  r   rg  Trh  绽|=default_scale)r@   rA   ri  rf  r$   rg  rh  rw  rv   s     r2   rA   zPatchTSTMeanScaler.__init__  su    )0)G6%%Q)0)Cv~~5<V_5UV11[`5<V_5UV11[_r4   r   rj  rO   c                    ||z  j                         j                  | j                  d      }|j                  | j                  d      }|t        j                  |d      z  }| j
                  Q|j                  d      }t        j                  |j                  d      d      }t        j                  ||z        }n"| j
                  t        j                  |      z  }t        j                  |dkD  ||      }t        j                  || j                        }||z  }	| j                  s|j                  | j                        }|	t        j                  |      |fS )rl  Trm  r   r[  r   r#   )absr   r$   r(   r_  rw  squeeze	ones_liker]  rh  rg  r^  )
rI   r   rj  ts_sumnum_observedrA  	batch_sumbatch_observationsrw  scaled_datas
             r2   ra   zPatchTSTMeanScaler.forward  s.    ++00266txx6N)--dhh-E\q99 %

q
)I!&\-=-=a-@a!H!MM)6H*HIM ..1GGM L1,e]C Et'9'9:Ul||MMdhhM/EE,,U3U::r4   rr  rl   s   @r2   rt  rt    sS    
`~ `&;LL&;6;ll&;	u||U\\5<<7	8&;r4   rt  c            
            e Zd ZdZdef fdZ	 ddej                  deej                     de	ej                  ej                  ej                  f   fdZ
 xZS )	PatchTSTNOPScalerz|
    Assigns a scaling factor equal to 1 along the first dimension, and therefore applies no scaling to the input data.
    r<   c                     t         |           t        |d      r|j                  nd| _        t        |d      r|j
                  | _        y d| _        y )Nrf  r   rg  T)r@   rA   ri  rf  r$   rg  rv   s     r2   rA   zPatchTSTNOPScaler.__init__  s@    )0)G6%%Q)0)Cv~~r4   r   rj  rO   c                     t        j                  |d      j                  | j                  | j                        }t        j
                  |d      j                  | j                  | j                        }|||fS )a  
        Parameters:
            data (`torch.Tensor` of shape `(batch_size, sequence_length, num_input_channels)`):
                input for Batch norm calculation
        Returns:
            tuple of `torch.Tensor` of shapes
                (`(batch_size, sequence_length, num_input_channels)`,`(batch_size, 1, num_input_channels)`,
                `(batch_size, 1, num_input_channels)`)
        Fr  )r$   rg  )r(   r{  r   r$   rg  r^  )rI   r   rj  rA  r@  s        r2   ra   zPatchTSTNOPScaler.forward  si     E:??DHHVZVbVb?ct59>>488UYUaUa>bS%r4   r   )rb   rc   rd   re   r   rA   r(   ri   r   rj   ra   rk   rl   s   @r2   r  r    s_    N~ N PT LL 6>u||6L 	u||U\\5<<7	8 r4   r  c            	            e Zd Zdef fdZdej                  dej                  deej                  ej                  ej                  f   fdZ xZ	S )PatchTSTScalerr<   c                     t         |           |j                  dk(  s|j                  du rt        |      | _        y |j                  dk(  rt        |      | _        y t        |      | _        y )Nr   Tr   )r@   rA   r   rt  scalerrd  r  rv   s     r2   rA   zPatchTSTScaler.__init__  sU    >>V#v~~'=,V4DK^^u$+F3DK+F3DKr4   r   rj  rO   c                 8    | j                  ||      \  }}}|||fS )a>  
        Parameters:
            data (`torch.Tensor` of shape `(batch_size, sequence_length, num_input_channels)`):
                Input for scaler calculation
            observed_indicator (`torch.BoolTensor` of shape `(batch_size, sequence_length, num_input_channels)`):
                Calculating the scale on the observed indicator.
        Returns:
            tuple of `torch.Tensor` of shapes
                (`(batch_size, sequence_length, num_input_channels)`,`(batch_size, 1, num_input_channels)`,
                `(batch_size, 1, um_input_channels)`)
        )r  )rI   r   rj  r@  rA  s        r2   ra   zPatchTSTScaler.forward  s)      ;;t-?@c5S%r4   )
rb   rc   rd   r   rA   r(   ri   rj   ra   rk   rl   s   @r2   r  r    sL    4~ 4 LL 6;ll 	u||U\\5<<7	8 r4   r  c                        e Zd Zdef fdZ	 	 	 	 	 ddej                  deej                     deej                     dee   dee   dee   d	e	e
ef   fd
Z xZS )PatchTSTModelr<   c                 b   t         |   |       t        |      | _        t	        |      | _        |j                  | _        | j
                  j                  }| j                  rt        |      | _	        nt        j                         | _	        t        ||      | _        | j                          y )N)r   )r@   rA   r  r  r   
patchifierdo_mask_inputr   r   maskingr   r   r   encoderr3  r  s      r2   rA   zPatchTSTModel.__init__,  s     $V,*62#11oo11*62DL;;=DL&v;G 	r4   r   past_observed_maskfuture_valuesr4  rN   return_dictrO   c           	         ||n| j                   j                  }||n| j                   j                  }||n| j                   j                  }|t	        j
                  |      }| j                  ||      \  }}}	| j                  |      }
| j                  r| j                  |
      \  }}n| j                  |
      d}}| j                  |||      }|s>|j                  |j                  |j                  f}||||	|
fz   }t        d |D              S t        |j                  |j                  |j                  |||	|
      S )a  
        Parameters:
            past_values (`torch.Tensor` of shape `(bs, sequence_length, num_input_channels)`, *required*):
                Input sequence to the model
            past_observed_mask (`torch.BoolTensor` of shape `(batch_size, sequence_length, num_input_channels)`, *optional*):
                Boolean mask to indicate which `past_values` were observed and which were missing. Mask values selected
                in `[0, 1]`:

                - 1 for values that are **observed**,
                - 0 for values that are **missing** (i.e. NaNs that were replaced by zeros).
            future_values (`torch.BoolTensor` of shape `(batch_size, prediction_length, num_input_channels)`, *optional*):
                Future target values associated with the `past_values`
            output_hidden_states (`bool`, *optional*):
                Whether or not to return the hidden states of all layers
            output_attentions (`bool`, *optional*):
                Whether or not to return the output attention of all layers
            return_dict (`bool`, *optional*):
                Whether or not to return a `ModelOutput` instead of a plain tuple.

        Returns:
            `PatchTSTModelOutput` or tuple of `torch.Tensor` (if `return_dict`=False or `config.return_dict`=False)

        Examples:

        ```python
        >>> from huggingface_hub import hf_hub_download
        >>> import torch
        >>> from transformers import PatchTSTModel

        >>> file = hf_hub_download(
        ...     repo_id="hf-internal-testing/etth1-hourly-batch", filename="train-batch.pt", repo_type="dataset"
        ... )
        >>> batch = torch.load(file)

        >>> model = PatchTSTModel.from_pretrained("namctin/patchtst_etth1_pretrain")

        >>> # during training, one provides both past and future values
        >>> outputs = model(
        ...     past_values=batch["past_values"],
        ...     future_values=batch["future_values"],
        ... )

        >>> last_hidden_state = outputs.last_hidden_state
        ```N)r   r4  rN   c              3   &   K   | ]	  }||  y wr   r   ).0vs     r2   	<genexpr>z(PatchTSTModel.forward.<locals>.<genexpr>  s     =qq}=s   )r6  rK   r7  r   r@  rA  r   )r<   use_return_dictrN   r4  r(   r{  r  r  r  r  r  r6  rK   r7  rj   r?  )rI   r   r  r  r4  rN   r  scaled_past_valuesr@  rA  patched_valuesmasked_valuesr   encoder_outputr   s                  r2   ra   zPatchTSTModel.forward>  sY   l &1%<k$++B]B]1B1N-TXT_T_TqTq$8$D $++JjJj 	 %!&!= *.[BT)U&C );<"&,,~">M4"&,,~">4M%<Pdu & 
 %779U9UWeWpWpqGsE> BBG=G===",>>(66%00&
 	
r4   NNNNN)rb   rc   rd   r   rA   r(   ri   r   rh   r   rj   r?  ra   rk   rl   s   @r2   r  r  *  s    ~ * 6:04/3,0&*Z
\\Z
 %U\\2Z
  -	Z

 'tnZ
 $D>Z
 d^Z
 
u))	*Z
r4   r  c                   `     e Zd ZdZdef fdZdej                  dej                  fdZ xZ	S )PatchTSTMaskPretrainHeadz-
    Pretraining head for mask modelling
    r<   c                 0   t         |           |j                  dkD  rt        j                  |j                        nt        j
                         | _        t        j                  |j                  |j                        | _
        |j                  | _        y Nr   )r@   rA   head_dropoutr   r   r   r   rD   rs   r   linearr   rv   s     r2   rA   z!PatchTSTMaskPretrainHead.__init__  sh    :@:M:MPQ:Qrzz&"5"56WYWbWbWdii0C0CD#11r4   	embeddingrO   c                     | j                  | j                  |            }| j                  r|ddddddddf   }|S )a  
        Parameters:
            embedding (`torch.Tensor` of shape `(bs, num_channels, num_patches, d_model)` or
                    `(bs, num_channels, num_patches+1, d_model)` if `cls_token` is set to True, *required*):
                Embedding from the model
        Returns:
            `torch.Tensor` of shape `(bs, num_channels, num_patches, d_model)` or
                            `(bs, num_channels, num_patches+1, d_model)` if `cls_token` is set to True

        Nr   )r  r   r   )rI   r  s     r2   ra   z PatchTSTMaskPretrainHead.forward  s>     KKY 78	!!QA+.Ir4   rz   rl   s   @r2   r  r    s/    2~ 2 %,, r4   r  z*
    The PatchTST for pretrain model.
    c                        e Zd Zdef fdZ	 	 	 	 d
dej                  deej                     dee   dee   dee   de	e
ef   fd	Z xZS )PatchTSTForPretrainingr<   c                     t         |   |       d|_        t        |      | _        t        |      | _        | j                          y )NT)r<   )r@   rA   r  r  r   r  headr3  rv   s     r2   rA   zPatchTSTForPretraining.__init__  s<     #"&1
,V4	 	r4   r   r  r4  rN   r  rO   c                    ||n| j                   j                  }| j                  ||||d      }| j                  |j                        }t        j                  d      } |||j                        }	|	j                  d      |j                  z  j                         |j                  j                         dz   z  }
|j                  }|s|f|dd	 z   }|
|
f|z   }|S |}|S t        |
|||j                  
      S )a	  
        Parameters:
            past_values (`torch.Tensor` of shape `(bs, sequence_length, num_input_channels)`, *required*):
                Input sequence to the model
            past_observed_mask (`torch.BoolTensor` of shape `(batch_size, sequence_length, num_input_channels)`, *optional*):
                Boolean mask to indicate which `past_values` were observed and which were missing. Mask values selected
                in `[0, 1]`:

                - 1 for values that are **observed**,
                - 0 for values that are **missing** (i.e. NaNs that were replaced by zeros).
            output_hidden_states (`bool`, *optional*):
                Whether or not to return the hidden states of all layers
            output_attentions (`bool`, *optional*):
                Whether or not to return the output attention of all layers
            return_dict (`bool`, *optional*): Whether or not to return a `ModelOutput` instead of a plain tuple.

        Returns:
            `PatchTSTForPretrainingOutput` or tuple of `torch.Tensor` (if `return_dict`=False or
            `config.return_dict`=False)

        Examples:

        ```python
        >>> from huggingface_hub import hf_hub_download
        >>> import torch
        >>> from transformers import PatchTSTConfig, PatchTSTForPretraining

        >>> file = hf_hub_download(
        ...     repo_id="hf-internal-testing/etth1-hourly-batch", filename="train-batch.pt", repo_type="dataset"
        ... )
        >>> batch = torch.load(file)

        >>> # Config for random mask pretraining
        >>> config = PatchTSTConfig(
        ...     num_input_channels=7,
        ...     context_length=512,
        ...     patch_length=12,
        ...     stride=12,
        ...     mask_type='random',
        ...     random_mask_ratio=0.4,
        ...     use_cls_token=True,
        ... )
        >>> # Config for forecast mask pretraining
        >>> config = PatchTSTConfig(
        ...     num_input_channels=7,
        ...     context_length=512,
        ...     patch_length=12,
        ...     stride=12,
        ...     mask_type='forecast',
        ...     num_forecast_mask_patches=5,
        ...     use_cls_token=True,
        ... )
        >>> model = PatchTSTForPretraining(config)

        >>> # during training, one provides both past and future values
        >>> outputs = model(past_values=batch["past_values"])

        >>> loss = outputs.loss
        >>> loss.backward()
        ```Tr   r  r4  rN   r  none	reductionr    r#   rv  r   )rE  rF  rK   r7  )r<   r  r   r  r6  r   MSELossr   r   r   r   rK   rD  r7  )rI   r   r  r4  rN   r  model_outputx_hatrE  loss_valmasked_lossr8  r   s                r2   ra   zPatchTSTForPretraining.forward  s   J &1%<k$++B]B] zz#1!5/ " 
 		,889 zzF+|778}}},|/@/@@EEG<K\K\K`K`KbejKjk%33ha!33G2=2I{nw.GN PWGN+^`l`w`w
 	
r4   )NNNN)rb   rc   rd   r   rA   r(   ri   r   rh   r   rj   rD  ra   rk   rl   s   @r2   r  r    s    ~  6:/3,0&*a
\\a
 %U\\2a
 'tn	a

 $D>a
 d^a
 
u22	3a
r4   r  c                   D     e Zd Zdef fdZdej                  fdZ xZS )PatchTSTClassificationHeadr<   c                    t         |           |j                  | _        |j                  | _        t	        j
                  d      | _        |j                  dkD  rt	        j                  |j                        nt	        j                         | _
        t	        j                  |j                  |j                  z  |j                        | _        y Nr   	start_dimr   )r@   rA   r   pooling_typer   Flattenflattenr  r   r   r   rD   r   rs   num_targetsr  rv   s     r2   rA   z#PatchTSTClassificationHead.__init__,  s    #11"//zzA.:@:M:MPQ:Qrzz&"5"56WYWbWbWdii 9 9FNN JFL^L^_r4   r  c                 n   | j                   r|dddddddf   }ng| j                  dk(  r|j                  d      }nE| j                  dk(  r|j                  d      j                  }nt        d| j                   d      | j                  |      }| j                  | j                  |            }|S )	a[  
        Parameters:
            embedding (`torch.Tensor` of shape `(bs, num_channels, num_patches, d_model)` or
                     `(bs, num_channels, num_patches+1, d_model)` if `cls_token` is set to True, *required*):
                Embedding from the model
        Returns:
            `torch.Tensor` of shape `(bs, num_targets)`

        Nr   r   r"   r#   r   pooling operator  is not implemented yet)	r   r  r   r   valuesrC   r  r  r   rI   r  pooled_embeddingry   s       r2   ra   z"PatchTSTClassificationHead.forward4  s     (Aq!4&((~~!~4%'(}}}3::01B1B0CCZ[\\<<(89T\\*:;<r4   r  rl   s   @r2   r  r  +  s!    `~ ` r4   r  z0
    The PatchTST for classification model.
    c                        e Zd Zdef fdZe	 	 	 	 	 ddej                  deej                     dee	   dee	   dee	   dee	   d	e
eef   fd
       Z xZS )PatchTSTForClassificationr<   c                     t         |   |       |j                  rt        j	                  d       d|_        t        |      | _        t        |      | _        | j                          y )N+Setting `do_mask_input` parameter to False.F)
r@   rA   r  loggerwarningr  r   r  r  r3  rv   s     r2   rA   z"PatchTSTForClassification.__init__V  sT      NNHI#(F "6*
.v6	 	r4   r   target_valuesr  r4  rN   r  rO   c                 R   ||n| j                   j                  }| j                  ||||d      }| j                  |j                        }d}	|t        j                         }
 |
||      }	|s|f|dd z   }|	|	f|z   }|S |}|S t        |	||j                  |j                        S )ac  
        past_values (`torch.Tensor` of shape `(bs, sequence_length, num_input_channels)`, *required*):
            Input sequence to the model
        target_values (`torch.Tensor`, *optional*):
            Labels associates with the `past_values`
        past_observed_mask (`torch.BoolTensor` of shape `(batch_size, sequence_length, num_input_channels)`, *optional*):
            Boolean mask to indicate which `past_values` were observed and which were missing. Mask values selected
            in `[0, 1]`:

            - 1 for values that are **observed**,
            - 0 for values that are **missing** (i.e. NaNs that were replaced by zeros).

        Examples:

        ```python
        >>> from transformers import PatchTSTConfig, PatchTSTForClassification

        >>> # classification task with two input channel2 and 3 classes
        >>> config = PatchTSTConfig(
        ...     num_input_channels=2,
        ...     num_targets=3,
        ...     context_length=512,
        ...     patch_length=12,
        ...     stride=12,
        ...     use_cls_token=True,
        ... )
        >>> model = PatchTSTForClassification(config=config)

        >>> # during inference, one only provides past values
        >>> past_values = torch.randn(20, 512, 2)
        >>> outputs = model(past_values=past_values)
        >>> labels = outputs.prediction_logits
        ```NTr  r   r   )rE  rO  rK   r7  )
r<   r  r   r  r6  r   CrossEntropyLossrN  rK   r7  )rI   r   r  r  r4  rN   r  r  y_hatr  rE  r   s               r2   ra   z!PatchTSTForClassification.forwardd  s    X &1%<k$++B]B]zz#1!5/ " 
 		,889$&&(DE=1Hha!33G/7/CxkG+GN JQGN.#&44#..	
 	
r4   r  )rb   rc   rd   r   rA   r   r(   ri   r   rh   r   rj   rN  ra   rk   rl   s   @r2   r  r  P  s    ~   15-1/3,0&*D
\\D
  -D
 %TN	D

 'tnD
 $D>D
 d^D
 
u55	6D
 D
r4   r  z,
    The PatchTST for regression Model.
    c                   J     e Zd Zddedef fdZdej                  fdZ xZ	S )PatchTSTPredictionHeadr<   r   c                    t         |           |j                  | _        |j                  | _        |j                  | _        |j
                  | _        | j
                  s| j                  r|j                  }n|j                  |z  }| j                  sVt        j                         | _	        t        j                         | _
        t        j                         | _        t        | j                        D ]  }| j                  j                  t        j                  d             |:| j                  j                  t        j                  ||j                                n*| j                  j                  |j#                  |             | j                  j                  |j$                  dkD  rt        j&                  |j$                        nt        j(                                 yt        j                  d      | _        |&t        j                  ||j                         | _        n|j#                  |      | _        |j$                  dkD  rt        j&                  |j$                        nt        j(                         | _        y)a  
        num_patches (`int`):
            The number of patches in the input sequence.
        distribution_output (`DistributionOutput`, *optional*):
            The distribution output layer for probabilistic forecasting. If None, a linear output layer is used.
        r"   r  Nr   )r@   rA   share_projectionr   r   r  rs   r   r  projectionsdropoutsflattensr  r   r  rD   prediction_lengthget_parameter_projectionr  r   r   r  
projectionr   )rI   r<   r   distribution_outputrB   r  rJ   s         r2   rA   zPatchTSTPredictionHead.__init__  s    	 & 7 7"(";";#11"// 2 2~~H~~3H$$!}}DMMODMMMODM4223 t$$RZZ!%<=&.$$++BIIh@X@X,YZ $$++,?,X,XYa,bc$$H[H[^_H_RZZ0C0C%Degepeperst ::2DL"*"$))Hf6N6N"O #6"N"Nx"X>D>Q>QTU>U2::f&9&9:[][f[f[hDLr4   r  c                    | j                   r|dddddddf   }nP| j                  dk(  r|j                  d      }n.| j                  dk(  r|j                  d      j                  }n|}| j
                  sg }t        | j                        D ]\  } | j                  |   |dd|ddf         } | j                  |   |      } | j                  |   |      }|j                  |       ^ t        j                  |d      }n3| j                  |      }| j                  |      }| j!                  |      }t#        |t$              rt%        d |D              }|S |j'                  dd      }|S )	aj  
        Parameters:
            embedding (`torch.Tensor` of shape `(bs, num_channels, num_patches, d_model)` or
                     `(bs, num_channels, num_patches+1, d_model)` if `cls_token` is set to True, *required*):
                Embedding from the model
        Returns:
            `torch.Tensor` of shape `(bs, forecast_len, num_channels)`

        Nr   r   r"   r#   r   r   c              3   @   K   | ]  }|j                  d d        yw)r"   r   N)r*   )r  zs     r2   r  z1PatchTSTPredictionHead.forward.<locals>.<genexpr>  s     =1;;q!,=s   )r   r  r   r   r  r  r  r   r  r  r  r   r(   r  r  r   r  r   rj   r*   )rI   r  r  ry   r  s        r2   ra   zPatchTSTPredictionHead.forward  st    (Aq!4  F*#,>>a>#8 ""e+#,==Q=#7#>#>  $- $$F4223 0#34==#34DQ1W4M#N #34==#34D#E  $74#3#3A#67G#H ./0 [[Q/F  $||,<=#||,<= __%56Ffe$=f==F  %%a+Fr4   r   )
rb   rc   rd   r   rf   rA   r(   ri   ra   rk   rl   s   @r2   r  r    s*    )i~ )iC )iV1 1r4   r  z,
    The PatchTST for prediction model.
    c                   &    e Zd Zdef fdZ	 	 	 	 	 ddej                  deej                     deej                     dee   dee   dee   d	e	e
ef   fd
Z ej                         	 ddej                  deej                     d	efd       Z xZS )PatchTSTForPredictionr<   c                    t         |   |       |j                  rt        j	                  d       d|_        t        |      | _        |j                  dk(  rd | _        n|j                  dk(  rt        |j                        | _        nn|j                  dk(  rt        |j                        | _        nC|j                  dk(  rt        |j                        | _        nt        d|j                         t        || j                  j                  j                   | j                  	      | _        | j%                          y )
Nr  Fmse	student_tr#   normalnegative_binomialUnknown distribution output )r  )r@   rA   r  r  r  r  r   rE  r  r   r  r   r   rC   r  r  r   r  r3  rv   s     r2   rA   zPatchTSTForPrediction.__init__  s     NNHI#(F "6*
;;%'+D$))[8+9f>V>V+W(++x7+7F<T<T+U(++/BB+AfF^F^+_( #?@Z@Z?[!\]]*DJJ))554KcKc
	
 	r4   r   r  r  r4  rN   r  rO   c                    ||n| j                   j                  }| j                  ||||d      }| j                  |j                        }d}	| j
                  r|}
n||j                  z  |j                  z   }
|u| j
                  rJ| j
                  j                  ||j                  |j                        }t        ||      }	t        |	      }	nt        j                  d      } ||
|      }	|j                  }|j                  }|s|
f|dd z   }|	|	f|z   }|S |}|S t        |	|
|j                  |j                  ||	      S )
aV	  
        Parameters:
            past_values (`torch.Tensor` of shape `(bs, sequence_length, num_input_channels)`, *required*):
                Input sequence to the model
            past_observed_mask (`torch.BoolTensor` of shape `(batch_size, sequence_length, num_input_channels)`, *optional*):
                Boolean mask to indicate which `past_values` were observed and which were missing. Mask values selected
                in `[0, 1]`:

                - 1 for values that are **observed**,
                - 0 for values that are **missing** (i.e. NaNs that were replaced by zeros).
            future_values (`torch.Tensor` of shape `(bs, forecast_len, num_input_channels)`, *optional*):
                Future target values associated with the `past_values`
            output_hidden_states (`bool`, *optional*):
                Whether or not to return the hidden states of all layers
            output_attentions (`bool`, *optional*):
                Whether or not to return the output attention of all layers
            return_dict (`bool`, *optional*):
                Whether or not to return a `ModelOutput` instead of a plain tuple.

        Returns:
            `PatchTSTForPredictionOutput` or tuple of `torch.Tensor` (if `return_dict`=False or
            `config.return_dict`=False)

        Examples:

        ```python
        >>> from huggingface_hub import hf_hub_download
        >>> import torch
        >>> from transformers import PatchTSTConfig, PatchTSTForPrediction

        >>> file = hf_hub_download(
        ...     repo_id="hf-internal-testing/etth1-hourly-batch", filename="train-batch.pt", repo_type="dataset"
        ... )
        >>> batch = torch.load(file)

        >>> # Prediction task with 7 input channels and prediction length is 96
        >>> model = PatchTSTForPrediction.from_pretrained("namctin/patchtst_etth1_forecast")

        >>> # during training, one provides both past and future values
        >>> outputs = model(
        ...     past_values=batch["past_values"],
        ...     future_values=batch["future_values"],
        ... )

        >>> loss = outputs.loss
        >>> loss.backward()

        >>> # during inference, one only provides past values, the model outputs future values
        >>> outputs = model(past_values=batch["past_values"])
        >>> prediction_outputs = outputs.prediction_outputs
        ```NTr  r@  rA  r   r  r   r    )rE  rL  rK   r7  r@  rA  )r<   r  r   r  r6  r  rA  r@  distributionrW  rb  r   r  rK  rK   r7  )rI   r   r  r  r4  rN   r  r  r  r  	y_hat_outr  rE  r@  rA  r   s                   r2   ra   zPatchTSTForPrediction.forward4  sn   z &1%<k$++B]B] zz#1!5/ " 
 		,889##I 2 22\5E5EEI$''#77DD|//|7I7I  E   |];+H5zzF3	=9"" l\!B%77G/7/CxkG+GN JQGN*(&44#..
 	
r4   c                    | j                   j                  } | |d|d      }| j                  rz| j                  j                  |j                  |j
                  |j                        }t        |      D cg c]  }|j                          }}t        j                  |d      }n|j                  j                  d      }t        |      S c c}w )a   
        Generate sequences of sample predictions from a model with a probability distribution head.

        Parameters:
            past_values (`torch.FloatTensor` of shape `(batch_size, sequence_length, num_input_channels)`):
                Past values of the time series that serves as context in order to predict the future.
            past_observed_mask (`torch.BoolTensor` of shape `(batch_size, sequence_length, num_input_channels)`, *optional*):
                Boolean mask to indicate which `past_values` were observed and which were missing. Mask values selected
                in `[0, 1]`:

                - 1 for values that are **observed**,
                - 0 for values that are **missing** (i.e. NaNs that were replaced by zeros).

        Return:
            [`SamplePatchTSTOutput`] where the outputs `sequences` tensor will have shape `(batch_size, number of
            samples, prediction_length, 1)` or `(batch_size, number of samples, prediction_length, num_input_channels)`
            for multivariate predictions.
        NF)r   r  r  r4  r  r   r#   rR  )r<   num_parallel_samplesr  r  rL  r@  rA  r  sampler(   r  r   rQ  rI   r   r  r  r   r  r   sampless           r2   generatezPatchTSTForPrediction.generate  s    2  ${{?? #1!&	
 ##33@@**7== A L 7<<P6QR|**,RGRkk'q1G00::1=G#g66 Ss   8Cr  r   )rb   rc   rd   r   rA   r(   ri   r   rh   r   rj   rK  ra   no_gradrQ  r  rk   rl   s   @r2   r  r    s    ~ @ 6:04/3,0&*k
\\k
 %U\\2k
  -	k

 'tnk
 $D>k
 d^k
 
u11	2k
Z U]]_ 6:-7\\-7 %U\\2-7 
	-7 -7r4   r  c                   J     e Zd ZdZddef fdZdej                  fdZ xZ	S )PatchTSTRegressionHeadz
    Regression head
    r<   c                    t         |           |j                  | _        |j                  | _        |j
                  | _        || _        |j                  |j                  z  }t        j                  d      | _        |j                  dkD  rt        j                  |j                        nt        j                         | _        |&t        j                   ||j"                        | _        y |j'                  |      | _        y r  )r@   rA   output_rangey_ranger   r  r  r   rs   r   r  r  r  r   r   r   rD   r  r  r  )rI   r<   r  rB   rJ   s       r2   rA   zPatchTSTRegressionHead.__init__  s    **#11"//#6 ,,v~~=zzA.:@:M:MPQ:Qrzz&"5"56WYWbWbWd& ii&2D2DEDO1JJ8TDOr4   r  c                 2   | j                   r|dddddddf   }ng| j                  dk(  r|j                  d      }nE| j                  dk(  r|j                  d      j                  }nt        d| j                   d      | j                  | j                  |            }| j                  |      }| j                  du | j                  duz  rEt        j                  |      | j                  d	   | j                  d   z
  z  | j                  d   z   }|S )
aY  
        Parameters:
            embedding (`torch.Tensor` of shape `(bs, num_channels, num_patches, d_model)` or
                    `(bs, num_channels, num_patches+1, d_model)` if `cls_token` is set to True, *required*):
                Embedding from the model
        Returns:
            `torch.Tensor` of shape `(bs, output_dim)`

        Nr   r   r"   r#   r   r  r  r   )r   r  r   r   r  rC   r   r  r  r  r  r(   sigmoidr  s       r2   ra   zPatchTSTRegressionHead.forward  s    (Aq!4&((~~!~4%'(}}}3::01B1B0CCZ[\\  <<5E(FG !12$$,T1IJ]]6*dll1oQ.OPSWS_S_`aSbbFr4   r   rz   rl   s   @r2   r  r    s&    U~ U" r4   r  z,
    The PatchTST for regression model.
    c                   0    e Zd Zdef fdZe	 	 	 	 	 ddej                  deej                     deej                     dee	   dee	   dee	   d	e
eef   fd
       Z ej                         	 ddej                  deej                     d	efd       Z xZS )PatchTSTForRegressionr<   c                 J   t         |   |       |j                  rt        j	                  d       d|_        t        |      | _        |j                  dk(  rd | _        n|j                  dk(  rt        |j                        | _        nn|j                  dk(  rt        |j                        | _        nC|j                  dk(  rt        |j                        | _        nt        d|j                         t        || j                        | _        | j!                          y )	Nr  Fr  r  r#   r  r  r  )r@   rA   r  r  r  r  r   rE  r  r   r  r   r   rC   r  r  r3  rv   s     r2   rA   zPatchTSTForRegression.__init__  s      NNHI#(F "6*
;;%'+D$))[8+9f>P>P+Q(++x7+7F<N<N+O(++/BB+AfFXFX+Y( #?@Z@Z?[!\]]*643K3KL	 	r4   r   r  r  r4  rN   r  rO   c           	      X   ||n| j                   j                  }| j                  ||||d      }| j                  |j                        }d}	|| j
                  rp| j
                  j                  |      }
t        |D cg c](  }|j                  d| j                   j                        * c}      }t        |
|      }	t        |	      }	nt        j                  d      }	 |	||      }	|s|f|dd z   }|	|	f|z   }|S |}|S t        |	||j                  |j                   	      S c c}w )
a#  
        past_values (`torch.Tensor` of shape `(bs, sequence_length, num_input_channels)`, *required*):
            Input sequence to the model
        target_values (`torch.Tensor` of shape `(bs, num_input_channels)`):
            Target values associates with the `past_values`
        past_observed_mask (`torch.BoolTensor` of shape `(batch_size, sequence_length, num_input_channels)`, *optional*):
            Boolean mask to indicate which `past_values` were observed and which were missing. Mask values selected
            in `[0, 1]`:

            - 1 for values that are **observed**,
            - 0 for values that are **missing** (i.e. NaNs that were replaced by zeros).
            Whether or not to return a `ModelOutput` instead of a plain tuple.

        Examples:

        ```python
        >>> from transformers import PatchTSTConfig, PatchTSTForRegression

        >>> # Regression task with 6 input channels and regress 2 targets
        >>> model = PatchTSTForRegression.from_pretrained("namctin/patchtst_etth1_regression")

        >>> # during inference, one only provides past values, the model outputs future values
        >>> past_values = torch.randn(20, 512, 6)
        >>> outputs = model(past_values=past_values)
        >>> regression_outputs = outputs.regression_outputs
        ```NTr  r    r   r  r   r   )rE  rI  rK   r7  )r<   r  r   r  r6  r  r  rj   r-   r  rW  rb  r   r  rH  rK   r7  )rI   r   r  r  r4  rN   r  r  r  rE  r  itemr   s                r2   ra   zPatchTSTForRegression.forward)  s=   J &1%<k$++B]B]zz#1!5/ " 
 		,889$''#77DDUKRWX$tyyT[[-D-DEXY<7'-zzF3E=1ha!33G+/+;tg'GN BIGN*$&44#..	
 	
 Ys    -D'c                 v   | j                   j                  } | |d|d      }| j                  j                  |j                        }t        |      D cg c]  }|j                          }}t        j                  |d      j                  d|| j                   j                        }t        |      S c c}w )a  
        Generate sequences of sample predictions from a model with a probability distribution head.

        Parameters:
            past_values (`torch.FloatTensor` of shape `(batch_size, sequence_length, num_input_channels)`):
                Past values of the time series that serves as context in order to predict the future.
            past_observed_mask (`torch.BoolTensor` of shape `(batch_size, sequence_length, num_input_channels)`, *optional*):
                Boolean mask to indicate which `past_values` were observed and which were missing. Mask values selected
                in `[0, 1]`:

                - 1 for values that are **observed**,
                - 0 for values that are **missing** (i.e. NaNs that were replaced by zeros).

        Return:
            [`SamplePatchTSTOutput`] where the outputs `sequences` tensor will have shape `(batch_size, number of
            samples, num_targets)`.
        NF)r   r  r  r4  r   r#   r    r  )r<   r  r  r  rI  r  r  r(   r  r-   r  rQ  r  s           r2   r  zPatchTSTForRegression.generates  s    0  ${{?? #1!&	
 //<<W=W=WX278L2MNQ<&&(NN++g1-2227KT[[MdMde#g66 Os   B6r  r   )rb   rc   rd   r   rA   r   r(   ri   r   rh   r   rj   rH  ra   r  rQ  r  rk   rl   s   @r2   r  r  	  s    ~ 4  1559/3,0&*G
\\G
  -G
 %U\\2	G

 'tnG
 $D>G
 d^G
 
u11	2G
 G
R U]]_ 6:'7\\'7 %U\\2'7 
	'7 '7r4   r  )r  r   r  r  r  r  )NrR   N)NFr   r  r<  )Lre   r"  dataclassesr   typingr   r   r   r(   r   activationsr	   modeling_flash_attention_utilsr
   modeling_outputsr   modeling_utilsr   r   processing_utilsr   time_series_utilsr   r   r   utilsr   r   r   configuration_patchtstr   
get_loggerrb   r  r  ri   rg   r3   r6   rn   listrh   rf   r   r   r   r   r   r   r	  r   r   r?  rD  rH  rK  rN  rQ  distributionsDistributionrW  rb  rd  rt  r  r  r  r  r  r  r  r  r  r  r  __all__r   r4   r2   <module>r     s     ! , ,   " B / F & U U 9 9 2 
		H	%  $(,%II%<<% 
% <<	%
 U\\*% e_% % %%>U/		 U/p&		 &2 04',7%LL7%7% 'tn7% !%	7%
 7%z 04	A%LLA%$T3Y/A% 'tnA% 	A%H-ryy -`9"bii 9"xH299 HV "2o "2 "2J!		 !H5 5p;x- ;x| 
4+ 4 46 
:; : : 
:+ : : 
.+ . .4 
:k : : 2; 2 2#u""// # #%,, #*5<< *(5<<:P *fkfrfr *2 0		  0H3; 3;n 		  6 RYY  8 m
+ m
 m
`ryy 8 
l
4 l

l
^" "J 
T
 7 T

T
n 
]RYY ]
]@ 
y73 y7
y7x4RYY 4n 
M73 M7
M7`r4   