
    rh                     H   d Z ddlZddlZddlmZmZmZ ddlZddlZddlm	Z	 ddl
mZmZmZ ddlmZ ddlmZmZ dd	lmZ dd
lmZ ddlmZmZmZ ddlmZ ddlmZmZm Z  ddl!m"Z"m#Z#m$Z$ ddl%m&Z&  e#jN                  e(      Z)d Z* G d de	jV                        Z, G d de	jV                        Z- G d de	jV                        Z. G d de      Z/e" G d de             Z0e" G d de0             Z1 e"d       G d  d!e0e             Z2 e"d"       G d# d$e0             Z3g d%Z4y)&zPyTorch OpenAI ImageGPT model.    N)AnyOptionalUnion)nn)BCEWithLogitsLossCrossEntropyLossMSELoss   )ACT2FN)CacheEncoderDecoderCache)GenerationMixin)GradientCheckpointingLayer))BaseModelOutputWithPastAndCrossAttentions!CausalLMOutputWithCrossAttentions SequenceClassifierOutputWithPast)PreTrainedModel)Conv1D find_pruneable_heads_and_indicesprune_conv1d_layer)auto_docstringloggingtorch_float   )ImageGPTConfigc                    	 ddl }ddl}t
        j                  j                  |      }t        j                  d|        |j                  j                  |      }g }g }|D ]l  \  }	}
t        j                  d|	 d|
        |j                  j                  ||	      }|j                  |	       |j                  |j                                n t        ||      D ]  \  }	}|	dd }	|	j                  d      }	t!        d	 |	D              s|	d
   dv r4t        j                  dj#                  dj%                  |	                   j| }|	d
   dvrt'        |d      }|	D ]W  }|j)                  d|      r|j                  d|      }n|g}|d   dk(  s|d   dk(  rt'        |d      }n|d   dk(  rt'        |d      }n|d   dk(  s|d   dk(  rt'        ||d         }t'        |d      }n|d   dv rt'        |d      }t'        |d      }nt+        |	      dk(  r,|	d   dk(  r$|d   dk(  rt'        ||d         }t'        |d      }nQ|d   dk(  rt'        |d      }t'        |d      }n0|d   d k(  rt'        |d      }t'        |d      }nt'        ||d         }t+        |      d!k\  sEt-        |d         }||   }Z t+        |	      dkD  r|	d   dk(  s|	d
   dk(  s|	d
   d k(  s|	d
   dk(  rn	 |j.                  |j.                  k(  sJ 	 t        j                  d"|	        |	d
   d#k(  rbt5        j6                  |j9                  |j:                  |j:                              j<                  |j>                  ddd|j:                  f<   |	d
   d$k(  rot5        j6                  |j9                  |j:                  |j:                              j<                  |j>                  dd|j:                  d!|j:                  z  f<   !|	d
   d%k(  ret5        j6                  |j9                  |j:                  |j:                              j<                  |j>                  ddd!|j:                  z  df<   t+        |	      dk(  rP|	d   dk(  rH|	d!   dk(  r@t5        j6                  |j9                  |j:                  |j:                              |_        |	d
   dk(  rt5        j6                  |      |_        |	d
   dk(  r7t5        j6                  |      |j>                  d|j@                  dz
  ddf<   O|	d
   d k(  r$t5        j6                  |      |j>                  d
<   {t5        j6                  |      |_         | S # t        $ r t        j	                  d        w xY w# t0        $ r1}|xj2                  |j.                  |j.                  fz  c_         d}~ww xY w)&z0
    Load tf checkpoints in a pytorch model
    r   NzLoading a TensorFlow model in PyTorch, requires TensorFlow to be installed. Please see https://www.tensorflow.org/install/ for installation instructions.z&Converting TensorFlow checkpoint from zLoading TF weight z with shape    /c              3   $   K   | ]  }|d v  
 yw))adam_vadam_mAdamWeightDecayOptimizerAdamWeightDecayOptimizer_1global_stepN ).0ns     /var/www/html/ai-insurance-compliance-backend/venv/lib/python3.12/site-packages/transformers/models/imagegpt/modeling_imagegpt.py	<genexpr>z.load_tf_weights_in_imagegpt.<locals>.<genexpr>Q   s      
 nn
   )_stepzSkipping {})wtettransformerz[A-Za-z]+\d+z(\d+)wgweightbbiaswpewte)q_projk_projv_projc_attnr
   r   attnc_projr-   lm_headsos   zInitialize PyTorch weight r6   r7   r8   )!re
tensorflowImportErrorloggererrorospathabspathinfotrainlist_variablesload_variableappendsqueezezipsplitanyformatjoingetattr	fullmatchlenintshapeAssertionErrorargstorch
from_numpyreshapen_embdTdata
vocab_size)modelconfigimagegpt_checkpoint_pathr?   tftf_path	init_varsnamesarraysnamerV   arraypointerm_namescope_namesnumes                    r(   load_tf_weights_in_imagegptro   0   sy   	 ggoo67G
KK8	BC''0IEF  'e(l5'BC&&w5Temmo&	' 5&) L3eABxzz#  

 
 "X"KK,,SXXd^<=88#g}5G 	'F||OV4 hhx8%h1~$A#(=!'84Q3&!'62Q5(KNe,C!';q>:!'84Q#AA!'84!'84TaDGv$5+a.H:T!';q>:!'84Q6)!'95!'84Q5(!'51!'84!';q>:;1$+a.)!#,;	'> t9q=T!W.$r(f2DRTYHY]abd]ein]n}}333
 	0788x/4/?/?fmm]c]j]j@k/l/n/nGLLOfmmO+,"X!AFAQAQfmmV]];Ba LLFMMA,===> "X!383C3CEMMRXR_R_agananDo3p3r3rGLLA-//0Y!^Q6 1d1g6I ++EMM&--,WXGL"X ++E2GL"X7<7G7G7NGLL06,,q00!34"X$//6GLL ++E2GLYL3\ LC  Q	
 	P " 7==%++66s#   U7 )V7 V	W#,WWc                   h     e Zd Zddee   def fdZdej                  dej                  fdZ	 xZ
S )ImageGPTLayerNormhidden_sizeepsc                     t         |           || _        t        j                  t        j                  |            | _        y N)super__init__rs   r   	ParameterrY   Tensorr1   )selfrr   rs   	__class__s      r(   rw   zImageGPTLayerNorm.__init__   s.    ll5<<#<=    tensorreturnc                     |t        j                  t        j                  t        j                  |      dd      | j                  z         z  }|| j
                  z  }|S )Nr+   T)axiskeepdim)rY   sqrtmeansquarers   r1   )rz   r}   s     r(   forwardzImageGPTLayerNorm.forward   sK    %**UZZV0D2W[%\_c_g_g%ghh$++%r|   )gh㈵>)__name__
__module____qualname__tuplerU   floatrw   rY   ry   r   __classcell__r{   s   @r(   rq   rq      s5    >E#J >U >
ell u|| r|   rq   c                   B    e Zd Zddee   dee   f fdZd ZddZddZ	d Z
d Z	 	 	 	 	 	 	 	 dd	ej                  d
ee   deej                     deej                     deej                     deej                     dee   dee   deej                     defdZ xZS )ImageGPTAttentionis_cross_attention	layer_idxc           	         t         |           |j                  }| j                  dt	        j
                  t	        j                  ||ft        j                              j                  dd||      d       | j                  dt	        j                  d      d       |j                  | _        |j                  | _        | j                  | j                  z  | _        | j                  | _        | j                  | j                  z  | j                  k7  r&t!        d| j                   d	| j                   d
      |j"                  | _        || _        |j&                  | _        || _        |j*                  | _        | j$                  rNt-        d| j                  z  | j                        | _        t-        | j                  | j                        | _        n(t-        d| j                  z  | j                        | _        t-        | j                  | j                        | _        t5        j6                  |j8                        | _        t5        j6                  |j<                        | _        tA               | _!        y )Nr3   dtyper   F)
persistentmasked_biasg     z=`embed_dim` must be divisible by num_heads (got `embed_dim`: z and `num_heads`: z).r>   r
   )"rv   rw   max_position_embeddingsregister_bufferrY   trilonesboolviewr}   rr   	embed_dimnum_attention_heads	num_headshead_dim
split_size
ValueErrorscale_attn_weightsr   scale_attn_by_inverse_layer_idxr   reorder_and_upcast_attnr   r9   q_attnr;   r   Dropout
attn_pdropattn_dropoutresid_pdropresid_dropoutsetpruned_heads)rz   ra   r   r   max_positionsr{   s        r(   rw   zImageGPTAttention.__init__   s   66JJuzz=-"@

STYY1m]  	 	
 	]ELL,>5Q++33$..8..==4>>)T^^;OPTP^P^O_ `NN#2' 
 #)";";"4 06/U/U,"'-'E'E$"" T^^!3T^^DDK @DK T^^!3T^^DDKT^^T^^<JJv'8'89ZZ(:(:;Er|   c                 F   t        |      dk(  ry t        || j                  | j                  | j                        \  }}t        j                  ||| j                  z   |d| j                  z  z   g      }t        | j                  |d      | _	        t        | j                  |d      | _
        | j                  | j                  z  | j                  t        |      z
  z  | _        | j                  t        |      z
  | _        | j                  j                  |      | _        y )Nr   r>   r   dim)rT   r   r   r   r   rY   catr   r   r9   r;   union)rz   headsindex
index_attns       r(   prune_headszImageGPTAttention.prune_heads   s    u:?7t~~t}}^b^o^opuYYut'>T__I\@]^_
 )jaH(eC  ??dnn<RUV[R\A\]#e*4 --33E:r|   c                 D   t        j                  ||j                  dd            }| j                  r |t	        |j                  d      dz        z  }| j                  r|t        | j                  dz         z  }| j                  s|j                  d      |j                  d      }}| j                  d d d d ||z
  |d |f   }	t        j                  |j                        j                  }
t        j                  |
|j                  |j                        }
t        j                   |	||
      }|||z   } t#        j$                  d      |      }|j'                  |j                        }| j)                  |      }|||z  }t        j                  ||      }||fS )Nr+         ?r   r   devicer   )rY   matmul	transposer   r   sizer   r   r   r   r3   finfor   minr}   r   wherer   Softmaxtyper   )rz   querykeyvalueattention_mask	head_maskattn_weightsquery_length
key_lengthcausal_mask
mask_valueattn_outputs               r(   _attnzImageGPTAttention._attn   st   ||E3==R+@A""'+ejjn6K*LLL //'%0B*CCL&&',zz"~sxx|*L))Aq*|*Cj*PR]S]R]$]^K\%7%78<<J j8J8JS_SfSfgJ ;;{L*ML%'.8L)rzzb),7 $((5((6  ')3Lll<7L((r|   c                    |j                         \  }}}}	|j                         \  }
}
}}
t        j                  ||z  ||t        j                  |j                        }d}| j
                  r |t        |j                  d            dz  z  }| j                  r|t        | j                  dz         z  }t        j                  |j                  j                  d      5  |j                  d||	      |j                  dd      j                  d|	|      }}t        j                  ||j                         |j                         d	|
      }|j                  ||||      }d d d        | j                  s|j                  d      |j                  d      }}| j                  d d d d ||z
  |d |f   }t        j                   |j"                        j$                  }t        j&                  ||j"                  |j                        }t        j(                  |||      }|||z   } t+        j,                  d      |      }|j"                  t        j                  k7  rt/        d      |j                  |j"                        }| j1                  |      }|||z  }t        j2                  ||      }||fS # 1 sw Y   ZxY w)Nr         ?r+   r   r   F)enabledr   r   )betaalphar   zDError with upcasting, attn_weights does not have dtype torch.float32)r   rY   emptyfloat32r   r   r   r   r   autocastr   r[   r   baddbmmr   r3   r   r   r   r}   r   r   r   RuntimeErrorr   r   )rz   r   r   r   r   r   bszr   	q_seq_lendk_	k_seq_lenr   scale_factorqkr   r   r   r   r   s                        r(   _upcast_and_reordered_attnz,ImageGPTAttention._upcast_and_reordered_attn
  sq   (-

%Y	2 XXZ1i {{3?IyPUP]P]fkfrfrs ""E%**R.1S88L//E$..1"455L ^^ELL--u= 	V==Y3S]]2r5J5R5RSUWY[d5eqA ==qwwy!'')RS[ghL'//Y	9UL	V
 &&',zz"~sxx|*L))Aq*|*Cj*PR]S]R]$]^K\%7%78<<J j8J8JS_SfSfgJ ;;{L*ML%'.8L)rzzb),7 .eff#((5((6  ')3Lll<7L((C	V 	Vs    BJ99Kc                 x    |j                         dd ||fz   } |j                  | }|j                  dddd      S )zJ
        Splits hidden_size dim into attn_head_size and num_heads
        Nr+   r   r>   r   r
   )r   r   permuterz   r}   r   attn_head_size	new_shapes        r(   _split_headszImageGPTAttention._split_heads>  sE     KKM#2&)^)DD	i(~~aAq))r|   c                     |j                  dddd      j                         }|j                         dd ||z  fz   }|j                  |      S )zS
        Merges attn_head_size dim and num_attn_heads dim into hidden_size
        r   r>   r   r
   Nr   )r   
contiguousr   r   r   s        r(   _merge_headszImageGPTAttention._merge_headsF  sO     1a+668KKM#2&)n*D)FF	{{9%%r|   hidden_states
layer_pastr   r   encoder_hidden_statesencoder_attention_mask	use_cacheoutput_attentionscache_positionr~   c
                    |d u}
|j                   \  }}}|St        |t              rA|j                  j	                  | j
                        }|
r|j                  }n|j                  }n|}|
r|n|}|
r%t        | d      st        d      |[rY| j                  |      }j                  | j
                     j                  }|j                  | j
                     j                  }nQ| j                  |      }| j                  |      j                  | j                   d      \  }}|j#                  |d| j$                  | j&                        j)                  dd      }|j#                  |d| j$                  | j&                        j)                  dd      }n| j                  |      j                  | j                   d      \  }}}|j#                  |d| j$                  | j&                        j)                  dd      }|j#                  |d| j$                  | j&                        j)                  dd      }|D|
s|	nd }	j+                  ||| j
                  d|	i      \  }}|
rd|j                  | j
                  <   |j#                  ||| j$                  | j&                        j)                  dd      }| j,                  r| j/                  |||||      \  }}n| j1                  |||||      \  }}| j3                  || j$                  | j&                        }| j5                  |      }| j7                  |      }||fS )	Nr   zIf class is used as cross attention, the weights `q_attn` have to be defined. Please make sure to instantiate class with `ImageGPTAttention(..., is_cross_attention=True)`.r>   r   r+   r   r   T)rV   
isinstancer   
is_updatedgetr   cross_attention_cacheself_attention_cachehasattrr   r   layerskeysvaluesr9   rN   r   r   r   r   r   updater   r   r   r   r;   r   )rz   r   r   r   r   r   r   r   r   r   r   r   seq_lenr   r   curr_past_key_valuecurrent_statesr   r   r   r   r   s                         r(   r   zImageGPTAttention.forwardN  s    3$>'--Wa!*&9:'2266t~~F
%*4*J*J'*4*I*I'&0#2D.-4* t 
 %*M2)00@EE+224>>BIIM2![[8>>tTU>V
UhhsBFPPQRTUV

3DNNDMMJTTUVXYZ $N ; A A$//WX A YE3((3DNNDMMBLLQPQRCJJsBFPPQRTUVE!3E^4N,33CQacqPrsJC!8<
%%dnn5

3GQQRSUVW''(,(G(GsTY[ikt(u%K(,

5#unV_(`%K''T^^T]]Skk+.((5L((r|   )FN)NNNNNNNFFN)r   r   r   r   r   rU   rw   r   r   r   r   r   rY   ry   r   r   r   r   r   s   @r(   r   r      s   )"8D> )"V^_bVc )"V;$)L2)h*& '+15,08<9=$),115D)||D) UOD) !.	D)
 ELL)D)  (5D) !) 6D) D>D) $D>D) !.D) 
D)r|   r   c                   V     e Zd Z fdZdej
                  dej
                  fdZ xZS )ImageGPTMLPc                     t         |           |j                  }t        ||      | _        t        ||      | _        t        |j                     | _        t        j                  |j                        | _        y ru   )rv   rw   rr   r   c_fcr;   r   activation_functionactr   r   r   dropout)rz   intermediate_sizera   r   r{   s       r(   rw   zImageGPTMLP.__init__  s_    &&	,i8	Y(9:&445zz&"4"45r|   r   r~   c                     | j                  |      }| j                  |      }| j                  |      }| j                  |      }|S ru   )r  r  r;   r  )rz   r   s     r(   r   zImageGPTMLP.forward  s@    		-0/M2]3r|   )r   r   r   rw   rY   ry   r   r   r   s   @r(   r  r    s#    6U\\ ell r|   r  c                   
    e Zd Zd fd	Z	 	 	 	 	 	 	 	 ddej
                  dee   deej
                     deej
                     deej
                     deej
                     dee   d	ee   d
eej
                     de	fdZ
 xZS )ImageGPTBlockc                    t         |           |j                  }|j                  |j                  nd|z  }t	        ||j
                        | _        t        ||      | _        t	        ||j
                        | _	        |j                  r/t        |d|      | _        t	        ||j
                        | _        t        ||      | _        y )N   rs   r   T)r   r   )rv   rw   rr   n_innerrq   layer_norm_epsilonln_1r   r:   ln_2add_cross_attentioncrossattentionln_cross_attnr  mlp)rz   ra   r   rr   	inner_dimr{   s        r(   rw   zImageGPTBlock.__init__  s    ((&,nn&@FNNa+o	%kv7P7PQ	%f	B	%kv7P7PQ	%%"3Ft_h"iD!2;FD]D]!^Dy&1r|   r   r   r   r   r   r   r   r   r   r~   c
           
         |}
| j                  |      }| j                  |||||||	      }|d   }|dd  }||
z   }|Yt        | d      st        d|  d      |}
| j	                  |      }| j                  ||||||||	      }|d   }|
|z   }||dd  z   }|}
| j                  |      }| j                  |      }|
|z   }|f|z   S )N)r   r   r   r   r   r   r   r   r  z'If `encoder_hidden_states` are passed, z` has to be instantiated with cross-attention layers by setting `config.add_cross_attention=True`)r   r   r   r   r   r   r   )r  r:   r   r   r  r  r  r  )rz   r   r   r   r   r   r   r   r   r   residualattn_outputsr   outputscross_attn_outputsfeed_forward_hidden_statess                   r(   r   zImageGPTBlock.forward  s:    !		-0yy!)/) ! 
 #1oqr"#h. ,4!12 =dV DZ Z  %H ..}=M!%!4!4%-#&;'="3- "5 	" -Q/K${2M 212 66G 		-0%)XXm%<" #=='))r|   ru   r  )r   r   r   rw   rY   ry   r   r   r   r   r   r   r   s   @r(   r  r    s    2$ '+15,08<9=$),115:*||:* UO:* !.	:*
 ELL):*  (5:* !) 6:* D>:* $D>:* !.:* 
:*r|   r  c                   F     e Zd ZU eed<   eZdZdZdZ	dgZ
 fdZd Z xZS )ImageGPTPreTrainedModelra   r.   	input_idsTr  c                 $    t        |   |i | y ru   )rv   rw   )rz   inputskwargsr{   s      r(   rw   z ImageGPTPreTrainedModel.__init__  s    &+F+r|   c           	         t        |t        j                  t        f      rl|j                  j
                  j                  d| j                  j                         |j                  |j                  j
                  j                          nt        |t        j                        ry|j                  j
                  j                  d| j                  j                         |j                  g|j                  j
                  |j                     j                          n5t        |t              r%|j                  j
                  j                  d       |j                         D ]m  \  }}d|v sd|v s|j
                  j                  d| j                  j                  t!        j"                  d| j                  j$                  z        z         o y)zInitialize the weights.g        )r   stdNr   r;   r1   r>   )r   r   Linearr   r1   r^   normal_ra   initializer_ranger3   zero_	Embeddingpadding_idxrq   fill_named_parametersmathr   n_layer)rz   modulerh   ps       r(   _init_weightsz%ImageGPTPreTrainedModel._init_weights  sQ   fryy&12 MM&&CT[[5R5R&S{{&  &&(-MM&&CT[[5R5R&S!!-""6#5#56<<> 12MM$$S) ..0 	sGD!4H$4Cdkk.K.KdiiXY\`\g\g\o\oXoNp.pr	sr|   )r   r   r   r   __annotations__ro   load_tf_weightsbase_model_prefixmain_input_namesupports_gradient_checkpointing_no_split_modulesrw   r4  r   r   s   @r(   r!  r!    s3    1O%!O&*#(),sr|   r!  c            $           e Zd Zdef fdZd Zd Zd Ze	 	 	 	 	 	 	 	 	 	 	 	 	 	 dde	e
j                     de	eee
j                           de	e
j                     d	e	e
j                     d
e	e
j                     de	e
j                     de	e
j                     de	e
j                     de	e
j                     de	e   de	e   de	e   de	e   de	e
j                     dedeeef   f d       Z xZS )ImageGPTModelra   c           	      v   t         |   |       |j                  | _        t	        j
                  |j                  | j                        | _        t	        j
                  |j                  | j                        | _	        t	        j                  |j                        | _        t	        j                  t        |j                        D cg c]  }t!        ||       c}      | _        t%        | j                  |j&                        | _        d| _        d | _        d| _        | j1                          y c c}w )Nr  r  F)rv   rw   rr   r   r   r,  r_   r5   r   r4   r   
embd_pdropdrop
ModuleListrangenum_hidden_layersr  hrq   r  ln_fmodel_parallel
device_mapgradient_checkpointing	post_init)rz   ra   ir{   s      r(   rw   zImageGPTModel.__init__  s     ++<< 1 14>>B<< > >OJJv001	ERXRjRjLklqf Blm%dnn&:S:ST	 $&+#  ms   
D6c                     | j                   S ru   r5   )rz   s    r(   get_input_embeddingsz"ImageGPTModel.get_input_embeddings/  s    xxr|   c                     || _         y ru   rK  )rz   new_embeddingss     r(   set_input_embeddingsz"ImageGPTModel.set_input_embeddings2  s	    !r|   c                     |j                         D ]-  \  }}| j                  |   j                  j                  |       / y)zv
        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer}
        N)itemsrC  r:   r   )rz   heads_to_prunelayerr   s       r(   _prune_headszImageGPTModel._prune_heads5  s<     +002 	2LE5FF5M**51	2r|   r"  past_key_valuesr   token_type_idsposition_idsr   inputs_embedsr   r   r   r   output_hidden_statesreturn_dictr   r%  r~   c                 
   ||n| j                   j                  }||n| j                   j                  }|
|
n| j                   j                  }
||n| j                   j                  }||t        d      |G| j                  ||       |j                         }|j                  d|d         }|j                  d   }n0|#|j                         dd }|j                  d   }nt        d      ||j                  n|j                  }| j                  r%| j                  r|
rt        j                  d       d}
d}|
r<t        |t               s,t        j                  d       d	}t#        j$                  |      }||j'                         n|}||j                  d|d         }|>t)        j*                  ||d   |z   t(        j,                  |
      }|j/                  d      }|z|dk  rt        d      |j                  |d      }|ddddddf   }|j1                  | j2                        }d|z
  t)        j4                  | j2                        j6                  z  }| j                   j8                  rE|C|j                         \  }}}||f}|	t)        j:                  ||      }	| j=                  |	      }	nd}	| j?                  || j                   j@                        }|| jC                  |      }| jE                  |      }||j1                  |j                        z   }|| jC                  |      }||z   }| jG                  |      }||j                  d      fz   }|rdnd}|r| j                   j8                  rdnd}|rdnd}tI        | jJ                        D ]L  \  } }!| jL                  r{t(        jN                  jQ                  |j                         ||j1                  |j                        }t        |t(        jR                        r|j1                  |j                        }|r||fz   } |!|||||    ||	|
||	      }"|"d   }|r(||"d   fz   }| j                   j8                  r	||"d   fz   }| jL                  s| jT                  jW                         D ]J  \  }#}$| |$d   k(  sdtY        |#      z   | jZ                  k7  s+|j1                  dtY        |#dz         z         }L O | j]                  |      } |j                  | }|r||fz   }|r|j_                         }|sta        d |||||fD              S tc        |||||      S )a  
        input_ids (`torch.LongTensor` of shape `(batch_size, input_ids_length)`):
            `input_ids_length` = `sequence_length` if `past_key_values` is `None` else
            `past_key_values.get_seq_length()` (`sequence_length` of input past key value states). Indices of input
            sequence tokens in the vocabulary.

            If `past_key_values` is used, only `input_ids` that do not have their past calculated should be passed as
            `input_ids`.

            Indices can be obtained using [`AutoImageProcessor`]. See [`ImageGPTImageProcessor.__call__`] for details.

        Examples:

        ```python
        >>> from transformers import AutoImageProcessor, ImageGPTModel
        >>> from PIL import Image
        >>> import requests

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> image_processor = AutoImageProcessor.from_pretrained("openai/imagegpt-small")
        >>> model = ImageGPTModel.from_pretrained("openai/imagegpt-small")

        >>> inputs = image_processor(images=image, return_tensors="pt")
        >>> outputs = model(**inputs)
        >>> last_hidden_states = outputs.last_hidden_state
        ```NzDYou cannot specify both input_ids and inputs_embeds at the same timer+   r   z5You have to specify either input_ids or inputs_embedszZ`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...FzPassing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.58.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.Tr   z$batch_size has to be defined and > 0r   r   )r   r%   )r   r   r   r   r   r>   zcuda:c              3   $   K   | ]  }|| 
 y wru   r%   )r&   vs     r(   r)   z(ImageGPTModel.forward.<locals>.<genexpr>  s      = r*   )last_hidden_staterU  r   
attentionscross_attentions)2ra   r   rY  r   use_return_dictr   %warn_if_padding_and_no_attention_maskr   r   rV   r   rG  trainingrB   warning_oncer   r   r   from_legacy_cacheget_seq_lengthrY   arangelong	unsqueezetor   r   r   r  r   invert_attention_maskget_head_maskr1  r5   r4   r?  	enumeraterC  rE  cuda
set_devicery   rF  rQ  strlast_devicerD  to_legacy_cacher   r   )%rz   r"  rU  r   rV  rW  r   rX  r   r   r   r   rY  rZ  r   r%  input_shape
batch_sizer   return_legacy_cachepast_lengthencoder_batch_sizeencoder_sequence_lengthr   encoder_hidden_shapeposition_embedsr   token_type_embedsoutput_shapeall_self_attentionsall_cross_attentionsall_hidden_statesrI  blockr  r   r]  s%                                        r(   r   zImageGPTModel.forward<  s   ` 2C1N-TXT_T_TqTq$8$D $++JjJj 	 "+!6IDKK<Q<Q	%0%<k$++B]B] ]%>cdd"66y.Q#..*K!r;r?;I"+J&',,.s3K&,,Q/JTUU%.%:!!@T@T&&4==##p "	#Z?\
 #'1CCOTO:I:Uo446[j%+00[_EN <<[_{5RZ_ZdZdmstL'11!4L %Q !GHH+00R@N ,AtT1,<=N ,..TZZ.@N!N2ekk$**6M6Q6QQN ;;**/D/P=R=W=W=Y: 7$68O#P %-).4HQW)X&%)%?%?@V%W"%)" &&y$++2E2EF	  HHY/M((<0%(:(:=;O;O(PP% $ 8),==M		-0"m&8&8&<%>>$5b4%64;;;Z;Zr`d"6BD!$&&) "	OHAu""

%%m&:&:;!-%3%6%6}7K7K%LNi6 )]-A-A BI#$58H$H!!%'=#"3-
G $AJM &9WQZM&I#;;22+?71:-+O( "" OO113 ODAqAbEzgA&6$:J:J&J(5(8(83q1u:9M(NOA"	OH 		-0***L9   1]4D D-==?O ':KM`bvw   9+++*1
 	
r|   )NNNNNNNNNNNNNN)r   r   r   r   rw   rL  rO  rT  r   r   rY   ry   r   r   r   r   r   r   r   r   s   @r(   r<  r<    s   ~ &"2  -1@D1515/3,0048<9=$(,0/3&*15N
ELL)N
 "%ell(;"<=N
 !.	N

 !.N
 u||,N
 ELL)N
  -N
  (5N
 !) 6N
 D>N
 $D>N
 'tnN
 d^N
 !.N
  !N
" 
u??	@#N
 N
r|   r<  z
    The ImageGPT Model transformer with a language modeling head on top (linear layer with weights tied to the input
    embeddings).
    )custom_introc            &           e Zd ZdgZdef fdZe	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 ddeej                     dee
e
ej                           deej                     deej                     deej                     d	eej                     d
eej                     deej                     deej                     deej                     dee   dee   dee   dee   deej                     dedee
ef   f"d       Z xZS )ImageGPTForCausalImageModelingzlm_head.weightra   c                     t         |   |       t        |      | _        t	        j
                  |j                  |j                  dz
  d      | _        d| _	        d | _
        | j                          y )Nr   Fr3   )rv   rw   r<  r.   r   r(  r\   r_   r<   rE  rF  rH  rz   ra   r{   s     r(   rw   z'ImageGPTForCausalImageModeling.__init__  s[     (0yy0A0AA0EER $r|   r"  rU  r   rV  rW  r   rX  r   r   labelsr   r   rY  rZ  r   r%  r~   c                 (   ||n| j                   j                  }| j                  |||||||||	|||||      }|d   }| j                  |      }d}|
r|dddddf   j	                         }|
dddf   j	                         }t               } ||j                  d|j                  d            |j                  d            }|s|f|dd z   }||f|z   S |S t        |||j                  |j                  |j                  |j                        S )a&
  
        input_ids (`torch.LongTensor` of shape `(batch_size, input_ids_length)`):
            `input_ids_length` = `sequence_length` if `past_key_values` is `None` else
            `past_key_values.get_seq_length()` (`sequence_length` of input past key value states). Indices of input
            sequence tokens in the vocabulary.

            If `past_key_values` is used, only `input_ids` that do not have their past calculated should be passed as
            `input_ids`.

            Indices can be obtained using [`AutoImageProcessor`]. See [`ImageGPTImageProcessor.__call__`] for details.
        labels (`torch.LongTensor` of shape `(batch_size, input_ids_length)`, *optional*):
            Labels for language modeling. Note that the labels **are shifted** inside the model, i.e. you can set
            `labels = input_ids` Indices are selected in `[-100, 0, ..., config.vocab_size]` All labels set to `-100`
            are ignored (masked), the loss is only computed for labels in `[0, ..., config.vocab_size]`

        Examples:

        ```python
        >>> from transformers import AutoImageProcessor, ImageGPTForCausalImageModeling
        >>> import torch
        >>> import matplotlib.pyplot as plt
        >>> import numpy as np

        >>> image_processor = AutoImageProcessor.from_pretrained("openai/imagegpt-small")
        >>> model = ImageGPTForCausalImageModeling.from_pretrained("openai/imagegpt-small")
        >>> device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        >>> model.to(device)  # doctest: +IGNORE_RESULT

        >>> # unconditional generation of 8 images
        >>> batch_size = 4
        >>> context = torch.full((batch_size, 1), model.config.vocab_size - 1)  # initialize with SOS token
        >>> context = context.to(device)
        >>> output = model.generate(
        ...     input_ids=context, max_length=model.config.n_positions + 1, temperature=1.0, do_sample=True, top_k=40
        ... )

        >>> clusters = image_processor.clusters
        >>> height = image_processor.size["height"]
        >>> width = image_processor.size["width"]

        >>> samples = output[:, 1:].detach().cpu().numpy()
        >>> samples_img = [
        ...     np.reshape(np.rint(127.5 * (clusters[s] + 1.0)), [height, width, 3]).astype(np.uint8) for s in samples
        ... ]  # convert color cluster tokens back to pixels
        >>> f, axes = plt.subplots(1, batch_size, dpi=300)

        >>> for img, ax in zip(samples_img, axes):  # doctest: +IGNORE_RESULT
        ...     ax.axis("off")
        ...     ax.imshow(img)
        ```N)rU  r   rV  rW  r   rX  r   r   r   r   rY  rZ  r   r   .r+   r   )losslogitsrU  r   r_  r`  )ra   ra  r.   r<   r   r   r   r   r   rU  r   r_  r`  )rz   r"  rU  r   rV  rW  r   rX  r   r   r  r   r   rY  rZ  r   r%  transformer_outputsr   	lm_logitsr  shift_logitsshift_labelsloss_fctoutputs                            r(   r   z&ImageGPTForCausalImageModeling.forward"  sR   N &1%<k$++B]B]"..+))%'"7#9/!5#) / 
  ,A.LL/	$S#2#q[1<<>L!#qr'?557L')HL--b,2C2CB2GH,J[J[\^J_`D\$7$;;F)-)9TGf$EvE0/??-;;*550AA
 	
r|   )NNNNNNNNNNNNNNN)r   r   r   _tied_weights_keysr   rw   r   r   rY   ry   r   r   r   r   r   r   r   r   s   @r(   r  r    s    ++	~ 	  -1@D1515/3,0048<9=)-$(,0/3&*15!p
ELL)p
 "%ell(;"<=p
 !.	p

 !.p
 u||,p
 ELL)p
  -p
  (5p
 !) 6p
 &p
 D>p
 $D>p
 'tnp
 d^p
  !.!p
" #p
$ 
u77	8%p
 p
r|   r  z
    The ImageGPT Model transformer with an image classification head on top (linear layer).
    [`ImageGPTForImageClassification`] average-pools the hidden states in order to do the classification.
    c                        e Zd Zdef fdZe	 	 	 	 	 	 	 	 	 	 	 	 ddeej                     dee	e	ej                           deej                     deej                     deej                     deej                     d	eej                     d
eej                     dee
   dee
   dee
   dee
   dedee	ef   fd       Z xZS )ImageGPTForImageClassificationra   c                     t         |   |       |j                  | _        t        |      | _        t        j                  |j                  | j                  d      | _        | j                          y )NFr  )
rv   rw   
num_labelsr<  r.   r   r(  r\   scorerH  r  s     r(   rw   z'ImageGPTForImageClassification.__init__  sR      ++(0YYv}}dooEJ
 	r|   r"  rU  r   rV  rW  r   rX  r  r   r   rY  rZ  r%  r~   c                 \   ||n| j                   j                  }| j                  ||||||||	|
||      }|d   }|j                  d      }| j	                  |      }d}|| j                   j
                  | j                  dk(  rd| j                   _        nl| j                  dkD  rL|j                  t        j                  k(  s|j                  t        j                  k(  rd| j                   _        nd| j                   _        | j                   j
                  dk(  rIt               }| j                  dk(  r& ||j                         |j                               }n |||      }n| j                   j
                  dk(  r=t               } ||j                  d	| j                        |j                  d	            }n,| j                   j
                  dk(  rt               } |||      }|s|f|dd z   }||f|z   S |S t!        |||j"                  |j$                  |j&                  
      S )ay  
        input_ids (`torch.LongTensor` of shape `(batch_size, input_ids_length)`):
            `input_ids_length` = `sequence_length` if `past_key_values` is `None` else
            `past_key_values.get_seq_length()` (`sequence_length` of input past key value states). Indices of input
            sequence tokens in the vocabulary.

            If `past_key_values` is used, only `input_ids` that do not have their past calculated should be passed as
            `input_ids`.

            Indices can be obtained using [`AutoImageProcessor`]. See [`ImageGPTImageProcessor.__call__`] for details.
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).

        Examples:

        ```python
        >>> from transformers import AutoImageProcessor, ImageGPTForImageClassification
        >>> from PIL import Image
        >>> import requests

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> image_processor = AutoImageProcessor.from_pretrained("openai/imagegpt-small")
        >>> model = ImageGPTForImageClassification.from_pretrained("openai/imagegpt-small")

        >>> inputs = image_processor(images=image, return_tensors="pt")
        >>> outputs = model(**inputs)
        >>> logits = outputs.logits
        ```N)
rU  r   rV  rW  r   rX  r   r   rY  rZ  r   r   r   
regressionsingle_label_classificationmulti_label_classificationr+   )r  r  rU  r   r_  )ra   ra  r.   r   r  problem_typer  r   rY   rh  rU   r	   rL   r   r   r   r   rU  r   r_  )rz   r"  rU  r   rV  rW  r   rX  r  r   r   rY  rZ  r%  r  r   pooled_hidden_statesr  r  r  r  s                        r(   r   z&ImageGPTForImageClassification.forward  s   d &1%<k$++B]B]"..+))%'/!5# / 
 ,A.,11a1801{{''/??a'/;DKK,__q(fllejj.HFLL\a\e\eLe/LDKK,/KDKK,{{''<7"9??a'#FNN$4fnn6FGD#FF3D))-JJ+-B @&++b/R))-II,./Y!4QR!88F)-)9TGf$EvE//??-;;*55
 	
r|   )NNNNNNNNNNNN)r   r   r   r   rw   r   r   rY   ry   r   r   r   r   r   r   r   r   s   @r(   r  r    sP   ~   -1@D1515/3,004)-$(,0/3&*f
ELL)f
 "%ell(;"<=f
 !.	f

 !.f
 u||,f
 ELL)f
  -f
 &f
 D>f
 $D>f
 'tnf
 d^f
 f
 
u66	7f
 f
r|   r  )r  r  r<  r!  ro   )5__doc__r0  rD   typingr   r   r   rY   torch.utils.checkpointr   torch.nnr   r   r	   activationsr   cache_utilsr   r   
generationr   modeling_layersr   modeling_outputsr   r   r   modeling_utilsr   pytorch_utilsr   r   r   utilsr   r   r   configuration_imagegptr   
get_loggerr   rB   ro   Modulerq   r   r  r  r!  r<  r  r  __all__r%   r|   r(   <module>r     sO   %  	 ' '    A A ! 5 ) 9 
 . Y Y 
 3 
		H	%iX
		 
i)		 i)X")) "J*. J*Z #so #s #sL p
+ p
 p
f 
%<o 

D q
%< q
q
hr|   