
    rh                    ~   U d dl Z d dlZd dlZd dlZd dlZd dlZd dlZd dlZd dlZd dl	Zd dl
mZ d dlmZmZ d dlmZmZmZmZmZ d dlZd dlZd dlZd dlmZ d dlmc mZ d dlmZm Z  d dl!m"Z" d dl#m$Z% d dl&m'Z' d d	l(m)Z) d d
l*m+Z+ d dl,m-Z-m.Z. d dl/m0Z0m1Z1 d dl2m3Z3m4Z4m5Z5m6Z6m7Z7m8Z8 d dl9m:Z: d dl;m<Z< d dl=m>Z> ddl?m$Z$ ddl@mAZA ddlBmCZCmDZDmEZE ddlFmGZG ddlHmIZI ddlJmKZKmLZL ddlMmNZNmOZOmPZP erd dlQZQe$j                  ZSeTeUd<    ej                  eW      ZXej                  eUd<   ej                  j                  Z[ej                  j                  Z\e G d d             Z]e G d d             Z^e G d d             Z_d ej                  d!eTfd"Zad#ej                  d!eTfd$Zcd#ej                  d!eTfd%Zdd ej                  d!eefd&Zf G d' d(      Zg eg       Zh	 dd)ej                  d*ejej                     d+ejej                     d,eek   d!ej                  f
d-Zld ej                  d!eTfd.Zmd ej                  d!eTfd/Znd ej                  d!eTfd0Zod ej                  d!eTfd1Zpd ej                  d!eTfd2Zqd ej                  d!eTfd3Zrd ej                  d!eTfd4Zsd ej                  d!eTfd5Ztd6ej                  d!euejej                     ejej                     f   fd7Zvd8ejej                     d9ekfd:Zwd;eejej                     euej                     f   d!eefd<Zx	 	 dd=ej2                  j                  d ej2                  j                  d>eyd?eyfd@Zzd=ej2                  j                  d ej2                  j                  dAej2                  j                  dBej                  dCeydDeyd!ej2                  j                  fdEZ|dFej                  d!eyfdGZ~d!ejej                     fdHZd ej2                  j                  d!eTfdIZd!ej                  fdJZdKej                  d!eufdLZd=ej2                  j                  d!dfdMZd=ej2                  j                  d!dfdNZ	 dd8ejej                     dOej                  dPej                  dQee<ej                        d!df
dRZddSd6ej                  d8ejej                     dTejej                     dUeedQee<ej                        d!euej                  ej                  f   fdVZdddWd6ej                  dXeejee      dQee<ej                        d!euej                  ej                  f   fdYZ eedZ      Zd[eed!eefd\Zd ej                  d!eefd]Zd=ej                  fd^Zej                  d_        Zd`eej                  eef   d!ejeuej                  eef      fdaZdbej                  d!ej                  fdcZddej2                  j                  deej2                  j                  dfej2                  j                  dgej2                  j                  dhej"                  dieedjej2                  j                  dkej2                  j                  fdlZd6ej                  ddej                  deej                  dmeed!euej                  ej                  f   f
dnZd6ej                  d!dfdoZd6ej                  d!ej                  fdpZ	 dd)ej                  dqe^dre_dsee<ej                        fdtZdu Zd!e]fdvZd=ej                  fdwZd)ej                  dxejey   dyejey   dzeydqe^d{ejej                     d!eueyejee   ejee   f   fd|Zd d}lmZ d~ej                  deed!ej                  fdZd Z	 dd)ej                  dqe^d!ejej                     fdZd)ej2                  j                  d8ejej2                  j                     fdZ	 dddd6ej                  dXeejee      d!euej                  ej                  f   fdZ	 	 	 	 	 ddej2                  j                  dekdekdeTdeeekejek   f      deTdeek   d!dfdZy)    Ndefaultdict)	dataclassreplace)AnyCallableOptionalTYPE_CHECKINGUnion)countersis_node_meta_valid)(create_structured_trace_for_min_cut_info)config)trace_structured)extract_tensor_metadata)BackwardState)is_sym_nodepy_sym_types)magic_methodsmethod_to_operator)find_symbol_binding_fx_nodesfree_symbolshint_intis_symbol_binding_fx_nodestatically_known_falsestatically_known_true)graph_drawer)
OrderedSet)CheckpointPolicy   )GraphInfoProvider)dp_knapsackgreedy_knapsackilp_knapsack)KnapsackEvaluator)get_aot_graph_name)get_cuda_generator_meta_valis_with_effects)fx_graph_cseget_aten_targetraise_getitemsAOT_PARTITIONER_DEBUGlogc                      e Zd ZU dZee   ed<   ee   ed<   ee   ed<   ee   ed<   ee   ed<   dej                  fdZ	dej                  fd	Z
dej                  fd
Zdej                  fdZdej                  fdZy)OpTypesz8Class for keeping track of different operator categoriesfusible_opscompute_intensive_ops
random_opsview_opsrecomputable_opsnodec                 0    t        |      | j                  v S N)r*   r0   selfr5   s     p/var/www/html/ai-insurance-compliance-backend/venv/lib/python3.12/site-packages/torch/_functorch/partitioners.py
is_fusiblezOpTypes.is_fusibleM   s    t$(8(888    c                 0    t        |      | j                  v S r7   )r*   r1   r8   s     r:   is_compute_intensivezOpTypes.is_compute_intensiveP   s    t$(B(BBBr<   c                 0    t        |      | j                  v S r7   )r*   r2   r8   s     r:   	is_randomzOpTypes.is_randomS   s    t$77r<   c                 0    t        |      | j                  v S r7   )r*   r3   r8   s     r:   is_viewzOpTypes.is_viewV   s    t$55r<   c                 0    t        |      | j                  v S r7   )r*   r4   r8   s     r:   is_recomputablezOpTypes.is_recomputableY   s    t$(=(===r<   N)__name__
__module____qualname____doc__r   r   __annotations__fxNoder;   r>   r@   rB   rD    r<   r:   r/   r/   C   s    BH%%%h//8$$"" **9rww 9C C8bgg 86BGG 6>BGG >r<   r/   c                      e Zd ZU eej
                     ed<   eej
                     ed<   eej
                     ed<   eej
                     ed<   eej
                  e	f   ed<   eej
                     ed<   e
j                  deej
                     fd       Zd	ej
                  defd
Zd	ej
                  defdZd	ej
                  defdZd	ej
                  de	fdZy)NodeInfoinputs_required_fw_nodesrequired_bw_nodesunclaimed_nodesfw_orderstatic_lifetime_input_nodesreturnc                 F     t        d  j                  D         fd      S )Nc              3       K   | ]  }|  y wr7   rL   .0ns     r:   	<genexpr>z-NodeInfo.required_fw_nodes.<locals>.<genexpr>l   s     01Q0s   c                 "    j                   |    S r7   )rS   )rZ   r9   s    r:   <lambda>z,NodeInfo.required_fw_nodes.<locals>.<lambda>l   s    a@P r<   key)sortedrP   r9   s   `r:   required_fw_nodeszNodeInfo.required_fw_nodesi   s!    0//06P
 	
r<   rZ   c                     || j                   v S r7   )rP   r9   rZ   s     r:   is_required_fwzNodeInfo.is_required_fwo   s    D++++r<   c                     || j                   v S r7   )rQ   rd   s     r:   is_required_bwzNodeInfo.is_required_bwr   s    D****r<   c                     || j                   v S r7   )rR   rd   s     r:   is_unclaimedzNodeInfo.is_unclaimedu   s    D((((r<   c                 R    || j                   v sJ d| d       | j                  |   S )NNode z not in fw nodes!)rP   rS   rd   s     r:   get_fw_orderzNodeInfo.get_fw_orderx   s4    D+++IuQC7H-II+}}Qr<   N)rE   rF   rG   listrJ   rK   rI   r   dictint	functoolscached_propertyrb   boolre   rg   ri   rl   rL   r<   r:   rN   rN   ]   s     M"277++!"''**((277C<  !+BGG!44
4= 
 

, ,D ,+ +D +)bgg )$ ) bgg  #  r<   rN   c                   @    e Zd ZU eed<   eed<   eed<   eed<   eed<   y)MinCutOptionsban_if_used_far_apartban_if_long_fusible_chainsban_if_materialized_backwardban_if_not_in_allowlistban_if_reductionN)rE   rF   rG   rr   rI   rL   r<   r:   rt   rt   }   s      $$"&&!!r<   rt   r5   rU   c                 z    | j                   j                  dd       t        j                  t        j                  fv S )N	recompute)metagetr   MUST_RECOMPUTEPREFER_RECOMPUTEr5   s    r:   must_recomputer      s5    99==d+''))0  r<   fx_gc                 T    | j                   j                  D ]  }t        |      s y y)NTF)graphnodesr   r   r5   s     r:   has_recomputable_opsr      s+    

   $ r<   c                     | j                   j                  D ]W  }t        |      st        |j                  d      s&t
        j                  j                  |j                  j                  v sW y y)NtagsTF)	r   r   r   hasattrtargettorchTagnondeterministic_seededr   r   s     r:   has_recomputable_rng_opsr      sU    

   4 V,		11T[[5E5EE r<   c                     t        | j                  d   t        j                  t        j                  f      ryt        | j                  d   t        j
                        sJ y)Nvalr       )
isinstancer|   r   SymIntSymBoolSymFloatr   s    r:   sym_node_sizer      sE    $))E"U\\5==$ABdii&777r<   c                       e Zd Zd Zy)InvalidNodeBasec                      y)NzInvalid NoderL   ra   s    r:   __repr__zInvalidNodeBase.__repr__   s    r<   N)rE   rF   rG   r   rL   r<   r:   r   r      s    r<   r   joint_graphrO   outputssubgraphc                 `  
 t        j                         }i 
|D ]3  }|j                  |j                        }|j                  |_        |
|<   5 | j
                  D ]  }t        |      r|dk7  r
t        
|<   |
v r#|j                  dk(  r
t        
|<   <|j                  dk(  rt        j                  |j                  i |j                  }|D cg c]/  }t        |t         j                        rt        
|   t              1 }}t!        |      r
t        
|<   |j#                  |
fd      
|<   |j                  dk(  r|j#                  |
fd      
|<   |j                  dk(  s g }	|D ]s  }t        |t         j                        rF|
vrt%        d| d	      t        
|   t              rJ d| d
       |	j'                  
|          c|	j'                  |       u |j)                  t+        |	             |j-                          |j/                          |S c c}w )a  
    Given a graph, extracts out a subgraph that takes the specified nodes as
    inputs and returns the specified outputs.

    This includes specifying non-placeholder nodes as inputs.

    The general strategy is to initialize all inputs with proxies as we
    encounter them, and trace through the graph, only keeping values which take
    in valid proxies. Then, all dead code is eliminated.
    backwardplaceholdercall_functionc                     |    S r7   rL   xenvs    r:   r]   z4_extract_graph_with_inputs_outputs.<locals>.<lambda>       CF r<   get_attrc                     |    S r7   rL   r   s    r:   r]   z4_extract_graph_with_inputs_outputs.<locals>.<lambda>   r   r<   outputrk   z couldn't be found in envz was invalid, but is output)rJ   Graphr   namer|   r   _must_be_in_backwardInvalidNodeoppytreearg_tree_leavesargskwargsr   rK   r   any	node_copyRuntimeErrorappendr   tupleeliminate_dead_codelint)r   rO   r   r   	new_graphr5   new_nodeall_argsr   output_valuesr   s             @r:   "_extract_graph_with_inputs_outputsr      s%     
I
C  ((3		D		 !! %(j*@#CI3; WW%#CIWW'--tyyHDKKHH "a) 3q6?3H 
 8}'D	!++D2BCCIWW
"!++D2BCCIWW 56 M 	$a!|"U1#-F#GHH!A 6qc456    Q(  #	$ U=)*!!#NN9s   4H+c                     | j                   dk(  xr3 dt        | j                        vxr t        |        xr t	        |        S Nr   tangents)r   strr   _is_bwd_seed_offset_is_fwd_seed_offsetr   s    r:   
_is_primalr      sK    =  	*c$++..	*#D))	* $D))	r<   c                 R    | j                   dk(  xr dt        | j                        v S r   r   r   r   r   s    r:   _is_tangentr      s$    77m#F
c$++6F(FFr<   c                     | j                   dk(  xr0 dt        | j                        v xs dt        | j                        v S )Nr   bwd_seedbwd_base_offsetr   r   s    r:   r   r     =    77m# c$++&&O*;s4;;?O*Or<   c                     | j                   dk(  xr0 dt        | j                        v xs dt        | j                        v S )Nr   fwd_seedfwd_base_offsetr   r   s    r:   r   r     r   r<   c                 v    | j                   dk(  xr) t        | j                  j                  d      t              S )Nr   r   )r   r   r|   r}   r   r   s    r:   _is_backward_stater     s*    77m#W
499==3G(WWr<   c                 @    | j                   j                  dd       dk(  S )Npartitioner_tagis_backwardr|   r}   r   s    r:   _has_tag_is_backwardr     s    99==*D1]BBr<   c                 @    | j                   j                  dd       dk(  S )Nr   must_be_in_backwardr   r   s    r:   _has_tag_must_be_in_backwardr     s    99==*D15JJJr<   c                 L    t        |       xs t        |       xr t        |       S r7   )r   r   r(   r   s    r:   r   r     s&    '- T"<t'<r<   joint_modulec                    t        j                  d | j                  j                  d      D         }|d | }||d  }||fS )Nc              3   4   K   | ]  }|j                     y wr7   r   rY   r5   s     r:   r[   z+_extract_fwd_bwd_outputs.<locals>.<genexpr>$  s     	K$))	K   r   r   )r   r   r   
find_nodes)r   num_fwd_outputsr   fwd_outputsbwd_outputss        r:   _extract_fwd_bwd_outputsr      sW     $$	K 2 2 = = = J	KG *?+K/*+K##r<   saved_valuesr   c                 V    | D ]$  }|j                   |k(  s| j                  |        y  y r7   )r   remove)r   r   saved_values      r:   _remove_by_namer   +  s0    # t#,r<   fwd_module_outputsc                     t        |       }t        t        |       dz
  dd      D ]  }t        | |         r|dz   } |S  |S )Nr    )lenranger   )r   idxis      r:   find_first_sym_noder   2  sX      
!C3)*Q.B7 -a01a%CJ	 Jr<   r   maxminc           	         | j                  |      5  | j                  t        j                  j                  j
                  j                  |f      }t        j                  j                  j
                  j                  |j                  d         |j                  d<   t        |j                  d         |j                  d<   d d d        | j                        5  | j                  t        j                  j                  j                  j                  |dgdf      }t        j                  j                  j                  j                  |j                  d   dgd      |j                  d<   t        |j                  d         |j                  d<   d d d        | j                        5  | j                  t        j                  j                  j                  j                  |t        j                  f      }t        j                  j                  j                  j                  |j                  d   t        j                        |j                  d<   t        |j                  d         |j                  d<   d d d        | j                        5  | j                  t        j                  j                  j                  j                  ||f      }t        j                  j                  j                  j                  |j                  d   |      |j                  d<   t        |j                  d         |j                  d<   d d d        | j                        5  | j                  t        j                  j                  j                  j                  |f      }t        j                  j                  j                  j                  |j                  d         |j                  d<   t        |j                  d         |j                  d<   d d d        | j                        5  | j                  t        j                  j                  j                  j                   ||f      }	t        j                  j                  j                  j!                  |j                  d   |      |	j                  d<   t        |	j                  d         |	j                  d<   d d d        | j                  	      5  | j                  t        j                  j                  j                  j                  |	t        j"                  fdt%        |j&                        z         }
t        j                  j                  j                  j                  |	j                  d   t        j"                        |
j                  d<   t        |
j                  d         |
j                  d<   d d d        |
S # 1 sw Y   1xY w# 1 sw Y   lxY w# 1 sw Y   xY w# 1 sw Y   xY w# 1 sw Y   xY w# 1 sw Y   NxY w# 1 sw Y   
S xY w)Nr   r   tensor_metar   T
fp8_scale_r   r   )inserting_afterr   r   opsatenabsdefaultr|   r   amaxprimsconvert_element_typefloat64	clamp_min
reciprocalmulTensorfloat32r   r   )r   r5   r   r   abs_node	amax_nodeamax_64_nodeclamp_min_nodereciprocal_nodemul_node
scale_nodes              r:   calculate_quantization_scalingr  =  s     
		t	$ U&&IINN&& ' 
  %yy~~1199$))E:JKe'>x}}U?S'Tm$U 
		x	( W''IINN''RD$' ( 
	 !&		 3 3 ; ;MM% 2$!
	u )@	u@U(V	}%W 
		y	) 

**IIOO0088U]]+ + 
 $)99??#G#G#O#ONN5!5==$
%  ,Ce$,
-(

 
		|	, 

,,IINN$$,,$ - 
 &+YY^^%=%=%E%Ee$c&
E" .E&.
M*

 
		~	. 

--IINN%%-- " . 
 ',iinn&?&?&G&G&'
U# /F  '/
]+

 
			/ U&&IINN%%!3' ' 
  %yy~~1188  ' 
e (?x}}U?S'Tm$U 
		x	( 	Y((IIOO0088EMM*DII. ) 


 "'!E!E!M!MMM% %--"

 *AQVAW)X
&	Y IU UW W

 



 



 

U U	Y sZ   B3W4B9X0CXB5X(B3X(4B5X5C(Y4W>XXX%(X25X?Yr  
quant_typer  	clamp_maxc           	      	   | j                  |      5  | j                  t        j                  j                  j
                  j                  |t        j                  f      }t        j                  j                  j
                  j                  |j                  d   t        j                        |j                  d<   t        |j                  d         |j                  d<   d d d        | j                        5  | j                  t        j                  j                  j                  j                  ||f      }t        j                  j                  j                  j                  |j                  d   |j                  d         |j                  d<   t        |j                  d         |j                  d<   d d d        | j                        5  | j                  t        j                  j                  j                  j                  ||f      }t        j                  j                  j                  j                  |j                  d   |      |j                  d<   t        |j                  d         |j                  d<   d d d        | j                        5  | j                  t        j                  j                  j                  j                  ||f      }	t        j                  j                  j                  j                  |j                  d   |      |	j                  d<   t        |	j                  d         |	j                  d<   d d d        | j                  	      5  | j                  t        j                  j                  j
                  j                  |	|fdt        |j                         z         }
t        j                  j                  j
                  j                  |	j                  d   |      |
j                  d<   t        |
j                  d         |
j                  d<   d d d        |
S # 1 sw Y   hxY w# 1 sw Y   xY w# 1 sw Y   xY w# 1 sw Y   xY w# 1 sw Y   
S xY w)Nr   r   r   
fp8_quant_r   )r   r   r   r   r  r  r   r  r|   r   r   r  r  r  r  r   r   )r   r5   r  r  r  r  target_node_32scaled_target_nodeclamp_min_scaled_nodeclamp_max_scaled_nodequant_activation_nodes              r:   perform_quantizationr    s    
		z	* 

,,IIOO0088& - 
 &+YY__%I%I%Q%QIIeemm&
E" .E&.
M*

 
		~	. 

"00IINN%% *- 1 
 */););)B)B&
(>*
& 2I##E*2
.

 
		1	2 

 % 3 3IINN$$,,$i0 !4 !
 -2IINN,D,D,L,L##E*I-
""5) 5L!&&u-5
""=1

 
		4	5 

 % 3 3IINN$$,,'3 !4 !
 -2IINN,D,D,L,L!&&u-y-
""5) 5L!&&u-5
""=1

 
		4	5 
 % 3 3IIOO0088'4DII. !4 !
 IIOO0088!&&u-z
 	""	

 5L!&&u-5
""=1
 ! u

 



 



 



 


 ! s@   CQ	<CQB5Q#%B5Q03CQ=	QQ #Q-0Q:=Rtensorc                 R    | j                         }| j                         }||z  dz  S )z
    Calculate the size of a PyTorch tensor in megabytes (MB).

    Args:
        tensor (torch.Tensor): Input tensor

    Returns:
        float: Memory size in MB
    i   )numelelement_size)r  num_elementsr  s      r:   calculate_tensor_sizer     s-     <<>L&&(L<'K88r<   c            	          t         j                  j                  j                  d   j	                  dd      } | j                  d      D cg c]$  }t        t         |j                  d      d         & } }| S c c}w )N!activation_quantization_aten_passallowed_dtypesztorch.bfloat16;.r   )r   	_inductorr   post_grad_fusion_optionsr}   splitgetattr)r#  dtypes     r:   get_allowed_dtypesr+    sz    __++DD+	c
,-  ;I:N:Ns:S16u{{3'+,N  s   )A8c                 B   t               }t        |       r| j                  d   j                  |vryt        j
                  j                  j                  d   j                  dd      }t        | j                  d         }t        j
                  j                  j                  d   j                  dd      s||k\  S t        j
                  j                  j                  d   j                  dd      rt        ||k\        xs t        ||k\         S t        ||k\        S )Nr   Fr"  
size_in_mbd   skip_dynamo_guardsquantize_dynamic_shape)r+  r   r|   r*  r   r&  r   r'  r}   r   r   r   )r5   r#  size_thresholdr-  s       r:   should_quantizer2    s   ')Nd#tyy'7'='=^'S__++DD+	c,  'tyy'78J??!!::+	c
&' ^++ ??!!::/

#&
./ )n, J+J.,HIIJ
 )~)EFFr<   c                      t         j                  j                  j                  d   j	                  dd      } t        t         | j                  d      d         S )Nr"  r  ztorch.float8_e5m2r%  r   )r   r&  r   r'  r}   r)  r(  )r  s    r:   get_quant_typer4    sN    ''@@+	c,+,  5***3/344r<   r*  c                 \    t        j                  |       }|j                  |j                  fS )z
    Calculate the range of values for a given torch.dtype.
    Args:
        dtype (torch.dtype): The input dtype.
    Returns:
        tuple: A tuple containing the minimum and maximum values.
    )r   finfor   r   )r*  infos     r:   calculate_ranger8    s%     ;;uD88TXXr<   c           
         | j                  d      d   }|j                  d   }t               }t        |      \  }}t	               }g g }}|D ]  }	|	j
                  j                  dd      s!t        j                  j                  j                  d   j                  dd      rMt        | |	|d	      }
t        | |	|
|||      }t        |
      s|j                  |
       n|j                  |
       n| j                  |	      5  | j!                  t        j"                  j$                  j&                  j(                  |	|fd
t+        |	j,                        z         }t        j"                  j$                  j&                  j)                  |	j
                  d   |      |j
                  d<   t/        |j
                  d         |j
                  d<   d d d        ||	<    |D 	cg c]  }	|	|v r||	   n|	 }}	t1        |      }||z   }|r|d | |z   ||d  z   }|j3                  dt5        |             t6        d   dxx   dz  cc<   y # 1 sw Y   yxY wc c}	w )Nr   r   r   saved_for_quantizationFr"  use_scalingT-q=r  r   r   r   inductor%activation_quantization_fwd_aten_passr    )r   r   r4  r8  rn   r|   r}   r   r&  r   r'  r  r  r   r   r   r   r   r  r  r   r   r   r   r   
update_argr   r   )r   r   r   r  r  r  node_to_quanttensor_scale_nodessym_scale_nodesr5   r  
quant_nodeoutput_updated_argsr   scale_nodess                  r:   quantize_activation_fwrF    sd   *1-F++a.K!J*:6IyFM*,b #-99==159%%>>3c-&' <4E
 24ZI
 #:.&--j9#**:6 **40 !&!4!4		<<DD"J/)C		N: "5 "J 		<<DD		%(* OO
 6M".6JOOM2 #-M$G#-L LWCGt}4d$>  1
2C$6K%36I#$6OO 	 a234Z@AQFA9  s   CI#II	c           
      
  	 | j                   D cg c]  }|j                  dk(  s| }}d }|D ]~  }|j                  j                  dd      s!|j                  j	                  d       |j                  j	                  d      }t
        j                  j                  j                  d   j                  dd      r| j                  |      5  d|j                  j                  dd	      z   	t        	fd
|D              }d d d        | j                        5  | j                  t
        j                  j                  j                   j"                  ||f      }t
        j                  j                  j                   j#                  |j                  d   |      |j                  d<   t%        |j                  d         |j                  d<   d d d        | j                  |      5  | j                  t
        j                  j&                  j(                  j*                  ||f      }t
        j                  j&                  j(                  j+                  |j                  d   |j                  d         |j                  d<   t%        |j                  d         |j                  d<   d d d        | j                        5  | j                  t
        j                  j                  j                   j"                  ||f      }t
        j                  j                  j                   j#                  |j                  d   |      |j                  d<   t%        |j                  d         |j                  d<   d d d        n| j                  |      5  | j                  t
        j                  j                  j                   j"                  ||fdt-        |j                        z         }t
        j                  j                  j                   j#                  |j                  d   |      |j                  d<   t%        |j                  d         |j                  d<   d d d        t/        |j0                  j3                               D ]   }|k7  s	||k7  s|j5                  ||       "  t6        d   dxx   dz  cc<   y c c}w # 1 sw Y   xY w# 1 sw Y   	xY w# 1 sw Y   ;xY w# 1 sw Y   xY w# 1 sw Y   xY w)Nr   r:  Fdequant_typer"  r;  r   r   c              3   @   K   | ]  }|j                   k(  r|  y wr7   r   )rY   	bwd_input
scale_names     r:   r[   z)quantize_activation_bw.<locals>.<genexpr>b  s%      &%$>>Z7 "&s   r   r   r   dequant_r   r=  %activation_quantization_bwd_aten_passr    )r   r   r|   r}   popr   r&  r   r'  r   r   r   nextr   r   r  r  r   r   r   divr  r   rm   userskeysreplace_input_withr   )
r   r5   	bw_inputsactivation_noderH  r  divided_target_node_32dequant_nodeuserrM  s
            @r:   quantize_activation_bwr[  S  sy   "'++J$M1IJIJO H@99==159IIMM2399==8L%%>>3c-'( **40 !-		0A0A,PR0S!SJ!% &)2& "J **:6 &+&9&9		<<DD"L1 ': 'O 		<<DD		%(, $((
 ;R',,U3;O((7 **?; 
T-2-@-@		**11-z: .A .* :?9K9K9R9R',,U3Z__U5K:*//6
 00F0K0KE0RS +//%
T **+AB #(#6#6		<<DD4lC $7 $L 		<<DD.33E:L !%%
 8O$))%08L%%m4  **40 #(#6#6		<<DD"L1'#dii.8 $7 $L 		<<DD		%(, !%%
 8O$))%08L%%m4 TZZ__./ @<'DO,C++D,?@MH@T Z@AQFAY K  
T 
T  sJ   SS4SB5S*CS%B5S2CS>S	S"	%S/	2S;	>T	
fwd_module
bwd_modulerT   c                 	   t         j                  j                  dd       	 y |r|D cg c]  }|j                   c}ng }| D ci c]  }|j                  | }}t        j
                  j                  j                  d   j                  dd      r)| D ci c]  }d|j                  vs|j                  |  }}j                  j                  d      d   j                  d   }j                  j                  d      D ci c]  }|j                  | }}|D ]  }|j                  |v st        |      s|j                  |v r!t        j                  d	|j                         Md
|j                  d<   |j                  d   j                  |j                  d<   d
||j                     j                  d<   |j                  d   j                  ||j                     j                  d<    t        dd fd       t!        j                         t        dd fd       t        dd fd       j                  j                  d      d   j                  d   }	|	D ]  }
d|
j                  v s||
j                  j#                  dd         }j                  j%                  |      5  j                  j'                  |
j                        }d d d        |j                  d   }j                  j)                  |
j                         d
|j                  d<   ||j                  d<   |j+                  |       j                  j-                  |        t        j
                  j                  j                  d   j                  dd
      rt/        j                  j                  d            }|d   }t1        |      D ]  }t3        |      r|} n j                  j                  d      d   j                  d   }|D ]  }
d|
j                  v sj                  j%                  |      5  j                  j'                  |
j                        }d d d        j                  j)                  |
j                         |} t5        j                         t        dd fd       y c c}w c c}w c c}w c c}w # 1 sw Y   xY w# 1 sw Y   {xY w)Nr"  exclude_primalsFprimalsr   r   r   r   z*Skipping quantization of static input %s: Tr:  r   rH  artifactc                      dddS )N,before_activation_quantization_fwd_aten_passstringr   encodingrL   rL   r<   r:   r]   z0enable_activation_quantization.<locals>.<lambda>      B 
 r<   c                  ,     j                  ddd      S NFT)print_outputinclude_strideinclude_deviceprint_readabler\  s   r:   r]   z0enable_activation_quantization.<locals>.<lambda>      :44tD 5 
 r<   )metadata_fn
payload_fnc                      dddS )N+after_activation_quantization_fwd_aten_passrd  re  rL   rL   r<   r:   r]   z0enable_activation_quantization.<locals>.<lambda>      A 
 r<   c                  ,     j                  ddd      S ri  rm  ro  s   r:   r]   z0enable_activation_quantization.<locals>.<lambda>  rp  r<   c                      dddS )N,before_activation_quantization_bwd_aten_passrd  re  rL   rL   r<   r:   r]   z0enable_activation_quantization.<locals>.<lambda>  rg  r<   c                  ,     j                  ddd      S ri  rm  r]  s   r:   r]   z0enable_activation_quantization.<locals>.<lambda>  rp  r<   r  rI  rK  r;  r   r   c                      dddS )N+after_activation_quantization_bwd_aten_passrd  re  rL   rL   r<   r:   r]   z0enable_activation_quantization.<locals>.<lambda>  ru  r<   c                  ,     j                  ddd      S ri  rm  rz  s   r:   r]   z0enable_activation_quantization.<locals>.<lambda>  rp  r<   )inductor_configr'  r}   r   r   r&  r   r   r   r   r2  r-   debugr|   r*  r   rF  r   r   r   updatereplace_all_uses_with
erase_noderm   reversedr   r[  )r   r\  r]  rT   r5   static_input_namessaved_values_namesr   bwd_module_inputsquant_fwd_module_outputsfwd_noderL  quant_bwd_inputrH  quant_bwd_module_inputsbwd_input_locbw_inputscaled_fwd_module_outputsscale_bwd_inputs    ``                r:   enable_activation_quantizationr    s    	0044/	
 	
 	 '  ;;t; 
 7CCd$))T/CC66+	c
U#$ )5
 $	8RDIItO
 
 $))444A!DII!L$.$4$4$?$?=$?$Q 		4  # 	W99**t/Dyy..		F		R26DII./(,		%(8(>(>DIIn%JNdii(--.FG@D		%@P@V@Vdii(--n=	W 

	 :++,

	 

	  *//::h:GJOOPQR, 
38==()(--*?*?b*QRI!!11)< S","2"2">">HMM">"RS$>>.9L  ''6=AO  !9:3?O  0++O<''	2
3 66+	c- #'z'7'7'B'Bm'B'T"U/3 !89 	Hx( (	
 %/$4$4$?$?8$?$LQ$O$T$TUV$W!1 	0Hx}},%%55mD W&0&6&6&B&B&B&VOW$$++HMM: /	0 :++,

	C 	< D
lS S0W Ws5   SSS,SS7'S'S"S	"S+	)rT   saved_sym_nodesr   c                H   t        | |      \  }}| j                  j                  d      }g t        t        |      }g t        t
        |      }	g t        t        |      }
g t        t        |      }g t        t        |      }t        | j                  ||z   |	z   |z   |d      }t        j                  j                         }|j                  d      D ]  }|j                  s-t        ||j                         t        ||j                         <|rIt!        d |j                  D              r-t        ||j                         t        ||j                         t        |      st        ||j                         |rJ  t#               }g }g }|D ]C  }t%        |      }|r#|j'                  |       |j)                  |       3|j)                  |       E t+        | j                        }t-        j.                  |||	      D ]]  }d|j0                  vrt3        |j0                  d         |z
  }t5        |d       D ]  }||vr|j)                  ||           ||z  }_ |j7                          |j9                  ||z          t        | j                  ||
z   ||z   |z   d	      }t        | j                  ||z   |	z   |z   |z   |d      }t:        j<                  j?                  | |      }t:        j<                  j?                  | |      }tA        ||||       ||fS )
Nr   r   r   r   c              3      K   | ]X  }|j                   t        j                  j                  j                  j
                  u xr t        |j                        d k(   Z ywr   N)r   r   r   _c10d_functionalwait_tensorr   r   rS  rX   s     r:   r[   z+_extract_fwd_bwd_modules.<locals>.<genexpr>C  sS      )
  HH		22>>FFF "AGG!")
s   AA r   c                     | j                   S r7   rK  )ss    r:   r]   z*_extract_fwd_bwd_modules.<locals>.<lambda>i  s
    166 r<   r^   forward)!r   r   r   filterr   r   r   r   r   r   r   distributedis_availablerS  r   r   allr   r   addr   r   	itertoolschainr|   r   r`   clearextendrJ   _lazy_graph_module_make_graph_moduler  )r   r   r  r   rT   r   r   placeholdersprimal_inputstangent_inputsfwd_seed_offset_inputsbwd_seed_offset_inputsbackward_state_inputs	bwd_graphdistributed_enabledr5   saved_symbolssaved_sym_nodes_bindingsaved_sym_nodes_derivedsymbolsymbol_bindingsnew_symbolsr  	fwd_graphr\  r]  s                             r:   _extract_fwd_bwd_modulesr    sI     8o K  %%00M0BL7fZ67M9vk<89NIv&9<HIIv&9<HIGf%7FG2,&7:PP	I  ++88:$$$6 )zzL$))4OTYY7
 !S )
 ZZ)
 &

 L$))4OTYY7%L$))4(((')2 /9lM     1*40f%#**40#**401 3<3E3EFO 7~V %		!"499U#34}D)9: 	?A '#**?1+=>	? 	$%" 25LLM 3..l"_4	I 3
	
	 !	!  		 
 		I &&99,	RJ&&99,	RJ"j*.I z!!r<   )static_lifetime_input_indicesrT   r  c                `   t        |       rt        | |||      S t        t        t        | j
                  j                              }t        t        t        | j
                  j                              }||z   }t        | |      \  }}	t        | j
                  ||d      }
t        d |
j                  D              }g }g }| j
                  j                  D ]  }|j                  |vrt        |      r|j                  |       /d|j                  vrA|j                  dk(  r2|j                   }t#        d |D              sJ |j%                  |       ~|j                   D cg c]  }|j                  |vs| }}d|j                  v r$t#        d |D              r|j%                  |       |j                  |        t        t&        j)                  |      j+                               }t        t&        j)                  |      j+                               }t-        | ||||	      S c c}w )
a  
    Partitions the :attr:`joint_module` in a manner that closely resembles the
    behavior observed in the original ``.forward()`` and ``.backward()`` of the
    callable, i.e., the resulting forward graph contains those operators that
    are executed in the original ``.forward()`` callable passed to
    :func:`aot_function`.

    The default partitioner collects the operators that are between the forward
    inputs and the forward outputs. This helps in finding the tensors which have
    to be stashed for the backward pass. These stashed tensors become the output
    of the generated forward graph. The remaining operators are then placed in
    the backward graph.

    .. warning::
        This API is experimental and likely to change.

    Args:
        joint_module(fx.GraphModule): The joint forward and backward graph. This
            is the result of AOT Autograd tracing.

    Returns:
        Returns the generated forward and backward Fx graph modules.
    )r   r  r  r  c              3   T   K   | ]   }|j                   d k7  s|j                   " ywr   Nr   r   r   s     r:   r[   z$default_partition.<locals>.<genexpr>  s$      $$''X:M		$   ((r   r   c              3   V   K   | ]!  }|j                   t        j                  k(   # y wr7   )r   operatorgetitemrY   rZ  s     r:   r[   z$default_partition.<locals>.<genexpr>  s     I4t{{h&6&66Is   ')c              3   2   K   | ]  }t        |        y wr7   r   rX   s     r:   r[   z$default_partition.<locals>.<genexpr>  s      2#$A2   r  r   rT   )r   #min_cut_rematerialization_partitionrm   r  r   r   r   r   r   r   r   r   r   r   r|   r   rS  r  r  rn   fromkeysrT  r  )r   _joint_inputsr   r  rT   r  r  rO   r   r   forward_only_graphforward_node_namesr   r  r5   rS  rZ   backward_usagess                     r:   default_partitionr    s   > L)2+*G	
 	
 
L,>,>,D,DEFM!&)<l>P>P>V>V"WX33F7o K <FK $ $066$  LO""(( *99..t ""4($))+?0JJJEI5IIII&  ::7I)IO  		)c 2(72 /  &&7##D);*< l388:;L4==9>>@AO#''$? )s   $H+8H+g    .Ar  c                      | |j                   z  S r7   )itemsize)r  r*  s     r:   _tensor_nbytesr    s    5>>!!r<   c                 V   dt         fdd| j                  v r| j                  d   }t        |t              ryt        |t        t
        f      rt        fd|D              S t        |t              r"t        fd|j                         D              S t        |t        j                        r |      S t        dt        |       d|        | j                  d	k(  s:| j                  t        j                  j                   j"                  j$                  u ry
t        d|  d      )NrU   c                     t        | t        j                        syt        t	        | j                         d      | j                        S )Nr      fallback)r   r   r  r  r   r  r*  r   s    r:   object_nbytesz_size_of.<locals>.object_nbytes  s1    !U\\*hqwwy4@!''JJr<   r   r    c              3   .   K   | ]  } |        y wr7   rL   )rY   rZ   r  s     r:   r[   z_size_of.<locals>.<genexpr>  s     5A}Q'5   c              3   4   K   | ]  \  }} |        y wr7   rL   )rY   _rZ   r  s      r:   r[   z_size_of.<locals>.<genexpr>	  s     @DAq}Q'@   zUnknown metadata type z	 on node r   r   rk   zO didn't have `val` metadata; we should always have `val` metadata on the nodes.)ro   r|   r   r   rm   r   sumrn   itemsr   r  r   typer   r   r   r   _assert_scalarr   )r5   r   r  s     @r:   _size_ofr    s    KC K
 		iic<( dE]+5555T"@CIIK@@@U\\* %%3DI;ivNOOww*uyy~~/L/L/T/T T

vde r<   c           	      2   ddl m}  |t              }| j                  D ]3  }|j                  dk(  s||j
                  j                  xx   dz  cc<   5 t        j                  dt        |j                         t        j                  d      d             y )Nr   r   r   r    z%sTr_   reverse)collectionsr   ro   r   r   r   rE   r-   r7  r`   r  r  
itemgetter)r   r   cntr5   s       r:   
_count_opsr    sr    '%c*C +77o%$$%*%+ HHT6#))+8+>+>q+A4PQr<   c                     g } t        t        j                  j                        D ]  }t	        t        j                  j                  |      }t        |t        j                  j                        sL|j                         D ]G  }t	        ||      }t        j                  j                  |j                  v s6| j                  |          | S r7   )dirr   r   r   r)  r   _opsOpOverloadPacket	overloadsr   	pointwiser   r   )r   	attr_nameopoverloadpacketoverloadop_overloads        r:   pointwise_opsr     s    
C( 
	"599>>9=*EJJ,G,GH(224 	H!"2H=Kyy""k&6&66

+,	
 Jr<   	depth_mapc                     | D ci c]7  }t        |t        j                  j                  j                        s2|||   9 }}t        |j                         t        j                  d      d      S c c}w )Nr    Tr  )	r   r   rJ   r5   rK   r`   r  r  r  )r   r  arg
arg_depthss       r:   sort_depthsr  2  sf    '+ #z#uxx}}?Q?Q/RYs^J  *""$(*=*=a*@$OOs
   3A2A2gmc                   
 t        j                         i 
| j                  j                  d      D ]  }j	                  |
fd      
|<    t        | j                  j                        D ci c]  \  }}||
 c}}
fd}t        t        t        | j                  j                              }d}t        j                  }|D ]#  }|j                  D ]  }|   |k  s|   }|} % || S t        | j                  j                        d|    D ]V  }|j                  dk(  s|j                  t        j                   j"                  j$                  j&                  k(  sO ||       X t        | j                  j                        |   d D ]
  } ||        t        j                   j)                  |       }	|	S c c}}w )a  
    This pass finds the first bwd node in the graph (by looking at users of
    tangents) and then reorders the graph by walking from this node to all the
    way to the end of the graph. At each op in this traveral, we insert this op
    in a new graph and try to bring only the relevant subgraph from the other
    non-bwd edges relevant for this op. This closely mimics the behavior of
    autograd engine.

    Why is this pass required in the first place?

    This is an artifact of how partitioners work today. The starting point of
    partitioner is a joint graph, which is fwd and then bwd graph. In the case
    of checkpointing, we keep portions of fwd graph in their original place in
    the joint graph, while obtaining a bwd graph. As a result, the resulting bwd
    graph has copies of recomputed fwd subgraphs followed by the original bwd
    graph. If we run this naively, this leads to bad memory footprint, because
    the fwd subgraphs are live for way longer duration than necessary. This pass
    reorders the operations such that we prioritize the ops for the original bwd
    graph while only realizing those ops from the fwd graph that are necessary
    at any given point in the graph.
    r   r   c                     |    S r7   rL   r   s    r:   r]   z5reordering_to_mimic_autograd_engine.<locals>.<lambda>U  s    A r<   c                 *   | g}t               }t        |      dkD  rH|j                         } | |v s| v r'|j                  |        || j                  z  }t        |      dkD  rHt        |fd      }|D ]  } j                  | fd      | <    y )Nr   c                     |    S r7   rL   )rZ   orders    r:   r]   zSreordering_to_mimic_autograd_engine.<locals>.insert_node_in_graph.<locals>.<lambda>f  s    %( r<   r^   c                     |    S r7   rL   r   s    r:   r]   zSreordering_to_mimic_autograd_engine.<locals>.insert_node_in_graph.<locals>.<lambda>h  r   r<   )r   r   rP  r  all_input_nodesr`   r   )r5   	cur_nodesinsertable_nodesr   r   r  s      r:   insert_node_in_graphzAreordering_to_mimic_autograd_engine.<locals>.insert_node_in_graphY  s    F	0:)nq ==?D''43;  & ---I )nq  ""28JK$ 	DD!++D2BCCI	Dr<   Nr   )rJ   r   r   r   r   	enumerater   rm   r  r   mathinfrS  r   r   r   r   r   copy_r   GraphModule)r  r5   r   r  r  first_node_in_bwdminimum_ordertangentrZ  new_gmr   r   r  s             @@@r:   #reordering_to_mimic_autograd_enginer	  9  s   . 
I"$C ##}#5 @''.>?D	@ )2"((..(AB93T3YBED$ &bhhnn=>NHHM! )MM 	)DT{]* %d$(!	))  	 RXX^^$%?u->'?@ '77o%$++9M9M9U9U*U &' RXX^^$U+<%=%?@ #T"# XX!!"i0FMY Cs   5G
	fw_module	bw_modulefw_nodebw_nodedevice	rng_countlast_fwd_inputlast_bwd_inputc                    |j                   }|J | j                  }	|j                  }
t        j                  j                  j
                  }| j                  j                  |      5  | j                  j                  d|       }t        |      |j                  d<   |}ddd       |j                  j                  |      5  |j                  j                  d|       }t        |      |j                  d<   |}ddd       t        |j                        }|d<   | j                  j                  |      5  |	j                  d||j                  g|j                  |      }ddd       |j                         |	j!                  |       t        |j                        }|d<   |
j#                  |      5  |
j                  d||j                  g|j                  |      }|j                  |       |
j!                  |       ddd       ||fS # 1 sw Y   zxY w# 1 sw Y   +xY w# 1 sw Y   xY w# 1 sw Y   ||fS xY w)a%  
    Note [CUDA Graph Safe RNG Functionalization]

    CUDA Graph capture doesn't work with get_rng_state and set_rng_state because these functions operate on CPU values,
    while CUDA Graph RNG capture uses on-device CUDA tensors. To solve this, we use graphsafe_set_state with a
    CUDA Generator registered to the CUDA Graph before capture begins. graphsafe_set_state updates the generator's pointer
    to reference a different GeneratorImpl, ensuring subsequent calls are correctly forwarded to the desired generator
    (and its cuda-tensor RNG state during graph capture).

    For each RNG operation's forward/backward pair:

    - We create two generators initialized with identical values
    - Each forward and backward call advances its respective generator equally
    - This keeps generators synchronized so forward and backward operations use matching RNG values

    When forward is called multiple times before backward (causing desynchronization):

    - We save the forward RNG state
    - We update the backward Generator's state before executing backward

    Before each CUDA Graph replay, replay_prologue updates captured RNG pointers with current states, ensuring backward Generator
    changes are reflected during replay.

    This function modifies both forward and backward computation graphs by:

    Creating RNG state placeholders for both passes
    Updating the forward node to use graph-safe RNG state
    Updating the backward node to use graph-safe RNG state

    For more details: https://github.com/pytorch/pytorch/issues/113541
    Nfwd_rng_state_r   bwd_rng_state_	rng_stater   r   r   )indexr   r   _prims	rng_primsgraphsafe_run_with_rng_stater   r   r'   r|   rn   r   create_noder   r   r  r  inserting_before)r
  r  r  r  r  r  r  r  
device_idxfw_graphbw_graphr  fwd_rng_statebwd_rng_state	fw_kwargsfunctional_fw_node
bwd_kwargs
rng_outputs                     r:   %apply_graphsafe_rng_functionalizationr&    s2   R J!!!HH#(<<#9#9#V#V  
	(	(	8 '!33nYK4PQ$?
$K5!&' 
	(	(	8 '!33nYK4PQ$?
$K5!&	' W^^$I*Ik		(	(	1 
%11(..07<<0	 2 

 !!"45  gnn%J+J{		"	"7	+ %))(..07<<0	 * 

 	%%j1G$% >))M' '' '
 
% >))s1   (9H9H&:.H3=AH?H#&H03H<?Inum_sym_nodesc                   ' t        j                         }d }dt        t        j                     fd'dt        t        j                     fd} ||       } ||      } ||      }	i }
| j
                  j                  D ]  }t        |      st        |j                  d      s&t        j                  j                  |j                  j                  v sW||j                     }||j                     }|	|j                     }||d|
|<    t        j                  j                  j                   }t        j                  j                  j"                  }d }|j
                  j%                  d	      D ]  }d
|j                  v s|} n |t'        d      g }t)        t+        |j
                  j%                  d	                  }t)        t+        |j
                  j%                  d	                  }t-        'fd|
j/                         D              }|j1                  t        j                  d             t3        |      dkD  }t        j4                  j6                  }t6        j8                  xr* | xr% |j:                   xs |j<                  j>                  }tA        |
jC                               D ]  \  }\  }}|d   }|d   } '|      }|j
                  }|j
                  }|r'|%|jD                  dk(  rtG        ||||||||      \  }}]|jI                  |      5  |jK                  d||j                  g|jL                  |jN                        }|jK                  dtP        jR                  |dfi       }|jK                  dtP        jR                  |dfi       } |jU                  |        |jW                  |       |jY                  |       d d d        |jI                  |      5  dt)        |       }!|j[                  |!      }" ||      |"j\                  d<   d d d        |jI                  |      5  |jK                  d|"|j                  g|jL                  |jN                        } |jU                  |        |jW                  |       d d d         |rt)        t_        |j
                  j%                  d	                  }#|#jL                  d   }$t3        |$      |z
  }%|$d |% ta        |      z   |$|%d  z   }&|j
                  jc                  |&       |j
                  jW                  |#       |je                          |je                          ||fS # 1 sw Y   xY w# 1 sw Y   LxY w# 1 sw Y   xY w)Nc                    i }| j                   j                  D ]i  }|j                  dk(  st        |j                  d      s*t
        j                  j                  |j                  j                  v s[|||j                  <   k |S )Nr   r   )
r   r   r   r   r   r   r   r   r   r   )gmodrandom_nodesr5   s      r:   get_rng_opsz*functionalize_rng_ops.<locals>.get_rng_ops  sl    JJ$$ 	/D?*DKK0II559I9II*.TYY'	/ r<   rU   c                     d| j                   vry| j                   d   }t        |t              s|f}|D ]D  }t        |t        j                        s|j
                  j                  dk(  s8|j
                  c S  t        j
                  d      S )zV
        Check the example value of the node outputs to find the device type.
        r   Ncudacpu)r|   r   r   r   r  r  r  )r5   
candidates	candidates      r:   
get_devicez)functionalize_rng_ops.<locals>.get_device	  s     		!YYu%
*e,$J# 	,I)U\\2##((F2$+++	,
 ||E""r<   r  c                     | -| j                   dk(  rt        j                  j                         S t        j                         S )Nr.  )r  r   r.  get_rng_state)r  s    r:   get_sample_rng_statez3functionalize_rng_ops.<locals>.get_sample_rng_state  s5    &++"7::++--""$$r<   r   )fwdbwdr   r   r  zaCouldn't find tangent node in graph inputs. This is unexpected, please file a bug if you see thisc              3   4   K   | ]  } |d            yw)r6  NrL   )rY   	node_pairr2  s     r:   r[   z(functionalize_rng_ops.<locals>.<genexpr>B  s       )2
9U#$r  r/  r    r6  r7  r.  r   r  r   rng_state_output_r   r   )3r  countr	   r   r  r   r   r   r   r   r   r   r   r   r  r  run_and_save_rng_staterun_with_rng_stater   r   rQ  r  r   valuesdiscardr   r&  r   graphsafe_rng_functionalizationfallback_randomtest_configs*graphsafe_rng_func_ignores_fallback_randomr   r  r  r&  r  r  r   r   r  r  r  r  r   r   r|   iterr   r   	recompile)(r   r
  r  r'  uidr,  r5  joint_graph_rng_opsfw_graph_rng_opsbw_graph_rng_opsrecomputable_rng_ops_mapr5   	base_noder  r  run_and_save_rngr=  bw_tangent_start_nodefw_rng_state_outputsr  r  devicesmulti_cuda_devices
ind_config'use_rng_graphsafe_rng_functionalizationr  r9  r  r  r  r#  stater%  
state_namebw_rng_state_nodefw_output_node
fw_outputssym_node_start_idxr   r2  s(                                          @r:   functionalize_rng_opsrY    s   2 //
C	#HU\\2 #$%Xell%; % &l3"9-"9-!""(( 	S4 V,		11T[[5E5EE+DII6I&tyy1G&tyy1G:A'2R$Y/	S ||--DD//BB **m*< 		!$(! $o
 	
 (9??#=#=#=#OPQN(9??#=#=#=#OPQN 6N6U6U6W G OOELL'( W) ''J.. 	
""	
 *** R&&QQ , .7 &&(. D-)	)Iy E"E"G$???? 4"v%-R	.*NN **73 3%-%9%9#$!..87<<8">>	 &: &" !,,#$$,a0	 -  &11#$$*  2 
 --j9##G,$++E2136 **+@A M0c<
$,$8$8$D!0DV0L!&&u-M
 **73 	-%11#&+W^^KgllK">>	 2 
 --j9##G,	- 	-wD-R d9??#=#=#=#JKL#((+
 _}<**+()*+,-. 	
 	w'"">2iw3 36M M
	- 	-s&   &B5U&45U3AV &U0	3U=	 V
	c                    | j                   j                  D ]t  }t        |j                  t        j
                  j                        s2|j                  j                  dk(  sLt        |      rXt        j                  |j                  d<   v y)z
    By default, the partitioner is not allowed to recompute collectives
    unless they come from a user-annotated AC region.
    See Note [Recomputing collectives in the partitioner]
    r  r{   N)r   r   r   r   r   r  
OpOverload	namespacer   r   	MUST_SAVEr|   )r   r5   s     r:   force_save_collectivesr^    sh     ""(( @t{{EJJ$9$9:%%);;"4(%5%?%?DIIk"@r<   c                    | j                   j                  D ]  }t        |      s|j                  D ]K  }t        |      s|j                  d   |j                  d   kD  s/t
        j                  |j                  d<   M |j                  j                  dd      st        d |j                  D              rt
        j                  |j                  d<    | S )a  
    If there are two consecutive checkpointed blocks with no operator in
    between, we would still want to stash the tensor at the boundary of
    checkpointed blocks. The following pass makes the last output node
    non-recomputable to allow for that.
    ac_graph_idr{   has_backward_hookFc              3   2   K   | ]  }t        |        y wr7   )r   r  s     r:   r[   z)cleanup_recompute_tags.<locals>.<genexpr>  s      E)-t$Er  )	r   r   r   rS  r|   r   r]  r}   r   )r   r5   rZ  s      r:   cleanup_recompute_tagsrc    s     ""(( D$

 H"4(		-0499]3KK-=-G-GDIIk*H yy}}0%8 E15E B& *:)C)C		+&7D8 r<   	node_infomin_cut_optionsdont_banc                   %&'()*+,-./0 
t               t               /t        rQt        d | j                  D              }|t        d /j                  D              z
  }t
        j                  d|       d &d '&'/fd(	 dd l}(/fd	**/fd
}(fd)dt        f)/fd}	|j                         .t               %%./fd}
| j                  D ]L  }|j                  dk(  r|j                  v rm|j                  vr0.j                  |j                   dz   dt"        j$                         `.j                  |j                   dz   dt"        j$                         t'        |      r0.j                  |j                   dz   dt"        j$                         t)        |      st+        |      r |
|       j-                  |      r ||      r |
|       d|j.                  vxr d|j.                  vxs8 d|j.                  v xr( t1        |j.                  d   t2        j4                         }t7        |      rt        t9        |            }nQ|r<t1        |j.                  j;                  d      t<              rdnt"        j$                  }n |	|j>                        }.j                  |j                   dz   |j                   dz   |       |j@                  D ]>  }.j                  |j                   dz   |j                   dz   t"        j$                         @ O dtB        tD        jF                     dtH        dtH        f(fd}jJ                  r(jL                  D ]  }|j@                  D cg c]$  }j-                  |      rjO                  |      & }}|j@                  D cg c]  }j-                  |      s| }}tQ        |      dkD  sw ||tS        |            }tU        |j@                        D ]x  }j-                  |      sjO                  |      |kD  s* (||      s4|%v r9t
        j                  d|jO                  |      ||jO                  |              |
|       z  jV                  r^t               }| j                  D ]D  }j-                  |      sjO                  |      |fg}jO                  |      }tQ        |      dkD  sJtY        jZ                  |      \  }}||v r,|j]                  |       jO                  |      |dz   kD  rNtQ        |      dk(  r@t
        j                  d||jO                  |      jO                  |              |
|       |j@                  D ]J  }j-                  |      s (||      s|%vs$tY        j^                  |jO                  |      |f       L tQ        |      dkD  rG 	 |ja                  .dd      \  }}|\  }-t               }.fd |D        D ]   \  0}|jo                  -0fd!|D               " t               }|D ](  \  } }!| d d" |!d d# k(  sJ | d d" }"|j]                  |"       * tq        |       +ts        | j                        D #ci c]  \  }#}||#
 c}}#,tu        +fd$|D        ,fd%&      }$|$%fS # t        $ r}t        d      |d }~ww xY wc c}w c c}w # tb        $ ri t
        j                  d       t
        j                  dje                  |jf                  jh                  jk                  .                   tm        .        w xY wc c}}#w )'Nc              3      K   | ]H  }|j                   d k(  r7t        |j                  d      r!t        |j                  j                         J yw)r   _overloadpacketN)r   r   r   r   ri  r   s     r:   r[   z solve_min_cut.<locals>.<genexpr>  sA      &
ww/)gdkkCT.U ++,&
s   AAc              3   2   K   | ]  }t        |        y wr7   )r   rY   r   s     r:   r[   z solve_min_cut.<locals>.<genexpr>  s      4
CF4
r  z&Ops banned from re-materialization: %sc                 D   |j                   t        j                  j                  j                  k7  ry|j
                  d   }t        j                  j                  j                  |      \  }}|D ].  }|j                  |   }| |u r yt        |t              s)| |v s. y yNFr   T)r   r   r   higher_orderauto_functionalizedr   _higher_order_opsauto_functionalizeget_mutable_argsr   r   rm   )ab
mutable_opmutable_arg_namesr  r   r  s          r:   !can_fuse_into_auto_functionalizedz8solve_min_cut.<locals>.can_fuse_into_auto_functionalized  s    88uyy--AAAVVAY
 ##66GG
S	
% 	 D((4.CCx#t$8	  r<   c                     |j                   t        j                  j                  j                  k7  ry|j
                  d   }|D ]  }|j
                  d   |   }| |u s y y)NFtensors_to_cloner   T)r   r   r   rn   triton_kernel_wrapper_functionalr   )rs  rt  rv  r   r  s        r:   .can_fuse_into_triton_kernel_wrapper_functionalzEsolve_min_cut.<locals>.can_fuse_into_triton_kernel_wrapper_functional	  sb    88uyy--NNNHH%78% 	D((8$T*CCx	 r<   c                 b   t        |      t        j                  k(  ry | |      ry | |      ry| j                  t        j
                  u r>| j                  d   j                  t        j                  j                  j                  u ryj                  |       xr j                  |      S )NTr   F)r*   r   catr   r  r  r   r   r   rn  rz  r;   )rs  rt  rw  r{  op_typess     r:   r;   z!solve_min_cut.<locals>.is_fusible  s     1),Q29!Q?HH(((q	  yy%%FFG
 ""1%@(*=*=a*@@r<   r   zANeed networkx installed to perform smart recomputation heuristicsc                 <   j                  |       ryt        | g      }t        |      dkD  ro|j                         }|j                  D ]A  }j                  |      s ||      s yj                  |      s1|j                  |       C t        |      dkD  royrm  )rB   r   r   rP  rS  re   r  )r5   r  currZ  r;   rd  r~  s       r:   is_materialized_backwardsz0solve_min_cut.<locals>.is_materialized_backwards-  s    D!v&	)nq --/C		 ( //5jd>S##D)MM$'	( )nq  r<   c                 T   | j                   dk7  ry| j                  t        j                  k(  ry| j                  j                  dd       t        j                  k(  ryt        j                  rj                  |       ry| j                  t        j                  j                  t        j                  j                  fv ryj                  rj!                  |       s$yj#                  |       sj%                  |       ryj&                  r3 |       r+t(        j+                  d| t-        | j.                               y| j0                  dk  r| j0                  t        j2                  kD  ryj4                  r/t7        d | j8                  D              }t;        |       }|dz  |k  S y)	Nr   Fr{   Tzmaterialized backwards: %s %si  c              3   h   K   | ]*  }t        |t        j                        st        |       , y wr7   )r   rJ   rK   r  rk  s     r:   r[   zBsolve_min_cut.<locals>.should_ban_recomputation.<locals>.<genexpr>f  s&      % !*Q2H%s   22r   )r   r   r  r  r|   r}   r   r]  r   recompute_viewsrB   r   lift_fresh_copyr   
lift_freshrx   rD   r@   r>   rw   r-   r  r   rS  dist_from_bwmax_dist_from_bwry   r  r   r  )r5   input_tensors_sizeoutput_sizer  re  r~  s      r:   should_ban_recomputationz/solve_min_cut.<locals>.should_ban_recomputation;  sc   77o%;;(***99==d+/?/I/II!!h&6&6t&<;;4//779P9PQQ22++D1!!$'8+H+H+N 77<U=
 II5tU4::=NO t#(9(9F<S<S(S ++!$ %%)YY% " #4.K?%777r<   c                 f      j                   dk(  ryt         fd j                  D               S )Nr   Tc              3   0   K   | ]  } |        y wr7   rL   )rY   rZ  r;   r5   s     r:   r[   z9solve_min_cut.<locals>.is_materialized.<locals>.<genexpr>q  s     E$z$-Es   )r   r  rS  )r5   r;   s   `r:   is_materializedz&solve_min_cut.<locals>.is_materializedm  s*    77m#E$**EEEEr<   rU   c           
         t         j                  r| |v ryt        |       }t         j                  r!j	                  |       rt
        j                  S t        | j                  d   t              r-t        | j                  d   t        j                        st        S t        |dt        t        | j                   d      d      z  z        } |       r|S |dz  S )Nr   r   g?r.  r       )r    treat_parameters_as_free_to_saver  r  rB   r  r  r   r|   r   r   r   INT_INFro   r   r   r  )r5   rT   mem_szr  r~  s      r:   get_node_weightz&solve_min_cut.<locals>.get_node_weights  s    3333$!!h&6&6t&< 88Odii&5dii.= Vsc#d.?.?*Eq&IIJK4 MA:r<   c                    j                  |       ry| v r\t        | j                  t        j                  j
                        xr | j                  j                  dk(  }t        j                  s|syt        |       ryd| j                  v r(t        | j                  d   t        j                        ryj                  |        j                  d| j                  dz   t        j                          y)NFr  r   source_incapacityT)rB   r   r   r   r  r[  r\  r   (unsafe_allow_optimization_of_collectivesr   r|   r   r  add_edger   r  r  )r5   is_collectivebanned_nodesrf  nx_graphr~  s     r:   ban_recomputation_if_allowedz3solve_min_cut.<locals>.ban_recomputation_if_allowed  s    D!8 4;;

(=(=> @KK))-??  >>m $DII*TYYu-=u~~"N
 	(DII$5Ir<   r   r  sinkr  _outr   r           start_nodes	max_rangec                    g }| D ]*  }t        j                  |
j                  |      |df       , t        |      dkD  rt        j                  |      \  }}}|s
j                  |      S |j
                  D ]_  }
j                  |      s
j                  |      |kD  r*
j                  |      | 	||      f}||vsJt        j                  ||       a t        |      dkD  r|S )z
        Finds the first unfusible node in the chain of nodes starting from
        `start_nodes` and returns its position.
        Tr   )heapqheappushrl   r   heappoprS  re   )r  r  sorted_nodesrZ   r  r5   node_is_fusiblerZ  r   r;   rd  s            r:   find_first_unfusiblez+solve_min_cut.<locals>.find_first_unfusible  s    
 9; 	OANN<)*@*@*CQ)MN	O ,!#',}}\'B$At_" --d33

 
:++D1 --d3i? !..t4"4.6C
 ,.|S9
:	 ,!# r<   z1used above/below fusible %s:(%s) -> %s -> %s:(%s)r.  ztoo long %s %s %s %sr  z-Failed to compute min-cut on following graph:
c              3   ,   K   | ]  }||   f  y wr7   rL   )rY   rZ   r  s     r:   r[   z solve_min_cut.<locals>.<genexpr>g  s     8Q$8s   c              3   0   K   | ]  }|v s|f  y wr7   rL   )rY   vnon_reachableus     r:   r[   z solve_min_cut.<locals>.<genexpr>h  s     Aa=.@q!fAs   		c              3   (   K   | ]	  }|     y wr7   rL   rY   r5   name_to_nodes     r:   r[   z solve_min_cut.<locals>.<genexpr>t  s     2d	2s   c                     |    S r7   rL   )r   node_idxs    r:   r]   zsolve_min_cut.<locals>.<lambda>t  s    (1+ r<   r^   );r   get_default_op_listr,   r   r4   r-   r7  networkxImportErrorr   floatDiGraphr   rQ   rO   r  r   r  r  r   r   r   re   r|   r   r   r  r   r   r}   r   rT   rS  rm   rJ   rK   ro   ru   rb   rl   r   r   r   rv   r  r  r  r  minimum_cut	Exceptionjoin	readwriteedgelistgenerate_edgelistvisualize_min_cut_graphr  get_name_to_noder   r`   )1r   rd  re  rf  joint_module_opsops_ignorednxer  r  r  r5   is_non_tensor_nodeweightrZ  r  	used_nodeordersfw_usersfirst_unfusible_usevisited
start_nodefusiblestart_orderr  r  	cut_value	partition	reachablecutsetnbrs	cut_nodesnode_innode_out	node_namer   r   r  rw  r{  r;   r  r  r  r  r  r  r~  r  s1    ```                                 @@@@@@@@@@@@r:   solve_min_cutr    s    <"$H% &
#))&
 

 ' 4
$554
 *
 
 	9;G"A&0dFe < zz|H(2L6 !! 3X77h9...9+++!!$))e"3Vdhh!O dii&0&488L$
 dii%/$((Kd248(.
 ##D).Ft.L(. "E}DII'EUtyy SDIIe4Dell)S%S 	 t=./F!$))--"6FDHH  %T9+P+PQF$))e+TYY-?&QJJ 	XDdii&0$))e2CdhhW	Xe3XL$rww- C C 4 ,,"44 	;I &OO++D1 &&t,F  "+I4L4LT4RH  6{Q&:8S[&Q#!)//2 ;D!006%2248;NN&y$7</$O%%229=/ %2248 5T:!;	;P 11'1|%++ !	VJ++J7''
3Z@2G $00<Kg,"w/3'>C  **3/+2CCG)HH."!..s3!..z: 15II VD!006&sD1 4w1G1G1Mt0TUV) g,"!	VF!~~h&I	9  )I}*4,F8i8 B4AdAAB ",I# !s|x},,,CRL	i !
 $K0L+4[5F5F+GHic4c	HH2	28ML %%]
  O
	R
R  @A2<<00BB8LMN)	& Is=   [ )[0;[5[58[: %]/	[-[(([-:A2],c                    dd l }dd l}|j                  j                  |       j	                         }|j                  |      d   }|j                         D ]c  }| |j                            |j                            d   }|j                  t        |             |t        d      k(  sS|j                  d       e t        j                  d       |j                  d       y )Nr   r  r  redz2Visualizing the failed graph to min_cut_failed.svgzmin_cut_failed.svg)r  pydotnx_pydotto_pydot	to_stringgraph_from_dot_data	get_edges
get_sourceget_destination	set_labelr   r  	set_colorr-   r7  	write_svg)r  r  r  
dot_format	dot_graphedger  s          r:   r  r  y  s    %%h/99;J))*5a8I##% "$//+,T-A-A-CDZPs6{#U5\!NN5!" HHAB,-r<   c                  x   g t         j                  t         j                  t         j                  t         j                  t         j
                  t         j                  t         j                  t         j                  t         j                  t         j                  t         j                  t         j                  t         j                  t         j                  t         j                  t         j                   t         j"                  t         j$                  t         j&                  t         j(                  t         j*                  t         j,                  t         j.                  t         j0                  t         j2                  t         j4                  t         j6                  t         j8                  t         j:                  t         j<                  t         j>                  t         j@                  t         jB                  t         jD                  t         jF                  t         jH                  t         jJ                  t         jL                  t         jN                  t         jP                  t         jR                  t         jT                  t         jV                  t         jX                  t         jZ                  t         j\                  t         j^                  t         j`                  t         jb                  t         jd                  t         jf                  t         jh                  t         jj                  t         jl                  t         jn                  t         jp                  t         jr                  t         jt                  t         jv                  t         jx                  t         jz                  t         j|                  t         j~                  t         j                  t         j                  t         j                  t         j                  t         j                  t         j                  t         j                  t        j                  t         j                  t         j                  t         j                  t         j                  } t         j                  t         j                  t         j                  g}|t         j                  t         j                  t         j                  t        j                  t         j                  t         j                  t         j                  t         j                  t         j                  g	z  }|}| g t        j                  t        j                  t         j                  t         j                  t         j                  t        j                  t        j                  t         j                  t         j                  t        j                  t         j                  t         j                  t         j                  t         j                  t         j                  t         j                  t         j                  t         j                  t         j                  t         j                  t         j                  t         j                  t         j                  t         j                  t         j                  t         j                  t         j                  t         j                  t         j                  t         j                  t         j                  t         j                  t         j                  t        j                  t        j                  z  } | t         j                  t         j                  gz  } | |z  } | t               z  } | t         j                  gz  } | t        D cg c]  }t        |       c}z  } t        |       }t        t        dt        f      t         j                  t         j                  t         j                  g      }t         j                  t         j                  t         j                  t         j                   t         j                  t         j                  t         j                  t         j                  t         j
                  t         j                  t         j                  g}||z  }t        |t        |      |t        |      |      S c c}w )N.)r   r  subrR  atan2r  r   r   pow	remainderfmod__and____or____xor__
__lshift__
__rshift__eqnegegtleltr   bitwise_notceilfloorfracnegreluroundsilutruncr-   log10log1plog2lgammaexpexpm1erferfccosacoscoshsinasinsinhtanatantanhatanhsqrtrsqrtr  sigmoidsoftplus	thresholdthreshold_backwardclampwherelerpaddcmulgelugelu_backwardr  mean_grad_sum_to_sizesum_to_sizer   totype_asr  r  squeeze	unsqueezersub_to_copyaliasviewslicetr  broadcast_in_dimexpand
as_stridedpermuteselectr(  r  clone	full_likevarstd_unsafe_viewreshapebroadcast_tensorsscalar_tensorones	new_zerosr  arangetriuvar_meanisinfr   fullzerosempty
empty_likeargmaxmaximumiota'_low_memory_max_pool_offsets_to_indicesr  gatherr  
zeros_liker   r   r   r   r   native_dropout	rand_like
randn_likemmconvolutionconvolution_backwardbmmaddmm#_scaled_dot_product_flash_attention'_scaled_dot_product_efficient_attention_flash_attention_forward_efficient_attention_forwardupsample_bilinear2d
_scaled_mmr/   )default_recomputable_opsrecomputable_view_opsr3   mr4   r2   r1   r0   s           r:   r  r    s   L0L0L0 	L0 	

	L0
 	L0 	L0 	L0 	L0 	L0 			L0 	L0 	L0 	L0 	L0 	L0  	!L0" 	#L0$ 	%L0& 	'L0( 	)L0* 	+L0, 	-L0. 	/L00 			1L02 	

3L04 			5L06 	7L08 			9L0: 	

;L0< 			=L0> 	

?L0@ 	AL0B 	

CL0D 	

EL0F 			GL0H 	IL0J 	KL0L 	

ML0N 	OL0P 			QL0R 	SL0T 			UL0V 			WL0X 	YL0Z 			[L0\ 			]L0^ 	_L0` 			aL0b 			cL0d 	

eL0f 			gL0h 	

iL0j 	kL0l 	mL0n 	oL0p 	qL0r 	sL0t 	

uL0v 	

wL0x 			yL0z 	{L0| 			}L0~ 	L0@ 	AL0B 			CL0D 	EL0F 	GL0H 			IL0J 	KL0L 	ML0N 	OL0P 	QL0R 	SL0T 			UL0V 	WL0Z "\\4>>4::F		




 
 %H $!		$!""$! 	

$! 		$!
 	$! 			$! 			$! 	$! 	$! 	$! 	$! 	$! 			$! 	$! 	

$!  	!$!" 	#$!$ 	%$!& 			'$!( 	)$!* 	+$!, 	-$!. 			/$!0 	1$!2 	

3$!4 	5$!6 			7$!8 	9$!: 	

;$!< 	

=$!> 	?$!@ 	A$!B 	C$!D 	

E$!F 	55G$! $L T[[ 99(/!   N1!3A!6 NN!":;HS#X./			dnndoo>J 	!!

0044%%))   #Z/K()8 + !Os   7d7c                 J    i }| j                   D ]  }|||j                  <    |S r7   )r   r   )r   r  r5   s      r:   r  r  2  s.    L '"&TYY'r<   memoryruntimes
max_memoryall_recomputable_banned_nodesc                    t         j                  }|dk(  rt        |||      S |dk(  rt        |||      S |dk(  rt	        |||      S |dk(  rZt
        j                  d       t        j                  | |||      }t	        ||t        |      j                  t        |            S t        |      r ||| |||      \  }}	d	||	fS t        d
|       )Ngreedyilpdpdynamic_memory_budget_dpzdynamic_memory_budget_dp is an experimental solver. It does not guarantee performance improvements. Additionally, it is not guaranteed to be stable.)r   re   recorded_knapsack_input_memories recorded_knapsack_input_runtimes)graph_info_provider)knapsack_algomax_mem_budgetr  z,Not aware of memory budget knapsack solver: )r   activation_memory_budget_solverr#   r$   r"   r-   warningr!   inialize_from_graphr%   get_knee_point_memory_budgetcallabler   )
r   rb  rc  rd  rd  re  SOLVERrm  saved_node_idxrecomp_node_idxs
             r:   #_optimize_runtime_with_given_memoryrx  9  s    33Fvx<<	5FHj99	468Z88	-	-?	

 0CC#*G-3-5	
 $7**)) + 	
 		
 
&	*0KY8U+
' ^_55I&RSSr<   no_dispatchr   r  c                     t        | j                        }fd}|D cg c]
  } ||       }}| j                         D cg c]
  } ||       }}| j                  ||      S c c}w c c}w )Nc                     t        |       S )Nr  )r   )dr  s    r:   realize_symbolz8_remove_symbols_without_guarding.<locals>.realize_symboln  s    H--r<   )stride)rm   shaper  new_empty_strided)r   r  r  r~  r  r  s    `    r:    _remove_symbols_without_guardingr  k  sk    ME. )..1^A.E.)*4AnQ4F4uV44 /4s   A'A,c                 F   	 t         j                  }d }|dk(  ry|dk(  rat               5  ddlm} t        j                  | j                   j                  f      \  	|j                  	 fd      }|cd d d        S |dk(  rudd	l
m} t        j                  | j                   j                  f      \  	 |d
      5 }  j                  i 	 d d d        j                         }t        |d      S t        d|       # 1 sw Y   y xY w# 1 sw Y   ?xY w)Nc                 z   t        | t        j                        rAt        | j                  d   t        j
                        rt        | j                  d   d      S t        | t        j                        rAt        | j                  d   t        j                        rt        | j                  d   d      S t        | t        j                        r(t        | j                  d   t        j                        ryt        | t        j                        r(t        | j                  d   t        j                        ry| S )Nr   r  r        ?T)r   rJ   rK   r|   r   r  r  r   r   r   r   r  s    r:   materialize_argz)estimate_runtime.<locals>.materialize_argy  s    a!j&M3AFF5MDQQ277#
166%=%,,(OAFF5MD99277#
166%=%..(Q277#
166%=%--(PHr<   testingr    profiler   )benchmarkerc                  (     j                    i S r7   )r   )r   r   r5   s   r:   r]   z"estimate_runtime.<locals>.<lambda>  s    ;4;;3O3O r<   flops)FlopCounterModeF)displayz Not aware of runtime estimator: )r   *activation_memory_budget_runtime_estimatorrz  $torch._inductor.runtime.benchmarkingr  r   tree_mapr   r   benchmark_gputorch.utils.flop_counterr  r   get_total_flopsr   r   )
r5   RUNTIME_MODEr  r  msr  modecounted_flopsr   r   s
   `       @@r:   estimate_runtimer  v  s   DDL
 y 		"] 	H!???TYY<TULD&**+OPB	 	 
	 <DKK8PQfU+ 	)tDKK((	),,.=!$$=l^LMM#	 		) 	)s   ADDDD c                 P    !"#$%&'()* |dkD  s|dk  rt        d|       t        t        j                  t        j                  t        j
                  t        j                  t        j                        }t        j                  rt        |dddd      }|dk(  rj                  S t         |      \  }}|dk(  r|S dt        t        j                     dt        fd	! !j                        ( !|      &&(k  r|S &(fd
}dt        t        j                     f!&(fd"t        |ddd      }t         |      \  }} "|      |k  r|S t        |d      t               \  }	}
 "|	      |k  r|	S ddlm# t%        #fdj                  D              %dt$        t        j                     dt        t        j                     f#%fd} ||
      }|D cg c]1  }|j&                  j)                  dd      t*        j,                  k(  r|3 }}|D cg c]	  }||vs| }}t/        |t0        d       t3               dk(  rj                  |z   S  D cg c]  } |t1        |             c}' D cg c]  }t5        |       c}*ddlm)  ')*fd$t        j:                  r"$ *fd} |d       |d      g}|d   dd  |d   dd  k7  r|d   |d   fg}|r|j=                         \  }}|d   |d   z
  dk  r#|j?                  |       |j?                  |       F ||d   |d   z   dz        }|dd  |dd  k7  r|j?                  ||f       |dd  |dd  k7  r|j?                  ||f       |r|jA                          dd l!m"} |D cg c]  }|d   	 }}|D cg c]  }|d   	 }}|jG                  d       |jI                  ||d        tK        |      D ]"  \  }}|jM                  |d!|||   fd"d#d$%       $ |jO                  d&       |jQ                  d'       |jS                  d(       |jU                  d       |jW                         }|jY                          t[        j\                         }t        j^                  't        j^                  }t[        j`                  |d)       d*}tb        jd                  jg                         r?tb        jd                  ji                         r!d+tb        jd                  jk                          }tZ        jl                  jo                  |d,| d-tq                d.      }|js                  |       tt        jw                  d/|        $| 0      d   S c c}w c c}w c c}w c c}w c c}w c c}w )1Nr    r   zJThe valid ranges for memory budget are 0 <= m <= 1. The provided value is )ru   rv   rw   rx   ry   F)ru   rv   rw   rx   r   rU   c                 :    t        t        t        |             dz  S N    eA)r  mapr  )r   s    r:   estimate_activations_sizez:choose_saved_values_set.<locals>.estimate_activations_size  s    3x./#55r<   c                     | dz  z
  z  S r  rL   )szmax_act_sizemin_act_sizes    r:   get_normalized_sizez4choose_saved_values_set.<locals>.get_normalized_size  s    S\L899r<   activationsc                 &     |       z
  z
  z  S r7   rL   )r  r  r  r  s    r:   get_mem_ratioz.choose_saved_values_set.<locals>.get_mem_ratio  s"    )+6E<'
 	
r<   )ru   rv   rw   )rx   )get_node_storagec              3   .   K   | ]  } |        y wr7   rL   )rY   r5   r  s     r:   r[   z*choose_saved_values_set.<locals>.<genexpr>  s     T4 0 6Tr  r  c                 r    | D cg c]&  }|j                   t        d      k  r |      vr|( c}S c c}w r  )r  ro   )r  r   r  input_storagess     r:   get_recomputable_banned_nodesz>choose_saved_values_set.<locals>.get_recomputable_banned_nodes  sD    
 "
 S)$Q'~= 
 	
 
s   +4r{   Tr  ry  c           
      d           5  t        |t        | d      |      \  }}}d d d        t               }D ]  }	 |j                  |           |j                        sJ t        ||
|      \  }}	t        rt        |||       |fS # 1 sw Y   pxY w# t        $ r Y rw xY w)Nr   )r   re  saved_node_idxsrecomputable_node_idxsexpected_runtimememories_banned_nodesruntimes_banned_nodesmin_cut_saved_values)	rx  r   r   r  BaseExceptionissubsetr  r,   r   )memory_budgetrd  r   r  r  r  rf  r   r   r  aggressive_optionsre  r  rz  r  s             r:   get_saved_values_knapsackz:choose_saved_values_set.<locals>.get_saved_values_knapsack	  s    ] 	
 4%%M1%-	 &		 )3) 	C:3?@	   !>???'	
a !4'.K /'=!1&;&;%1	 ---Q	 	$ ! s   B B#B #	B/.B/c                 N     |       \  }}| t              |z
   |      fS )N)rd  r   )r  )rt  r   r  r  r  r   rd  r  s      r:   estimate_for_budgetz4choose_saved_values_set.<locals>.estimate_for_budgetC	  s@    -FYK.*L* )*-==l+ r<   r  r  gMbP?r  )
      )figsizeo)markerz.4fzoffset points)r   r  center)
textcoordsxytexthazMemory Budgetz Runtime of Recomputed Componentsz:Pareto Frontier of Memory Budget vs. Recomputation Runtime)exist_okrI  _rank_memory_budget_paretor  z.svgz%Generated Pareto frontier curve at %s)r  rd  r   )<r   rt   r   ban_recompute_used_far_apart!ban_recompute_long_fusible_chains#ban_recompute_materialized_backwardban_recompute_not_in_allowlistban_recompute_reductionsaggressive_recomputationr   rO   r  rm   rJ   rK   r  torch._inductor.fx_utilsr  r   r|   r}   r   r]  r`   r  r   r  torch.utils._mode_utilsrz  visualize_memory_budget_paretorP  r   sortmatplotlib.pyplotpyplotfigureplotr   annotatexlabelylabeltitlegridgcfshowosgetcwdmemory_budget_pareto_dirmakedirsr   r  r  is_initializedget_rankpathr  r&   savefigr-   rq  )+r   rd  r  re  runtime_optimized_saved_valuesr  r  more_aggressive_optionsmore_aggressive_saved_values%aggressive_recomputation_saved_valuesr  r  recomputable_banned_nodesr   must_save_nodesr5   r  optionsbisectslhsrhsmidpltitemx_valuesy_valuestxtfigfig_dirrank_suffixfig_namer  re  r  r  r  r  r  r  r  r  rz  r  s+   ``                             @@@@@@@@@@@@r:   choose_saved_values_setr    s   
 qMA-XYfXgh
 	
 $$AA#)#K#K%+%O%O & E E88O &&!"'',).$)
 (5)%"A --6RWW 6% 6 -Y-=-=>L,-KLL|#--:
4= 

 &##(%*	 '4Y 7'# ! 12]B++  % ;HY 2;7)< :;mK449T9CSCSTTN
 )
	bgg
 !>l K +66::k5)-=-G-GG 	
O  -!0H! ! %+!x%! ()Q./112O-.HQK( ,I#' 4). ).V ,,	 	 's+-@-EF1:ab>WQZ^+
GAJ/0G";;=Sq6CF?T)NN3'NN3')3q6CF?a*?@qr7c!"g%NNC:.qr7c!"g%NNC:.  	'(/0DG00(/0DG00 	

7
#8C0  ) 	FAsLLs)hqk"*  	 	

?#

56		NOggi
))+**655GKK$/))+0A0A0P0P0R"5#4#4#=#=#?"@AK77<<+K=:L:N9OtT
 	H;XF %#yk	 	]
!^ 10s*   '6V
#	V-V&VV"V4V#c                 J  	 ddl m	 d }	fd}t        j                  j	                         rt        j                  j                         rt        j                  j                         dkD  r ||       r ||       rt               5   	       5  |D cg c]  }|j                   c}g}t        j                  j                  |d       |d   }t        |       }|D cg c]  }||   	 }}d d d        d d d        |S |S c c}w c c}w # 1 sw Y   xY w# 1 sw Y   |S xY w)Nr   )unset_fake_temporarilyc                     | j                   D ]K  }t        |j                  t        j                  j
                        s2|j                  j                  dv sK y y)N>   c10d_functionalr  TF)r   r   r   r   r  r[  r\  )r   r5   s     r:   has_collectivesz2_broadcast_rank0_decision.<locals>.has_collectives	  sM    %% 	DUZZ22++''+RR		
 r<   c                     dj                  d | j                  D              }t        j                  |j	                  d            j                         }t        t        j                  j                               D cg c]  }d  c}t               5          5  t        j                  j                  |       d d d        d d d        t        fdD              S c c}w # 1 sw Y   *xY w# 1 sw Y   .xY w)N/c              3   4   K   | ]  }|j                     y wr7   rK  )rY   r   s     r:   r[   zD_broadcast_rank0_decision.<locals>.has_same_nodes.<locals>.<genexpr>	  s     >qAFF>r   zutf-8c              3   .   K   | ]  }d    |k(    ywr  rL   )rY   r   
all_inputss     r:   r[   zD_broadcast_rank0_decision.<locals>.has_same_nodes.<locals>.<genexpr>	  s     :!:a=A%:r  )r  r   hashlibsha256encode	hexdigestr   r   r  get_world_sizerz  all_gather_objectr  )r   node_strrO   r  r  r  s       @r:   has_same_nodesz1_broadcast_rank0_decision.<locals>.has_same_nodes	  s    
 88>K,=,=>> 89CCE$)%*;*;*J*J*L$MNqdN
] 	D24 	D//
FC	D 	D :z:::	 O	D 	D 	D 	Ds*    	C#C4!C(?C4(C1	-C44C=r    )src)torch._subclasses.fake_tensorr  r   r  r  r  r  rz  r   broadcast_object_listr  )
r   r   r  r	  r   objectsr  r  rZ   r  s
            @r:   _broadcast_rank0_decisionr  	  s    E; 	&&(,,.,,.2K(;'] 	I24 	I(45156G33G3C!(+K8L5GHLOHLH	I 	I < 6
 I	I 	I 	I sB   DDD'9D D,D.D
DD	DD")r  c          	         | j                   j                          | j                          | j                   }t        j                  rt        |      }|| _         | j                   }t        |       }t        |       }	|rt        |       } t        j                  st        |        fd}
|g } |
| |      }t        |j                        dk(  rt        | |||j                        S t        | j                   j                         D ]  }|j"                  dk(  rt%        d      |_        #|j)                  |      sd|_        <t%        d      |_        |j*                  D ]*  }t-        |j&                  |j&                  dz         |_        ,  t        j.                  }|j                   D ]=  }t1        |j2                  j5                  dd      t6              s.|j2                  d   } n t9        |||	      }t        j:                  rt;        ||      }t=        t?        t@        |            }t=        t?        d
 |            }tC        | |||j                        \  }}|r|	rtE        | ||t        |            \  }}tG        |      }tI        |      }tI        |      }tJ        rtM        |D cg c]  }tO        |      tQ        |      f c}      }tS        d |D              dz  }tT        jW                  d|       tT        jW                  d|       tY        d |j                   j                   D              }tY        d |j                   j                   D              }||z  }t[        t$              }|j                   j                   D ]R  }|j\                  |v st_        |j`                  d      s)|tQ        |j`                  jb                        xx   dz  cc<   T tT        jW                  dt        |      t        |      t        |             tM        |je                         tg        jh                  d      d      }tT        jW                  d|       ||fS c c}w )ax  
    Partitions the joint graph such that the backward recomputes the forward.
    Recomputing helps in trading off memory bandwidth with computation.

    To create the fwd and bwd graph, we copy the joint graph, manually set the
    outputs to just original forward or backward outputs. And then we run the
    resulting graphs through dead code elimination.

    .. warning::
        This API is experimental and likely to change.

    Args:
        joint_module(fx.GraphModule): The joint forward and backward graph. This
            is the result of AOT Autograd tracing.
        _joint_inputs: The inputs to the joint graph. This is unused.
        compiler: This option determines the default set of recomputable ops.
            Currently, there are two options: ``nvfuser`` and ``inductor``.
        recomputable_ops: This is an optional set of recomputable ops. If this
            is not None, then this set of ops will be used instead of the
            default set of ops.
        num_fwd_outputs: The number of outputs from the forward graph.

    Returns:
        Returns the generated forward and backward Fx graph modules.
    c                    t        | j                        t               | j                  j                  D ]m  }|j                  dk(  r d|j
                  v rj                  |       nt        |      rj                  |       |v sSj                  |j                         o t        t        t        | j                  j                              }t        t        t        | j                  j                              }||z   }t        |       \  }}j                  d |D               t        | j                  ||d      }t        fd|j                  D              t        fd| j                  j                  D              }	t        fdt!        |      D              }
d	}i }| j                  j                  D ]  }|v s|||<   |d
z  } t#        ||	||
      S )Nr   r   r  c              3   F   K   | ]  }||j                   dk7  s|  y w)Nr   r   )rY   r  s     r:   r[   zNmin_cut_rematerialization_partition.<locals>.classify_nodes.<locals>.<genexpr>
  s$      !
am8HA!
s   !!!r  c              3   Z   K   | ]"  }|j                   d k7  r|j                      $ ywr  r  r  s     r:   r[   zNmin_cut_rematerialization_partition.<locals>.classify_nodes.<locals>.<genexpr>	
  s.      <
ww(" #<
s   (+c              3   2   K   | ]  }|vr|vr|  y wr7   rL   )rY   r5   rQ   rb   s     r:   r[   zNmin_cut_rematerialization_partition.<locals>.classify_nodes.<locals>.<genexpr>
  s*      :
,,=N1N :
s   c              3   2   K   | ]  \  }}|v s|  y wr7   rL   )rY   r   pr  s      r:   r[   zNmin_cut_rematerialization_partition.<locals>.classify_nodes.<locals>.<genexpr>
  s"      1
!Qa;X6XA1
s   r   r    )r  r   r   r   r   r   r  r   r  rS  rm   r  r   r   r   r   r   rN   )r   r  r5   r  r  rO   r   r   r  rR   rT   fw_cntrS   r  rQ   rb   r   s    `           @@@r:   classify_nodesz;min_cut_rematerialization_partition.<locals>.classify_nodes	  s   '(:(:;1; &&,, 	5Dww-'J$++,E!%%d+%d+!%%d+((!((4	5 VJ0B0B0H0HIJ!%&(:(:(@(@A"
 !77#;/$
 [ 	   !
"!
 	
 @Y
 2< <
*00<
 2

 0: :
$**00:
 0

 '1 1
#M21
 '
#  &&,, 	D((!'!	 '
 	
r<   Nr   )r   r  rT   r   r  r    r  )r  c                     t        |        S r7   r  )rZ   s    r:   r]   z5min_cut_rematerialization_partition.<locals>.<lambda>M
  s    [^); r<   r  c              3   2   K   | ]  }t        |        y wr7   )r  rk  s     r:   r[   z6min_cut_rematerialization_partition.<locals>.<genexpr>h
  s     'J'Jr  z'Theoretical Activations Stored: %.2f GBz,Theoretical Per Activation Storage Sizes: %sc              3   T   K   | ]   }|j                   d k(  s|j                   " ywr   Nr  r   s     r:   r[   z6min_cut_rematerialization_partition.<locals>.<genexpr>m
  $      %
477o;UDII%
r  c              3   T   K   | ]   }|j                   d k(  s|j                   " ywr  r  r   s     r:   r[   z6min_cut_rematerialization_partition.<locals>.<genexpr>p
  r  r  ri  z# remat/fw/bw: %d/%d/%dTr  zCount of Ops Rematerialized: %s)5r   r   rE  r   cser)   r   r   rc  r  r^  r   rQ   r  rT   r  r   r   ro   r  re   rS  r   activation_memory_budgetr   r|   r}   r  r  r  rm   r  r   r  rY  r	  r+   r,   r`   r  r   r  r-   r7  r   r   r   r   r   ri  r  r  r  )r   r  compilerr   r  r   	cse_graphr   graph_has_recomputable_opsgraph_has_recomputable_rng_opsr  rd  r5   rZ  r  r   r  r
  r  r   sorted_sizestotal_activations_size_gbfw_module_nodesbw_module_nodesremat_nodescountsrematerialized_opss      `                       r:   r  r  	  s   D **,D zz &	&$$K!5l!C%=l%K"!-l;::|,4
l %,(*%|-JKI
 9&&'1, +*G(1(M(M
 	
 ++112 R77h #CD))$/ !D #CD

 R$'(9(94;L;Lq;P$Q!RR 33M!! diimmOT:EB IIo6M +#L
 ''0lK6+|<=O;\JKL 4''$-$I$IIy ")#8iC4H$ Iy 4I>I y)Iy)IlKSV4KL %('J\'J$JS$P!:<UV 	?N$ %
"+//"7"7%
 
 % %
"+//"7"7%
 
 &7!,S!1OO)) 	>DyyK'GDKKAR,Ss4;;6678A=8	> 	%  		
 $LLN 3 3A 6
 	24FGi= Ls   /Qtracedfnamefigname
clear_metaprogparse_stack_tracedot_graph_shapec                    |rWt        j                  | j                        }t        j                  | |      } | j                  j
                  D ]	  }i |_         t        j                  j                  |      \  }	}
|
sdt        j                  z   }
t        j                  d|	|
       t        j                  | |||      }|j!                         }t#        |d|
j%                  d      z         }|	 |
 }|	 ||       y  |||       y )Nr%  zWriting FX graph to file: %s%s)r0  r1  write_)r/  )copydeepcopyr   rJ   r  r   r|   r  r  splitextr   torch_compile_graph_formatr-   r7  r   FxGraphDrawerget_main_dot_graphr)  lstrip)r+  r,  r-  r.  r/  r0  r1  r   r5   baseextgr   write_methods                 r:   
draw_graphr?  
  s     MM&,,/		2LL&& 	DDI	  'ID#F555HH-tS9""+'		A 	
A1hC89LfSENE|UU&r<   r7   )g      @r<  )r    )r=  )fx_graphTNFN)r4  rp   r  r  r  loggingr  r  r  os.pathr  r   dataclassesr   r   typingr   r   r	   r
   r   r   torch._inductor.inductor_primstorch.distributedtorch.fxrJ   torch.utils._pytreeutils_pytreer   torch._dynamo.utilsr   r   ;torch._functorch._activation_checkpointing.ac_logging_utilsr   torch._inductorr   r~  torch._loggingr   r  r   %torch.fx.experimental._backward_stater   "torch.fx.experimental.proxy_tensorr   r   torch.fx.experimental.sym_noder   r   %torch.fx.experimental.symbolic_shapesr   r   r   r   r   r   torch.fx.passesr   torch.utils._ordered_setr   torch.utils.checkpointr   rI  -_activation_checkpointing.graph_info_providerr!   "_activation_checkpointing.knapsackr"   r#   r$   ,_activation_checkpointing.knapsack_evaluatorr%   _aot_autograd.logging_utilsr&   _aot_autograd.utilsr'   r(   compile_utilsr)   r*   r+   sympydebug_partitionerr,   rr   rI   	getLoggerrE   r-   Loggerr   r   r  r/   rN   rt   rK   r   r  r   r   ro   r   r   r   r   rm   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r  r  r*  r  r  r   r+  r2  r4  r8  rF  r[  r  r  r  r  r  r  r  cacher  rn   r  r	  r  r&  rY  r^  rc  r  r  r  r  rx  r  rz  r  r  r  r  r  r?  rL   r<   r:   <module>ra     s
           	  # * @ @  %   $ $ < 6 + A ? H L  ) / 3  L 
 L ; M H H  %66 t 6'g''1W^^ 1yy~~		 > > >2      >    T r~~ $ 2>> d  C  
  #	DDMD "'']D sm	D
 XXDNRWW  Gbgg G$ Gbgg $ bgg $ XRWW X XCrww C4 CKrww K4 Krww 4 $..$
4=$rww-'($$rww- s d277mU277^;< 	J88>>J
((--J 
J 
	JZB!88>>B!
((--B! B! 	B!
 B! B! XX]]B!J9%,, 95 9"D- G%((-- GD G45 5	5;; 	5 	8G%((.. 8GT 8GvMG%((.. MGT MGh BF	yrww-yy y "**RWW*=!>	y
 
yD BFq"..q"rww-q" "'']q"
 q" "**RWW*=!>q" 2>>2>>)*q"r :>AE\..\
 $,DI#6\ "**RWW*=!>\ 2>>2>>)*\~ c("# " "277 s :Rbhh R  "Pbggsl!3 PU277C<=P8Q PJBNN Jr~~ JZZ*xx##Z*xx##Z* XX]]Z* XX]]	Z*
 LLZ* Z* HHMMZ* HHMMZ*zH ..H ~~H  ~~H  	H 
 2>>2>>)*H V@ @D @# #BNN #T /3	S&S&S& #S& z"''*+	S&l."eW ePBHH +T+TK+T 5k+T 	+T
 +T $(=+T 5$s)T#Y&'+T\ 05 5 5 5$NT o	o	o	 
"'']	o	d))/3EHHMM/B)^ H  :>H ..H  $,DI#6H  2>>2>>)*H \ ,0#%)'HH  '' ' 	'
 5d3i(
)' ' c]' 
'r<   