
    rh&e                        d dl Z d dlZd dlZd dlZd dlmZmZ d dlmZm	Z	 d dl
mZmZmZmZmZmZmZ d dlZd dlZd dlmZ d dlmZ d dlmZmZ d dlmZ d d	lmZ d d
lm Z  d dl!m"Z"m#Z# ddl$m%Z%  ed      Z& ed      Z'e(ejR                  df   Z*e(e+ejX                     e+ejR                     f   Z-ej\                  j_                  e0d      Z1d dlm2Z2m3Z3 erd dl4m5Z5m6Z6 dejR                  deejR                     fdZ7dejR                  deejR                     fdZ8dejR                  de9ejR                  e:f   deejR                     fdZ; e jx                  d       G d d             Z=edd d!ejR                  d"ejR                  d#ed   dee(e-e-f      f
d$       Z>e	 dBdd d!ejR                  d"ejR                  d#ed%   de(e-e-f   f
d&       Z>	 dBdd d!ejR                  d"ejR                  d#e?dee(e-e-f      f
d'Z> G d( d)      Z@ej                  d*k\  r ej                  eCd+      ZDnd,ee&   d-ee'   de	e(e&e'f      fd.ZDd/e+ejX                     d0e+ejX                     d1e+ejX                     d2e+ejX                     d3e+e+ejR                        d4e+e+ee+ejR                     gejR                  f         de9ejX                  ejR                  f   fd5ZEd6ed   dee=   fd7ZFd8ejR                  de9ejX                  e:f   de:fd9ZGd:eejR                  e:f   de:fd;ZH e jx                  d       G d< d=             ZI e jx                  d       G d> d?             ZJd@ed   deeJ   fdAZKy)C    N)Counterdefaultdict)IterableIterator)CallableLiteralOptionaloverloadTYPE_CHECKINGTypeVarUnion)config)index_vars_no_squeeze)sympy_product
sympy_subs)
OrderedSet)Identity)	try_solve)symbol_is_typeSymT   VTU.loop_tiling)FloorDivModularIndexingFusedSchedulerNodeSchedulerNodeexprreturnc                    | j                         ryt        | t              ryt        | j                        dk(  sJ t        t        | j                              }t        | t              r;t        t        j                  | j                  d   | j                  d         |      }n t        t        j                  | d      |      }|r|d   j                         sy|d   S )zw
    Given an expr with a single free symbol, solve for a constant relation that would make
    this expression 0.
    Nr   r      )is_constant
isinstancer   lenfree_symbolsnextiterr   r   sympyEqargs)r"   free_symbolouts      o/var/www/html/ai-insurance-compliance-backend/venv/lib/python3.12/site-packages/torch/_inductor/tiling_utils.pysolve_for_zeror2   '   s    
 	D(	#t  !Q&&&tD--./K$(1tyy|<kJq);7c!f((*q6M    c           	      
   t        | j                        dk(  ryt        t        | j                              dt        j
                  dt        t        j
                     ffd}| j                  t              s| j                  t              s ||       S g }g }t        j                  j                  |       D ]x  }t        |t        j                        rKd}|j                  D ]5  }t        |      }||j!                         sJ d}|j#                  |       7 |rf y|j#                  |       z |syt%        |      }	 ddt        j
                  d	t        j
                  d
t        t        j
                     dt        j
                  fd}	|j'                  t        |	      j'                  t        |	      }
 ||
      }|rt)        ||i      dk(  sy|j#                  |       t        t+        |            dk(  r|d   S y)a  
    Giving an expr with a single free symbol, try to find a tiling that would
    make the expression coalesced with respect to that symbol.

    Tiling an expression `x` by `y` means that the expression will now be indexed
    by both the original (x) and by (x * y). So we are looking for a
    multiplicative factor that will make ((x + 1) * y) - (x * y) == 1.

    To simplify things for sympy, we'll try just x * y == 1, check x(1) and x(0).
    r   Nr"   r#   c                    | j                  t              s| j                  t              rJ t        | j                        dk7  ry t        t        j                  | d            }|r|d   j                         sy |d   S )Nr   )	hasr   r   r(   r)   r   r,   r-   r&   )r"   r0   r/   s     r1   _solve_simple_exprz,solve_for_tiling.<locals>._solve_simple_exprM   sk    88O,TXXh5GGGt  !Q&q);7#a&,,.1vr3   FTxyzc                     | |z  S N )r8   r9   r:   s      r1   indexing_div_repz*solve_for_tiling.<locals>.indexing_div_rep{   s    
 1ur3   r   r<   )r(   r)   r*   r+   r,   Exprr	   r6   r   r   Add	make_argsr'   Mulr.   r2   r&   appendsumreplacer   r   )r"   r7   required_valueseq_1_expressionsargseenmul_argr0   	eq_1_exprr>   eq_1_expr_simplifiedr/   s              @r1   solve_for_tilingrM   <   s    4"tD--./K 0D  88O$TXXh-?!$''O
 yy""4( )c599%D 88 ,$W-;(((&&s+, ##C(%)( $%I
 #':::: EJJ 
	 %,,_>NOWW" 1
2Cz)k3-?@QF3
:o&'1,q!!r3   index
var_rangesc                    t         j                  j                  |       }|D ]
  }||v s|c S  i }| j                  D ]  }||v rd||<   t	        |      ||<    t        | |      }|j                         D ]<  }d||<   	 t        | |      }||z
  dk(  rd||<   t        | |      |z
  dk(  r|c S d||<   > y# t        $ r t        j                  d| |       Y bw xY w)z;
    Try to find the symbol which coalesces this index
    r   r   zzero division error %s %sr%   N)
r,   r@   rA   r)   get_hintr   keysZeroDivisionErrorloop_tiling_loginfo)rN   rO   top_level_termsv	variables
zero_indexnew_vals          r1   find_coalesced_varr[      s    ii))%0O H
 *,I '
?IaL#A;IaL	' E9-J__ 	!	 	2G Z1$IaL 5),w61<	!  ! 	  !<eYO	s   ?B66 CCT)frozenc                       e Zd ZU dZeej                     ed<   eej                     ed<   eej                  ee
   f   ed<   eej                  ee
   f   ed<   eej                  ef   ed<   y)FusedNormalizedReadsWriteszO
    Normalized reads and writes for nodes in the same FusedSchedulerNode.
    
index_varsreduce_varsreadswritesrO   N)__name__
__module____qualname____doc__r   r,   Symbol__annotations__dictr?   strintr=   r3   r1   r^   r^      sk     5<<((ELL))

JsO+,,Z_,--U\\3&''r3   r^   nr!   pointwise_numel	red_numelnone_if_not_divisiblec                      y r<   r=   rl   rm   rn   ro   s       r1   get_pw_red_splitsrr      s     58r3   Fc                      y r<   r=   rq   s       r1   rr   rr      s     +.r3   c                    | j                         s%t        | j                  j                  d         |k(  r^| j                  j                  | j                  j                  d   f| j                  j
                  | j                  j                  d   ffS t        | j                  j                  d         ||z  k(  sJ t        | j                  j                  d         dz
  }d}|dk\  r0|| j                  j                  d   |   z  }||k(  rn|dz  }|dk\  r0|dk\  rr| j                  j                  d   d| }| j                  j                  d| }| j                  j                  d   |d  }| j                  j                  |d  }	||f|	|ffS |ry | j                  j                  | j                  j                  d   f| j                  j
                  | j                  j                  d   ffS )Nr   r   )is_reductionr   _bodysizes	iter_varsr`   r(   )
rl   rm   rn   ro   iprod	pw_splitsrx   
red_splitsred_varss
             r1   rr   rr      s    	~~=q)9:oMWWa 01WW  !''--"23
 	

 q)*o	.IIIIAGGMM!!AD
q&a ##9	Q	 q& 	AvGGMM!$Qq)	GG%%a*	WW]]1%ab)
77$$QR(9%*'=== WWa 01WW  !''--"23
 	
r3   c            	       \    e Zd ZdZded   fdZdeeef   fdZdedede	eeef      fd	Z
y
)NodeSplitGetterz_
    Finds a Pointwise, Reduction Split that compatible with all nodes in a SchedulerNode.
    noder   c                    || _         |j                  d   d   | _        |j                  d   d   | _        t	        t
              | _        d| _        t               | _        |j                  d   }t        |j                               D ]W  }t        |t        j                  j                  j                        s3t!        || j                  | j                  d      }|0| j                  j#                  |j$                  j&                         |\  \  }}\  }}t        j                  j(                  j*                  j,                  j/                  |||f| j                        \  }}| j                  t1        |         j#                  t3        |             |dk7  rt5        |      f| _        t3        |      t3        |      f}| j                  j#                  |       Z t               | _        y )Nr   r   r=   T)ro   )r   grouprm   rn   r   r   pw_split_optionsreduction_splitall_node_sizesreversed	get_nodesr'   torch	_inductor	schedulerr!   rr   addrv   rw   codegensimd
SIMDKernelprepare_split_iteration_lengthsr(   tupler   seen_pw_splits)	selfr   fused_grouprl   maybe_splits_n_pw_splitsn_red_splitsn_sizes	            r1   __init__zNodeSplitGetter.__init__  s    	+/::a=+;%)ZZ]1%5>I*>U&(?I|jjm$..*+ $	,Aa!:!:!H!HI
 -4''tL ###''62>/Q/q, '',,77WW+|!<dnn &K !!#k"2377k8JK r!(5l(C'E$K(%*=>F##F+I$	,L 2<r3   r#   c                 ~   t        | j                        dk(  rt        t        | j                              S t	        | j
                  j                               }t        |dd      D ]  }| j
                  |   D ]&  }| j                  || j                        x}s"|c c S  | j
                  |   D ]o  }t        t        |      dz
        D ]S  }t        |d| t        |||dz          fz   ||dz   d z         }| j
                  t        |         j                  |       U q  | j                  f| j                  ffS )zI
        Get a compatible pointwise, reduction split of the node
        r   r   r%   N)r(   r   r*   r+   maxr   rR   range	try_splitr   r   r   r   rm   rn   )r   max_pw_splitpw_split_lenpw_splitr0   ry   	new_splits          r1   get_node_splitszNodeSplitGetter.get_node_splits;  sV   
 t""#q(T001224005578!,26 	IL 11,? ..43G3GHH3HJ
 !11,? Is8}q01 IA % 1(!a!e)<=?@"1q57+,!I
 ))#i.9==iHII	I  %%'$..):;;r3   pwredc                    ddl m}m} || j                  v ry| j                  j	                  |       | j
                  D ]  \  }}	 ||z   }||f}|j                  ||      \  }	}
t        |
      dk(  sJ |	dt        |       }t        t        j                  j                  |            }||k7  sq| j                  ||      x}s|c S  ||fS # |$ r Y  yw xY w)zs
        See if this split is compatible, and potentially returning a longer split
        than the input.
        r   )	CantSplitr   Nr%   )torch._inductor.codegen.simdr   r   r   r   r   _split_iteration_rangesr(   r   	itertoolschainfrom_iterabler   )r   r   r   r   r   n_pwn_redgroupslengthssplitsgetterspw_group_splitsflattened_pw_splitsr0   s                 r1   r   zNodeSplitGetter.try_splitV  s     	G$$$#.. 	KD%c-","D"DVW"U w<1$$$$Ys2w/O
 #(	(E(Eo(V"W"b(..)<cBB3BJ#	& 3w  s   CCCN)rc   rd   re   rf   r   r   r   Splitr   r	   r   r=   r3   r1   r   r      sX    4>9:4>l<ue|!4 <6E  (5;N2O r3   r   )   
   )strictit1it2c                     t        |       t        |      k7  r#t        dt        |        dt        |             t        | |      S )zP
        Zip two iterables, raising ValueError if their lengths differ.
        zLengths differ: z != )r(   
ValueErrorzip)r   r   s     r1   	zip_equalr   }  sA     s8s3x/Czc#hZHII3}r3   rx   r}   norm_pw_varsnorm_red_vars
new_rangesreturn_getters_groupsc           	         t        d |D              }t        j                  d|       }d}t        |       dk(  rt        |      dk(  ri S t        |      t        ||z         k(  sJ g }	|D ]'  }
|	j	                  |
D cg c]
  } ||       c}       ) i }t        t        |	| |f            D ]f  \  }\  }
}t        |
      t        |      k7  r|dk(  sJ t        |      dk(  sJ 8|j                  t        |
|      D ci c]  \  }}||
 c}}       h d}i }t        |||z         D ]l  \  }}g }t        t        |            D ]  }|j	                  ||          |dz  } d}t        t        |      dz
  dd      D ]  }||z  |||   <   ||   |z  } n |j                         D ci c]  \  }}|t        ||       c}}S c c}w c c}}w c c}}w )zBMaps original variables to expressions using normalized variables.c              3   2   K   | ]  }t        |        y wr<   )r(   ).0ss     r1   	<genexpr>z$apply_var_mapping.<locals>.<genexpr>  s     .a3q6.s   zv_0:r   r   r   )rD   r,   symbolsr(   rC   	enumerater   updater   r   itemsr   )rx   r}   r   r   r   r   num_vars	flat_varscountapply_groupsr   giter_vars_to_flat_varsry   	var_grouprW   flat_vars_to_new_vars	new_rangenew_var
range_varsrz   ks                         r1   apply_var_mappingr     s   . .:..HXJ/0IE
9~s8}1	z?c,">????L& ;59aQy\9:;  !*,)X!68" 
PE9
 u:Y'6M6y>Q&&&%%E98M&N1q!t&NO
P E'
L=4PQ 	'	7
s9~& 	Ai./QJE	 s9~)2r2 	'A3:T>!*Q-0Q<$&D	'	' +002Aq 	
:a.// 9 : 'Os   =G
 G,Gr   c           
      	  ) t        t              }t        t              }| j                         }| j                         }t               }t               )|D ]O  }t        j
                  j                  j                  ||      r)j                  |       ?|j                  |       Q t        )fd| j                  j                  D              }| j                  d   d   }| j                  d   d   }	t        d ||	fD              ryt        |       j                         \  }
}t        |
|d      \  \  }}}| } t!        | j#                               D ]T  }t%        |t&        j(                  j                  j*                        s3|j,                  }|j.                  r yt        t              }t        t              }|D ],  }|j1                  |      D ]  }||   j                  |        . |D ],  }|j3                  |      D ]  }||   j                  |        . |s|st5        |||	      \  \  }}\  }}|
|z   }||f}t&        j(                  j6                  j8                  j:                  j=                  |||	      }t&        j(                  j6                  j8                  j:                  j?                  ||      \  }}tA        ||||||      }dtB        jD                  d	tB        jD                  fd
}|jG                         D  !ci c]  \  } }!tI         ||       |      |! }"} }!|jG                         D #!ci c]  \  }#}!tI         ||#      |      |! }$}#}!|"jG                         D ]  \  }}%||xx   |%z  cc<    |$jG                         D ]  \  }}%||xx   |%z  cc<    W |jG                         D &!ci c]0  \  }&}!t        j
                  jJ                  jM                  |&|      |!2 }}&}!|jG                         D '!ci c]0  \  }'}!t        j
                  jJ                  jM                  |'|      |!2 }}'}!tO        |||||      }(tP        jS                  d|(       |(S c c}!} w c c}!}#w c c}!}&w c c}!}'w )zjExtracts index variables, reduce variables, read/write expressions, and variable ranges from a fused node.c              3   T   K   | ]  }|j                   vs|j                    ! y wr<   )name)r   depremoved_bufferss     r1   r   z1extract_normalized_read_writes.<locals>.<genexpr>  s%      chho6Us   ((r   r   c              3   v   K   | ]1  }t        |t        j                        xr |j                           3 y wr<   )r'   r,   r?   r&   )r   vars     r1   r   z1extract_normalized_read_writes.<locals>.<genexpr>  s5       
C	$	>S__->)>	>s   79Nrl   )prefixr"   r#   c                 0    | j                  t        d       S )Nc                     | S r<   r=   )r8   s    r1   <lambda>zIextract_normalized_read_writes.<locals>.remove_identity.<locals>.<lambda>'  s    A r3   )rE   r   )r"   s    r1   remove_identityz7extract_normalized_read_writes.<locals>.remove_identity&  s    <<+66r3   zNormalized Fused reads: %s)*r   r   get_buffer_namesget_operation_namesr   graphr   $can_buffer_be_removed_through_fusionr   read_writesra   r   anyr   r   r   listr   r'   r   r   r!   rv   indirect_varsget_all_read_exprget_all_write_exprrr   r   r   r   r   r   r   r,   r?   r   r   sizevarssimplify_with_rangesr^   rT   rU   )*r   ra   rb   all_output_namesop_namesoutputsbuf_nameinputsrm   rn   r{   r|   r   r   rangesrl   bodyn_readsn_writesinpr"   r0   rx   r   r}   r   r   r   r   r   var_mapr   readrW   n_reads_newwriten_writes_new	buf_namesrw	fused_outr   s*                                            @r1   extract_normalized_read_writesr    s    0;:/FE0;J0GF,,.'')H)|G'1|O$ "77AA(HU)KK!	"   ,,22 F #'**Q-"2O JJqM!,I  #Y/  +D1AACIz -B:c-)!\=6 D$.."# E&!U__66DDEww 5@5L6A*6M  	'C..s3 '!!#&'	'  	(C//4 (""3'(	( x=N	>
: K":8\ Z'/OO##((33SS 	 OO##((33KK 	*
)
 $!
	7%** 	7 	7 JQ
>EdAJt,g69
 

 %NN,
q u-w7:
 

  +002 	%OD)$K9$K	%  ,113 	&OD)4LI%L	&IE&P IN@D1--a8!;E  IO@D1--a8!;F  +I 5yA;

s   Q)?Q/?5Q5
5Q;addrc                 2   g }| j                   D ]C  }|j                  |d      }t        |t        j                        r0|3|j                  |       E ddlm} |j                  j                  j                  t        |      t        j                        S )z6
    Score addr according to its approximate size
    Nr   r   fallback)r)   getr   r   INDIRECTrC   virtualizedr   r   r   atomically_apply_size_hintr   r   unbacked_symint_fallback)r  rO   	var_sizesrW   v_sizer   s         r1   	get_scorer  I  s     I %4(a/F4FV$	%
 7766i 6+J+J 7  r3   rW   c                     t        | t              r| S t        j                  j                  j                  | t        j                        S )Nr  )r'   rk   r   r   r   	size_hintr   r  )rW   s    r1   rQ   rQ   \  s7    !Sww))!f6U6U)VVr3   c                   D    e Zd ZU dZej
                  ed<   eed<   eed<   y)	VarTilingzm
    Tiling of a var by `tiling_factor` that yields additional coalesced mem accesses by `benefit_score`
    r   tiling_factorscoreN)rc   rd   re   rf   r,   rg   rh   rk   r=   r3   r1   r  r  c  s     
Jr3   r  c                   T    e Zd ZU eej
                  ef   ed<   eed<   dZ	e
e   ed<   y)CoalesceVarAnalysiscoalesced_by_varnorm_read_writesNsuggested_split)rc   rd   re   ri   r,   r?   rk   rh   r^   r  r	   r  r=   r3   r1   r  r  n  s,    
 5::s?++00+/OXi(/r3   r  
fused_nodec           	         t        |       }|y|j                  }|j                  }|j                  }t	               }t	               }t        j                  d |j                         D        d |j                         D              D ]  \  }\  }}	t        |j                  |j                  j                         z
        }
|
r<t        ||      }|dk(  rNt        ||      }d}|	D ]=  }t        j                  j                  |      x}s%||j                   j"                  z  }? ||rdndz  }|r||xx   ||z  z  cc<   ||xx   ||z  z  cc<    |st%        ||      S t'        t              }|j                         D ]  \  }}t(        j+                  |j                  d      }|j                  D ]  }||vr|dk(  r||= t-        ||      }d||<   t/        |      }||j1                         r|j2                  sLt5        |      }t        j                  j6                  j9                  |||         sdt;        fd	|||   |z  fD              s||   |xx   |z  cc<     t=        |      dk(  rt%        ||      S d}d}|j                         D ])  \  }}|j                         D ]  \  }}||kD  s||f}|} + |t%        ||      S t%        ||t?        |d   |d   |      
      S )a[  
    Find variables that coalesce the reads and writes and score the total size.

    If uncoalesced memory expressions are found, look for additionally tiling of variables
    which will coalesce memory accesses.

    For instance - for the following expression:

    (32*p0) // 2048

    Tiling p0 by 64 will make this expression coalesced.
    Nc              3   $   K   | ]  }d |f 
 yw)TNr=   r   items     r1   r   z,analyze_memory_coalescing.<locals>.<genexpr>  s     0$$0   c              3   $   K   | ]  }d |f 
 yw)FNr=   r  s     r1   r   z,analyze_memory_coalescing.<locals>.<genexpr>  s     24%2r!  r   r   r%   )r  r     c              3   r   K   | ].  }t         j                  j                  j                  |       0 y wr<   )r   r   r   statically_known_lt)r   blockMIN_TILING_BLOCKs     r1   r   z,analyze_memory_coalescing.<locals>.<genexpr>  s1         445EuMs   47)r  r  r  ) r  ra   rb   rO   r   r   r   r   boolr)   rR   r  r[   r   r   try_get_bufferdtypeitemsizer  r   ri   fromkeysr   rM   r&   
is_integerrk   r   r%  allr(   r  )r  r  ra   rb   rO   r  uncoalesced_addrsis_readmemory_exprr   indirect_exprsizemaybe_coalesced_varbyte_multiplerr   buftiling_scoresuncoalesced_expr
addr_score	expr_subsrW   single_var_exprr  best_tilingbest_tiling_scorer   tiling_countertile
tile_scorer'  s                                @r1   analyze_memory_coalescingrA  z  sa     6jA""E$$F!,,J07	/6y-6__0%++-026<<>2. D))+y
 $$'7'B'B'G'G'II
 j1190jI! 	5Hgg,,X66s6#))"4"44	5
 	w!A-01TN5JJ1k*d^.CC*;D> "-@P
 	

 7B'6JM(9(?(?(A ":$*MM"2"?"?C	!..  	:A
"Q!()99EOIaL,_=M%$002$//.M77##77zRS}U
  ! +Z]m-KL  !]+z9+A 	:":H =Q"-@P
 	
 59K,224 /^ . 4 4 6 	/D*--"Dk$.!	// "-@P
 	
 ))!+a.+a.BST r3   )F)Ldataclasses	functoolsr   syscollectionsr   r   collections.abcr   r   typingr   r   r	   r
   r   r   r   r,   r   torch._inductorr   torch._inductor.dependenciesr   torch._inductor.utilsr   r   torch.utils._ordered_setr   torch.utils._sympy.functionsr   torch.utils._sympy.solver   torch.utils._sympy.symbolr   r   r  r   r   r   r   r?   r   r   rg   VarsAndRanges_logginggetArtifactLoggerrc   rT   r   r   torch._inductor.schedulerr    r!   r2   rM   ri   rk   r[   	dataclassr^   rr   r(  r   version_infopartialr   r   r   r  r  rQ   r  r  rA  r=   r3   r1   <module>rV     sR      
 , . W W W   " > ; / 1 . :  CLCL 	ejj#od5<<($uzz*::; ..228]K B K (< *V5:: V(5::*> Vr#::##'

C#8#ejj#L d#	( 	( $	( 
88ZZ8 zz8 #4=	8
 eM=0128 
8 

 -2	..ZZ. zz. #5>	.
 =-'(. 
. #(	#
#
ZZ#
 zz#
  	#

 eM=012#
Lu up w!	!!#d3Ix{ ! %1+9N @ELL!@5<< @ u||$@ %	@
 T%**%&@  XtEJJ/?.@%**.L%M NO@ 
%,,


"#@F}
5
6}()}@EJJ Ds1B,C  &Wejj#o& W3 W d#  $ d#0 0 $0B;<B!"Br3   