
    rh
K                        d dl mZmZmZ d dlmZmZmZmZ d dl	m
Z
 d dlmZ d dlmZ d dlZd dlmZmZmZmZ d dlmZ d dlZd dlZd dlZd dlZd dlZd dlZd d	lmZ d
efdZde
j@                  fdZ! ejD                         d        Z# ejD                         de$fd       Z%de$fdZ& ejD                         de$fd       Z' ejD                  d      d        Z(de$fdZ) ed       G d d             Z* G d de      Z+y)    )BaseBackend	GPUTargetLanguage)irpassesllvmnvidia)knobs)
PTXASError)	dataclassN)AnyDictTupleOptional)
ModuleType)Pathtargetc                 >    dt         t        t        t        f   fd}|S )Nreturnc                     | j                   j                  }|j                   j                  }||k(  sJ d       |dk(  ryy)Nz%lhs and rhs bitwidth must be the same   )   r       )r   r   r   )scalarprimitive_bitwidth)lhs_typerhs_typelhs_bitwidthrhs_bitwidths       r/var/www/html/ai-insurance-compliance-backend/venv/lib/python3.12/site-packages/triton/backends/nvidia/compiler.pycheck_dot_compatibilityz-min_dot_size.<locals>.check_dot_compatibility   sB    9999|+T-TT+1    )r   int)r   r!   s     r    min_dot_sizer$      s!     uS#s]7K   #"r"   r   c                  6    t         j                  j                  S N)r
   r	   ptxas r"   r    	get_ptxasr)   !   s    <<r"   c                      t         j                  j                  } | | S t        j                  t               j                  dg      j                  d      }|S )Nz	--versionutf-8)r
   r	   mock_ptx_version
subprocesscheck_outputr)   pathdecode)mock_verversions     r    get_ptxas_versionr3   %   sI    ||,,H%%y{'7'7&EFMMgVGNr"   c                     t        | t              sJ t        t        | j	                  d            \  }}|dk(  r|dk  rd|z   S d|z   dz
  S |dk(  rd|z   S |dk(  rd	|z   S t        d
| z         )zK
    Get the highest PTX version supported by the current CUDA driver.
    .      P         F   
   ?   z?Triton only support CUDA 10.0 or higher, but got CUDA version: )
isinstancestrmapr#   splitRuntimeError)cuda_versionmajorminors      r    ptx_get_versionrF   .   s    
 lC(((sL..s34LE5{19::>!{Ez{Ez
X[gg
hhr"   archc                 `    | j                   }|t               j                  }t        |      }|S r&   )ptx_versionr)   r2   rF   )optionsrG   rI   rC   s       r    get_ptx_version_from_optionsrK   A   s0    %%K {**%l3r"   c                 @    t        | |      }t        d|      }d| }|S )NV   z+ptx)rK   min)rJ   rG   rI   llvm_ptx_versionfeaturess        r    get_featuresrQ   I   s0    .w=K 2{+&'(HOr"   c                     t        | d      5 }t        j                  |j                               j	                         cd d d        S # 1 sw Y   y xY w)Nrb)openhashlibsha256read	hexdigest)r/   fs     r    	file_hashrZ   W   s>    	dD	 4Q~~affh'1134 4 4s   1AA
capabilityc                 "    | dk\  rdnd}d|  | S )NZ   a sm_r(   )r[   suffixs     r    sm_arch_from_capabilityrb   ]   s!    "$S"FVH%%r"   T)frozenc                   ^   e Zd ZU dZeed<   dZeed<   dZeed<   dZe	e   ed<   d	Z
eed
<   dZeed<   dZeed<   dZe	e   ed<   dZeed<   dZeed<   dZeed<   dZee   ed<   dZee   ed<   dZeed<   dZee   ed<   dZeed<   dZeed<   dZeed<   dZeed<   dZeed <   dZeed!<   d" Z d# Z!y)$CUDAOptions   	num_warpsr9   num_ctas   
num_stagesNmaxnreg)r9   r9   r9   cluster_dimsrI   ptx_optionsir_overrideTenable_fp_fusionFlaunch_cooperative_grid
launch_pdl)fp8e5fp8e4b15supported_fp8_dtypesr(   !deprecated_fp8_dot_operand_dtypestf32default_dot_input_precision)rv   tf32x3ieeeallowed_dot_input_precisionsmax_num_imprecise_acc_defaultextern_libsdebugcudabackend_namesanitize_overflowrG   c                    t        t              j                  dz  }| j                  i nt	        | j                        }|j                  dd       s-t        j                  j                  xs t        |dz        |d<   t        j                  | dt        |j                                      | j                  dkD  r| j                  | j                  dz
  z  dk(  sJ d       y )Nlib	libdevicezlibdevice.10.bcr|   r   r9   znum_warps must be a power of 2)r   __file__parentr|   dictgetr
   r	   libdevice_pathr?   object__setattr__tupleitemsrg   )selfdefault_libdirr|   s      r    __post_init__zCUDAOptions.__post_init__}   s    h..6 ,,4b$t?O?O:P{D1',||'B'B'mc.[lJlFmK$4k6G6G6I0JK~~!t~~!9K'LQR&R 	0/	0R&Rr"   c           	      ^   t        | j                        }t        d t        |d         D              |d<   dj	                  t        |j                               D cg c]  \  }}| d|  c}}      }t        j                  |j                  d            j                         S c c}}w )Nc              3   <   K   | ]  \  }}|t        |      f  y wr&   )rZ   ).0kvs      r    	<genexpr>z#CUDAOptions.hash.<locals>.<genexpr>   s     (htq!!Yq\):(hs   r|   _-r+   )
r   __dict__r   sortedjoinr   rU   rV   encoderX   )r   	hash_dictnamevalkeys        r    hashzCUDAOptions.hash   s    '	#((hviXeNfGg(h#h	- hh	@Q9RSID#4&#ST~~cjj12<<>> Ts   B)
)"__name__
__module____qualname__rg   r#   __annotations__rh   rj   rk   r   rl   r   rI   rm   r?   rn   ro   boolrp   rq   rt   r   ru   rw   rz   r{   r|   r   r}   r   r   rG   r   r   r(   r"   r    re   re   c   s    IsHcJ "GXc]!#L%#KK!%K#%!d!$)T)J'<%*<46%uSz6'--/I %*I*.!4.KE4L#"t"D#0?r"   re   c                        e Zd Zedefd       Zd ZdefdZdeddf fdZ	de
fdZd	 Zd
 Zdeeef   fdZd Zed        Zed        Zd Zd Zd Zd Zd Z ej4                         d        Z xZS )CUDABackendr   c                      | j                   dk(  S )Nr~   )backend)r   s    r    supports_targetzCUDABackend.supports_target   s    ~~''r"   c                     d}t        j                  ||      }|st        d|       t        |j	                  d            S )Nz	^sm(\d+)$z(TRITON_OVERRIDE_ARCH must have the form r9   )re	fullmatch
ValueErrorr#   group)r   rG   patternmatchs       r    _parse_archzCUDABackend._parse_arch   s@    Wd+GyQRR5;;q>""r"   r   c                 B    | j                  |j                        }d| S )Ncuda:)r   rG   )r   rJ   r[   s      r    get_target_namezCUDABackend.get_target_name   s#    %%gll3
zl##r"   Nc                 2    t         |   |       d| _        y )Ncubin)super__init__
binary_ext)r   r   	__class__s     r    r   zCUDABackend.__init__   s     !r"   c                 h   dt         j                  j                  xs d| j                  j                   i}|j                  t        j                  j                         D ci c]  }||v s||   |||    c}       t        | j                  |d               }d|vrFt        t        j                        }|dk\  r|j                  d       t        t        |            |d<   d|vr
|dk\  rd|d<   d	|vrt         j                   j"                  |d	<   |dk(  rd
nd|d<   t        di |S c c}w )NrG   smrt   Y   fp8e4nvru   r]   )rs   ro   i   @r   r{   r(   )r
   runtimeoverride_archr   rG   updatere   __dataclass_fields__keysr#   r   setrt   addr   r   languagedefault_fp_fusion)r   optsargsr   r[   rt   s         r    parse_optionszCUDABackend.parse_options   s.   33NDKK<L<L;M7NO)I)I)N)N)PuATUY]T]aefgahatQQZuv))$v,78
!-#&{'G'G#H R$((3+08L1M+ND'(.d:R<J89T)',~~'G'GD#$9Cr9Iq,-"T""% vs   #	D/-D/3D/c                     |j                   |j                  |j                  |j                  d   |j                  d   |j                  d   fS )Nr   r9      )rg   rh   sharedrl   )r   metadatas     r    pack_metadatazCUDABackend.pack_metadata   sO    OO!!!$!!!$!!!$
 	
r"   c                     dd l mc mc m} t	        | j                  |j                              }|dk\  r|j                  n|j                  t        | j                        d}|S )Nr   r8   )convert_custom_typesr$   )triton.language.extra.cudar   extrar~   r#   r   rG   convert_custom_float8_sm80convert_custom_float8_sm70r$   r   )r   rJ   r~   r[   codegen_fnss        r    get_codegen_implementationz&CUDABackend.get_codegen_implementation   sV    11))',,78
 0:R/?D++TEdEd%

 r"   c                     ddl m} d|iS )Nr   )r   ztriton.language.extra.libdevice)r   r   )r   r   s     r    get_module_mapzCUDABackend.get_module_map   s    819==r"   c                 .    t        j                  |       y r&   )r	   load_dialects)r   ctxs     r    r   zCUDABackend.load_dialects   s    S!r"   c                    t        j                  | j                        }|j                          t        j
                  j                  |       t        j                  j                  |       |dz  dk  rt        j                  j                  |       t        j
                  j                  |       t        j                  j                  |       t        j                  j                  |       t        j
                  j                  |       t        j
                  j                  |       t        j                  j                  |       |j!                  |        | S )Nr<   	   )r   pass_managercontextenable_debugr   commonadd_inlinerttiradd_rewrite_tensor_pointer(add_rewrite_tensor_descriptor_to_pointeradd_canonicalizeradd_combineadd_reorder_broadcastadd_cseadd_symbol_dceadd_loop_unrollrun)modr   optr[   pms        r    	make_ttirzCUDABackend.make_ttir   s    __S[[)
!!"%..r2aKK@@D''+#))"-b!$$R(##B'
s
r"   c                    |j                   H| j                  dt        j                  | j                        j                  |j                                t        j                         }|j                  <|j                  d   |_	        |j                  d   |_
        |j                  d   |_        t        j                  | j                        }|j                         }t        j                  j!                  |d| |j"                  d|j$                         t        j&                  j)                  |       |dz  dk\  rt        j&                  j+                  |       t        j                  j,                  j/                  ||       t        j&                  j1                  |       t        j&                  j3                  |       t        j&                  j5                  |       t        j&                  j1                  |       t        j&                  j7                  ||d	k\         t        j                  j,                  j9                  |       t        j                  j;                  |       |dz  d
v rFt        j&                  j=                  |       t        j>                  jA                  |       t        j                  jC                  |       t        j>                  jA                  |       t        j&                  jE                  |       t        j                  jF                  jI                  ||jJ                  |       t        j&                  jM                  ||jJ                         t        j&                  jO                  |       t        j&                  jQ                  ||jJ                  |       n|dz  dk\  rt        j&                  j=                  |       t        j>                  jA                  |       t        j                  jC                  |       t        j&                  jS                  |       t        j&                  jU                  |       t        j                  j,                  jW                  |       t        j&                  jM                  ||jJ                         t        j&                  jO                  |       t        j&                  jY                  ||jJ                         t        j&                  jQ                  ||jJ                  |       t        j&                  jE                  |       t        j                  j,                  j[                  |       nt        j                  jC                  |       t        j>                  jA                  |       t        j                  j;                  |       t        j&                  j]                  |       t        j&                  j7                  ||d	k\         t        j&                  j_                  |       t        j                  j,                  ja                  |       t        j&                  j1                  |       t        j                  j,                  jc                  |       t        j&                  je                  |       t        j&                  jg                  |       t        j                  j;                  |       t        j>                  ji                  |       |dz  dk\  rRt        j                  j,                  jk                  |       t        j                  j,                  jm                  |       t        j>                  jo                  |       t        j>                  jA                  |       |jq                  |        |j                  |j                  |j                  f|d<   | js                         }||d<   | S )Nzttg.maxnregr   r9   r   r   r   r<   r   r8   )r   r   r   rl   tensordesc_meta):rk   set_attrr   builderr   get_int32_attrr	   ClusterInforl   clusterDimXclusterDimYclusterDimZr   r   r   r   add_convert_to_ttgpuirrg   rh   ttgpuiradd_coalesceadd_f32_dot_tc	ttnvgpuiradd_plan_ctaadd_remove_layout_conversionsadd_optimize_thread_localityadd_accelerate_matmuladd_optimize_dot_operands add_optimize_descriptor_encodingadd_loop_aware_cseadd_fuse_nested_loopsr   r   add_triton_licm add_combine_tensor_select_and_ifhopperadd_hopper_warpspecrj   add_assign_latenciesadd_schedule_loopsadd_pipelineadd_optimize_accumulator_initadd_hoist_tmem_allocadd_promote_lhs_to_tmemadd_warp_specializeadd_remove_tmem_tokensadd_prefetchadd_coalesce_async_copyadd_optimize_tmem_layoutsadd_interleave_tmemadd_reduce_data_duplicationadd_reorder_instructionsr   add_tma_loweringadd_fence_insertionadd_sccpr   get_tensordesc_metadata)r   r   r   r[   cluster_infor   dump_enabledr   s           r    
make_ttgirzCUDABackend.make_ttgir   s'    ;;"LL

3;;(?(N(Ns{{([\))+''*'7'7':L$'*'7'7':L$'*'7'7':L$__S[[)(**2zl/CS]]TVX[XdXde##B'q NN))"-,,R>44R833B7,,R044R800Z25EF@@D&&r*v%NN004MM++B/KK''+MM++B/NN;;B?MM  44RVNN//CNNCNN--b1NN''CNNLI2#NN004MM++B/KK''+NN88<NN//3MM##;;B?NN//CNNCNN--b1NN..r3>>BNN''CNNLINN;;B?MM##::2>KK''+''+&&r*##B'00Z25EF..r299"=44R833B72226//3&&r*$$R(q MM##44R8MM##77;r"''+
s$0$<$<l>V>VXdXpXp#q 557&5"#
r"   c                    |}t        j                  |j                        }|j                          t        j
                  j                  |       t        j                  j                  |       t        j                  j                  |       t        j
                  j                  |       t        j
                  j                  |       |j                  |       |j                         |d<   |S )Nr   )r   r   r   r   r   r   r   r   r  r   r  r   r
  r   r  )r   srcr   rJ   r[   r   r   s          r    	ttgir_optzCUDABackend.ttgir_opt0  s    __S[[)
""2&r"&&r*((,77;
s&)&A&A&C"#
r"   c                    t        || j                  j                        }|}t        j                  |j
                        }|j                          t        j                  j                  j                  |       t        j                  j                  |       t        j                  j                  |       t        j                  j                  |       t        j                  j!                  |       t        j                  j                  j#                  |       t        j                  j%                  |       t        j                  j                  j'                  |||       t        j(                  j+                  |       t        j(                  j-                  |       t        j                  j                  j/                  |       t        j                  j                  j1                  |       t        j(                  j+                  |       t        j(                  j-                  |       t        j(                  j3                  |       t4        j6                  j8                  st        j:                  j=                  |       |j?                  |       tA        jB                          tA        j
                         }t4        j6                  jD                  rtG        d      tA        jH                  ||      }	tK        |      }
tM        || j                  j                        }d}t        jN                          tA        jP                  |	||
|       t        jR                  |	       |jT                  r4|jT                  D cg c]  \  }}|	 }}}tA        jV                  |	|       tA        jX                  |	t@        jZ                         |j]                  d      }|||d<   |j]                  d      |d<   |j]                  d      |d<   |j]                  d	      |d
<   |j]                  d      |d<   t_        |	      }~	~|S c c}}w )NzYAddress Sanitizer Error: Address sanitizer is currently only supported on the AMD backendnvptx64-nvidia-cudazttg.total-num-warpsrg   z
ttg.sharedr   zttg.tensor_memory_size	tmem_sizezttg.global_scratch_memory_sizeglobal_scratch_sizez#ttg.global_scratch_memory_alignmentglobal_scratch_align)0rK   r   rG   r   r   r   r   r	   r   r   add_lower_mmar   r
  add_allocate_warp_groupsconvertadd_scf_to_cfadd_allocate_shared_memoryadd_allocate_tensor_memory"add_allocate_global_scratch_memoryadd_to_llvmirr   r   r   add_nvgpu_to_llvmadd_warp_specialize_to_llvmr   r
   compilationdisable_line_infollvmiradd_di_scoper   r   init_targetsenable_asanrB   	to_modulerb   rQ   set_short_ptrattach_datalayoutset_nvvm_reflect_ftzr|   link_extern_libsoptimize_moduleOPTIMIZE_O3get_int_attrr?   )r   r#  r   rJ   r[   rI   r   r   r   llvm_modprocrP   tripler   r/   pathstotal_num_warpsrets                     r    	make_llirzCUDABackend.make_llir?  s   27DKK<L<LM__S[[)
--b177;//3$$R(11"5::2>99"=++B
KH''+b!11"5;;B?''+b!$$R(  22MM&&r*
s,,.((km m>>#w/&z2)9)9:&xx@##H-.5.A.ABltTTBEB!!(E2Xt'7'78 **+@A&$3H[! --l; # 0 01I J*-*:*:;[*\&'+.+;+;<a+b'((m
# Cs   Qc           	         t        || j                  j                        }d}t        |      }t	        || j                  j                        }t        j                  ||||g |j                  d      }	t        j                  d|	      }
t        |
      dk(  sJ |
d   |d<   |dz   d|dz   }t        j                  d	d
| |	t        j                        }	t        j                  dd| |	t        j                        }	t        j                  dd|	      }	t        j                  j                  rt!        d       t!        |	       |	S )Nr&  Fz(.visible .entry ([a-zA-Z_][a-zA-Z0-9_]*)r9   r   r   r<   r5   z\.version \d+\.\d+z	.version )flagsz\.target sm_\d+z.target sm_z,\s*debug|debug,\s*r_   z // -----// NVPTX Dump //----- //)rK   r   rG   rb   rQ   r   translate_to_asmro   r   findalllensub	MULTILINEr
   r	   
dump_nvptxprint)r   r#  r   r   r[   rI   rD  rC  rP   rG  namess              r    make_ptxzCUDABackend.make_ptx{  s$   238H8HI&&z2T[[%5%56##CxSEYEY[`a

FL5zQ 8$b);r>*:;ff*i},EsRTR^R^_ff';zl)CSPRP\P\]ff+R5<<""45#J
r"   c                    t               j                  }t        j                  ddd      5 }t        j                  ddd      5 }|j	                  |       |j                          |j                  dz   }t        j                  j                  rdd	gndg}	|j                  rg nd
g}
t        |      }t        j                  j                  rddgng }|j                  r|j                  j                  d      ng }|g|	|
d||d| |j                  d|}	 t!        j"                  |dd|       t$        j                  j'                  |j                        rt%        j(                  |j                         t$        j                  j'                  |j                        rt%        j(                  |j                         t-        |d      5 }|j/                         }d d d        t$        j                  j'                  |      rt%        j(                  |       d d d        d d d        S # t         j*                  $ r}t-        |j                        5 }|j/                         }d d d        n# 1 sw Y   nxY wt$        j                  j'                  |j                        rt%        j(                  |j                         |j0                  dk(  rd}n2|j0                  dt2        j4                  z   k(  rd}nd|j0                   }t7        | d ddj9                  |       d      d }~ww xY w# 1 sw Y   UxY w# 1 sw Y   &xY w# 1 sw Y   S xY w)NFwz.ptx)deletemodera   rz.logz.oz	-lineinfoz-suppress-debug-infoz--fmad=falsez--opt-level0 z-vz--gpu-name=z-oT)check	close_fdsstderr   z!Internal Triton PTX codegen error   z`ptxas` raised SIGSEGVz`ptxas` failed with error code z
`ptxas` stderr:
z
Repro command: 
rS   )r)   r/   tempfileNamedTemporaryFilewriteflushr   r
   r4  r5  ro   rb   r	   disable_ptxas_optrm   rA   r-   r   osexistsremoveCalledProcessErrorrT   rW   
returncodesignalSIGSEGVr   r   )r   r#  r   r   r[   r'   fsrcflogfbin	line_infofmadrG   disable_optptx_extra_options	ptxas_cmdelog_filelogerrorrY   r   s                        r    
make_cubinzCUDABackend.make_cubin  s     ((CO .	 SW''u3vN.	 RVJJsOJJL99t#DAFARARAdAd&<=kvjwI--2N3CD*:6D 38,,2P2P=#.VXK ?Boo 5 5c :SU !$(*.1<?PT_`d_eRfhlhqhqswILydS77>>$)),IIdii(77>>$)),IIdii($ dD! !Q!ww~~d#		$].	  .	 ^ + 00 L$))_ *"--/C* * *77>>$)),IIdii(<<3&?E\\S6>>%994E=all^LE E7 +558E :33688I3F2Gr"K L LL"! !W.	  .	  .	 ^ sz   MCM B)H4	ML9&<M"M4L6L1I6	-	L16I?;B6L11L66M9M>MM	MMc                      j                  j                        |t        j                  k(  r fd|d<    fd|d<   n|t        j                  k(  r
 fd|d<    fd|d<    fd|d	<    fd
|d<   y )Nc                 ,    j                  | |      S r&   )r   r#  r   r[   rJ   r   s     r    <lambda>z(CUDABackend.add_stages.<locals>.<lambda>  s    4>>#xQXZd3e r"   r   c                 ,    j                  | |      S r&   )r!  r|  s     r    r}  z(CUDABackend.add_stages.<locals>.<lambda>  s    DOOCSZ\f4g r"   ttgirc                 ,    j                  | |      S r&   )r$  r|  s     r    r}  z(CUDABackend.add_stages.<locals>.<lambda>  s    DNN3RY[e4f r"   c                 ,    j                  | |      S r&   )rH  r|  s     r    r}  z(CUDABackend.add_stages.<locals>.<lambda>  s    t~~c8WV`/a r"   llirc                 T    j                  | |j                  j                        S r&   )rS  r   rG   r#  r   rJ   r   s     r    r}  z(CUDABackend.add_stages.<locals>.<lambda>  s#    dmmC7TXT_T_TdTd.e r"   ptxc                 T    j                  | |j                  j                        S r&   )ry  r   rG   r  s     r    r}  z(CUDABackend.add_stages.<locals>.<lambda>  s#    XwX\XcXcXhXh0i r"   r   )r   rG   r   TRITONGLUON)r   stagesrJ   r   r[   s   ` ` @r    
add_stageszCUDABackend.add_stages  se    %%gll3
x&eF6NgF7O'fF7Oaveuiwr"   c                 L    t               }| d| j                  j                   S )Nr   )r3   r   rG   )r   r2   s     r    r   zCUDABackend.hash  s&    #%!DKK,,-..r"   )r   r   r   staticmethodr   r   r   r?   r   r   r   r   r   r   r   r   r   r   r   r!  r$  rH  rS  ry  r  	functools	lru_cacher   __classcell__)r   s   @r    r   r      s    (	 ( (#$# $"y "T "#S #,
>S*_ 5 >"    H HT:x,1f	j Y/ /r"   r   ),triton.backends.compilerr   r   r   triton._C.libtritonr   r   r   r	   tritonr
   triton.runtime.errorsr   dataclassesr   r  typingr   r   r   r   typesr   rU   r   ra  rk  rf  r-   pathlibr   r$   
NvidiaToolr)   r  r3   r#   rF   rK   rQ   rZ   rb   re   r   r(   r"   r    <module>r     s-   E E 8 8  , !  - -   	   	  # #5##    iS i i$  
 
 
 T4 4
& & $'? '? '?TD/+ D/r"   