
    rhr                    `   d dl mZ d dlZd dlZd dlZd dlZd dlZd dlZd dlZd dl	Z	d dl
Z
d dlZd dlZd dlZd dlZd dlmZmZ d dlmZ d dlmZmZmZmZ d dlmZmZmZmZmZmZ d dlZd dl Zd dl!m"Z" d dl#m$Z$ d d	l%m&Z& d d
l'm(Z(m)Z)m*Z*m+Z+m,Z, d dl-m.Z.m/Z/m0Z0 d dl1m2Z2 d dl3m4Z4 erd dl5m6Z6 d dl7m8Z8 ddl9m:Z: ddl;m<Z< ddl=m>Z> dZ? e2e@d      ZA G d deB      ZC G d d      ZD G d d      ZEee&j                  e&j                  f   ZHej                   G d d             ZJej                   G d d             ZK G d  d!eK      ZL G d" d#      ZM G d$ d%      ZN G d& d'eK      ZO G d( d)eMeO      ZP G d* d+eNeO      ZQ G d, d-eMeK      ZR G d. d/eNeK      ZSej                  d2d0       ZU	 	 	 	 d3d1ZVy)4    )annotationsN)IterableSequence)ThreadPoolExecutor)byrefc_size_tc_void_pCDLL)AnyCallableIOOptionalTYPE_CHECKINGUnion)get_interface_for_device)rand_strided)ir)CppCodeCacheCUDACodeCache
DLLWrapperget_hashPyCodeCache)get_gpu_typeget_ld_library_pathis_gpu)getArtifactLogger)
OrderedSet)
ModuleType)TritonTemplateCaller   )config)benchmarker)VCUDA_VISIBLE_DEVICES
autotuningc                      e Zd Zy)!NonzeroWorkspaceNotSupportedErrorN__name__
__module____qualname__     s/var/www/html/ai-insurance-compliance-backend/venv/lib/python3.12/site-packages/torch/_inductor/autotune_process.pyr'   r'   6       r-   r'   c                      e Zd ZdZedd       Zedd       Zedd       ZddZd Z	ddZ
ddZddd	Zddd
ZddZddZddZy)TuningProcesszF
    Class to launch and interact with a benchmarking subprocess.
    c                     t         j                  dt        j                         t        j                  j                  t                      fd}	  |        y# t        $ r Y yw xY w)z4
        Entry point for the child process.
        z3Started autotune subprocess %s. Visible devices: %sc                     	 t         j                        } | y 	  |        }t         j                  |       7# t        $ r}|}Y d }~'d }~ww xY wN)r1   recv	Exceptionsend)jobresulte	read_pipe
write_pipes      r.   workloopz,TuningProcess.process_main.<locals>.workloopJ   sX    #((3; UF ""6:6  ! Fs   : 	AA

AN)autotuning_logdebugosgetpidenvirongetr$   EOFError)r;   r<   r=   s   `` r.   process_mainzTuningProcess.process_main?   sQ    
 	AIIKJJNN/0	

	7	J 		s   A 	A('A(c                P    t        j                  | |       |j                          y r4   )pickledumpflush)objr<   s     r.   r7   zTuningProcess.send\   s    C$r-   c                ,    t        j                  |       S r4   )rG   load)r;   s    r.   r5   zTuningProcess.recva   s    {{9%%r-   c                2    || _         | j                          y r4   )devicestart)selfrN   s     r.   __init__zTuningProcess.__init__e   s    

r-   c                T   t         j                  j                  t         j                  j                  t              d      }t        j
                         \  }}t        j
                         \  }}t        j                  |d      | _        t        j                  |d      | _        t        j                         | _        | j                  j                  | j                  t        j                         t        j                  |dt        j                           dt#        |       dt#        |       g}t         j$                  j'                  dt         j(                  j                  t        j                              dt+               t,        j.                  rd	ndd
}| j0                  t#        | j0                        |t2        <   t5        j6                  |i t         j$                  |||f      | _        t        j:                  |       t        j:                  |       d| _        y)z4
        Start the benchmarking subprocess.
        z__autotune_main__.pywbrbz	--parent=z
--read-fd=z--write-fd=TORCH_CUSTOM_PYTHONPATH01)
PYTHONPATHTORCH_WARM_POOLLD_LIBRARY_PATH3TORCHINDUCTOR_PROFILE_WITH_DO_BENCH_USING_PROFILINGN)envpass_fdsT)r@   pathjoindirname__file__pipefdopenr<   r;   	selectorsDefaultSelectorselectorregister
EVENT_READsys
executablerA   strrB   rC   pathsepr   r!   /profile_bandwidth_with_do_bench_using_profilingrN   r$   
subprocessPopenprocesscloserunning)rP   entrysubproc_read_fdwrite_fdread_fdsubproc_write_fdcmd	extra_envs           r.   rO   zTuningProcess.starti   s    RWW__X68NO$&GGI!$&GGI!!))Hd37D1!113t~~y/C/CD NN		}%_-./#./01
 **..)2::??388+D
  #24 EE DG
	 ;;".1$++.>I*+!''+2::++%'78

 	!
!"r-   c                V    | j                   xr | j                  j                         du S )z:
        True if the subprocess is still running.
        N)rr   rp   pollrP   s    r.   alivezTuningProcess.alive   s%     ||; 1 1 3t ;;r-   c                    | j                         s| j                          t        j                  || j                         y)z8
        Push a work item to the child process.
        N)r}   rO   r1   r7   r<   )rP   reqs     r.   putzTuningProcess.put   s*     zz|JJL30r-   c                   	 | j                   j                  |      s"t        d| j                  j                         t
        j                  | j                        }t        |t              r||S # t        $ r | j                           t        $ r | j                           t        $ r< t        j                  d| j                  j                         | j                           w xY w)z
        Get a response from the child process. Raises TimeoutError on timeout;
        raises EOFError if the subprocess crashes.
        zTimeout in autotune subprocess z.Unexpected exception in autotune subprocess %s)rf   selectTimeoutErrorrp   pidr1   r5   r;   killrD   rq   r6   r>   	exception
isinstance)rP   timeoutr9   s      r.   rC   zTuningProcess.get   s    
	==''0"%DT\\EUEUDV#WXX"''7F fi(L!  	IIK 	JJL 	$$@$,,BRBR IIK	s   AA2 2A7C)c                    | j                         r t        j                  d| j                         |r| j	                          yy)zC
        Signal the child process to shut down gracefully.
        N)r}   r1   r7   r<   wait)rP   r   s     r.   shutdownzTuningProcess.shutdown   s2     ::<tT__5IIK r-   c                x    | j                         r| j                  j                          | j                          y)z5
        Wait for the child process to exit.
        N)r}   rp   r   rq   r|   s    r.   r   zTuningProcess.wait   s&     ::<LL

r-   c                    | j                   j                          | j                  j                          | j                  j                          d| _        y)z"
        Close resources.
        FN)rf   rq   r;   r<   rr   r|   s    r.   rq   zTuningProcess.close   s;     	r-   c                    | j                         rDt        j                  d| j                  j                         | j                  j                          | j                          y)z6
        Send a SIGKILL to the child process.
        z)Sending SIGKILL to autotune subprocess %dN)r}   r>   errorrp   r   r   rq   r|   s    r.   r   zTuningProcess.kill   sF     ::<  ;   LL

r-   N)r;   	IO[bytes]r<   r   returnNone)rJ   r   r<   r   r   r   )r;   r   r   r   )rN   Optional[int])r   bool)r   r   r   r   )g      ^@)r   floatr   r   )T)r   r   r   r   r   r   )r)   r*   r+   __doc__staticmethodrE   r7   r5   rQ   rO   r}   r   rC   r   r   rq   r   r,   r-   r.   r1   r1   :   sq      8   & &.`<16
r-   r1   c                  J    e Zd ZdZddZed	d       ZddZd
dZ	 	 	 	 ddZ	y)TuningProcessPoolz
    Maintains a pool of TuningProcesses to benchmark kernels in parallel
    across devices. By default, we create one TuningProcess per device and
    set the sub-process environment to make only that device visible.
    c                V   | j                         }t        j                  d|       |D cg c]  }t        |       c}| _        t        j                         | _        | j                  D ]  }| j                  j                  |        t        t        |            | _        yc c}w )z,
        Start the child processes.
        z$Sub-process autotune device list: %s)rN   )max_workersN)get_device_listr>   r?   r1   	processesqueueQueueprocess_queuer   r   lenexecutor)rP   devicesrN   ps       r.   rQ   zTuningProcessPool.__init__   s     &&(CWM FMM6-v6M9> 	&A""1%	& +s7|D Ns   B&c                 l   t         j                  sdgS t               } t        |       }|j	                         }t
        t        j                  v rNt        j                  t
           j                  d      D cg c]  }t        |       }}t        |      |k  sJ |S t        t        |            S c c}w )zD
        Gather the list of devices to be used in the pool.
        N,)r!   autotune_multi_devicer   r   device_countr$   r@   rB   splitintr   listrange)gpu_typedevice_interfacecountdr   s        r.   r   z!TuningProcessPool.get_device_list  s    
 ++6M>3H= --/  2::-')zz2F'G'M'Mc'RS!s1vSGSw<5(((NE%L!!	 Ts   7B1c                    | j                   j                          | j                  D ]  }|j                  d        | j                  D ]  }|j                           y)z5
        Signal all child processes to exit.
        F)r   N)r   r   r   r   )rP   r   s     r.   r   zTuningProcessPool.shutdown  sQ     	  	#AJJEJ"	# 	AFFH	r-   c                   |j                   J | j                  j                         }|j                  |j                   j                         	 |j                  t
        j                        | j                  j                  |       S # t        $ rB t        j                  d| d       t        d      cY | j                  j                  |       S t        $ rB t        j                  d| d       t        d      cY | j                  j                  |       S w xY w# | j                  j                  |       w xY w)z
        Entry point for the thread-pool helper threads: Wait for an open TuningProcess,
        remove it from the queue, execute the benchmark in that subprocess, and return
        the TuningProcess to the queue.
        zTimed out benchmarking choice 'z['. It will be ignored. Please debug the root cause in case the choice can bring perf gains.infzFailed to benchmark choice ')bmreqr   rC   r   	benchmarkr!   +max_autotune_subproc_result_timeout_secondsr   warningswarnr   r6   )rP   choicerp   s      r.   targetzTuningProcessPool.target#  s     ||'''$$((*FLL**+	,;;BB$ ""7+  	 MM1& :W W
 < ""7+  	 MM.vh 7W W
 <""7+	  ""7+s0   B	 	-D6D  ,D?D  DD   D=c           	     x    t        t        || j                  j                  | j                  |                  }|S )z>
        Benchmark each choice in a separate process.
        )dictzipr   mapr   )rP   choicesresultss      r.   r   zTuningProcessPool.benchmarkB  s/     s7DMM$5$5dkk7$KLMr-   Nr   )r   zSequence[Optional[int]])r   r   r   r   r   zlist[TritonTemplateCaller]r   z!dict[TritonTemplateCaller, float])
r)   r*   r+   r   rQ   r   r   r   r   r   r,   r-   r.   r   r      sB    E& " "(	,>+ 
+r-   r   c                  p    e Zd ZU ded<   ded<   ded<   ded<   d	ed
<   dZded<   e	 	 	 	 dd       ZddZy)
TensorMetaztorch.devicerN   ztorch.dtypedtypeztorch._prims_common.ShapeTypesizesztorch._prims_common.StrideTypestridesr   offsetNzOptional[str]namec           
     :   t        |t              r4|D cg c]  }| j                  |       }}t        d |D              sJ |S |}t        |t        j
                        rt	        j                  d|      }|j                         }|J |j                         }|J t        ||t        j                  j                  j                  |j                         t        j                         t        j                  j                  j                  |j#                         t        j                         t        j                  j                  j%                  |j'                         j(                  t        j                         |j+                               S c c}w )Nc              3  <   K   | ]  }t        |t                y wr4   )r   r   .0xs     r.   	<genexpr>z*TensorMeta.from_irnodes.<locals>.<genexpr>c  s     AQz!Z0A   fake)r   layout)fallback)rN   r   r   r   r   r   )r   r   from_irnodesallr   LayoutBuffer	get_dtype
get_devicer   r#   graphsizevars
size_hintsget_sizer!   unbacked_symint_fallback
get_stride	size_hint
get_layoutr   get_name)clsirnodesr   r9   noder   rN   s          r.   r   zTensorMeta.from_irnodes]  sU    gx(>E F!1!1!!4 FF FA&AAAAMdBII&99&6D    "!!!''""--88 .  GG$$//!88 0  77##--!((88 .  
 	
 !Gs   Fc                    t        | j                  | j                  | j                  | j                  | j
                        S )N)rN   r   
extra_size)r   r   r   rN   r   r   r|   s    r.   	to_tensorzTensorMeta.to_tensor  s2    JJLL;;**{{
 	
r-   )r   z/Union[LayoutOrBuffer, Sequence[LayoutOrBuffer]]r   #Union[TensorMeta, list[TensorMeta]])r   torch.Tensor)r)   r*   r+   __annotations__r   classmethodr   r   r,   r-   r.   r   r   T  sQ    ((++KD-!
E!
	,!
 !
F
r-   r   c                  x    e Zd ZdZ	 	 	 	 	 	 	 	 	 	 d	dZ	 	 	 	 	 	 d
dZddZdd	 	 	 	 	 ddZdd	 	 	 	 	 ddZy)BenchmarkRequesta1  
    Only handle triton template benchmark for now. The extern kernel benchmark
    can be done inside the same process since they usually don't cause crash.

    Important: Instances of this class and subclasses have to be serializable
    across process boundaries. Do not put CUDA Tensors in here!
    c                    || _         t        |t              r|g}|| _        t        t        t
        f      r)t              dkD  rt        fdD              sJ d   | _        || _	        y )Nr    c              3  d   K   | ]'  }d D ]   }t        d   |      t        ||      k(   " ) yw))rN   r   r   r   r   r   N)getattr)r   r   attroutput_tensor_metas      r.   r   z,BenchmarkRequest.__init__.<locals>.<genexpr>  sG       Q  .q148GAt<LLLs   -0r   )
kernel_namer   r   input_tensor_metatupler   r   r   r   
extra_args)rP   r   r   r   r   s      ` r.   rQ   zBenchmarkRequest.__init__  s     ''4!2 3!2(5$-8%&* /   
 "4A!6"4$r-   c                   t         r4   NotImplementedErrorrP   outinput_tensorss      r.   make_run_fnzBenchmarkRequest.make_run_fn  s
     "!r-   c                     y r4   r,   r|   s    r.   cleanup_run_fnzBenchmarkRequest.cleanup_run_fn  s    r-   Nr   c                   t         r4   r   rP   fnr   r   s       r.   do_benchzBenchmarkRequest.do_bench  s
     "!r-   c                   t         j                  t        j                        }|rt	        j                         }|Ft        |      dk(  sJ t        d | j                  D              }| j                  j                         }|r+t	        j                         z
  }t	        j                         }	  | j                  |d|i}|r+t	        j                         z
  }t	        j                         } | j                  |g|| }|r9t	        j                         z
  }	t         j                  dt!        |       |	       | j#                          |S # t        $ r# t         j                  d       t        d      cY S w xY w)Nr   c              3  <   K   | ]  }|j                           y wr4   )r   r   s     r.   r   z-BenchmarkRequest.benchmark.<locals>.<genexpr>  s     !PA!++-!Pr   r   z0Skipping op due to nonzero workspace requirementr   z6InChildProcess %s: load %f, create tensor %f, bench %f)r>   isEnabledForloggingDEBUGtimer   r   r   r   r   r   r'   infor   r  r?   rk   r   )
rP   r   r   r?   start_tscreate_tensor_elapser  load_elapseresbench_elapses
             r.   r   zBenchmarkRequest.benchmark  sP   
 ++GMM:yy{H ;}%***!!P9O9O!PPM))335C#'99;#9 yy{H	 !!!=:c:B ))+0Kyy{HdmmB44499;1L  HD	$ 	
+ 1 	  RS<	 s   0E )E=<E=)
r   rk   r   r   r   r   r   Iterable[Any]r   r   r   r   r   r   r   zCallable[[], None]r   r   r   r   zOptional[torch.Tensor]r   r   )	r)   r*   r+   r   rQ   r   r   r  r   r,   r-   r.   r   r     s    %% ?% @	%
 "% 
%6"*"1="	"
 '+	" %" $	"
 
" '+)$) $) 
	)r-   r   c                  N    e Zd ZdZ	 	 	 	 	 d	 	 	 	 	 	 	 	 	 ddZdd	 	 	 	 	 ddZy)	_TestBenchmarkRequestz
    Supports unit testing. Defined in this file instead of the test file so the
    TuningProcess sub-process can unpickle these objects.
    Nc                J    || _         || _        || _        || _        || _        y r4   )r9   rN   sleepexccrash)rP   r9   rN   r  r  r  s         r.   rQ   z_TestBenchmarkRequest.__init__  s'     

r-   r  c               r   | j                   <t        j                  j                  t        d       t        | j                         k(  sJ | j                  rt        j                  | j                         | j                  r| j                  | j                  rt        j                  d       | j                  S )Nr    )rN   r@   rB   rC   r$   rk   r  r  r  r  ri   exitr9   r   s      r.   r   z_TestBenchmarkRequest.benchmark   sx     ;;"::>>"6=T[[AQQQQ::JJtzz"88((N::HHQK{{r-   )g        NNNF)
r9   r   rN   r   r  zOptional[float]r  zOptional[Exception]r  r   r  )r)   r*   r+   r   rQ   r   r,   r-   r.   r  r    sq      $!%#'  	
 !  KO*1G	r-   r  c                  $    e Zd Zdd	 	 	 	 	 ddZy)GPUDeviceBenchmarkMixinNr  c                  t        d g ||D              }t        |      dk  s
J d|        t        d |D        d      }t        |      }t        |      dk(  rt        t	        |            }n|j                         }|j                  |      5  t        j                  |      }|j                          d d d        |S # 1 sw Y   S xY w)Nc              3     K   | ]i  }t        |t        j                        rMt        |j                  j
                        r.|j                  j                  |j                  j                   k y wr4   )r   torchTensorr   rN   typeindexr   tensors     r.   r   z3GPUDeviceBenchmarkMixin.do_bench.<locals>.<genexpr>  sR      $
&%,,/v}}))*##/	 MM$
s   A/A1r    zCan not mix devices c              3     K   | ]9  }t        |j                  j                        r|j                  j                   ; y wr4   )r   rN   r#  r%  s     r.   r   z3GPUDeviceBenchmarkMixin.do_bench.<locals>.<genexpr>  s4      &--,,- ""s   ?Acuda)
r   r   nextr   itercurrent_devicerN   r"   benchmark_gpusynchronize)	rP   r  r   r   device_idx_setdevice_typer   
device_idxr  s	            r.   r  z GPUDeviceBenchmarkMixin.do_bench  s     $ $
/M/3/$
 
 >"a'P+??O)PP'+
 
 4K@~!#d>23J)88:J$$Z0 	+++B/C((*	+ 
		+ 
s   &CCr  r)   r*   r+   r  r,   r-   r.   r  r    s*    
 '+	 % $	
 
r-   r  c                  $    e Zd Zdd	 	 	 	 	 ddZy)CPUDeviceBenchmarkMixinNr  c               ,    t        j                  |      S r4   )r"   benchmark_cpur  s       r.   r  z CPUDeviceBenchmarkMixin.do_bench2  s     ((,,r-   r  r1  r,   r-   r.   r3  r3  1  s*    
 '+	- %- $	-
 
-r-   r3  c                       e Zd Z	 	 	 	 	 d	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 d fdZ	 	 	 	 	 	 ddZd ZddZ xZS )	TritonBenchmarkRequestc                    t         |   ||||       || _        || _        || _        || _        |	| _        |
| _        || _        || _	        || _
        y r4   )superrQ   module_pathmodule_cache_key
num_stages	num_warpsnum_consumer_groupsnum_buffers_warp_specmatrix_instr_nonkdimwaves_per_eukpack)rP   r   r   r   r   r:  r;  r<  r=  r>  r?  r@  rA  rB  	__class__s                 r.   rQ   zTritonBenchmarkRequest.__init__>  s_      	&79KZX& 0$"#6 %:"$8!(
r-   c               ~   t        j                  | j                  | j                        }t        j                  d| j                  | j                         t        || j                        j                  }t        | j                        }d|j                  _        i }dd l}d|j                  |      j                  v rd|d<   |j                   j"                  dk(  rd}nP|j                   j"                  }	t%        |	      }
|
j'                  | j(                  j                   j*                        }t-        t        || j                        t.        j0                  j2                  j4                  j6                        r!t9        j:                  |g|||i |d|iS t9        j:                  |g|||i ||ddS )	Nz"benchmark module key: %s, path: %sFr   warmupcpustreamT)rG  benchmark_run)r   load_by_key_pathr;  r:  r>   r?   r   r   runr   r   __self__with_bandwidth_infoinspect	signature
parametersrN   r#  r   get_raw_streamr   r$  r   r!  	_inductorruntimetriton_heuristicsDebugAutotuner	functoolspartial)rP   r   r   mod
run_methodr   
warmup_argrM  rG  r/  r   s              r.   r   z"TritonBenchmarkRequest.make_run_fnY  s    **4+@+@$BRBRS0!!	
 S$"2"2377
$//*
27
/ 
w((4???#(Jx ::??e#F**//K7D%44''..44F C))*OO##55DD
 $$  	
    $$  	
  " r-   c                    t        j                  | j                  | j                        }t	        || j
                        j                          y r4   )r   rI  r;  r:  r   r   
precompile)rP   rW  s     r.   r[  z!TritonBenchmarkRequest.precompile  s9    **4+@+@$BRBRST%%&113r-   c                T    d| j                   d| j                  d| j                  S )Nself.kernel_name=z, self.module_path=z, self.module_cache_key=)r   r:  r;  r|   s    r.   __str__zTritonBenchmarkRequest.__str__  s2    #$""$$8t'7'7&99RD<Q<Q;STTr-   )r   r   r   r   r   )r   rk   r   r   r   r   r   r  r:  rk   r;  rk   r<  r   r=  r   r>  r   r?  r   r@  r   rA  r   rB  r   r   r   r  r   rk   )r)   r*   r+   rQ   r   r[  r^  __classcell__rC  s   @r.   r7  r7  ;  s     $%%&$% ? @	
 "     !  # "   
64*41=4	4l4Ur-   r7  c                      e Zd Zy)TritonGPUBenchmarkRequestNr(   r,   r-   r.   rc  rc    r/   r-   rc  c                      e Zd Zy)TritonCPUBenchmarkRequestNr(   r,   r-   r.   re  re    r/   r-   re  c                  t     e Zd ZdZ	 	 	 	 	 	 	 	 	 	 	 	 d	 fdZd Z	 	 	 	 	 	 d
dZddZd ZddZ	ddZ
 xZS )CUDABenchmarkRequestae  
    A class to handle CUDA (CUTLASS) benchmark requests. This class is for
    managing the lifecycle of a CUDA kernel benchmark, including compiling
    the source code, managing workspace memory, and executing the kernel.

    Important: Instances of this class have to be serializable across
    process boundaries. Do not put CUDA Tensors in here!
    c                    t         |   ||||       || _        d| _        d | _        d | _        d| _        d| _        d| _        t        j                  | j                  d      \  | _        | _        y )Nr   F so)r9  rQ   source_codeworkspace_size	workspaceDLL_workspace_size_updatedhash_keysource_filer   writerP   r   r   r   r   rk  rC  s         r.   rQ   zCUDABenchmarkRequest.__init__  sr     	&79KZX&#$15)-',$ "*7*=*=d>N>NPT*U't'r-   c                    t         j                  d|        t        j                  | j                  d       t         j                  d|        y)z
        Precompile the CUDA source code to populate the CUDACodeCache.
        This may happen in a separate thread pool.
        Precompiling %srj  Done precompiling %sN)r>   r?   r   compilerk  r|   s    r.   r[  zCUDABenchmarkRequest.precompile  s<    
 	.5d..53T:r-   c          	        | j                          | j                          t        |      |gz   D cg c]  }t        |j	                                }}t
        j                  d| j                  | j                  | j                  | j                  || j                         t        t        j                  j                         j                        }t!        | j                  | j                        }t        d      }| j"                  dkD  rht        j$                  | j"                  dz   dz  t        j&                  |j(                        | _        t        | j*                  j	                               }t-        j.                  |g|| j                  d|| }	  |        |S c c}w # t0        $ r,}	t3        |	      fd}
| j5                          |
cY d}	~	S d}	~	ww xY w)zc
        Create a function to run the CUDA kernel with the given input and output tensors.
        zqmake_run_fn: self.kernel_name=%s, self.source_file=%s, self.hash_key=%s, self.DLL=%s, args=%s, self.extra_args=%sr         )r   rN   Nc                     t               r4   )RuntimeError)err_msgs   r.   raise_runtime_errorz=CUDABenchmarkRequest.make_run_fn.<locals>.raise_runtime_error  s    "7++r-   )ensure_dll_loadedupdate_workspace_sizer   r	   data_ptrr>   r?   r   rq  rp  rn  r   r!  r(  current_streamcuda_streamr   rl  zerosfloat64rN   rm  rU  rV  r|  rk   r   )rP   r   r   r&  args
stream_ptrrX  workspace_ptrretr:   r~  r}  s              @r.   r   z CUDABenchmarkRequest.make_run_fn  s    	 ""$:>}:MQTPU:UV*+VVMMHHOO	
 ejj779EEF
TXXt'7'78
 ""[[$$q(Q.mmzzDN
 %T^^%<%<%>?M 

 __
 	

 
 
		'E 
W WD  	'!fG, !&&	's#    F'F, ,	G!5!GG!G!c           
        | j                   ry | j                          t        | j                  D ch c]  }|j                   c}      }t        |dz         D cg c]  }t        d        }}t        t        j                  j                         j                        }t        | j                  | j                        }t               } |g || j                  t!        |      d |  t        j                  j#                          |j$                  | _        t(        j+                  d| j&                  | j                  | j,                  | j.                  | j                  || j                         d| _         y c c}w c c}w )Nr    zupdate_workspace_size called: new workspace size=%d, self.kernel_name=%s, self.source_file=%s, self.hash_key=%s, self.DLL=%s, args=%s, self.extra_args=%sT)ro  r  r   r   r   r   r	   r!  r(  r  r  r   rn  r   r   r   r   r-  valuerl  r>   r?   rq  rp  )rP   metaunique_input_count_r  r  rX  c_workspace_sizes           r.   r  z*CUDABenchmarkRequest.update_workspace_size  sW   ''  #'#9#9:4TYY:
 )..@1.D(EF1FFejj779EEF
TXXt'7'78
#: 	
	
__	
  	
 	
 	
 	

 .44 hMMHHOO		
 (,$; ;Fs   E<Fc                    | j                   4t        j                  | j                  d      \  | _         | _        | _        y y )Nrj  )rn  r   rL   rk  rp  rq  r|   s    r.   r  z&CUDABenchmarkRequest.ensure_dll_loaded  s:    888E8J8J  $95DHdmT%5 r-   c                l    | j                   !| j                   j                          d | _         d | _        y r4   )rn  rq   rm  r|   s    r.   r   z#CUDABenchmarkRequest.cleanup_run_fn$  s(    88HHNNDHr-   c                T    d| j                   d| j                  d| j                  S )Nr]  z, self.source_file=z, self.hash_key=)r   rq  rp  r|   s    r.   r^  zCUDABenchmarkRequest.__str__*  s0    #$""$$8t'7'7&99JDMM;KLLr-   r   rk   r   r   r   r   r   r  rk  rk   r   r   r  r   r_  )r)   r*   r+   r   rQ   r[  r   r  r  r   r^  r`  ra  s   @r.   rg  rg    s    VV ?V @	V
 "V V 
V$;4*41=4	4l",HMr-   rg  c                  b     e Zd Z	 	 	 	 	 	 	 	 	 	 	 	 d fdZd Z	 	 	 	 	 	 ddZddZd	dZ xZS )
CppBenchmarkRequestc                f    t         |   ||||       || _        t        |      | _        d | _        y r4   )r9  rQ   rk  r   rp  rn  rs  s         r.   rQ   zCppBenchmarkRequest.__init__2  s5     	&79KZX& -6:r-   c                    t         j                  d|        t        j                  | j                  d       t         j                  d|        y )Nru  rF  r/  rv  )r>   r?   r   rL   rk  r|   s    r.   r[  zCppBenchmarkRequest.precompile?  s<     	.5$**>3T:r-   c               \   t        j                  | j                  d      | _        t	        |      |gz   D cg c]  }|j                          }}t        j                  d| j                  | j                  || j                         t        | j                  | j                        }t        d | j                  D              sJ t        j                  gt        |      t        t	        | j                              z   z  |_        t!        j"                  |g|| j                   S c c}w )NrF  r  zJmake_run_fn: self.kernel_name=%s, self.DLL=%s, args=%s, self.extra_args=%sc              3  P   K   | ]  }t        |t        j                           y wr4   )r   ctypesc_ulonglong)r   args     r.   r   z2CppBenchmarkRequest.make_run_fn.<locals>.<genexpr>U  s     R3:c6#5#56Rs   $&)r   rL   rk  rn  r   r  r>   r?   r   r   r   r   r  r  r   argtypesrU  rV  )rP   r   r   r&  r  rX  s         r.   r   zCppBenchmarkRequest.make_run_fnF  s     $$T%5%55I04]0Cse0KLf!LLXHHOO	
 TXXt'7'78
R$//RRRR%112ID122


   

 __
 	
! Ms   D)c                    | j                   3	 t        | j                   d      r| j                   j                          y y y )Nrq   )rn  hasattrrq   r|   s    r.   r   z"CppBenchmarkRequest.cleanup_run_fna  s9    88 txx)  *	  r-   c                     d| j                   S )Nr]  )r   r|   s    r.   r^  zCppBenchmarkRequest.__str__i  s    #$""$%%r-   r  r  r   r_  )	r)   r*   r+   rQ   r[  r   r   r^  r`  ra  s   @r.   r  r  .  so    ;; ?; @	;
 "; ; 
;;
*
1=
	
6!&r-   r  c                 X    t               } t        j                  | j                         | S r4   )r   atexitrg   r   )pools    r.   get_tuning_process_poolr  m  s    D
OODMM"Kr-   c                4    t               j                  |       S )zO
    Do benchmarking in a subprocess and return the perf number (latency).
    )r  r   )r   s    r.   benchmark_in_sub_processr  t  s     #$..w77r-   )r   r   r   )W
__future__r   r  r  dataclassesrU  r	  r@   rG   r   rd   rn   ri   r  r   collections.abcr   r   concurrent.futuresr   r   r   r	   r
   typingr   r   r   r   r   r   r!  torch._inductor.async_compiletorch._dynamo.device_interfacer   torch._dynamo.testingr   torch._inductorr   torch._inductor.codecacher   r   r   r   r   torch._inductor.utilsr   r   r   torch._loggingr   torch.utils._ordered_setr   typesr    torch._inductor.select_algorithmr   ri  r!   runtime.benchmarkingr"   virtualizedr#   r$   r)   r>   r6   r'   r1   r   r   r   LayoutOrBuffer	dataclassr   r   r  r  r3  r7  rc  re  rg  r  cacher  r  r,   r-   r.   <module>r     s   "      	     
   . 1 2 2 D D  $ C .   L K , /  E  -  . "8\:		 	l l^e eP ryy"))+, 3
 3
 3
l ] ] ]@, D   F- -YU- YUx	 79O 		 79O 	LM24D LM^<&13C <&~  8'8&8r-   