
    rh                        d dl Z d dlZd dlZd dlmZmZ d dlmZ d dlm	Z	m
Z
 d dlmZ d dlmZ d dlmZmZ d dlZd dlmZ d dlmZ d d	lmZ d
dlmZ d
dlmZ d
dlmZ d
dlmZ d
dl m!Z!m"Z"m#Z#  G d de      Z$ ejJ                  e&      Z'e	 G d d             Z(e	 G d d             Z) e"        G d d             Z* G d de      Z+ e"        G d de+             Z, e"        G d de+             Z-d Z. e#d       d!dej^                  fd"       Z0e	 G d# d$             Z1e#d%        Z2 e"        G d& d'             Z3e,e-d(Z4 e"        G d) d*             Z5 G d+ d,      Z6y)-    N)ABCabstractmethod)deque)	dataclassfield)Enum)partial)OptionalUnion)DecodeStream)tqdm   )PretrainedConfig)GenerationConfig)PreTrainedTokenizerFast)logging)ContinuousBatchProcessorMetricsattach_tracertracedc                   ,    e Zd ZdZdZdZdZdZdZdZ	dZ
y	)
RequestStatusz5Status of a generation request through its lifecycle.pending
prefillingprefilling_splitsplit_pending_remainderdecodingfinishedfailedN)__name__
__module____qualname____doc__PENDING
PREFILLINGPREFILLING_SPLITSPLIT_PENDING_REMAINDERDECODINGFINISHEDFAILED     ~/var/www/html/ai-insurance-compliance-backend/venv/lib/python3.12/site-packages/transformers/generation/continuous_batching.pyr   r   &   s*    ?GJ)7HHFr+   r   c                   
   e Zd ZU dZeed<    ee      Zee	   ed<    ee      Z
ee	   ed<    ee      Zee   ed<   dZee   ed<   ej                   Zeed	<    eej$                        Zeed
<    ee	      Zee	   ed<   y)GenerationOutputa  Tracks the output of a generation request.

    Attributes:
        request_id (str): The ID of the generation request.
        prompt_ids (list[int]): The IDs of the prompt tokens.
        generated_tokens (list[int]): The generated tokens.
        logprobs (list[float]): The log probabilities of the generated tokens.
        error (Optional[str]): Any error message associated with the request. When None, the request was successful.
    
request_iddefault_factory
prompt_idsgenerated_tokenslogprobsNerrorstatuscreated_time
next_token)r   r    r!   r"   str__annotations__r   listr2   intr3   r4   floatr5   r
   r   r#   r6   timer7   r8   r*   r+   r,   r.   r.   5   s     O!$7JS	7"'"=d3i=!$7Hd5k7E8C=)11FM1		:L%: %c :J:r+   r.   c                      e Zd ZU dZeed<   dZeee	      ed<   dZ
eee	      ed<    ee      Zee	   ed<    ee      Zee	   ed<    ee      Zee	   ed	<   d
Ze	ed<   ej"                  Zeed<   dZe	ed<   dZe	ed<    eej*                        Zeed<   dZee   ed<   dZee   ed<   de	fdZde	fdZede	defd       Zd Zd Z y)RequestStatezTracks the state of a generation request through its lifecycle.

    Attributes:
        status (RequestStatus): can be one of PENDING, PREFILLING, PREFILLING_SPLIT,
                                SPLIT_PENDING_REMAINDER, DECODING, FINISHED, FAILED
    r/   Nr2   full_prompt_idsr0   remaining_prompt_idsstatic_outputsallocated_blocksr   position_offsetr6      max_new_tokenseos_token_idr7   r5   r8   returnc                     | j                   S )zCGet the current length of the sequence (prompt + generated tokens).)rE   selfs    r,   current_lenzRequestState.current_lenc   s    ###r+   c                 ,    t        | j                        S )z*Get the number of tokens generated so far.)lenrC   rL   s    r,   generated_lenzRequestState.generated_leng   s    4&&''r+   token_idc                 .   | j                   t        j                  k7  ry|| j                  k(  xr | j                  dk7  }| j	                         | j
                  k\  }|r|r| j                  j                  |g       |s|rt        j                  | _         yy)zUpdate the request with a newly generated token and check for completion.

        Args:
            token_id: The token ID to add to the output sequence

        Returns:
            bool: True if the request is now complete, False otherwise
        FrH   T)	r6   r   r'   rI   rQ   rG   rC   extendr(   )rM   rR   is_eos
is_max_lens       r,   update_with_tokenzRequestState.update_with_tokenk   s     ;;-000T...J43D3D3J'')T-@-@@
 6&&z2Z'00DKr+   c                 0   d| j                    d| j                   d| j                          dt        | j                         dt        | j
                         d| j                   dt        | j                         d| j                   d	| j                   d
S )NzRequestState(
	request_id=z
,
	status=z,
	out_tokens=z,
	query_length=z, 
	remaining_tokens=z, 
	kv_length=z
	full_prompt_lenght=z,
	allocated_blocks=z,
	generated_tokens=z
))
r/   r6   rQ   rP   r2   rB   rE   rA   rD   rC   rL   s    r,   __repr__zRequestState.__repr__   s9   -doo->l4;;-Wghlhzhzh|g}  ~P  QT  UY  Ud  Ud  Qe  Pf  f}  ~A  BF  B[  B[  ~\  }]  ]m  nr  nB  nB  mC  CZ  [^  _c  _s  _s  [t  Zu  uK  LP  La  La  Kb  bx  y}  yL  yL  xM  MP  Q  	Qr+   c           	          t        | j                  | j                  | j                  | j                  g | j
                  | j                        S )z7Convert the request state to a GenerationOutput object.)r/   r2   r6   r3   r4   r5   r8   )r.   r/   rA   r6   rC   r5   r8   rL   s    r,   to_generation_outputz!RequestState.to_generation_output   s@    ++;;!00**
 	
r+   )!r   r    r!   r"   r9   r:   r2   r
   r;   r<   rA   r   rB   rC   rD   rE   r   r#   r6   rG   rI   r>   r7   r=   r5   r8   rN   rQ   r   boolrW   rY   r[   r*   r+   r,   r@   r@   K   s    O&*Jc#*+/OXd3i(/&+D&A$s)A %d ;NDI;"'"=d3i=OS)11FM1NCL#		:L%:E8C= $J$$S $(s ( # $  4Q

r+   r@   c                      e Zd Zej                  dddfdededej                  dej                  de	de
ee	eeej                  e	f   f      d	e
e	   d
dfdZede	ded
ee	   fd       Zeded
dfd       Zd
e	fdZded
ee	   fdZededee	   d
ee	   fd       Zedej.                  dej.                  de	d
eej.                  ej.                  f   fd       Zy)PagedAttentionCached   Nconfiggeneration_configdevicedtypenum_requestslayer_device_maptp_sizerJ   c           
      &   t        |dd      |j                  n|j                  | _        | j                  }||dkD  r||z  dk7  rt        d| d| d      t	        |d      r|j
                  n|j                  |j                  z  | _        |j                  | _        t        |d	d
      }	t        |dd      }
t        |dd      }t        |dd      }|	|>t        |j                  |
| j
                  | j                  | j                  |||	      \  }	}t        j                  d|	 d|
 d|        || _        |
| _        |	| _        ||	| j                  | j
                  f| _        || _        || _        g | _        g | _        t)        |j                        D ]  }|||   n|}t+        j,                  | j                  | j                   |      }t+        j,                  | j                  | j                   |      }t*        j.                  j1                  |       t*        j.                  j1                  |       | j$                  j3                  |       | j&                  j3                  |        t5        t)        |	            | _        i | _        y)a  Initialize a paged attention cache for efficient memory usage.

        Args:
            config: Model configuration
            generation_config: Generation configuration containing cache parameters
            device: Device for the cache tensors
            dtype: Data type for the cache tensors
            layer_device_map: Optional mapping of layer indices to devices
            initial_prompt_shapes: Optional sample prompts to help calculate optimal cache size
        num_key_value_headsN   r   zNumber of key value heads z+ must be divisible by tensor parallel size .head_dim
num_blocksi   
block_size    
max_memory?max_batch_tokens   )rm   rk   
num_layers	num_headsmax_memory_percentrc   rl   zUsing calculated num_blocks=z, block_size=z, max concurrent requests rc   rb   )getattrnum_attention_headsrh   
ValueErrorhasattrrk   hidden_sizenum_hidden_layerscompute_optimal_blocksrG   loggerwarningrq   rm   rl   cache_shaperc   rb   	key_cachevalue_cacherangetorchzeros_dynamomark_static_addressappendr   _free_blocks_block_tables)rM   r`   ra   rb   rc   rd   re   rf   rh   rl   rm   ru   rq   idxlayer_devicenew_layer_key_cachenew_layer_value_caches                    r,   __init__zPagedAttentionCache.__init__   s   . v4d;C &&++ 	 
 #667Q;"W,1 01D0EEpqxpyyz{   'vz:FOO@R@RV\VpVp@p 	 "(!9!9 .dC
.bA
$%6cJ"#46H#N!1!9+A!00%1122#5%	,(J( 	*:,mJ<Oijzi{|	
 !1$$/T__dmm\
-//1112 		;C4D4P+C0V\L"'++d.>.>djjYe"f$)KK0@0@

[g$h! MM--.ABMM--.CDNN!!"56##$9:		; "%
"3435r+   n_blocksr/   c                 $   t        | j                        |k  ryg }t        |      D ]+  }|j                  | j                  j	                                - || j
                  vrg | j
                  |<   | j
                  |   j                  |       |S )z*Allocates n_blocks for a given request_id.F)rP   r   r   r   popleftr   rT   )rM   r   r/   	allocated_s        r,   allocate_blocksz#PagedAttentionCache.allocate_blocks   s     t  !H,	x 	:AT..6689	: T///-/Dz*:&--i8r+   c                     || j                   v r7| j                   j                  |      }| j                  j                  |       yt        j                  d|        y)z.Frees all blocks associated with a request_id.z6Attempted to free blocks for non-existent request_id: N)r   popr   rT   r~   info)rM   r/   blocks_to_frees      r,   free_blockszPagedAttentionCache.free_blocks   sR     +++!//33J?N$$^4KKPQ[P\]^r+   c                 ,    t        | j                        S )z,Returns the number of free blocks available.)rP   r   rL   s    r,   get_num_free_blocksz'PagedAttentionCache.get_num_free_blocks  s    4$$%%r+   c                 :    | j                   j                  |g       S )z&Returns the block table for a request.)r   getrM   r/   s     r,   get_block_tablez#PagedAttentionCache.get_block_table  s    !!%%j"55r+   statelogical_indicesc           	      2   |j                   }| j                  j                  |      }|st        d|       | j                  }g }|D ]L  }||z  }||z  }	|t        |      k\  rt        d| d| d|       ||   }
|
|z  |	z   }|j                  |       N |S )a  
        Maps logical sequence indices to physical cache indices using the block table, using PyTorch.

        Args:
            request_id: The request ID.
            logical_indices: A list of logical indices.

        Returns:
            A list of physical indices.

        Raises:
            ValueError: If no block table is found for the request ID.
            IndexError: If a logical index maps to a block index that is out of bounds.
        z!No block table found for request zLogical index z maps to block index z$ which is out of bounds for request )r/   r   r   ry   rm   rP   
IndexErrorr   )rM   r   r   r/   block_tablerm   physical_indicesr   	block_idxblock_offsetphysical_block_numphysical_indexs               r,   _get_physical_indicesz)PagedAttentionCache._get_physical_indices  s      %%
((,,Z8@MNN__
" 	4Cz)I+LC,, $SE)>yk J##-,0 
 "-Y!7/*<|KN##N3	4  r+   
key_statesvalue_states	layer_idxc                 t   | j                   | j                  z  }| j                  |   j                  | j                  || j
                        }| j                  |   j                  | j                  || j
                        }	|d   |d d |d d f<   |d   |	d d |d d f<   |d d d |d d f   |	d d d |d d f   fS )Nr   )rl   rm   r   viewrh   rk   r   )
rM   r   r   r   
read_indexwrite_indexkwargstotal_slotsk_cache_flatv_cache_flats
             r,   updatezPagedAttentionCache.update4  s     oo7~~i055d6N6NP[]a]j]jk''	2778P8PR]_c_l_lm*4Q-QQ&'*6q/QQ&'D!Z23\$:WXBX5YYYr+   )r   r    r!   r   float16r   r   rb   rc   r<   r
   dictr   r9   r   r   r;   r   r   r   r   r@   r   Tensortupler   r*   r+   r,   r^   r^      s    #]]OS!%R6 R6 ,R6 	R6
 {{R6 R6 #4U3c3I-J(J#KLR6 #R6 
R6h   c   _c _d _ _&S &6# 6$s) 6 % < % $s) % X\]`Xa %  % N ZLLZ llZ 	Z 
u||U\\)	*Z Zr+   r^   c                       e Zd ZdZddedefdZedefd       Z	ede
dee   fd	       Zedefd
       Zeddedefd       Zededee
   fd       Zy)	Schedulerz
    Abstract base class for scheduling requests in the continuous batch processor.
    It is expected that cache allocation and scheduling logic will be implemented in subclasses.
    cacheretain_cache_on_finishc                 Z    i | _         i | _        t               | _        || _        || _        y N)active_requestswaiting_requestsr   waiting_requests_orderr   r   )rM   r   r   s      r,   r   zScheduler.__init__M  s+    8:9;27'#
&<#r+   r   c                      yz"Add a request to the waiting list.Nr*   )rM   r   s     r,   add_waiting_requestzScheduler.add_waiting_requestT       	r+   token_budgetrJ   c                      y r   r*   )rM   r   s     r,   schedule_batchzScheduler.schedule_batchY  s    r+   c                 Z    t        | j                        xs t        | j                        S )z2Check if there are requests ready to be processed.)rP   r   r   rL   s    r,   has_pending_requestszScheduler.has_pending_requests]  s%     4''(FC0E0E,FFr+   r/   evict_from_cachec                      y)z:Finish processing a request and free its allocated blocks.Nr*   rM   r/   r   s      r,   finish_requestzScheduler.finish_requestb  r   r+   c                 T    || j                   v r| j                   |   j                  S g S r   )r   rC   r   s     r,   !get_active_request_static_outputsz+Scheduler.get_active_request_static_outputsg  s,    ---''
3BBB	r+   NFT)r   r    r!   r"   r^   r\   r   r   r@   r   r<   r;   r   r   r   r9   r   r   r*   r+   r,   r   r   G  s    
=1 =4 =    3 43E   Gd G G     C DI  r+   r   c                       e Zd Zededefd       Z ed      dededee   fd       Z	edefd	       Z
eded
ee   fd       Zeddedefd       Zy)FIFOSchedulerr   len_next_tokensc                    |j                         }t        |j                        | j                  j                  z  |z
  }||k  st        |j                        dk(  rf||z
  dz   | j                  j                  z  dz   }| j                  j                  ||j                        }|sy|j                  j                  |       yNr   ri   FTrN   rP   rD   r   rm   r   r/   rT   rM   r   r   rN   	occupancyblocks_neededr   s          r,   _allocate_blocks_if_neededz(FIFOScheduler._allocate_blocks_if_neededp       '')../$**2G2GG+U	&3u/E/E+F!+K-	9A=$**BWBWW[\\M

22=%BRBRSI"")))4r+   prepare_request	span_namer   "request_ids_to_remove_from_waitingc                 J   |j                   t        j                  k(  r|j                  n|j                  }t        |      |k  r|j                   t        j                  k(  rJ|| j                  |j                  <   t        j                  |_         |j                  |j                         y|j                   t        j                  k(  r.t        j                  |_         |j                  |_        g |_        yy|j                   t        j                  k(  rJ|| j                  |j                  <   t        j                  |_         |j                  |j                         n2|j                   t        j                  k(  rt        j                  |_         ||d |_        |d| |_        yz6Prepare a request for processing in the current batch.Nr6   r   r&   rB   r2   rP   r#   r   r/   r$   addr%   rM   r   r   r   request_tokenss        r,   _prepare_request_for_processingz-FIFOScheduler._prepare_request_for_processing~  G    +0,,-:_:_*_E&&ejeueu 	 ~-||}4449>$$U%5%56,77266u7G7GH!F!FF,77#(#=#= -/* G ||}4449>$$U%5%56,==266u7G7GH!F!FF,==)7)FE&-m|<Er+   c                    | j                   r|j                  | j                  v rn| j                  j                  |j                        }|j                  t        |j                        d |_        |j                  |_        |j                  |_        || j                  |j                  <   | j                  j                  |j                         yr   r   r/   r   r   r2   rP   rA   rD   rE   r   r   r   rM   r   	old_states      r,   r   z!FIFOScheduler.add_waiting_request       &&5+;+;t?S?S+S,,001A1ABI$//I4M4M0N0PQE%.%?%?E"$-$=$=E!27e../##**5+;+;<r+   rJ   c                 p   
 g }g }g  j                   j                         D ]_  }|j                  t        j                  k(  r|j                  |       |j                  t        j                  k(  sO|j                  |       a  j                  D ]   }|j                   j                  |          " ||z   }t               
|D ]  } j                  ||
       t        |j                        } j                  |t        |j                              s%t         j                  j                        dk(  r nJut         dt"        ffd       } ||       ||z  }t         dt"        f
 fd       }	 |	|       |dk(  s n t%         j                  D cg c]	  }|
vs| c}       _        S c c}w )Nr   r   c                 (    j                  |        y r   r   r   scheduled_requestss    r,   _add_to_scheduled_requestsz@FIFOScheduler.schedule_batch.<locals>._add_to_scheduled_requests      "))%0r+   c                 x    | j                   }|j                  v rj                  |= j                  |       y y r   r/   r   r   r   req_idr   rM   s     r,   _remove_from_waiting_requestszCFIFOScheduler.schedule_batch.<locals>._remove_from_waiting_requests  =    ))T222--f56::6B 3r+   )r   valuesr6   r   r'   r   r&   r   r   setr   rP   r2   r   r   r   r   r@   r   rM   r   priority_statessecond_priority_statesr   r   
candidatesrequest_lenr   r   r   r   s   `         @@r,   r   zFIFOScheduler.schedule_batch  s   .057))002 	5E||}555&&u-||}DDD&--e4		5 11 	IF"))$*?*?*GH	I %'==
-0U* 	E00Fhie../K22s5++, tzz../141, 1 1 'u-K'LC\ C C *%0q 9	< ',"&"="=rOqAqVr'
# "! ss   	F3!F3r/   r   c                 x    |r8| j                   j                  |       || j                  v r| j                  |= y y y r   r   r   r   r   s      r,   r   zFIFOScheduler.finish_request  =    JJ"":.T111((4 2 r+   Nr   r   r    r!   r   r@   r<   r   r   r9   r   r   r;   r   r\   r   r*   r+   r,   r   r   n       s   '(=!=14=Z]^aZb= )=8 = = = 4"3 4"43E 4" 4"l 5 5 5 5r+   r   c                       e Zd Zededefd       Z ed      dededee   fd       Z	edefd	       Z
eded
ee   fd       Zeddedefd       Zy)PrefillFirstSchedulerr   r   c                    |j                         }t        |j                        | j                  j                  z  |z
  }||k  st        |j                        dk(  rf||z
  dz   | j                  j                  z  dz   }| j                  j                  ||j                        }|sy|j                  j                  |       yr   r   r   s          r,   r   z0PrefillFirstScheduler._allocate_blocks_if_needed  r   r+   r   r   r   r   c                 J   |j                   t        j                  k(  r|j                  n|j                  }t        |      |k  r|j                   t        j                  k(  rJ|| j                  |j                  <   t        j                  |_         |j                  |j                         y|j                   t        j                  k(  r.t        j                  |_         |j                  |_        g |_        yy|j                   t        j                  k(  rJ|| j                  |j                  <   t        j                  |_         |j                  |j                         n2|j                   t        j                  k(  rt        j                  |_         ||d |_        |d| |_        yr   r   r   s        r,   r   z5PrefillFirstScheduler._prepare_request_for_processing  r   r+   c                    | j                   r|j                  | j                  v rn| j                  j                  |j                        }|j                  t        |j                        d |_        |j                  |_        |j                  |_        || j                  |j                  <   | j                  j                  |j                         yr   r   r   s      r,   r   z)PrefillFirstScheduler.add_waiting_request  r   r+   rJ   c                 r   
 g }g }g  j                   j                         D ]`  }|j                  t        j                  k(  r|j                  |       2|j                  t        j                  k(  sP|j                  |       b  j                  D ]   }|j                   j                  |          " ||z   }t               
|D ]  } j                  ||
       t        |j                        } j                  |t        |j                              s%t         j                  j                        dk(  r nJut         dt"        ffd       } ||       ||z  }t         dt"        f
 fd       }	 |	|       |dk(  s n t%         j                  D cg c]	  }|
vs| c}       _        S c c}w )Nr   r   c                 (    j                  |        y r   r   r   s    r,   r   zHPrefillFirstScheduler.schedule_batch.<locals>._add_to_scheduled_requests:  r   r+   c                 x    | j                   }|j                  v rj                  |= j                  |       y y r   r   r   s     r,   r   zKPrefillFirstScheduler.schedule_batch.<locals>._remove_from_waiting_requestsB  r   r+   )r   r   r6   r   r&   r   r'   r   r   r   r   rP   r2   r   r   r   r   r@   r   r  s   `         @@r,   r   z$PrefillFirstScheduler.schedule_batch  s   .057))002 	5E||}DDD&&u-!7!77&--e4		5 11 	IF"))$*?*?*GH	I %'==
-0U* 	E00Fhie../K22s5++, tzz../141, 1 1 'u-K'LC\ C C *%0q 9	< ',"&"="=rOqAqVr'
# "! ss   	F4"F4r/   r   c                 x    |r8| j                   j                  |       || j                  v r| j                  |= y y y r   r  r   s      r,   r   z$PrefillFirstScheduler.finish_requestT  r  r+   Nr   r	  r*   r+   r,   r  r    r
  r+   r  c                     t         j                  j                         r}t        j                  d      } t         j                  j	                  |       j
                  }t         j                  j                  |       }t         j                  j                  |       }nt         j                  j                  j                         rt         j                  j                  j                         rWt        j                  d      } t         j                  j                         }|t         j                  j                         z
  }d}nt        j                  d      } d }d}d}| |||fS )Ncudampsr   cpu)r   r  is_availablerb   get_device_propertiestotal_memorymemory_reservedmemory_allocatedbackendsr  is_builtdriver_allocated_memoryrecommended_max_memory)rb   r  reserved_memoryallocated_memorys       r,   get_device_and_memoryr#  \  s    zz f%zz77?LL**44V< ::66v>				(	(	*u~~/A/A/J/J/Le$yy88:'%))*J*J*LL e$<2BBBr+   T
standalonerp   c                 z   t               \  }}	}
}t        |	t        ||
      z
  |z        }t        j                  g |      j                         }d|z  |z  |z  |z  }|||z  }t        ||| z  | z  dz  z        }|dk  rt        j                  d       d}t        d|      }t        || z  |z  dz   d      }||fS )	N)rc   r      r   z0you are trying to generate a bit too many tokensrn   @   ri   )	r#  r<   maxr   tensorelement_sizer~   r   min)max_num_tokensrm   rk   rt   rs   ru   rl   rc   rb   totalreservedr   available_memory
dtype_sizebytes_per_token max_possible_concurrent_requestsmax_concurrent_tokensoptimal_num_blockss                     r,   r}   r}   t  s     *?)@&FE8YEC	8$<<@RRSb.;;=J)mh.;jHO+5+G('*_~=NRSST($ (1,IJ+-($DE4~E*TXYY[]^444r+   c                   D   e Zd ZU ej                  ed<   ej                  ed<   ej                  ed<   ej                  ed<   ej                  ed<   eed<   eed<   ej                  ed<   ej                  ed	<   ej                  ed
<   eee	e   f   ed<   e
ed<   dZeed<   y)PagedAttentionArgs	input_idsattention_maskposition_idscumulative_seqlens_qcumulative_seqlens_kmax_seqlen_qmax_seqlen_kr   r   logits_indicesblock_tablesr   F	use_cacheN)r   r    r!   r   r   r:   r<   r   r9   r;   r^   rA  r\   r*   r+   r,   r7  r7    s    ||LL ,,,,&,,&LL sDI~&&Itr+   r7  c                    | dd  | d d kD  }|dd  |d d kD  }t        |j                         |j                               }| d |dz    } |d |dz    }| d   }|d   }t        j                  || j                        }t        j                  ||j                        }t        j
                  || dd  d      }	t        j
                  ||dd  d      }
|	d d d f   |
d d d f   k(  }| dd  | d d z
  dk(   | dd  z  }t        j
                  ||d      d d d f   |
k(  }t        j                  t        j                  |||	j                        d      j                         }|j                  ||z  d       |S )Nri   rH   rb   T)rightFdiagonal)
r,  sumr   arangerb   	bucketizetriuonesr\   masked_fill_)r;  r<  valid_docs_qvalid_docs_knum_valid_docstotal_qtotal_k	q_indices	k_indices	q_doc_ids	k_doc_idsdoc_mask	is_causalapply_causalcausal_masks                  r,   create_document_maskrZ    s    (+.B3B.GGL'+.B3B.GGL))+\-=-=-?@N 00D.12DE/0D.12DE"2&G"2&GW-A-H-HIIW-A-H-HII	+?+C4PI	+?+C5QID!YtQw%77H 'qr*-A#2-FF!KLOcdedfOggI??9itDQWMQZZL**UZZAQAQR]^_ddfK<+5>Or+   c                      e Zd Z	 	 d!dedededej                  dej                  dej                  de
j                  de
j                  d	ed
edefdZ ed      d        Ze e
j$                         d               ZdefdZd Zed        Zedefd       Zed        Zed        Zed        Zededefd       Zed        Zedefd       Z ed        Z!ed        Z"y )"ContinuousBatchProcessorr   r`   ra   input_queueoutput_queue
stop_eventmodel_devicemodel_dtype	scheduler	streamingmanual_evictionc                    || _         || _        || _        || _        || _        || _        || _        || _        |	| _        |
| _	        || _
        g | _        |j                  | _        t        |j                        | _        | j                          t!        j"                  | j                  j$                        | _        t)        d      | _        y)a  Initialize the continuous batch processor.

        Args:
            cache: The paged attention cache to use
            generation_config: The generation configuration
            input_queue: Queue for incoming requests
            output_queue: Queue for outgoing results
            stop_event: Event to signal processing should stop
            model_device: Device for model inputs/outputs
            model_dtype: Data type for model inputs/outputs
            streaming: Whether to stream tokens as they're generated
        Tskip_special_tokensN)r   r`   ra   r]  r^  r_  r`  ra  rb  rc  rd  requests_in_batchrq   r   metricssetup_static_tensorsr   from_pretrained_name_or_path	tokenizerr   decode_stream)rM   r   r`   ra   r]  r^  r_  r`  ra  rb  rc  rd  s               r,   r   z!ContinuousBatchProcessor.__init__  s    4 
!2&($(&"".57 !& 6 66u7M7MN!!#0@@AZAZ[)dCr+   Tr$  c                    | j                   }| j                  j                  | j                  j                  z  }t        j
                  | j                  d}|| _        t	        j                  d|ffi || _	        t	        j                  d|ffi || _
        t	        j                  dd||f| j                  | j                        | _        t	        j                  |dz   ffi || _        t	        j                  |dz   ffi || _        t	        j                  |ffi || _        t	        j                  |ffi || _        t	        j"                  |fdfi || _        d| _        d| _        t	        j"                  d|fdfi || _        y )Nrv   ri   rH   r   )rq   r   rl   rm   r   int32r`  tensor_metadatar   r8  r:  ra  r9  r;  r<  r   r   fullr?  r=  r>  
output_ids)rM   Tmax_token_budgetrq  s       r,   rj  z-ContinuousBatchProcessor.setup_static_tensors  sY   !!::004::3H3HH$)KK4;L;LM.aV??!KKAB/B#kk1&'t/?/?HYHY
 %*KKQ$LO$L!$)KKQ$LO$L! ;;t??++'7&9M_M#jj!rE_E**aVRC?Cr+   c                 J   | j                   j                          | j                  j                          | j                  j	                  t        j                  | j                        j                         | j                  j                          | j                  j                          | j                  j	                  d       | j                  j	                  d       | j                  j	                  d       d| _        d| _        | j                   j                          y)z(Reset static tensors for the next batch.rH   r   N)r8  zero_r:  r9  fill_r   finfora  r,  r;  r<  r   r   r?  r=  r>  rs  rL   s    r,   reset_static_tensorsz-ContinuousBatchProcessor.reset_static_tensors  s     	!!!%++d.>.>"?"C"CD!!'')!!'')r"b!!!"%r+   rJ   c                 &   | j                   | j                  | j                  | j                  | j                  | j
                  | j                  | j                  | j                  | j                  | j                  j                  | j                  ddS )z2Get model keyword arguments for the current batch.F)r8  r:  r9  r;  r<  r   r   r?  r=  r>  r@  r   rA  )r8  r:  r9  r;  r<  r   r   r?  r=  r>  r   r   rL   s    r,   get_model_kwargsz)ContinuousBatchProcessor.get_model_kwargs  s{      --"11$($=$=$($=$=++//"11 -- -- JJ44ZZ
 	
r+   c           	          d| j                    d| j                   d| j                  j                   d| j                  j                   d	| j                         j                         z   S )Nz%ContinuousBatchProcessor(input_queue=z, output_queue=z, active_requests=z, waiting_requests=))r]  r^  rb  r   r   r|  rY   rL   s    r,   rY   z!ContinuousBatchProcessor.__repr__.  s    3D4D4D3E_UYUfUfTggyz~  {I  {I  {Y  {Y  zZ  Zm  nr  n|  n|  nM  nM  mN  NO  P##%..01	
r+   c                    | j                   j                         sU	 | j                   j                         }|8| j                  j	                  |       | j                   j                         sTyy# t
        j                  $ r Y yt        $ rQ}t        j                  d| d       t               j                  d      }|| j                  ||       Y d}~d}~ww xY w)z?Pull new requests from the input queue and add to waiting list.NzError processing new request: Texc_infor   )r]  empty
get_nowaitrb  r   queueEmpty	Exceptionr~   r5   localsr   _handle_request_error)rM   r   es      r,   _get_new_requestsz*ContinuousBatchProcessor._get_new_requests4  s     ""((*9((335=2259 ""((* ;;  9=aSADQ&,hll7&;$..q%8	9s#   A1 A1 1CCACCr   c                    t         j                  |_        t        |      |_        t        |j                  t              r+| j                  j                  |j                        |_	        ng |_	        | j                  j                  |j                  |j                         | j                  j                  |j                                y)z(Handle general request processing error.N)r   r)   r6   r9   r5   
isinstancer/   rb  r   rC   ri  record_request_completionr7   r^  putr[   )rM   r5   r   s      r,   r  z.ContinuousBatchProcessor._handle_request_errorF  s     %++%j e&&,#'>>#S#STYTdTd#eE #%E ..u/A/A5CSCSTe88:;r+   c                 ^   | j                          | j                  j                         sy| j                  j	                  t        | j                  j                        t        | j                  j                               | j                  j                  | j                        | _
        | j                  sy| j                          g }g }g }g }dg}dg}g }| j                  j                  | j                         | j                  D ]R  }|j                  }	|j                  |	       |j                  }
t        |	      }||
z   }t!        t#        |            }||
d }| j$                  j'                  ||      }|| d }|j                  |       |j                  |       |j                  |       |j)                  |d   |z          |j)                  |d   |z          t        |j*                        dk(  r|j)                  |d   dz
         t-        | j.                  |      | _        t-        | j0                  |      | _        |xj                  |z  c_        U t2        j5                  dt        | j                         dt        | j                  j                         dt        | j                  j                         d|d    d	|d    d
| j$                  j7                                 | j9                  |||||||       | j                  j;                  | j$                         y)z=Prepare tensors and metadata for the next model forward pass.Nr   rH   ri   zScheduled: z, Waiting: z
, Active: z	. cum Q: z
. cum KV: z, free blocks: )r  rb  r   ri  record_queue_metricsrP   r   r   r   rq   rh  rz  record_batch_metricsr2   rT   rE   r;   r   r   r   r   rB   r)  r=  r>  r~   r   r   _build_tensorsrecord_kv_cache_memory_metrics)rM   r:  r8  r   r   r;  r<  r?  r   next_input_idspast_lengthquery_length
key_lengthcache_indexpositions_to_addread_indiceswrite_indicess                    r,   prepare_next_batchz+ContinuousBatchProcessor.prepare_next_batchU  sL    	 ~~224))#dnn.L.L*MsSWSaSaSrSrOst!%!>!>t?T?T!U%% 	!!#	
 !s !s))$*@*@A++ 	2E"--N^,//K~.L%3JuZ01K*;<8::;;E;OL(,8M 01l+}- ''(<R(@<(OP ''(<R(@:(MN5--.!3%%&:2&>&BC #D$5$5| DD #D$5$5z BD!!\1!+	2. 	#d4456k#dnnFeFeBfAggqruvz  wE  wE  wU  wU  sV  rW  W`  au  vx  ay  `z  zD  EY  Z\  E]  D^  ^m  nr  nx  nx  nL  nL  nN  mO  P	
 	  	
 	33DJJ?r+   c           	         t        t        j                  fi | j                  } ||      | j                  d d d t        |      f<    ||      | j                  d d d t        |      f<    ||      | j                  d t        |        ||      | j                  d t        |        ||      | j                  d t        |        ||      | j                  d t        |        ||      | j                  d t        |       t        j                  | j                        j                  }	| j                  j                   dk7  rt#        t        |      dz
        D ]  }
||
dz      ||
   z
  ||
dz      ||
   z
  k  r4||
dz      ||
   z
  dk\  r#||
dz      ||
dz      ||
   z
  z
  dz   }|||
   z
  }nd}t%        ||
   ||
dz            }t%        ||
   ||
dz            }t        j&                  t        j(                  | j*                  d||f   j,                  |	| j                  | j.                        |      }|| j*                  d||f<    y y )Npaged_attentionri   .rv   rE  )r	   r   r*  rq  r8  rP   r:  r   r   r;  r<  r?  ry  ra  r,  r`   _attn_implementationr   slicerJ  rr  r9  shaper`  )rM   r8  r:  r   r   r;  r<  r?  	to_tensor	min_valueirF  query_range	key_rangemasks                  r,   r  z'ContinuousBatchProcessor._build_tensors  s    ELLAD,@,@A	.7	.Bq*C	N**+4=l4K!0s<0001/8/E+3{+,-6z-B)#j/*AJK_A`!!"=C(<$=>AJK_A`!!"=C(<$=>5>~5N1c.12KK 0 0155	;;++/@@334q89 H(Q/2Fq2II*1q514H4KKL,QU36J16MMQRR -QU37KAPQE7RUijkUl7lmpqq   (*>q*AAH H#$8$;=QRSVWRW=XY!"6q"9;OPQTUPU;VW	zzJJ++Ci,GHNN!"..#00	 & DH##Ci$?@1H Ar+   c                     | j                    	 | j                   j                         d   }|S ddg}|S # t        $ r ddg}Y |S w xY w)Nr   ri   )rs  tolistr  )rM   outs     r,   _synczContinuousBatchProcessor._sync  s`    ??&oo,,.q1
 
 a&C
	  !f 
	s   3 AAtokenc                 n   | j                   rb| j                  j                  | j                  |j                  d         |_        | j                  j                  |j                                y|j                  t        j                  k(  r*| j                  j                  |j                                yy)zCSend output to the queue based on streaming mode and request state.rH   N)rc  rn  steprm  rC   r8   r^  r  r[   r6   r   r(   )rM   r   r  s      r,   _maybe_send_outputz+ContinuousBatchProcessor._maybe_send_output  s     >>#1166t~~uG[G[\^G_`E!!%"<"<">?\\]333!!%"<"<">? 4r+   c                 T   | j                         }g }t        | j                        D ]U  \  }}|j                  }t	        |j
                        dk(  r| j                  j                  |j                  |j                         t        j                  |_        || j                  |      }|g|_        |j                  |      rs| j                  j                  |j                  |j                         | j                   j#                  |j                  | j$                          |j'                  |       | j)                  ||       "|j                  t        j*                  k(  sAt        j,                  |_        X | j.                  j1                         dk(  rt3        d      y)z0Update request states based on generated tokens.r   )r   zNo more free blocksN)r  	enumeraterh  r/   rP   rB   ri  record_ttft_metricr7   r   r'   r6   r?  r2   rW   r  rb  r   rd  r   r  r%   r&   r   r   ry   )rM   
out_tokensfinished_request_idsr  r   r   r  s          r,   update_batchz%ContinuousBatchProcessor.update_batch  sY    ZZ\
!!$"8"89 	EHAu%%F5--.!3//0B0BEDTDTU,55"4#6#6q#9:$)7 **51LL::5;M;MuO_O_`NN11%2B2BZ^ZnZnVn1p(//7''u5!?!??,DD	E ::))+q0233 1r+   c                 6    | j                   j                         S )z2Check if there are any active or waiting requests.)rb  r   rL   s    r,   r   z-ContinuousBatchProcessor.has_pending_requests  s     ~~2244r+   c                     | j                   }|D ]9  }| j                  ||       | j                  j                  |j                         ; y)z&Handle errors during batch processing.N)rh  r  rb  r   r/   )rM   r5   failed_reqsreqs       r,   handle_batch_errorz+ContinuousBatchProcessor.handle_batch_error  sD     ,, 	:C&&uc2NN))#..9	:r+   c                    t        | j                  j                  j                               }|D ]9  }| j	                  ||       | j                  j                  |j                         ; t        | j                  j                  j                               D ]9  }| j                  j                  j                  |      }| j	                  ||       ; | j                  j                  j                          y)zFail all active requests with the given error.

        Args:
            error: The error to report in the failure message
        N)r;   rb  r   r   r  r   r/   r   keysr   r   clear)rM   r5   requestsr   r   s        r,   fail_all_requestsz*ContinuousBatchProcessor.fail_all_requests  s     66==?@ 	<E&&ue4NN))%*:*:;	<
 4>>::??AB 	5FNN3377?E&&ue4	5
 	--335r+   N)FF)#r   r    r!   r^   r   r   r  Queue	threadingEventr   rb   rc   r   r\   r   r   rj  no_gradrz  r7  r|  rY   r  r@   r  r  r  r  r<   r  r  r   r  r  r*   r+   r,   r\  r\    s      %/D"/D !/D ,	/D
 [[/D kk/D OO/D ll/D [[/D /D /D /Db tD D& U]]_    
"4 
&
 9 9" <, < < <@ <@| ,H ,H\   @ @S @ @ 4 4* 5d 5 5 : : 6 6r+   r\  )fifoprefill_firstc            	          e Zd ZdZ	 	 	 d(dededefdZed        Zd Z	d)d	ed
e
e   fdZd*d
e
e   fdZ	 d+dee   de
e   de
e   defdZdeee      fdZd*de
e   fdZd Zed        Zedefd       Z ed      d        Z ed      d        Z ed      defd       Zd  Z ed!      d,ded"efd#       Z ed$      d%        Zede
e   fd&       Zedefd'       Z y)-ContinuousBatchingManagerzManager for handling continuous batching of generation requests.

    This class provides the user interface for submitting generation requests,
    retrieving results, and managing the background generation thread.
    ra   rd  rc  c                    |j                         | _        ||j                  n|}|| _        t        j                  |      | _        t        j                         | _        t        j                         | _	        || _
        t        |dd      | _        d| _        d| _        t        j                         | _        d| j                  j                  _        t        |dd      | _        | j                  j'                  |      | _        t        |dd      | _        t        |d	d      | _        || _        d| _        t3        d
      | _        y)aS  Initialize the continuous batching manager.

        Args:
            model: The language model for generation
            generation_config: Configuration for generation parameters
            max_queue_size: Maximum size of the request queue (0 = unlimited)
            streaming: Whether to stream tokens as they are generated
        N)maxsizelog_prob_generationFr   	do_sampleTuse_cuda_graphprofilerf  )evalmodelra   r  r  r]  r^  r  r  r_  rc  rw   r  _generation_thread_request_counterLock_request_locktop_pr  _get_logits_processorlogit_processorr  r  rd  batch_processorr   rn  )rM   r  ra   rd  max_queue_sizerc  s         r,   r   z"ContinuousBatchingManager.__init__  s     ZZ\
7H7PE33Vg!2 ;;~>!KKM#//+"#*+<>SUZ#[ "& !&^^--1

$$* !2KF#zz??@QR%&79I4P0)UC.CG)dCr+   c                 V   | j                   0| j                   j                         rt        j                  d       yt	        j
                         | _        t        j                  | j                        | _         | j                   j                          t        j                  d       y)z'Start the background generation thread.Nz"Manager thread is already running.)targetz$Continuous batching manager started.)r  is_aliver~   r   r  r  _result_queuer  Thread_run_generation_loopstartr   rL   s    r,   r  zContinuousBatchingManager.startC  sw     "".43J3J3S3S3UNN?@"[[]"+"2"2$:S:S"T%%':;r+   c                 V    | j                   duxr | j                   j                         S )z5Check if the background generation thread is running.N)r  r  rL   s    r,   
is_runningz$ContinuousBatchingManager.is_runningO  s'    &&d2Yt7N7N7W7W7YYr+   Nblocktimeoutc                    | j                   t        j                  d       y| j                  j	                         s/| j                  j                          t        j                  d       |r| j                  |       yy)zSignal the background thread to stop.

        Args:
            block: Whether to wait for the thread to stop
            timeout: Maximum time to wait for the thread to stop
        NzManager not started.z'Stopping continuous batching manager...)r  r~   r   r_  is_setr   r   join)rM   r  r  s      r,   stopzContinuousBatchingManager.stopS  sa     ""*NN12%%'OO!KKABIIg r+   c                     | j                   i| j                   j                  |       | j                   j                         rt        j	                  d       yt        j                  d       d| _         yy)zWait for the background thread to finish.

        Args:
            timeout: Maximum time to wait for the thread to stop
        Nr  z2Generation thread did not exit after join timeout.z$Continuous Batching Manager stopped.)r  r  r  r~   r   r   )rM   r  s     r,   r  zContinuousBatchingManager.joine  s`     "".##(((9&&//1STBC*.' /r+   r8  r/   rG   rJ   c                    |9| j                   5  d| j                   }| xj                  dz  c_        ddd       || j                  j                  n|}t	        |t        |      t        |      || j                  j                        }| j                  j                  |dd       t        j                  d| d	       |S # 1 sw Y   xY w)
a/  Add a new generation request to the queue.

        Args:
            input_ids: Input token IDs to use as prompt
            request_id: Optional custom request ID (auto-generated if None)
            **kwargs: Additional generation parameters

        Returns:
            str: The request ID
        Nreq_ri   )r/   r2   rA   rG   rI   T
   r  r  zAdded request z
 to queue.)r  r  ra   rG   r@   r;   rI   r]  r  r~   debug)rM   r8  r/   rG   r   s        r,   add_requestz%ContinuousBatchingManager.add_requests  s     ## +#D$9$9#:;
%%*%+ CQBX//>>^l!I O)//<<
 	U$;~j\<=#+ +s   %CCinputsc                 ^    t        |      D ]  \  }}d| } | j                  |fd|i| ! y )N
batch_req_r/   )r  r  )rM   r  r   r  r8  r   s         r,   add_requestsz&ContinuousBatchingManager.add_requests  sB    %f- 	ELAy!!%FDYD6DVD	Er+   c                    | j                   | j                  j                         ry	 | j                  j                  d|      }t        j                  d|j                          |S # t        j                  $ r Y yw xY w)zRetrieve one result from the output queue.

        Args:
            timeout: Maximum time to wait for a result

        Returns:
            Optional[Dict]: The result data or None if timeout
        NTr  zRetrieved result for request )	r  r^  r  r   r~   r  r/   r  r  )rM   r  results      r,   
get_resultz$ContinuousBatchingManager.get_result  sz     ""*t/@/@/F/F/H	&&**w*GFLL89J9J8KLMM{{ 		s   A A* *B ?B c              #   B  K   | j                   | j                   j                         s| j                  j                         s[| j	                  d      }|| | j                   | j                   j                         r?| j                  j                         sZyyw)z.Iterate over results as they become available.Ng?r  )r  r  r^  r  r  )rM   r  s     r,   __iter__z"ContinuousBatchingManager.__iter__  s      ##/D4K4K4T4T4V^b^o^o^u^u^w__S_1F!	 ##/D4K4K4T4T4V^b^o^o^u^u^ws   A?BBBc                    t         j                  j                  | j                  j                        }|j                  t         j                  j                                t         j                  j                  |      5  | j                  |       d d d        t         j                  j                         j                  |       t         j                  j                         | _
        t         j                  j                  | j                  |      5  | j                  |       d d d        y # 1 sw Y   xY w# 1 sw Y   y xY w)NrC  )stream)r   r  Streamr  rb   wait_streamcurrent_streamr  _generation_step	CUDAGraphgraph)rM   r  r  s      r,   warmupz ContinuousBatchingManager.warmup  s    ""$***;*;"<5::4467ZZv& 	3!!/2	3 	

!!#//7ZZ))+
ZZdjj8 	3!!/2	3 	3	3 	3	3 	3s   D1D=1D:=Er  c                 .   |j                         }t        j                         5  | j                  |      }| j                  r|j
                  j                  |       | j                  ||      }| j                  ||       ddd       y# 1 sw Y   yxY w)z6Perform a single generation step. This is cuda graphedN)	r|  r   r  _model_forwardr  output_probscopy__process_logit_sample)rM   r  
batch_datalogitsprobss        r,   r  z*ContinuousBatchingManager._generation_step  s}     %557
]]_ 	1((4F'',,226:''
F;ELL%0	1 	1 	1s   ABBmodel_forwardr   c                 :     | j                   di |j                  S )Nr*   )r  r	  )rM   r  s     r,   r  z(ContinuousBatchingManager._model_forward  s    tzz'J'...r+   logit_processingc                     t        | j                  d      r"| j                  j                  |d   |d          | j                  |d   |      S )Nset_continuous_batching_contextr?  r;  r8  )rz   r  r  )rM   r  r	  s      r,   r  z(ContinuousBatchingManager._process_logit  sT     4'')JK  @@+,j9O.P ##J{$;VDDr+   samplingc                    | j                   rKt        j                  j                  |d      }t	        j
                  |d   d      j                  d      }nt	        j                  |d      }|j                  j                  |       y )NrH   )dimr   ri   )num_samples)
r  nn
functionalsoftmaxr   multinomialsqueezeargmaxrs  r  )rM   r  r
  next_tokenss       r,   r  z!ContinuousBatchingManager._sample  sg    >>MM))%R)8E++E!H!DLLQOK,,u"5K""((5r+   c                    d}	 t        | j                  j                  | j                  | j                  j                  | j                  j
                  t        | j                  j                        t        | j                  dd            }d}t        | j                  d      rKt        j                  | j                  j                        }|&t        j                  d| d       t         }nt         }t#        || j                  j                  | j                  | j                  | j$                  | j&                  | j                  j                  | j                  j
                   ||| j(                        | j*                  | j(                        }|| _        d}| j&                  j/                         r|j1                         rB| j3                  ||       |rd	}| j&                  j/                         s1|j1                         rBt        j;                  d       y# t4        $ r6}t        j7                  d
| d       | j9                  ||       Y d}~Pd}~ww xY w# t        j;                  d       w xY w)z6Main processing loop running in the background thread.N_tp_size   )rd   rf   rb  zScheduler 'z ' not found. Defaulting to FIFO.TFzError in generation loop: r  zGeneration loop finished.)r^   r  r`   ra   rb   rc   rP   r]  r  rw   rz   SCHEDULER_MAPPINGr   rb  r~   r   r   r\  r^  r_  rd  rc  r  r  r   _inner_generation_loopr  r5   _handle_critical_errorr   )rM   r  paged_attention_cacherb  is_firstr  s         r,   r  z.ContinuousBatchingManager._run_generation_loop  s   ,	5$7

!!&&

!!

   !1!1!7!78

J:%! It--{;-11$2H2H2R2RS	$NN[;[#\] -I *	6%

!!&&  !!

!!

  /1E1EF$$O $3D H--/O4X4X4Z++OXF$H --/O4X4X4Z KK34	  	<LL5aS9DLI''?;;	< KK34s0   G3H 8H 	I(,II! II! !I8generation_loopr"  c           
         t         j                  j                         rt         j                  j                          |j	                          t               \  }}}}t        j                  d| d| d| d|        t         j                  j                         rP| j                  rD|r| j                  |       nAt        | d      r	 | j                          n#| j                  |       n| j                  |       t         j                  j                         rt         j                  j                          |j!                          y # t        $ r5}t        j                  d| d       |j                  |       Y d }~y d }~ww xY w)	Nz[Memory] Device: z	, Total: z, Reserved: z, Allocated: r   zModel forward pass failed: Tr  )r   r  r  synchronizer  r#  r~   r   r  r  rz   _graph_replayr  r5   r  r  r  )rM   r  r"  rb   r.  r/  r   r  s           r,   r  z0ContinuousBatchingManager._inner_generation_loop  s/   ::""$JJ""$**,-B-D*x'xy|H:Ubclbmno::""$)<)<O,w'&&( %%o6!!/2::""$JJ""$$$& ! LL#>qc!BTLR#66q9s   E	 		F+FFgraph_replayc                 8    | j                   j                          y r   )r   replayrL   s    r,   r&  z'ContinuousBatchingManager._graph_replay.  s    

r+   c                     | j                   j                          	 	 | j                  j                         }||j	                  ||       /# t
        j                  $ r Y nw xY w||j                  |       yy)z:Handle critical errors that terminate the generation loop.N)r_  r   r]  r  r  r  r  r  )rM   r5   r  req_datas       r,   r   z0ContinuousBatchingManager._handle_critical_error2  s~     		++668".#99%J  {{ 		 &--e4 's   0A A"!A"c                     | j                   st        d      | j                  &| j                  j                  j	                  |       yy)zSEvict a request from the cache. It is assumed that the request is already finished.z0Manual eviction is not enabled for this manager.N)rd  RuntimeErrorr  rb  r   r   s     r,   evict_request_from_cachez2ContinuousBatchingManager.evict_request_from_cacheE  sD     ##QRR+  **99*E ,r+   )Fr   T)FNr   )NNr   )!r   r    r!   r"   r   r\   r   r   r  r  r
   r=   r  r  r;   r<   r9   r  r  r.   r  r  r  r\  r  r  r  r  r  r  r&  r   r.  r*   r+   r,   r  r    s    !&"D ,"D 	"D "DH 	< 	<Z$ % $/HUO / gkc08V^_bVc	BE4S	? E(3C*D $ 
3 
3 10H 1 1 o&/ '/ ()E *E j!6'? 6 "6/5b '('6N 'Z^ ' )'0 n% & 5XF^=_ 5 5$ F3 F Fr+   r  c                       e Zd ZdZ	 	 	 	 ddee   dedededef
dZ	e
 ej                         	 	 dd	eee      dee   d
edeee      fd              Zy)ContinuousMixinz?Mixin class for models to add continuous batching capabilities.Nra   rd  r  rc  rJ   c                 
   t        | d      rt        | d      rt        | d      st        d      ||n| j                  }|t        d      |j                  t
        j                  d       d|_        t        | ||||      S )	a  Initialize a manager for continuous batching inference.

        Args:
            generation_config: Custom generation configuration
            max_queue_size: Maximum size of the input request queue
            streaming: Whether to stream tokens as they are generated

        Returns:
            `ContinuousBatchingManager`: The manager instance to add requests and retrieve results.
        r`   rb   rc   z;Model must have 'config', 'device', and 'dtype' attributes.z8A GenerationConfig must be provided or set in the model.zE`eos_token_id` not set in GenerationConfig. Setting to -1 (disabled).rH   )r  ra   rd  r  rc  )rz   AttributeErrorra   ry   rI   r~   r   r  )rM   ra   rd  r  rc  
gen_configs         r,   init_continuous_batchingz(ContinuousMixin.init_continuous_batchingQ  s    " tX&gdH.EWUY[bMc !^__*;*G&TMcMc
WXX""*NNbc&(J# )(+)
 	
r+   r  progress_barc                    |sg S | j                  |      }|j                          i }t        |      }	 ddlm}  |t
        g      5  t        || d| dd      5 }	 |j                  |fi | d}
|
|k  r|j                  d	      }|r|j                  }|j                  t        j                  k(  r|||<   |
dz  }
|	j                  d       t
        j                  |j                  j                   j#                  |j$                               n&|j'                         st
        j)                  d
       n|
|k  rddd       ddd       |j-                  dd       |S # 1 sw Y   &xY w# 1 sw Y   *xY w# t*        $ r$}t
        j)                  d| d       Y d}~Ud}~ww xY w# |j-                  dd       w xY w)a=  Generate sequences for a batch of prompts using continuous batching.

        Args:
            inputs: List of input token sequences (prompts)
            generation_config: Optional generation configuration
            **kwargs: Additional generation parameters

        Returns:
            `list[list[int]]`: A list containing the generated sequences (including prompt tokens
                                if not handled otherwise) for each input prompt, in the same order.
                                Returns an empty list `[]` for requests that failed.
        )ra   r   )logging_redirect_tqdmzSolving z	 requestsrequest)r.  disabledescunitri   r  z*Generation thread terminated unexpectedly.NzError during batch generation: Tr  g      @r  )r4  r  rP   tqdm.contrib.loggingr7  r~   r   r  r  r/   r6   r   r(   r   r  r  rm  decoder3   r  r5   r  r  )rM   r  ra   r5  r   managerresultsrd   r7  pbarfinished_countr  r   r  s                 r,   generate_batchzContinuousMixin.generate_batchv  s   * I //BS/T6{	2B&x0 &&!--#L>;"	 &
 (G((:6:%&N(<7!(!3!3A!3!>!%+%6%6F%}}0F0FF28 .! 3 $A"LL)@)@)J)J)Q)QRXRiRi)jk#*#5#5#7 &-Y Z % )<7&&4 LLtSL15& && &.  	OLL:1#>LNN	O LLtSL1sZ   E< E0C!E$?E0E< $E-	)E00E95E< <	F)F$F, $F))F, ,G)NFr   F)NT)r   r    r!   r"   r
   r   r\   r<   r  r4  r   r   inference_moder;   rB  r*   r+   r,   r0  r0  N  s    I 9= %#
#$45#
 #
 	#

 #
 
##
J U 9=!	9T#Y9 $$459 	9 
d3i9  9r+   r0  )7r  r  r>   abcr   r   collectionsr   dataclassesr   r   enumr   	functoolsr	   typingr
   r   r   torch.nnr  tokenizers.decodersr   r   configuration_utilsr   generation.configuration_utilsr   tokenization_utils_fastr   utils.loggingr   utils.metricsr   r   r   r   	getLoggerr   r~   r.   r@   r^   r   r   r  r#  r   r}   r7  rZ  r\  r  r  r0  r*   r+   r,   <module>rR     s       #  (   "   ,  2 = = # R R	D 	 
		8	$ ; ; ;* G
 G
 G
T mZ mZ mZ`$ $N s5I s5 s5l s5I s5 s5lC0 4 
--5 5>      < H6 H6 H6X
 *  sF sF sFl	c cr+   