
    rh,                     h    d Z ddlZddlmZmZ ddlmZ ddlmZ ddl	m
Z
 g d	Z G d
 de      ZdgZy)z
Processor class for EVOLLA.
    N)OptionalUnion   )BatchFeature)ProcessorMixin   )AutoTokenizer)aa_seqfoldseekmsac            
            e Zd ZdZddgZdgZdZdZdZd fd	Z	ddZ
	 ddefd	Z	 	 	 	 dd
eeee   ef      deeeee      ee   f      dee   dee   fdZd Zd Zd Zd Z fdZe fd       Z xZS )EvollaProcessoran  
    Constructs a EVOLLA processor which wraps a LLama tokenizer and SaProt tokenizer (EsmTokenizer) into a single processor.

    [`EvollaProcessor`] offers all the functionalities of [`EsmTokenizer`] and [`LlamaTokenizerFast`]. See the
    docstring of [`~EvollaProcessor.__call__`] and [`~EvollaProcessor.decode`] for more information.

    Args:
        protein_tokenizer (`EsmTokenizer`):
            An instance of [`EsmTokenizer`]. The protein tokenizer is a required input.
        tokenizer (`LlamaTokenizerFast`, *optional*):
            An instance of [`LlamaTokenizerFast`]. The tokenizer is a required input.
        protein_max_length (`int`, *optional*, defaults to 1024):
            The maximum length of the sequence to be generated.
        text_max_length (`int`, *optional*, defaults to 512):
            The maximum length of the text to be generated.
    protein_tokenizer	tokenizersequence_max_lengthr	   c                     |t        d      |t        d      t        | 	  ||       d| j                  _        || _        || _        y )Nz+You need to specify an `protein_tokenizer`.z"You need to specify a `tokenizer`.z<|reserved_special_token_0|>)
ValueErrorsuper__init__r   	pad_tokenprotein_max_lengthtext_max_length)selfr   r   r   r   kwargs	__class__s         /var/www/html/ai-insurance-compliance-backend/venv/lib/python3.12/site-packages/transformers/models/evolla/processing_evolla.pyr   zEvollaProcessor.__init__;   sS    $JKKABB*I6#A "4.    c           
      d   g }|D ]  }|j                  d      }|j                  d      }dj                  t        ||      D cg c]&  \  }}|j                         |j	                         z   ( c}}      }	|j                  |	        | j                  j                  |dd|d      }
|
S c c}}w )Nr
   r    ptT)return_tensors
truncation
max_lengthpadding)getjoinzipupperlowerappendr   batch_encode_plus)r   proteinsr   sa_sequencesproteinr
   r   sfsa_sequence	sa_tokenss              r   process_proteinsz EvollaProcessor.process_proteinsG   s     	-G[[*F{{:.H''SQYEZ"[TQ1779qwwy#8"[\K,		- **<<$K]gk = 
	  #\s   +B,r   c                     g }|D ]1  }| j                   j                  |dd      }|j                  |       3 | j                  |dddd|      }|S )NFT)tokenizeadd_generation_promptr    longest)add_special_tokensr!   r$   r"   r#   )r   apply_chat_templater*   )r   textsr   promptsmessagespromptprompt_inputss          r   process_textzEvollaProcessor.process_textT   sw    
  	#H^^77&* 8 F
 NN6"	# $& ' 
 r   r,   messages_listr   c                    ||t        d      ||n| j                  }||n| j                  }t        |t              r|g}t        |t
        t        f      rt        |d   t
        t        f      s|g}t        |t
        t        f      rt        d |D              st        d      t        |t
        t        f      r6t        d |D              s$t        ddj                  t               d|       t        |t
        t        f      r|D ]  }t        |t
        t        f      st        d	t        |       d
      t        d |D              st        d      t        d |D              st        d |D              sst        d|        nt        dt        |       d
      | j                  ||      }| j                  ||      }t        |d   |d   |d   |d   d      S )av  This method takes batched or non-batched proteins and messages_list and converts them into format that can be used by
        the model.

        Args:
            proteins (`Union[List[dict], dict]`):
                A list of dictionaries or a single dictionary containing the following keys:
                    - `"aa_seq"` (`str`) -- The amino acid sequence of the protein.
                    - `"foldseek"` (`str`) -- The foldseek string of the protein.
            messages_list (`Union[List[List[dict]], List[dict]]`):
                A list of lists of dictionaries or a list of dictionaries containing the following keys:
                    - `"role"` (`str`) -- The role of the message.
                    - `"content"` (`str`) -- The content of the message.
            protein_max_length (`int`, *optional*, defaults to 1024):
                The maximum length of the sequence to be generated.
            text_max_length (`int`, *optional*, defaults to 512):
                The maximum length of the text.

        Return:
            a dict with following keys:
                - `protein_input_ids` (`torch.Tensor` of shape `(batch_size, sequence_length)`) -- The input IDs for the protein sequence.
                - `protein_attention_mask` (`torch.Tensor` of shape `(batch_size, sequence_length)`) -- The attention mask for the protein sequence.
                - `text_input_ids` (`torch.Tensor` of shape `(batch_size, sequence_length)`) -- The input IDs for the text sequence.
                - `text_attention_mask` (`torch.Tensor` of shape `(batch_size, sequence_length)`) -- The attention mask for the text sequence.
        z3You need to specify `messages_list` and `proteins`.r   c              3   <   K   | ]  }t        |t                y wN
isinstancedict.0ps     r   	<genexpr>z+EvollaProcessor.__call__.<locals>.<genexpr>   s     :aST:a;N:a   zUThe proteins should be a list of dictionaries, but not all elements are dictionaries.c              3   \   K   | ]$  }t        d  |j                         D               & yw)c              3   ,   K   | ]  }|t         v   y wrC   )PROTEIN_VALID_KEYS)rH   ks     r   rJ   z5EvollaProcessor.__call__.<locals>.<genexpr>.<genexpr>   s     :A'':s   N)allkeysrG   s     r   rJ   z+EvollaProcessor.__call__.<locals>.<genexpr>   s&      ;
?@C:::;
s   *,z2There should be a list of dictionaries with keys: z, z for each protein.But got: z;Each messages in messages_list should be a list instead of .c              3   <   K   | ]  }t        |t                y wrC   rD   rH   ms     r   rJ   z+EvollaProcessor.__call__.<locals>.<genexpr>   s     A1:a.ArK   zfEach message in messages_list should be a list of dictionaries, but not all elements are dictionaries.c              3   T   K   | ]   }t        |j                               d k7   " yw)r   N)lenrQ   rT   s     r   rJ   z+EvollaProcessor.__call__.<locals>.<genexpr>   s     <as1668})<s   &(c              3   X   K   | ]"  }t        |j                               d dhk7   $ yw)rolecontentN)setrQ   rT   s     r   rJ   z+EvollaProcessor.__call__.<locals>.<genexpr>   s+      D=>CMfi%88Ds   (*zlEach message in messages_list should be a list of dictionaries with two keys: 'role' and 'content'.But got: zFThe messages_list should be a list of lists of dictionaries, but it's 	input_idsattention_mask)protein_input_idsprotein_attention_maskr\   r]   )data)r   r   r   rE   rF   listtuplerP   r&   rN   typeanyr3   r?   r   )	r   r,   r@   r   r   r   r<   r2   text_tokenss	            r   __call__zEvollaProcessor.__call__l   s,   B }4RSS3E3Q/W[WnWn-<-H/dNbNb h% zHmdE]3J}UVGWZ^`eYf<g*OMhu.s:aX`:a7atuuhu.s ;
DL;
 8
 D99/01 2$:'  mdE]3) !(T5M:$'bcghpcqbrrs%tuuAAA$ A  <8<< DBJD A %$$,:/  XY]^kYlXmmno  ))(4FG	''G%.{%;*34D*E(5"-.>"?	
 	
r   c                 :     | j                   j                  |i |S rC   )r   batch_decoder   argsr   s      r   rh   zEvollaProcessor.batch_decode   s    *t~~**D;F;;r   c                 :     | j                   j                  |i |S rC   )r   decoderi   s      r   rl   zEvollaProcessor.decode   s    $t~~$$d5f55r   c                 :     | j                   j                  |i |S rC   )r   rh   ri   s      r   protein_batch_decodez$EvollaProcessor.protein_batch_decode   s     2t%%22DCFCCr   c                 :     | j                   j                  |i |S rC   )r   rl   ri   s      r   protein_decodezEvollaProcessor.protein_decode   s     ,t%%,,d=f==r   c                    | j                   j                  t        j                  j	                  || j
                               d| j                  v }|r| j                  j                  d      nd }|r|| j                  j                  d       t        |   |fi |}|r|| j                  j                  |d       |S )Nr   )r   save_pretrainedospathr&   protein_tokenizer_dir_name
attributesindexremover   insert)r   save_directoryr   protein_tokenizer_presentprotein_tokenizer_indexoutputsr   s         r   rr   zEvollaProcessor.save_pretrained   s    ..rww||NDLkLk/lm %84??$J!Pi$//"7"78K"Los$)@)LOO""#67').CFC$)@)LOO""#:<OPr   c                     t        |   |fi |}t        |t              r|d   }t	        j                  || j
                        }||_        |S )Nr   )	subfolder)r   from_pretrainedrE   rb   r	   ru   r   )clspretrained_model_name_or_pathr   	processorr   r   s        r   r   zEvollaProcessor.from_pretrained   sY    G+,ITVT	 i'!!I)99)S5S5S
 '8	#r   )N      )r   )r   )NNNN)__name__
__module____qualname____doc__rv   valid_kwargsprotein_tokenizer_classtokenizer_classru   r   r3   intr?   r   r   ra   rF   rf   rh   rl   rn   rp   rr   classmethodr   __classcell__)r   s   @r   r   r       s    " &{3J)*L .%O!4
/   # 4 7;GK,0)-W
5dT!123W
  d4:&6T
&B CDW
 %SM	W

 "#W
r<6D>
(  r   r   )r   rs   typingr   r   feature_extraction_utilsr   processing_utilsr   autor	   rN   r   __all__ r   r   <module>r      s@    
 " 4 ! 3 Tn Tn 
r   