
    rhd                         d Z ddlZddlmZmZ ddlZddlmZ ddl	m
Z
 ddlmZmZ dZ ej                  e      Zd	ed
edej(                  fdZ G d d      Z G d d      ZdgZy)z%REALM Retriever model implementation.    N)OptionalUnion)hf_hub_download)AutoTokenizer   )logging	strtoboolzblock_records.npyblock_records_pathnum_block_recordsreturnc                     dd l mc m} |j                  j	                  | d      }|j                  |d      }t        |j                  d      j                               }|S )Nr   i    )buffer_sizeT)drop_remainder   )	tensorflow.compat.v1compatv1dataTFRecordDatasetbatchnexttakeas_numpy_iterator)r
   r   tfblocks_dataset	np_records        /var/www/html/ai-insurance-compliance-backend/venv/lib/python3.12/site-packages/transformers/models/deprecated/realm/retrieval_realm.pyconvert_tfrecord_to_npr   "   s]    %%WW,,-?M^,_N#))*;D)QN^((+==?@I    c                   &    e Zd ZdZ	 	 	 	 ddZd Zy)ScaNNSearcherztNote that ScaNNSearcher cannot currently be used within the model. In future versions, it might however be included.c                     ddl m}  |||d      }|j                  |||      }|j                  |      }|j	                         | _        y)zBuild scann searcher.r   )builderdot_product)dbnum_neighborsdistance_measure)
num_leavesnum_leaves_to_searchtraining_sample_size)dimensions_per_blockN)#scann.scann_ops.py.scann_ops_pybindr#   treescore_ahbuildsearcher)	selfr%   r&   r+   r(   r)   r*   Builderr#   s	            r   __init__zScaNNSearcher.__init__/   sU     	KR}}],,!8Lcw  
 ""8L"Mr   c                     | j                   j                  |j                         j                               \  }}|j	                  d      S )Nint64)r0   search_batcheddetachcpuastype)r1   question_projectionretrieved_block_ids_s       r   r6   zScaNNSearcher.search_batchedD   s@    !%!=!=>Q>X>X>Z>^>^>`!aQ"))'22r   N)   i  d   i )__name__
__module____qualname____doc__r3   r6    r   r   r!   r!   ,   s    ~  #(*3r   r!   c                   p     e Zd ZdZ fdZddZedeee	e
j                  f      fd       Zd Zd Z xZS )	RealmRetrieverah  The retriever of REALM outputting the retrieved evidence block and whether the block has answers as well as answer
    positions."

        Parameters:
            block_records (`np.ndarray`):
                A numpy array which contains evidence texts.
            tokenizer ([`RealmTokenizer`]):
                The tokenizer to encode retrieved texts.
    c                 >    t         |           || _        || _        y N)superr3   block_records	tokenizer)r1   rI   rJ   	__class__s      r   r3   zRealmRetriever.__init__T   s    *"r   c                    t        j                  | j                  |d      }| j                  j	                  |d   d      }g }g }	|D ]2  }
|j                  |       |	j                  |
j	                                4 | j                  ||	ddd|      }|j                  |      }|| j                  ||      |fz   S d d d |fS )Nr   )indicesaxisT)skip_special_tokens)padding
truncationreturn_special_tokens_mask
max_length)npr   rI   rJ   decodeappendconvert_to_tensorsblock_has_answer)r1   r;   question_input_ids
answer_idsrS   return_tensorsretrieved_blocksquestiontext	text_pairretrieved_blockconcat_inputsconcat_inputs_tensorss                r   __call__zRealmRetriever.__call__Y   s    774#5#5?RYZ[>>(();A)>TX(Y	/ 	7OKK!_3356	7 )TdW[hr ' 
 !. @ @ P!((
CG\F^^^$&;<<r   pretrained_model_name_or_pathc                    t         j                  j                  |      r%t         j                  j                  |t              }nt        d|t        d|}t        t         j                  j                  dd            st        d      t        j                  |d      }t        j                  |g|i |} | ||      S )N)repo_idfilenameTRUST_REMOTE_CODEFalseaz  This part uses `pickle.load` which is insecure and will execute arbitrary code that is potentially malicious. It's recommended to never unpickle data that could have come from an untrusted source, or that could have been tampered with. If you already verified the pickle data and decided to use it, you can set the environment variable `TRUST_REMOTE_CODE` to `True` to allow it.T)allow_picklerC   )ospathisdirjoin_REALM_BLOCK_RECORDS_FILENAMEr   r	   environget
ValueErrorrT   loadr   from_pretrained)clsrd   init_inputskwargsr
   rI   rJ   s          r   rt   zRealmRetriever.from_pretrainedn   s    77==67!#.KMj!k!0 "5@]"ag" (;WEF=   2F!112OhR]hagh	=),,r   c                     t        j                  t        j                  j	                  |t
              | j                         | j                  j                  |       y rG   )	rT   saverk   rl   rn   ro   rI   rJ   save_pretrained)r1   save_directorys     r   rz   zRealmRetriever.save_pretrained   s8    
^-JKTM_M_`&&~6r   c           	      b   g }g }g }d}|j                   D ]T  }|j                         }|j                  | j                  j                        }	|	dz   ||	dz   d j                  | j                  j                        z   }
|j                  g        |j                  g        |D ]n  }t        |	dz   |
      D ]Z  }|d   ||   k(  s|||t        |      z    |k(  s$|d   j                  |       |d   j                  |t        |      z   dz
         \ p t        |d         dk(  r|j                  d       #|j                  d       t        |d         |kD  sGt        |d         }W t        ||      D ]0  \  }}t        |      |k  sdg|t        |      z
  z  }||z  }||z  }2 |||fS )z&check if retrieved_blocks has answers.r   r   NFT)		input_idstolistindexrJ   sep_token_idrV   rangelenzip)r1   ra   rZ   has_answers	start_posend_posmax_answersinput_idinput_id_listfirst_sep_idxsecond_sep_idxansweridx
start_pos_end_pos_paddeds                   r   rX   zRealmRetriever.block_has_answer   s   	%// 	5H$OO-M)//0K0KLM*Q.}q?P?R1S1Y1YZ^ZhZhZuZu1vvNR NN2$ F !2NC FCayM#$66(sS[/@AVK%bM005#BK..sS[/@1/DE	FF 9R=!Q&""5)""4(y}%3"%im"4K)	5. %(	7$; 	# J:,s:!>?f$
F"		#
 Iw..r   )Npt)r?   r@   rA   rB   r3   rc   classmethodr   r   strrk   PathLikert   rz   rX   __classcell__)rK   s   @r   rE   rE   I   sM    #
=* -HU3PRP[P[K[E\<] - -*7#/r   rE   )rB   rk   typingr   r   numpyrT   huggingface_hubr   transformersr   utilsr   r	   ro   
get_loggerr?   loggerr   intndarrayr   r!   rE   __all__rC   r   r   <module>r      sw    , 	 "  + & ( !4  
		H	%s s rzz 3 3:d/ d/N 
r   