
    rhO$                     Z   d dl Z d dlZd dlmZmZ d dlmZ d dlmZm	Z	 d dl
Z
d dlmZ d dlmZ ddlmZ dd	lmZ dd
lmZmZ ddlmZmZmZmZ  ej6                  e      Z e ej>                               Z  e!d e D              Z"e G d d             Z# G d de      Z$ G d de      Z%y)    N)	dataclassfield)Enum)OptionalUnion)FileLock)Dataset   )$MODEL_FOR_QUESTION_ANSWERING_MAPPING)PreTrainedTokenizer)check_torch_load_is_safelogging   )SquadFeaturesSquadV1ProcessorSquadV2Processor"squad_convert_examples_to_featuresc              #   4   K   | ]  }|j                     y wN)
model_type).0confs     s/var/www/html/ai-insurance-compliance-backend/venv/lib/python3.12/site-packages/transformers/data/datasets/squad.py	<genexpr>r   "   s     EDOOEs   c                      e Zd ZU dZ eddddj                  e      z   i      Zee	d<    edddi      Z
ee	d	<    ed
ddi      Zee	d<    ed
ddi      Zee	d<    edddi      Zee	d<    edddi      Zee	d<    edddi      Zee	d<    edddi      Zee	d<    edddi      Zee	d<    edddi      Zee	d<    eddd i      Zee	d!<    ed"dd#i      Zee	d$<   y)%SquadDataTrainingArgumentszb
    Arguments pertaining to what data we are going to input our model for training and eval.
    Nhelpz!Model type selected in the list: z, )defaultmetadatar   zFThe input data dir. Should contain the .json files for the SQuAD task.data_dir   zThe maximum total input sequence length after tokenization. Sequences longer than this will be truncated, sequences shorter will be padded.max_seq_lengthzVWhen splitting up a long document into chunks, how much stride to take between chunks.
doc_stride@   zkThe maximum number of tokens for the question. Questions longer than this will be truncated to this length.max_query_length   zThe maximum length of an answer that can be generated. This is needed because the start and end predictions are not conditioned on one another.max_answer_lengthFz1Overwrite the cached training and evaluation setsoverwrite_cachezDIf true, the SQuAD examples contain some that do not have an answer.version_2_with_negativeg        zIIf null_score - best_non_null is greater than the threshold predict null.null_score_diff_threshold   n_best_sizer   zjlanguage id of input for language-specific xlm models (see tokenization_xlm.PRETRAINED_INIT_CONFIGURATION)lang_id   z3multiple threads for converting example to featuresthreads)__name__
__module____qualname____doc__r   joinMODEL_TYPESr   str__annotations__r    r"   intr#   r%   r'   r(   boolr)   r*   floatr,   r-   r/        r   r   r   %   s    (KdiiXcNd(deJ  (pqHc   Q
NC  rsJ  "/
c  #J
s  ")\ ]OT  %*)o p%T  (-v'rs(u  f&qrK  C
GS  f6k-lmGSmr<   r   c                       e Zd ZdZdZy)SplittraindevN)r0   r1   r2   r?   r@   r;   r<   r   r>   r>   h   s    E
Cr<   r>   c                       e Zd ZU dZeed<   ee   ed<   eed<   e	ed<   dej                  dddfded	ed
ee   deeef   dee	   dee   dee   fdZd Zdeeej(                  f   fdZy)SquadDatasetzH
    This will be superseded by a framework-agnostic approach soon.
    argsfeaturesmodeis_language_sensitiveNFpt	tokenizerlimit_length	cache_dirdataset_formatc                    || _         || _        |j                  r
t               n	t	               | _        t        |t              r
	 t        |   }|| _
        |j                  rdnd}t        j                  j                  ||n|j                  d|j                   d|j                   j"                   d|j$                   d|       }	|	dz   }
t'        |
      5  t        j                  j)                  |	      r|j*                  st-        j,                         }t/                t1        j2                  |	d      | _        | j4                  d	   | _        | j4                  j9                  d
d       | _        | j4                  j9                  dd       | _        t>        jA                  d|	 dt-        j,                         |z
         | j:                  | j<                  dt>        jC                  d|	 d       nI|t        jD                  k(  r+| j
                  jG                  |j                        | _        n*| j
                  jI                  |j                        | _        tK        | j<                  ||j$                  |jL                  |jN                  |t        jP                  k(  |jR                  |      \  | _        | _        t-        j,                         }t1        jT                  | j6                  | j:                  | j<                  d|	       t>        jA                  d|	 dt-        j,                         |z
  dd       d d d        y # t        $ r t        d      w xY w# 1 sw Y   y xY w)Nzmode is not a valid split namev2v1cached__z.lockT)weights_onlyrD   datasetexamplesz"Loading features from cached file z [took %.3f s]zDeleting cached file z; will allow dataset and examples to be cached in future run)rS   rH   r"   r#   r%   is_trainingr/   return_dataset)rD   rR   rS   z!Saving features into cached file z [took z.3fz s])+rC   rF   r)   r   r   	processor
isinstancer6   r>   KeyErrorrE   ospathr4   r    value	__class__r0   r"   r   existsr(   timer   torchloadold_featuresrD   getrR   rS   loggerinfowarningr@   get_dev_examplesget_train_examplesr   r#   r%   r?   r/   save)selfrC   rH   rI   rE   rF   rJ   rK   version_tagcached_features_file	lock_pathstarts               r   __init__zSquadDataset.__init__w   s    	%:"/3/K/K)+QaQcdC AT{ 	"::d!ww||".IDMMdjj\9#6#6#?#?"@$BUBUAVVWXcWde 
 )72	i  -	ww~~23D<P<P		(*$)JJ/CRV$W! !% 1 1* =#0044YE $ 1 1 5 5j$ G89M8Nn]_c_h_h_jmr_r <<'4==+@NN/0D/E F& &
 599$$(NN$C$CDMM$RDM$(NN$E$Edmm$TDM.P!]]'#'#6#6#%)%:%: $ 3 LL#1	/+t| 		

!%4<<UYUbUbc(
 78L7MWUYU^U^U`chUhilTmmpqW-	 -	  A?@@A-	 -	s   	M I(M M M)c                 ,    t        | j                        S r   )lenrD   )ri   s    r   __len__zSquadDataset.__len__   s    4==!!r<   returnc                 (   | j                   |   }t        j                  |j                  t        j                        }t        j                  |j
                  t        j                        }t        j                  |j                  t        j                        }t        j                  |j                  t        j                        }t        j                  |j                  t        j                        }t        j                  |j                  t        j                        }|||d}	| j                  j                  dv r|	d= | j                  j                  dv r|	j                  ||d       | j                  j                  r|	j                  d|i       | j                  rW|	j                  dt        j                   |j"                  t        j$                        | j                  j&                  z  i       | j(                  t*        j,                  k(  rrt        j                  |j.                  t        j                        }
t        j                  |j0                  t        j                        }|	j                  |
|d	       |	S )
N)dtype)	input_idsattention_masktoken_type_ids)xlmroberta
distilbert	camembertrw   )xlnetrx   )	cls_indexp_maskis_impossiblelangs)start_positionsend_positions)rD   r_   tensorru   longrv   rw   r}   r~   r:   r   rC   r   updater)   rF   onesshapeint64r-   rE   r>   r?   start_positionend_position)ri   ifeatureru   rv   rw   r}   r~   r   inputsr   r   s               r   __getitem__zSquadDataset.__getitem__   s   --"LL!2!2%**E	g&<&<EJJOg&<&<EJJOLL!2!2%**E	gnnEKK@W%:%:%++N #,,
 99#PP'(99#33MM	VDEyy00>?))wIOO5;;)WZ^ZcZcZkZk)kmn99##ll7+A+ATO!LL)=)=UZZPMMMoP]^_r<   )r0   r1   r2   r3   r   r7   listr   r>   r9   r?   r   r   r8   r   r6   rn   rq   dictr_   Tensorr   r;   r<   r   rB   rB   m   s     %$=!!
K '+"'++05#'(,J(J 'J sm	J
 CJJ  (~J C=J !JX" S%,,%6 7  r<   rB   )&rY   r^   dataclassesr   r   enumr   typingr   r   r_   filelockr   torch.utils.datar	   models.auto.modeling_autor   tokenization_utilsr   utilsr   r   processors.squadr   r   r   r   
get_loggerr0   rc   r   keysMODEL_CONFIG_CLASSEStupler5   r   r>   rB   r;   r<   r   <module>r      s    
  (  "   $ M 5 6 t t 
		H	%E@EEGH E0DEE ?n ?n ?nDD 
y7 yr<   