
    rh                        d dl mZ d dlZd dlZd dlZd dlmZ 	 d dlmZ d dlZd dlZd dlmZ d dlmZ d dlmZ d dlmZmZmZ d	d
lmZmZmZ  ej4                  e      Z G d de      Zy# e	$ r	 d dl
mZ Y Yw xY w)    )annotationsN)PreTrainedTokenizerBase)Self)nn)tqdm)Module)fullnamehttp_getimport_from_string   )TransformersTokenizerWrapperWhitespaceTokenizerWordTokenizerc                      e Zd ZU g dZded<   dZded<   	 	 d	 	 	 	 	 ddZd	 Zdd
ZddZ	dddZ
d Ze	 	 	 	 	 d	 	 	 	 	 	 	 	 	 	 	 	 	 dd       Zedd e       df	 	 	 	 	 	 	 dd       Zy)WordEmbeddingstokenizer_classupdate_embeddingsmax_seq_length	list[str]config_keyszwordembedding_config.jsonstrconfig_file_nameFc                \   t         j                  j                  |        t        |t              rt        |      }nt        |t              st        d      t        |t              rt        j                  |      }t        |t        j                        rt        j                  |      }|j                         \  }}|| _        t        j                   ||      | _        | j"                  j%                  d|i       || j"                  j&                  _        || _        || _        || _        y )Nz>tokenizer must be a WordTokenizer or a HuggingFace tokenizer. weight)r   r   __init__
isinstancer   r   r   
ValueErrorlistnpasarrayndarraytorch
from_numpysizeembeddings_dimension	Embedding	emb_layerload_state_dictr   requires_grad	tokenizerr   r   )selfr+   embedding_weightsr   r   num_embeddingsr&   s          ~/var/www/html/ai-insurance-compliance-backend/venv/lib/python3.12/site-packages/sentence_transformers/models/WordEmbeddings.pyr   zWordEmbeddings.__init__   s     			4 i!894Y?II}5]^^'. "

+< ='4 % 0 01B C/@/E/E/G,,$8!n6JK&&2C'DE.?+"!2,    c                b    | j                  |d         }d }|j                  |||d   d       |S )N	input_idsattention_mask)token_embeddingscls_token_embeddingsr3   )r(   update)r,   featuresr4   
cls_tokenss       r/   forwardzWordEmbeddings.forward:   sB    >>(;*?@
$4(2"*+;"<	
 r0   c                6   |D cg c]  } | j                   j                  |fi |! }}|D cg c]  }t        |       }}t        |      }g }g }	|D ]I  }dg|t        |      z
  z  }
|j	                  ||
z          |	j	                  dgt        |      z  |
z          K t        j                  |t
        j                        t        j                  |	t
        j                        t        j                  |t
        j                        d}|S c c}w c c}w )Nr   r   )dtype)r2   r3   sentence_lengths)r+   tokenizelenmaxappendr#   tensorlong)r,   textskwargstexttokenized_textstokensr<   max_lenr2   attention_maskspaddingoutputs               r/   r=   zWordEmbeddings.tokenizeF   s    OTUt24>>224B6BUU6EFFCKFF&'	% 	@FcWs6{23GVg-.""A3V#4w#>?	@ iuzzB#ll?%**M %-=UZZ P
 # VFs
   $DDc                    | j                   S )N)r&   r,   s    r/   get_word_embedding_dimensionz+WordEmbeddings.get_word_embedding_dimensionZ   s    (((r0   c                    | j                  |       | j                  ||       | j                  j                  |       y )N)safe_serialization)save_configsave_torch_weightsr+   save)r,   output_pathrP   s      r/   rS   zWordEmbeddings.save]   s6    %@RSK(r0   c                \    t        | j                        | j                  | j                  dS )Nr   )r	   r+   r   r   rM   s    r/   get_config_dictzWordEmbeddings.get_config_dictb   s*    '7!%!7!7"11
 	
r0   Nc                    |||||d} | j                   dd|i|}	t        |	j                  d            }
 | j                  dd|i|}|
j	                  |      } | j
                  dd|i|} | d||d   d|	}|S )N)	subfoldertokencache_folderrevisionlocal_files_onlymodel_name_or_pathr   zemb_layer.weight)r+   r-    )load_configr   popload_dir_pathloadload_torch_weights)clsr]   rX   rY   rZ   r[   r\   rD   
hub_kwargsconfigr   tokenizer_local_pathr+   weightsmodels                  r/   rb   zWordEmbeddings.loadi   s     #(  0

 !U4FU*U,VZZ8I-JK0s00eDVeZde#(()=>	(#((]<N]R\]ai7CU;VaZ`ar0    c           
     B   t         j                  d|        t        j                  j	                  |      s?t         j                  | d       d|v sd|v rt        d|       d|z   }t        ||       d }g }g }	|j                  d      rt        j                  |dd	
      nt        |d	
      5 }
t        |
dd      }|D ]  }|j                         j                  |      }|st        |      dk(  r4|d   }|Ct        |      dz
  }|j                  d       |	j                  t        j                   |             t        |      dz
  |k7  rt         j#                  d       t        j$                  |dd  D cg c]  }t'        |       c}      }|	j                  |       |j                  |       ||dkD  st        |      |kD  s n t        j(                  |	      }	|j+                  |        | ||	|      cd d d        S c c}w # 1 sw Y   y xY w)NzRead in embeddings file z, does not exist, try to download from server/\zEmbeddings file not found: zAhttps://public.ukp.informatik.tu-darmstadt.de/reimers/embeddings/z.gzrtutf8)encodingzLoad Word Embeddings
Embeddings)descunit   r   r   PADDING_TOKENz\ERROR: A line in the embeddings file had more or less  dimensions than expected. Skip token.)r+   r-   r   )loggerinfoospathexistsr   r
   endswithgzipopenr   rstripsplitr>   r@   r    zeroserrorarrayfloatr!   	set_vocab)rd   embeddings_file_pathr   item_separatorr+   max_vocab_sizeurlr&   vocab
embeddingsfIniteratorliner   wordnumvectors                    r/   from_text_filezWordEmbeddings.from_text_file   s    	./C.DEFww~~23KK/00\]^**d6J.J #>?S>T!UVVUXllCS./#
 $,,U3 II*D6B*V<%	o ADC&<<PH  ++N;UqQx'/+.u:>(LL1%%bhh/C&DE JN)* LLv qr"C#5:"CD!!&)T"!-.12DUVdId58 J/J&j\mnK%	o %	o6 #D7%	o %	os1   3CH?H+H=HH3HHH)Fi@B )r+   z'WordTokenizer | PreTrainedTokenizerBaser   boolr   int)rC   r   )returnr   )T)rT   r   rP   r   ) NNNF)r]   r   rX   r   rY   zbool | str | NonerZ   
str | Noner[   r   r\   r   r   r   )r   r   r   r   r   r   r   z
int | None)__name__
__module____qualname__r   __annotations__r   r   r9   r=   rN   rS   rV   classmethodrb   r   r   r^   r0   r/   r   r      s   WKW7c7 #(%-:-  	-
 -6
())

  #'#'#!&  !	
 !   
 4  #(!%'%)<o!<o  <o 	<o #<o <or0   r   )
__future__r   r|   loggingrx   transformersr   typingr   ImportErrortyping_extensionsnumpyr    r#   r   r   #sentence_transformers.models.Moduler   sentence_transformers.utilr	   r
   r   r+   r   r   r   	getLoggerr   rv   r   r^   r0   r/   <module>r      so    "   	 0'     6 M M W W			8	$foV fo!  '&'s   A, ,A:9A: