
    rhIY                        d dl Z d dlZd dlmZ d dlmZ d dl mZ d dlmZm	Z	 d dl
Z
i dddd	d
ddddddddddddddddddddddddd d!d"Zd.d#efd$Zd#efd%Z G d& d'      Z G d( d)      Z G d* d+      Z G d, d-      Zy)/    N)Iterator)Fraction)Match)OptionalUnionu   œoeu   ŒOE   øo   ØO   æae   ÆAE   ßssu   ẞSSu   đdu   ĐD   ð   Ð   þth   Þu   łlu   ŁLsc                 l    fddj                  fdt        j                  d|       D              S )z
    Replace any other markers, symbols, and punctuations with a space, and drop any diacritics (category 'Mn' and some
    manual mappings)
    c                     | v r| S | t         v r	t         |    S t        j                  |       dk(  ryt        j                  |       d   dv ry| S )NMn r   MSP )ADDITIONAL_DIACRITICSunicodedatacategory)charkeeps    /var/www/html/ai-insurance-compliance-backend/venv/lib/python3.12/site-packages/transformers/models/whisper/english_normalizer.pyreplace_characterz8remove_symbols_and_diacritics.<locals>.replace_character5   sY    4<K**(..!!$'4/!!$'*e3    r"   c              3   .   K   | ]  } |        y wN ).0cr+   s     r*   	<genexpr>z0remove_symbols_and_diacritics.<locals>.<genexpr>C   s     RA$Q'Rs   NFKDjoinr&   	normalize)r   r)   r+   s    `@r*   remove_symbols_and_diacriticsr7   /   s,     77R1F1Fvq1QRRRr,   c                 Z    dj                  d t        j                  d|       D              S )z[
    Replace any other markers, symbols, punctuations with a space, keeping diacritics
    r"   c              3   X   K   | ]"  }t        j                  |      d    dv rdn| $ yw)r   r#   r$   N)r&   r'   )r0   r1   s     r*   r2   z!remove_symbols.<locals>.<genexpr>J   s,     o+..q1!4=31Dos   (*NFKCr4   r   s    r*   remove_symbolsr<   F   s(     77okNcNcdjlmNnooor,   c                   *    e Zd ZddedefdZdefdZy)BasicTextNormalizerremove_diacriticssplit_lettersc                 8    |rt         nt        | _        || _        y r.   )r7   r<   cleanr@   )selfr?   r@   s      r*   __init__zBasicTextNormalizer.__init__N   s    6G2^
*r,   r   c                 n   |j                         }t        j                  dd|      }t        j                  dd|      }| j                  |      j                         }| j                  r4dj                  t        j                  d|t        j                              }t        j                  dd|      }|S )N[<\[][^>\]]*[>\]]r"   \(([^)]+?)\)r$   z\X\s+)	lowerresubrB   r@   r5   regexfindallUrC   r   s     r*   __call__zBasicTextNormalizer.__call__R   s    GGIFF'Q/FF?B*JJqM!ua9:AFF63"r,   N)FF)__name__
__module____qualname__boolrD   strrP   r/   r,   r*   r>   r>   M   s!    +$ +t +# r,   r>   c                   b     e Zd ZdZ fdZdee   dee   fdZdefdZ	defdZ
defd	Z xZS )
EnglishNumberNormalizerav  
    Convert any spelled-out numbers into arabic numbers, while handling:

    - remove any commas
    - keep the suffixes such as: `1960s`, `274th`, `32nd`, etc.
    - spell out currency symbols after the number. e.g. `$20 million` -> `20000000 dollars`
    - spell out `one` and `ones`
    - interpret successive single-digit numbers as nominal: `one oh one` -> `101`
    c                    t         |           h d| _        t        g dd      D ci c]  \  }}||
 c}}| _        | j                  j                         D ci c]  \  }}|dk(  rdn|dz   |df c}}| _        dd	d
dddd| j                  j                         D ci c]/  \  }}|dkD  r%|dk7  r |dk7  r||j                  d      rdndz   |df1 c}}| _        i | j                  | j                  | _	        ddddddddd| _
        | j                  j                         D ci c]  \  }}|j                  dd      |df c}}| _        | j                  j                         D ci c]  \  }}|j                  dd       |df c}}| _        i | j                  | j                  | _        d!d"d#d$d%d&d'd(d)d*d+d,d-| _        | j                  j                         D ci c]  \  }}|dz   |df c}}| _        | j                  j                         D ci c]  \  }}|dz   |df c}}| _        i | j                   | j"                  | _        h | j                  | j                  | j                  | _        d.d.d/d/d0| _        d1d1d2d2d3d3d4d4d5| _        t-        t/        | j(                  j1                               t/        | j*                  j1                               z         | _        d6d7id7d8| _        h d9| _        | j                  | j                  | j                  | j                  | j                  | j                  | j$                  | j(                  | j*                  | j4                  | j6                  fD ch c]  }|D ]  }|  c}}| _        d:d;h| _        y c c}}w c c}}w c c}}w c c}}w c c}}w c c}}w c c}}w c c}}w )<N>   r   ohzero)onetwothreefourfivesixseveneightnineteneleventwelvethirteenfourteenfifteensixteen	seventeeneighteennineteen   )startr`   sixesr   )r   r   )rn   st)   nd)   rd)   r   )   r   )zerothfirstsecondthirdfifthtwelfthrt   rv   rw   thr         (   2   <   F   P   Z   )twentythirtyfortyfiftysixtyseventyeightyninetyyiesiethd     i@B i ʚ;l    J)l     I5 l     NZol     @=7M.cl      B3v^!< l      P ~cegl       73Me'l       (l
F3YHqS )hundredthousandmillionbilliontrillionquadrillionquintillion
sextillion
septillion	octillion	nonillion	decillion-+)minusnegativepluspositive   £u   €$   ¢)poundpoundseuroeurosdollardollarscentcentsr   %)perpercent>   andpointdoubletripler[   ones)superrD   zeros	enumerater   itemsones_pluralendswithones_ordinalones_suffixedtensreplacetens_pluraltens_ordinaltens_suffixedmultipliersmultipliers_pluralmultipliers_ordinalmultipliers_suffixeddecimalspreceding_prefixersfollowing_prefixerssetlistvaluesprefixes	suffixersspecialswordsliteral_words)rC   inamevaluemappingkey	__class__s         r*   rD   z EnglishNumberNormalizer.__init__k   s   (
 % G
4 !G
	 W[V_V_VeVeVg
GRtUtu}G$*uclB
  !
 $(99??#4D%19! t}}S1t<udmK
 G 0 0FD4E4EF 	
	 W[V_V_VeVeVgh{tUDLLe4uclBhY]YbYbYhYhYjk+$PUT\\#v6EkF 0 0FD4E4EF  $)047;>BF
 PTO_O_OeOeOg"he4#:s|#;"hRVRbRbRhRhRj#k;4D4K%$>#k $[t'>'>$[$BZBZ$[!=$))=dii=$**= 	$
  	$
  D!9!9!@!@!BCd4KcKcKjKjKlFmmnC=
 =
 

		""		""  ))((((
 
  


" $V_G

$ ik" #i#k6
s/   N0N6%4N</O4O,O$OOr   returnc              #     K   d d d}dt         fd}dt        t         t        f   ffd}t        |      dk(  ry t	        |      D ]'  \  }}|dk7  r||dz
     nd }|t        |      dz
  k7  r||dz      nd }|rd}6|d uxr t        j                  d|      }	|d   | j                  v }
|
r|dd  n|}t        j                  d|      r ||      }|t        d	      Ct        t               r)j                  d
      rt              t        |      z    |       |
r|d   n|j                  dk(  r|j                  ||| j                  vr
 |        ||       ,|| j                  v rt        xs d      dz   N|| j                  v r| j                  |   }|qt        t               s|| j                  v r?|| j                   v r|dk  rd d t        |      z   t              t        |      z   |dk  r(dz  dk(  r|z  t              t        |      z   dz  dk(  r|z  
t              t        |      z   #|| j"                  v r.| j"                  |   \  }} |t        |      |z          nt        t               s|| j                  v rS|| j                   v r"|dk  r |d d t        |      z   |z          n |t              t        |      z   |z          n|dk  rEdz  dk(  r |t        |z         |z          ng |t              t        |      z   |z          nDdz  dk(  r |t        |z         |z          n" |t              t        |      z   |z          d `|| j                   v rf| j                   |   }|t        t               rt              t        |      z   dz  dk(  r|z  t              t        |      z   || j$                  v r| j$                  |   \  }} |t        |      |z          t        t               r$ |t              t        |      z   |z          Bdz  dk(  r |t        |z         |z          e |t              t        |      z   |z          || j&                  v r| j&                  |   }|t        t               sdk(  r> |      }|||z  nd }||j                  dk(  r|j                   |       |dz  dz  }dz  }|||z  z   || j(                  v r| j(                  |   \  }} |t        |      |z          nt        t               rd |      }|||z  nd }|0|j                  dk(  r! |t        |j                        |z          nL |        |t        |      |z          n+dz  dz  }dz  }|||z  z    |t              |z          d || j*                  v r9
 |       || j                  v s|	r| j*                  |   - ||       9|| j,                  v r)| j,                  |    |       d ||       p|| j.                  v r|n| j.                  |   }t        |t0              r7||v r |t              ||   z          d} |        ||        |t              |z           ||       || j2                  v r|| j                  vr|	s
 |        ||       1|dk(  r(|| j&                  vsF
 |        ||       ^|dk(  s|dk(  r{|| j                  v s|| j                  v rG|dk(  rdnd}| j                  j5                  |d      }t        xs d      t        |      |z  z   d}ˉ
 |        ||       |dk(  r&|| j6                  v s|	st        xs d      d
z   t        d|       t        d|         |       y y w)NFr   c                 8    	 t        |       S # t        $ r Y y w xY wr.   )r   
ValueErrorr;   s    r*   to_fractionz:EnglishNumberNormalizer.process_words.<locals>.to_fraction   s#    {" s   
 	resultc                 4    t        |       } | z   } d d | S r.   )rU   )r   prefixr   s    r*   outputz5EnglishNumberNormalizer.process_words.<locals>.output   s*    [F!&EFMr,   r   rn   z^\d+(\.\d+)?$zConverting the fraction failed.r"   0
   r   r   Tr   r   r   rr   rt   r   zUnexpected token: )rU   r   intlenr   rJ   matchr   r   
isinstancer   denominator	numeratorr   r   r   r   r   r   r   r   r   r   r   dictr   getr   )rC   r   skipr   r   r   currentprevnextnext_is_numeric
has_prefixcurrent_without_prefixfr   suffixr   
multiplierpbeforeresidualrepeatsr   r   s                        @@r*   process_wordsz%EnglishNumberNormalizer.process_words   s     $+/	3 		5c? 	 u:?#E* C	AJAw#$65Q<tD#$E
Q#65Q<DD"$.S288<Ld3SO t}}4J4>WQR[G"xx(*@A 679$%EFF$!%-%..2E #E
S\ 9 $Um+'1v==A%KKE2E

*$ -'Wo%DJJ&EKR(3.DII%yy)= Es+ttyy/@tyy(TBY %cr
SY 6 #E
SY 6BYrzQ #E
SY 6s{a' #E
SY 6D...#11':f= TV!344s+ttyy/@tyy(TBY$U3BZ#d)%;f%DEE$SZ#d)%;f%DEEBYrzQ$S%6%?@@$SZ#d)%;f%DEEs{a'$S%6%?@@$SZ#d)%;f%DEEDII%yy)= Es+JT2Es{a' #E
SY 6D...#11':f= TV!344s+ Uc$i!7&!@AAs{a'$S%6%?@@$SZ#d)%;f%DEED,,,!--g6
=&Es+uz#E*A*+-JTA}!); !$Um+ *"d]T1F$t|H"X
%::ED555%)%>%>w%G"
F= Z6!9::s+#E*A*+-JTA}!);$S%5%>??$Um+$S_v%=>>"d]T1F$t|H"X
%::E Uf!455D444$ -'4::%!55g>F /)D444$!55g>F -' /)DNN*$!^^G4F!&$/6>"(UfTl)B"CC#'D"(-/"(/1$SZ&%899 /)DMM)tzz)/($Um+ /)%4#3#33 ,"(-/$Wo-(Gx,?tyy(DDJJ,>'.(':!#yy}}T15 #EKR 03t9w3F F# ,"(-/$Wo-'t}}, #EKR 03 6 %'9'%CDD !#5gY!?@@GC	AJ - s   ^	bB2b?br   c                 R   g }t        j                  d|      }t        |      D ]  \  }}t        |j	                               dk(  r#|t        |      dz
  k(  r|j                  |       F|j                  |       |j                  d      d   }|| j                  v s|| j                  v r|j                  d       |j                  d        d	j                  |      }t        j                  d
d|      }t        j                  dd|      }t        j                  dd|      }|S )Nz\band\s+a\s+half\br   rn   rr   )maxsplitr   z
point fivez
and a halfr$   z([a-z])([0-9])z\1 \2z([0-9])([a-z])z([0-9])\s+(st|nd|rd|th|s)\b\1\2)rJ   splitr   r   stripappendrsplitr   r   r5   rK   )rC   r   resultssegmentsr   segment	last_words          r*   
preprocessz"EnglishNumberNormalizer.preprocess  s   88115#H- 	1JAw7==?#q(CMA%%w'w'#NNAN6r:	-d>N>N1NNN<0NN<0	1 HHW FF$h2FF$h2 FF17A>r,   c                     dt         fd}dt         fd}t        j                  d||      }t        j                  d||      }t        j                  dd|      }|S )Nmc                     	 | j                  d      }| j                  d      }t        | j                  d            }| | d|dS # t        $ r | j                  cY S w xY w)Nrn   rr   rt   r   02d)groupr   r   string)r  currencyintegerr   s       r*   combine_centsz:EnglishNumberNormalizer.postprocess.<locals>.combine_cents  sa     771:''!*AGGAJ"G9AeC[99  xx s   AA A A c                 t    	 dt        | j                  d             S # t        $ r | j                  cY S w xY w)Nr   rn   )r   r  r   r  )r  s    r*   extract_centsz:EnglishNumberNormalizer.postprocess.<locals>.extract_cents  s9     C
O,--  xx s    77u,   ([€£$])([0-9]+) (?:and )?¢([0-9]{1,2})\bu   [€£$]0.([0-9]{1,2})\bz	\b1(s?)\bzone\1)r   rJ   rK   )rC   r   r  r  s       r*   postprocessz#EnglishNumberNormalizer.postprocess  s\    	 U 	 	 U 	  FFBMSTUFF.qA FF<1-r,   c                     | j                  |      }dj                  d | j                  |j                               D              }| j	                  |      }|S )Nr$   c              3   &   K   | ]	  }||  y wr.   r/   )r0   words     r*   r2   z3EnglishNumberNormalizer.__call__.<locals>.<genexpr>  s     XdtGWTXs   )r  r5   r   r  r  rO   s     r*   rP   z EnglishNumberNormalizer.__call__  sJ    OOAHHXd&8&8&CXXQr,   )rQ   rR   rS   __doc__rD   r   rU   r   r   r  r  rP   __classcell__)r   s   @r*   rW   rW   `   sR    h-T] 49 ] # ] ~C :S 2# r,   rW   c                   "    e Zd ZdZd ZdefdZy)EnglishSpellingNormalizerz~
    Applies British-American spelling mappings as listed in [1].

    [1] https://www.tysto.com/uk-us-spelling-list.html
    c                     || _         y r.   )r   rC   english_spelling_mappings     r*   rD   z"EnglishSpellingNormalizer.__init__  s	    /r,   r   c                 T     dj                   fd|j                         D              S )Nr$   c              3   V   K   | ]   }j                   j                  ||       " y wr.   )r   r   )r0   r  rC   s     r*   r2   z5EnglishSpellingNormalizer.__call__.<locals>.<genexpr>  s"     K((t4Ks   &))r5   r  rO   s   ` r*   rP   z"EnglishSpellingNormalizer.__call__  s    xxKKKKr,   N)rQ   rR   rS   r  rD   rU   rP   r/   r,   r*   r  r    s    0L# Lr,   r  c                       e Zd Zd ZdefdZy)EnglishTextNormalizerc                 R   d| _         i dddddddd	d
ddddddddddddddddddddddd d!d"i d#d$d%d&d'd(d)d*d+d,d-d.d/d0d1d2d3d4d5d6d7d8d9d:d;d<d=d>d?d@dAdBdCdDdEdFdGdHdIdJdKdLdMdNdOdPdLdQdRdS| _        t               | _        t	        |      | _        y )TNz\b(hmm|mm|mhm|mmm|uh|um)\bz	\bwon't\bzwill notz	\bcan't\bzcan notz	\blet's\bzlet usz	\bain't\baintz	\by'all\bzyou allz	\bwanna\bzwant toz	\bgotta\bzgot toz	\bgonna\bzgoing toz\bi'ma\bzi am going toz\bimma\bz
\bwoulda\bz
would havez
\bcoulda\bz
could havez\bshoulda\bzshould havez	\bma'am\bmadamz\bmr\bzmister z\bmrs\bzmissus z\bst\bzsaint z\bdr\bzdoctor z\bprof\bz
professor z\bcapt\bzcaptain z\bgov\bz	governor z\bald\bz	alderman z\bgen\bzgeneral z\bsen\bzsenator z\brep\bzrepresentative z\bpres\bz
president z\brev\bz	reverend z\bhon\bz
honorable z\basst\bz
assistant z	\bassoc\bz
associate z\blt\bzlieutenant z\bcol\bzcolonel z\bjr\bzjunior z\bsr\bzsenior zesquire z	 had beenz	 has beenz	 had gonez	 has gonez	 had donez has gotz notz arez isz wouldz willz havez am)z\besq\bz	'd been\bz	's been\bz	'd gone\bz	's gone\bz	'd done\bz's got\bzn't\bz're\bz's\bz'd\bz'll\bz't\bz've\bz'm\b)ignore_patterns	replacersrW   standardize_numbersr  standardize_spellingsr   s     r*   rD   zEnglishTextNormalizer.__init__  s   <6
*6
 )6
 (	6

 &6
 )6
 )6
 (6
 *6
 6
 6
 <6
 <6
 M6
 '6
" y#6
$ 	%6
& x'6
( y)6
* +6
, -6
. /6
0 16
2 
36
4 
56
6 )76
8 96
: ;6
< =6
> ?6
@ ,A6
B }C6
D 
E6
F yG6
H yI6
J #%%%%%#k6
n $;#< %>?W%X"r,   r   c                    |j                         }t        j                  dd|      }t        j                  dd|      }t        j                  | j                  d|      }t        j                  dd|      }| j                  j                         D ]  \  }}t        j                  |||      } t        j                  dd|      }t        j                  dd	|      }t        |d
      }| j                  |      }| j                  |      }t        j                  dd	|      }t        j                  dd|      }t        j                  dd|      }|S )NrF   r"   rG   z\s+''z	(\d),(\d)r  z\.([^0-9]|$)z \1u
   .%$¢€£)r)   u   [.$¢€£]([^0-9])z	([^0-9])%z\1 rH   r$   )	rI   rJ   rK   r)  r*  r   r7   r+  r,  )rC   r   patternreplacements       r*   rP   zEnglishTextNormalizer.__call__=  s   GGIFF'Q/FF?B*FF4''Q/FF7C#$(NN$8$8$: 	0 G[wQ/A	0 FF<!,FF?FA.)!,?$$Q'&&q) FF)615FF<+FF63"r,   N)rQ   rR   rS   rD   rU   rP   r/   r,   r*   r%  r%     s    :Yx# r,   r%  )r"   )rJ   r&   collections.abcr   	fractionsr   r   typingr   r   rL   r%   rU   r7   r<   r>   rW   r  r%  r/   r,   r*   <module>r4     s+    
  $   " $$ 	# 	#	
 	$ 	$ 	$ 
4 	# 	# 	# 	# 	$ 	$ 	#  	#! (SS S.pc p &O OdL LU Ur,   