
    rhP                        d dl Zd dl d dlZd dlZd Zd Zed fdZ	ddZ
ddZeZddZd	 ZeZdd
Zd Zd ZeZddZeZddZ G d d      Zd dZ G d d      ZedfdZd!dZ G d d      Zd ZeZd Ze Z!d"dZ y)#    N)*c                    t        j                  | d      } | j                  \  }}t        j                  ||fd      }t        j                  ||fd      }t	        j
                         }t        |      |_        t        |      |_        ||_	        ||_
        |j                          |j                  |t        |              |j                          ||fS )zPreturn k smallest values (and their indices) of the lines of a
    float32 arrayfloat32dtypeint64)npascontiguousarrayshapezerosfaissfloat_maxheap_array_tswig_ptridsvalnhkheapifyaddnreorderarrayr   mnIDhas          g/var/www/html/ai-insurance-compliance-backend/venv/lib/python3.12/site-packages/faiss/extra_wrappers.pykminr             i8E;;DAq
!Qw'A
!Qy)A		$	$	&Ba[BFa[BFBEBDJJLGGAxJJLa4K    c                    t        j                  | d      } | j                  \  }}t        j                  ||fd      }t        j                  ||fd      }t	        j
                         }t        |      |_        t        |      |_        ||_	        ||_
        |j                          |j                  |t        |              |j                          ||fS )zOreturn k largest values (and their indices) of the lines of a
    float32 arrayr   r   r   )r	   r
   r   r   r   float_minheap_array_tr   r   r   r   r   r   r   r   r   s          r   kmaxr$   +   r    r!   c                    t        j                  | d      } t        j                  |d      }| j                  \  }}|j                  \  }}||k(  sJ t        j                  ||fd      }|t        k(  r-t        ||t        |       |t        |      t        |             |S |t        k(  r| |j                  z  |dd |S t        ||t        |       |t        |      ||t        |             |S )zJcompute the whole pairwise distance matrix between two sets of
    vectorsr   r   N)
r	   r
   r   empty	METRIC_L2pairwise_L2sqrr   METRIC_INNER_PRODUCTTpairwise_extra_distances)	xqxbmetric
metric_argnqdnbd2diss	            r   pairwise_distancesr5   =   s     
		b		2B			b		2BHHEBXXFB7N7
((B89
-Cr8B<SM	 J 
'	'bddA J 	!r8B<JSM		
 Jr!   c                 t    t        j                  | d      }t        t        |      |j                  |       |S Nr   r   )r	   r&   
float_randr   sizer   seedress      r   randr=   V   s+    
((1I
&Cx}chh-Jr!   c                     t        j                  | d      }|"t        t        |      |j                  |       |S t        t        |      |j                  ||       |S Nr   r   )r	   r&   
int64_randr   r9   int64_rand_max)r   r;   vmaxr<   s       r   randintrC   \   sO    
((1G
$C|8C=#((D1 J 	x}chhd;Jr!   c                 t    t        j                  | d      }t        t        |      |j                  |       |S r7   )r	   r&   float_randnr   r9   r:   s      r   randnrF   h   s+    
((1I
&Csxx.Jr!   c                    | j                  d      } | j                  dk(  rt        | j                  t	        |             S | j
                  \  }}t        j                  |d      }t        ||t	        |       t	        |             |S )z> compute a checksum for quick-and-dirty comparisons of arrays uint8   uint64r   )	viewndimbvec_checksumr9   r   r   r	   r   bvecs_checksum)ar   r1   css       r   checksumrQ   n   sg    	wAvv{QVVXa[1177DAq	!8	$B1a!hrl3Ir!   c                 f    t        j                  | |fd      }t        | |t        |      |       |S r7   )r	   r&   rand_smooth_vectors_cr   )r   r1   r;   r<   s       r   rand_smooth_vectorsrT   z   s-    
((Aq6
+C!Qt4Jr!   c                 Z   t        j                  | d      } t        j                  |d      }| j                  d   }|j                  d   |k(  sJ | j                  d   |j                  d   }}d}t        |      D ]+  }|t	        |t        | |         |t        ||               z  }- |S )z< size of intersection between each line of two result tablesr   r   r   rI   )r	   r
   r   rangeranklist_intersection_sizer   )I1I2r   k1k2ninteris          r   eval_intersectionr^      s    			b	0B			b	0B
A88A;!XXa["((1+BF1X 6,AXbe_6 	66 Mr!   c                 d    t        | j                  d   | j                  d   t        |              y )NrI   r   )fvec_renorm_L2r   r   xs    r   normalize_L2rc      s"    1771:qwwqz8A;7r!   c           	         t        j                  | d      } |t        | j                         dz         }t        j                  |dz   d      }t        j                  | j
                  d      }t        | j
                  t        j                  | j                  d            |t        j                  |      t        j                  |      |       ||fS )a  Perform a bucket sort on a table of integers.

    Parameters
    ----------
    tab : array_like
        elements to sort, max value nbucket - 1
    nbucket : integer
        number of buckets, None if unknown
    nt : integer
        number of threads to use (0 = use unthreaded codepath)

    Returns
    -------
    lims : array_like
        cumulative sum of bucket sizes (size vmax + 1)
    perm : array_like
        perm[lims[i] : lims[i + 1]] contains the indices of bucket #i (size tab.size)
    r   r   rI   rJ   )
r	   r
   intmaxr&   r9   bucket_sort_cr   r   rK   )tabnbucketntlimsperms        r   bucket_sortrm      s    & 

s'
2Ccggi!m$88GaKw/D88CHHG,D%..(!34%u~~d';

 :r!   c           	      @   | j                   dk(  s| j                   dk(  sJ | j                  \  }}|t        | j                         dz         }t	        j
                  |dz   d      }t        ||t        j                  |       |t        j                  |      |       |S )a  Perform a bucket sort on a matrix, recording the original
    row of each element.

    Parameters
    ----------
    tab : array_like
        array of size (N, ncol) that contains the bucket ids, maximum
        value nbucket - 1.
        On output, it the elements are shuffled such that the flat array
        tab.ravel()[lims[i] : lims[i + 1]] contains the row numbers
        of each bucket entry.
    nbucket : integer
        number of buckets (the maximum value in tab should be nbucket - 1)
    nt : integer
        number of threads to use (0 = use unthreaded codepath)

    Returns
    -------
    lims : array_like
        cumulative sum of bucket sizes (size vmax + 1)
    int32r   rI   r   )	r   r   re   rf   r	   r&   matrix_bucket_sort_inplace_cr   r   )rh   ri   rj   nrowncolrk   s         r   matrix_bucket_sort_inplacers      s    , 99399#777JD$cggi!m$88GaKw/D dENN3'%

 Kr!   c                   *    e Zd ZdZddZd Zd Zd Zy)
ResultHeapz_Accumulate query results from a sliced dataset. The final result will
    be in self.D, self.I.c                 z   t        j                  ||fd      | _        t        j                  ||fd      | _        ||c| _        | _        |rt               }n
t               }||_        ||_        t        | j                        |_
        t        | j                        |_        |j                          || _        y)z
        nq: number of query vectors,
        k: number of results per query
        keep_max: keep the top-k maximum values instead of the minima
        r   r   r   N)r	   r   r   r   r0   r   r#   r   r   r   r   r   r   heaps)selfr0   r   keep_maxrw   s        r   __init__zResultHeap.__init__   s     2q'12q'3a)+E)+ETVV$	TVV$	
r!   c                 &   |j                   \  }}t        j                  |d      }t        j                  |d      }|j                   ||fk(  sJ || j                  k(  sJ | j                  j                  |t        |      t        |      |       y)z
        Add results for all heaps
        D, I should be of size (nh, nres)
        D, I do not need to be in a particular order (heap or sorted)
        r   r   r   N)r   r	   r
   r0   rw   addn_with_idsr   )rx   r   r   r0   kds        r   
add_resultzResultHeap.add_result   s     B  )4  '2ww2r("""TWW}}

  QK	r!   c           	         |j                   \  }}|t        |      k(  sJ |j                  dk(  r|j                   |j                   k(  s!|j                  dk(  r|j                   |fk(  sJ t        j                  |d      }t        j                  |d      }t        j                  |d      }|j                  dk(  rdn|}| j
                  j                  |t        |      |t        |      t        |      |       y)z
        Add results for a subset of heaps.
        D, I should hold resutls for all the subset
        as a special case, if I is 1D, then all ids are assumed to be the same
           rI   r   r   r   r   N)r   lenrL   r	   r
   rw   addn_query_subset_with_idsr   )rx   subsetr   r   nsubsetr}   	id_strides          r   add_result_subsetzResultHeap.add_result_subset  s     gg#f+%%%FFaKAGGqww.FFaKAGGv-	
.   )4  '2%%fG<1A"	

--Xf%Xa[)	
r!   c                 8    | j                   j                          y N)rw   r   )rx   s    r   finalizezResultHeap.finalize  s    

r!   NF)__name__
__module____qualname____doc__rz   r~   r   r    r!   r   ru   ru      s    (
*r!   ru   c                 l   |j                   | j                   k(  sJ | j                   \  }}}t        j                  ||f| j                        }t        j                  ||f|j                        }|rt        nt
        } ||||t        |       t        |      t        |      t        |             ||fS )z
    Merge a set of sorted knn-results obtained from different shards in a dataset
    Dall and Iall are of size (nshard, nq, k) each D[i, j] should be sorted
    returns D, I of size (nq, k) as the merged result set
    r   )r   r	   r&   r   merge_knn_results_CMaxmerge_knn_results_CMinr   )	DallIallry   nshardr   r   DnewInewfuncs	            r   merge_knn_resultsr     s     ::###::LFAq88QF$**-D88QF$**-D%-!3ID	1f
 :r!   c                       e Zd Zd Zd Zd Zy)MapInt64ToInt64c                 ,   t        t        j                  |            | _        |d| j                  z  k(  sJ d       || _        t        j
                  |dfd      | _        t        j                  | j                  t        | j                               y )Nr   zneed power of 2 capacityr   r   )
re   r	   log2log2_capacitycapacityr&   rh   r   hashtable_int64_to_int64_initr   )rx   r   s     r   rz   zMapInt64ToInt64.__init__3  ss     !231 2 222N4NN2 88XqM9++D,>,>@RSr!   c           	          |j                   \  }|j                   |fk(  sJ t        j                  | j                  t	        | j
                        |t	        |      t	        |             y r   )r   r   hashtable_int64_to_int64_addr   r   rh   )rx   keysvalsr   s       r   addzMapInt64ToInt64.add:  sR    ZZzzaT!!!** 2x~x~	/r!   c           	          |j                   \  }t        j                  |fd      }t        j                  | j
                  t        | j                        |t        |      t        |             |S r?   )r   r	   r&   r   hashtable_int64_to_int64_lookupr   r   rh   )rx   r   r   r   s       r   lookupzMapInt64ToInt64.lookupA  sW    ZZxxG,-- 2x~x~	/ r!   N)r   r   r   rz   r   r   r   r!   r   r   r   1  s    T/r!   r           c                    t        j                  | d      } t        j                  |d      }| j                  \  }}|j                  \  }}||k(  sJ t        j                  ||fd      }	t        j                  ||fd      }
|t        k(  r:t        t        |       t        |      ||||t        |
      t        |	             |
|	fS |t        k(  r:t        t        |       t        |      ||||t        |
      t        |	             |
|	fS t        t        |       t        |      ||||||t        |
      t        |	      
       |
|	fS )a  
    Compute the k nearest neighbors of a vector without constructing an index


    Parameters
    ----------
    xq : array_like
        Query vectors, shape (nq, d) where the dimension d is that same as xb
        `dtype` must be float32.
    xb : array_like
        Database vectors, shape (nb, d) where dimension d is the same as xq
        `dtype` must be float32.
    k : int
        Number of nearest neighbors.
    metric : MetricType, optional
        distance measure to use (either METRIC_L2 or METRIC_INNER_PRODUCT)

    Returns
    -------
    D : array_like
        Distances of the nearest neighbors, shape (nq, k)
    I : array_like
        Labels of the nearest neighbors, shape (nq, k)
    r   r   r   )
r	   r
   r   r&   r'   	knn_L2sqrr   r)   knn_inner_productknn_extra_metrics)r,   r-   r   r.   r/   r0   r1   r2   r3   r   r   s              r   knnr   M  s.   2 
		b		2B			b		2BHHEBXXFB7N7
"a(A
"a	*ARL(2,r2q(1+x{	
  a4K 
'	'RL(2,r2q(1+x{	
 a4K 	RL(2,r2vz1QK!	
 a4Kr!   c                    | j                   \  }}|j                   \  }}||k(  sJ t        j                  ||fd      }t        j                  ||fd      }	|dk(  rt        j                         }
||
_        ||
_        t        j                  |	      |
_        t        j                  |      |
_	        t        j                  |
t        j                  |       t        j                  |      ||d       ||	fS |dk(  rlt        j                  t        j                  |       t        j                  |      ||||t        j                  |      t        j                  |	             ||	fS t        )a  
    Compute the k nearest neighbors of a set of vectors without constructing an index.

    Parameters
    ----------
    xq : array_like
        Query vectors, shape (nq, d) where d is the number of bits / 8
        `dtype` must be uint8.
    xb : array_like
        Database vectors, shape (nb, d) where d is the number of bits / 8
        `dtype` must be uint8.
    k : int
        Number of nearest neighbors.
    variant : string
        Function variant to use, either "mc" (counter) or "hc" (heap)

    Returns
    -------
    D : array_like
        Distances of the nearest neighbors, shape (nq, k)
    I : array_like
        Labels of the nearest neighbors, shape (nq, k)
    ro   r   r   hcrI   mc)r   r	   r&   r   int_maxheap_array_tr   r   r   r   r   hammings_knn_hchammings_knn_mcNotImplementedError)r,   r-   r   variantr0   r1   r2   r3   r   r   heaps              r   knn_hammingr     s-   2 HHEBXXFB7N7
"a(A
"a(A$((*>>!$>>!$%..$ennR&8"q	
 a4K 
DNN2r 2BAqNN1u~~a0	
 a4K "!r!   c                   2    e Zd ZdZd Zd ZddZd	dZd Zy)
Kmeansa  Object that performs k-means clustering and manages the centroids.
    The `Kmeans` class is essentially a wrapper around the C++ `Clustering` object.

    Parameters
    ----------
    d : int
       dimension of the vectors to cluster
    k : int
       number of clusters
    gpu: bool or int, optional
       False: don't use GPU
       True: use all GPUs
       number: use this many GPUs
    progressive_dim_steps:
        use a progressive dimension clustering (with that number of steps)

    Subsequent parameters are fields of the Clustring object. The most important are:

    niter: int, optional
       clustering iterations
    nredo: int, optional
       redo clustering this many times and keep best
    verbose: bool, optional
    spherical: bool, optional
       do we want normalized centroids?
    int_centroids: bool, optional
       round centroids coordinates to integer
    seed: int, optional
       seed for the random number generator

    c                 t   || _         | j                  |       d| _        d|v rt               | _        nt               | _        |j                         D ]S  \  }}|dk(  r|dk(  s|dk(  r
t               }|| _        't        | j                  |       t        | j                  ||       U | j                          y)zd: input dimension, k: nb of centroids. Additional
         parameters are passed on the ClusteringParameters object,
         including niter=25, verbose=False, spherical = False
        Fprogressive_dim_stepsgpuTN)r1   resetr   "ProgressiveDimClusteringParameterscpClusteringParametersitemsget_num_gpusgetattrsetattr	set_index)rx   r1   r   kwargsvs        r   rz   zKmeans.__init__  s    
 

1"f,8:DG*,DGLLN 	'DAqEz9R$A #A&	' 	r!   c                    | j                   }| j                  j                  t        k(  ru| j                  j                  rt        |      | _        nt        |      | _        | j                  r1t        j                  | j                  | j                        | _        y y | j                  rt        | j                        }n
t               }|| _        y )N)ngpu)r1   r   	__class__r   	sphericalIndexFlatIPindexIndexFlatL2r   r   index_cpu_to_all_gpusGpuProgressiveDimIndexFactoryProgressiveDimIndexFactoryfac)rx   r1   r   s      r   r   zKmeans.set_index  s    FF77 44ww  (^
(^
xx"88$((S
  xx3B02DHr!   Nc                 R    |t        |      | _        d| _        d| _        d| _        y)zg prepare k-means object to perform a new clustering, possibly
        with another number of centroids N)re   r   	centroidsobjiteration_stats)rx   r   s     r   r   zKmeans.reset  s*     =VDF#r!   c                    t        j                  |d      }|j                  \  }}|| j                  k(  sJ | j                  j
                  t        k(  rt        || j                  | j                        }|D|j                  \  }}||k(  sJ t        j                  |j                         |j                         |j                  || j                  |       ng|J |J | j                  j                  rJ t!        || j                  | j                        }|j                  |t#        |      | j$                         t        j&                  |j                        }	|	j)                  | j                  |      | _        |j*                  }
t-        |
j/                               D cg c]  }|
j1                  |       }
}t        j2                  |
D cg c]  }|j4                   c}      | _        dj7                         }|
D cg c]  }|D ci c]  }|t9        ||       c} c}}| _        | j4                  j.                  dkD  r| j4                  d   S dS c c}w c c}w c c}w c c}}w )a   Perform k-means clustering.
        On output of the function call:

        - the centroids are in the centroids field of size (`k`, `d`).

        - the objective value at each iteration is in the array obj (size `niter`)

        - detailed optimization statistics are in the array iteration_stats.

        Parameters
        ----------
        x : array_like
            Training vectors, shape (n, d), `dtype` must be float32 and n should
            be larger than the number of clusters `k`.
        weights : array_like
            weight associated to each vector, shape `n`
        init_centroids : array_like
            initial set of centroids, shape (n, d)

        Returns
        -------
        final_obj: float
            final optimization objective

        r   r   z,obj time time_search imbalance_factor nsplitr   r   r   )r	   r
   r   r1   r   r   r   
Clusteringr   r   copy_array_to_vectorravelr   trainr   r   ProgressiveDimClusteringr   r   vector_float_to_arrayreshaper   rV   r9   atr   r   splitr   )rx   rb   weightsinit_centroidsr   r1   clusncr3   r   statsr]   ststat_fieldsfields                  r   r   zKmeans.train  s   4   )4ww1DFF{{77 44a1D)'--BQww**>+?+?+A4>>RJJq$**g. ?"?!)))ww((((+Atvvtww?DJJq(1+txx0//?	"**46615$$&+EJJL&9:!::88e4RVV45DJJL  
 5@@5UGB&&@ 
  $xx}}q0txx|9c9 ;4 A 
s$   (II#		I-I(&I-(I-c                 R   t        j                  |d      }| j                  J d       | j                  j	                          | j                  j                  | j                         | j                  j                  |d      \  }}|j                         |j                         fS )Nr   r   zshould train before assigningrI   )r	   r
   r   r   r   r   searchr   )rx   rb   r   r   s       r   assignzKmeans.assignH  s      )4~~)J+JJ)



t~~&zz  A&1wwy!'')##r!   r   )NN)	r   r   r   r   rz   r   r   r   r   r   r!   r   r   r     s"    @. $::x$r!   r   c                 J    t        | t        j                  j                        S r   )
isinstancecollectionsabcSequencera   s    r   is_sequencer   U  s    a1122r!   c           	         | j                   \  }}t        j                  | d      } t        |      rt        j                  |d      }|j                   |fk(  sJ t	        |j                         dz   dz        }t        j                  ||fd      }t        ||t        |      t        |       t        |      |       |S ||z  dz   dz  }t        j                  ||fd      }t        |||t        |       t        |      |       |S )a>  
    Pack a set integers (i, j) where i=0:n and j=0:M into
    n bitstrings.
    Output is an uint8 array of size (n, code_size), where code_size is
    such that at most 7 bits per code are wasted.

    If nbit is an integer: all entries takes nbit bits.
    If nbit is an array: entry (i, j) takes nbit[j] bits.
    ro   r         rH   )	r   r	   r
   r   re   sumr&   pack_bitstrings_cr   )rO   nbitr   M	code_sizebs         r   pack_bitstringsr   Z  s     77DAq
Qg.A4##D8zzaT!!!aA-.	HHa^73q(4.(1+x{I	G H X\a'	HHa^73!Qhqk8A;	JHr!   c           
         | j                   \  }}|t        j                  |d      }t        |      }t	        |j                         dz   dz        }||k\  sJ t        j                  ||fd      }t        ||t        |      t        |       |t        |             |S |}||z  dz   dz  }||k\  sJ t        j                  ||fd      }t        |||t        |       |t        |             |S )a  
    Unpack a set integers (i, j) where i=0:n and j=0:M from
    n bitstrings (encoded as uint8s).
    Input is an uint8 array of size (n, code_size), where code_size is
    such that at most 7 bits per code are wasted.

    Two forms:
    - when called with (array, M, nbit): there are M entries of size
      nbit per row
    - when called with (array, nbits): element (i, j) is encoded in
      nbits[j] bits
    ro   r   r   r   )	r   r	   r
   r   re   r   r&   unpack_bitstrings_cr   )r   
M_or_nbitsr   r   r   r   min_code_sizerO   s           r   unpack_bitstringsr  u  s     77LAy|##Jg>ITXXZ!^12M)))HHaV7+q(4.QKHQK	1 H TA!+M)))HHaV7+q$Y	=Hr!   )90  )r  N)i  )Nr   r   )r   r   )"numpyr	   faiss.loaderr   collections.abcr   r   r$   r'   r5   r=   rC   lrandrF   rQ   rT   rS   r^   rc   rm   rg   rs   rp   ru   r   r   r   r   r   r   r   r   r  r   r   r!   r   <module>r     s       $$ '0A 2 	 , 8 >  :  N= =@, 8 $ 3l0pS$ S$t3 $ 2 ( r!   