
    rh1                         d Z ddlZddlZddlZddlmZ 	 ddlZd ZddZd Z G d d	      Z G d
 de      ZddZ	 ddZ G d de      Zd Zd ZddZ	 	 ddZy# e	$ r  e
d       Y Sw xY w)zO
This contrib module contains a few routines useful to do clustering variants.
    N)
ThreadPoolz2scipy not accessible, Python k-means will not workc                       y N )argkwargss     k/var/www/html/ai-insurance-compliance-backend/venv/lib/python3.12/site-packages/faiss/contrib/clustering.py	print_nopr
      s        c                    | j                   d   }|j                  dd      }|rt        nt        } |d| j                    d| d|         |d       t	        j
                  ||f|dd	|}	|	j                  |        |	j                  g}
 |        |	j                  } |d
       t        j                         }|	j                  |       \  }}t        j                  ||      } |dt        j                         |z
  ddt        |       dt        |              |j                         }~	|s*t        j                   |dz         |z  |z  }|dd |dd z
  }nat        j"                  |      }||z  |d   z  }|ddxxx |dd z  ccc t%        |      |k(  sJ  |dt        |       dt        |              d}g }t        j                         }t'        |      D ]  }t)        ||         } |dt        j                         |z
  dd| d| d| d	dd       |||   z   }||| }t        j*                  ||   |k(        sJ t	        j
                  ||fi |}	| |   }|	j                  |       |
j-                  |	j                         |j-                  |	j                         ~	|}  |dt        j                         |z
  dd       t        j.                  |      |
fS )a=  
    perform 2-level clustering on a training set xt
    nc1 and nc2 are the number of clusters at each level, the final number of
    clusters is nc2. Additional arguments are passed to the Kmeans object.

    Rebalance allocates the number of sub-clusters depending on the number of
    first-level assignment.
       verboseFz2-level clustering of z nb 1st level clusters = z total zperform coarse trainingi  )nitermax_points_per_centroidzassigning the training set	minlengthzdone in z.2fz s. Sizes of clusters -Nznb 2nd-level centroids r   [z s] training sub-cluster /z nc2= Tendflushz s)shapegetprintr
   faissKmeanstrainiteration_stats	centroidstimeassignnpbincountminmaxargsortarangecumsumsumrangeintallappendvstack)xtnc1nc2	rebalanceclustering_niterargsdr   logkmr"   
centroids1t0_assign1bcoccall_nc2bc_sumi0c2c1i1subsetxtsubs                             r	   two_level_clusteringrK      s    	Ahhy%(G%	C
 
*CC5PSuUV!"		3
& $
 
B
 HHRL))*OE J$%	B2JAw	W	,B(499;#C((>s2wiqR	RSA
YYsQw#%,QR&2cr7"23,&*,ws|#7|s"""%c'l^1S\NCD 
B	B	BCj '"+a		b %%>rd!C5cURTU[]eij"R&[2bvvgfo+,,,\\!S)D)6

r112
		",, (499;#C(+,99R=/))r   c                    t        j                  |       } t        | t         j                        rt	        | j
                  j                               D ]?  }| j
                  j                  |      }|j                  |       |j                  |      }A t        | j                  |fi | d| _        yt        | t         j                        sJ | j                  t         j                  k(  sJ t!        t#        j$                  | j&                              }t)        d|       t+        ||| j&                  fi |\  }}| j,                  j                  |       | j,                  j/                  |       | j                  |       y)zJ
    Applies 2-level clustering to an index_ivf embedded in an index.
    TNz
REBALANCE=)r   downcast_index
isinstanceIndexPreTransformr.   chainsizeatr!   applytrain_ivf_index_with_2levelindex
is_trainedIndexIVFmetric_type	METRIC_L2r/   r&   sqrtnlistr   rK   	quantizeradd)rU   r3   r8   ivtr4   r#   r>   s           r	   rT   rT   _   s&   
   'E%001u{{'')* 	A"BHHRL"B	 	$EKK<t<eU^^,,,///
bggekk"
#C	,'CEELIq	OO)$	OO	"	KKOr   c                   6    e Zd ZdZd Zd Zd Zd Zd Zd	dZ	y)
DatasetAssignWrapper for a matrix that offers a function to assign the vectors
    to centroids. All other implementations offer the same interfacec                 <    t        j                  |d      | _        y Nfloat32dtype)r&   ascontiguousarrayxselfri   s     r	   __init__zDatasetAssign.__init__   s    %%ay9r   c                 4    | j                   j                  d   S )Nr   ri   r   rk   s    r	   countzDatasetAssign.count       vv||Ar   c                 4    | j                   j                  d   S Nr   rn   ro   s    r	   dimzDatasetAssign.dim   rq   r   c                      | j                   |   S r   )ri   rk   indicess     r	   
get_subsetzDatasetAssign.get_subset   s    vvgr   c                 D    t        j                  | j                  |d      S rs   )r   knnri   rk   r#   s     r	   perform_searchzDatasetAssign.perform_search   s    yyA..r   Nc                    | j                  |      \  }}|j                         }|j                         }|j                  \  }}t        j                  ||fd      }|,t        j
                  j                  ||| j                         nCt        j
                  j                  |||d d t        j                  f   | j                  z         |||fS rd   )	r|   ravelr   r&   zerosr]   rR   ri   newaxis)rk   r#   weightsDIncr9   sum_per_centroids           r	   	assign_tozDatasetAssign.assign_to   s    ""9-1GGIGGIA88RG9=?FFII&4662FFII&71bjj=+ADFF+JK!%%%r   r   )
__name__
__module____qualname____doc__rl   rp   rt   rx   r|   r   r   r   r	   ra   ra      s&    H:/&r   ra   c                       e Zd ZdZddZd Zy)DatasetAssignGPUz GPU version of the previous c                    t         j                  | |       t        j                  |j                  d         }|dk\  r/t        j
                  t        j                         ||      | _        y t        j                  |      | _        y )Nr   r   )	ra   rl   r   IndexFlatL2r   index_cpu_to_gpuStandardGpuResourcesrU   index_cpu_to_all_gpus)rk   ri   gpu_idr   rU   s        r	   rl   zDatasetAssignGPU.__init__   sh    tQ'!!!''!*-Q;//**,DJ
 44U;DJr   c                     | j                   j                          | j                   j                  |       | j                   j                  | j                  d      S rs   )rU   resetr]   searchri   r{   s     r	   r|   zDatasetAssignGPU.perform_search   s=    



y!zz  ++r   N)F)r   r   r   r   rl   r|   r   r   r	   r   r      s    '	<,r   r   c                    | j                   d   }|j                   d   }||dz  j                  d      }|3t        j                  | j	                  d      j                  d            }|d| z  |j
                  z  z
  }|j                  d      }|j                         |t        j                  |      |z  z      |j                         z   }||fS )z assignment function for xq is sparse, xb is dense
    uses a matrix multiplication. The squared norms can be provided if
    available.
    r      r   )axis)	r   r-   r&   arraypowerTargminr~   r+   )	xqxbxq_normsxb_normsnqnbd2r   r   s	            r	   sparse_assign_to_denser      s    
 
!B	!B!G==#88BHHQKOOA./
QVbdd]
"B
		q	A

1ryy}r))*X^^-==Aa4Kr   c           
         
  j                   d   }j                   d   t        j                  |d      

j                  t        j                         t        j
                  |t               dz  j                  d      
 f	d}|dk(  s
|dk(  s|k  r$t        t        |t        d|                   
fS t        |      }	|	j                  |t        d|             
fS )z
    decomposes the sparse_assign_to_dense function into blocks to avoid a
    possible memory blow up. Can be run in multithreaded mode, because scipy's
    sparse-dense matrix multiplication is single-threaded.
    r   re   rf   r   r   c           
      b  	 | | z    }
| | z    }	| | z    }4t        j                  |j                  d      j                  d            }n| | z    }t	        d      D ]H  }t        |||z    |||z          \  }}|dk(  r||d d  ||d d  1||k  }||   |z   ||<   ||   ||<   J y )Nr   r   r   )r   r   )r&   r   r   r-   r.   r   )r^   xq_blockIblockDblockxq_norms_blockjDiIimaskr   r   bbsr   qbsr   r   r   r   s            r	   handle_query_blockz9sparse_assign_to_dense_blocks.<locals>.handle_query_block   s    a!c'?1q3w1q3wXXhnnQ&7&;&;A&>?N%a!c'2Nq"c" 	(A+1q3w'!!a#g.	FB Avq	q	F{!$x!|t!$xt	(r   )r   r&   emptyfillinfonesr/   r-   listmapr.   r   )r   r   r   r   r   r   ntr   r   poolr   r   r   s   ``````    @@@r	   sparse_assign_to_dense_blocksr      s     
!B	!B
9%AFF266N	3	A!G==#( (. 
Qw"'R3YS#U1b#%678
 a4K "~#U1b#%67a4Kr   c                   *    e Zd ZdZd Zd Zd ZddZy)DatasetAssignSparserb   c                     |j                   t        j                  j                  k(  sJ || _        t        j                  |j                  d      j                  d            | _	        y )Nr   r   )
	__class__scipysparse
csr_matrixri   r&   r   r   r-   squared_normsrj   s     r	   rl   zDatasetAssignSparse.__init__   sG    {{ell55555XXaggajnnQ&78r   c                 b    t        j                  | j                  |   j                               S r   )r&   r   ri   todenserv   s     r	   rx   zDatasetAssignSparse.get_subset  s"    xxw//122r   c                 F    t        | j                  || j                        S )N)r   )r   ri   r   r{   s     r	   r|   z"DatasetAssignSparse.perform_search  s     ,FFI(:(:< 	<r   Nc                    | j                  |      \  }}|j                         }|j                         }| j                  j                  d   }|t	        j
                  |d      }t        |      }t        j                  j                  ||t	        j                  |dz         f||f      }t	        j                  || j                  z  j                               }|||fS )Nr   re   rf   r   )r   )r|   r~   ri   r   r&   r   lenr   r   
csc_matrixr+   r   r   )	rk   r#   r   r   r   nr   mr   s	            r	   r   zDatasetAssignSparse.assign_to	  s    ""9-1GGIGGIFFLLO?ggay1G^LL##a1q5)*q' $  88QZ$8$8$:;!%%%r   r   )r   r   r   r   rl   rx   r|   r   r   r   r	   r   r      s    H9
3<&r   r   c                     t        j                  |d      }t        j                  t	        |      | t        j
                  |            S )Nint64rf   )r&   rh   r   imbalance_factorr   swig_ptr)kr%   s     r	   r   r     s6    !!&8F!!#f+q%..2HIIr   c                     | j                   t        j                  k(  rydd l}t	        | |j
                        ryt        dt        |              )NFr   TzUnknown tensor type )r   r&   ndarraytorchrN   TensorNotImplementedErrortype)ri   r   s     r	   check_if_torchr      s@    {{bjj !U\\"
 4T!WI>
??r   c                    |t         j                  }|j                  \  }}d}t        |      }t        j                  | dk(        d   }t        |      dk(  ry|rddl}|j                  |d         }	nt        j                  |d         }	|	dddxx   dz  cc<   |	dddxx   dz  cc<   t        |      dkD  r| j                  d      dz
  }
d|
|
dk  <   |
|
j                         z  }
|
dkD  j                         }t        ||j                        }|j                  |||
      }t        |d| |      D ]:  \  }}||   }||	z  ||<   ||	z  ||<   | |   dz  | |<   | |xx   | |   z  cc<   |dz  }< ||d }t        |      dkD  r|S )z/ reassign centroids when some of them collapse Nr   r   g      P?r   float)rQ   p)r&   randomr   r   wherer   r   	ones_likeastyper-   r(   rQ   choicezip)hassignr#   rsr   r9   nsplitis_torchempty_centsr   facprobasnnznreplacecjscicjcs                    r	   reassign_centroidsr   )  s   	zYY??DAqFi(H((7a<(+K
;1ooil+ll9Q<(!H	H1II k
Q
(1,vz&**,z sK,,-iiFi3+ix0#6 	FB"AGIbMGIbM!"+*GBKBK72;&KaKF	 "(),) k
Q
, Mr   c           
         |j                         |j                         }}|rt        nt        }	 |	d||| ||fz         t        j
                  j                  |      }
t        d       t        j                         }|
j                  || d      }|j                  |      }t        |      }g } |	d       d}g }t        |      D ]  }t        j                         } |	ddd	
       |j                  |      \  }}} |	ddd	
       |t        j                         |z
  z  }|j                         }|r|j                         }|j                  |       t	        j                   ||       }|j#                  dd      j%                  d      }d||dk(  <   |r.ddl}|j)                  |      j+                  |j,                        }||z  }t/        |||
      }|t        j                         |z
  |t1        | |      |d} |	d||d   |d   ||d   |fz         |j                  |       |h |	d|       |rddl}|j3                  ||       t	        j2                  ||        |r||fS |S )a0  Pure python kmeans implementation. Follows the Faiss C++ version
    quite closely, but takes a DatasetAssign instead of a training data
    matrix. Also redo is not implemented.

    For the torch implementation, the centroids are tensors (possibly on GPU),
    but the indices remain numpy on CPU.
    zAClustering %d points in %dD to %d clusters, %d iterations seed %dz
preproc...F)rQ   replacez  doner   	assigningr   Tr   zcompute centroidsr   r   r   re   N)objr$   time_searchr   r   zM  Iteration %d (%.2f s, search %.2f s): objective=%g imbalance=%.3f nsplit=%dr$   r   r   zstoring centroids in)rp   rt   r   r
   r&   r   RandomStater$   r   rx   r   r.   r   r-   itemr1   r'   reshaper   r   
from_numpytodevicer   r   save)r   datar   seed
checkpointr   return_statsr   r9   r:   r   r=   permr#   r   r"   t_search_totr   r^   t0sr%   r   sumserrr   r   r   r   ss                                r	   kmeansr  Z  sa    ::<qA%	C 
$()1a'=	> ? 
		t	$B	,	B99QQ9.D%Ii(HOML
C5\ 1/iikKT...34T6		c))eeg((*C

3++f2oob!$++I6C1H""3'**4;;7C3J	#GY; YY[2%' 0F ;
 	 5ai=!1,-9 	
 	q!!&
3

9j1
I.c1/f /))r   )T   )NN)NN @  r  Nr   )r  i  NTF)r   numpyr&   r   r$   multiprocessing.poolr   scipy.sparser   ImportErrorr   r
   rK   rT   ra   r   r   r   r   r   r   r   r  r   r   r	   <module>r     s       +@	D*NF& &D,} ,($ HL-`&- &DJ
@-b CGRQ
  @	
>?@s   A A.-A.