
    rh/                        d dl Z d dlZd dlZd dlZddlmZmZmZm	Z	 ddl
mZ  G d d      Z G d de      Z ej                         Zd	d
de dfD ]#  ae j"                  j%                  t               s# n dad Z G d de      Zd Z G d de      Z G d de      Z G d de      Z G d de      Z G d de      ZddZy)    N   )
fvecs_read
ivecs_read
bvecs_mmap
fvecs_mmap)knnc                   N    e Zd ZdZd Zd ZddZd ZddZddZ	dd	Z
d
 Zd Zy)Datasetz+ Generic abstract class for a test dataset c                 J    d| _         d| _        d| _        d| _        d| _        y)z2 the constructor should set the following fields: L2Ndmetricnqnbntselfs    i/var/www/html/ai-insurance-compliance-backend/venv/lib/python3.12/site-packages/faiss/contrib/datasets.py__init__zDataset.__init__   s%        c                     t               )z' return the queries as a (nq, d) array NotImplementedErrorr   s    r   get_querieszDataset.get_queries       !##r   Nc                     t               )z' return the queries as a (nt, d) array r   r   maxtrains     r   	get_trainzDataset.get_train   r   r   c                     t               )z' return the queries as a (nb, d) array r   r   s    r   get_databasezDataset.get_database"   r   r   c              #      K   | j                         }|\  }}| j                  |z  |z  | j                  |dz   z  |z  }}t        |||      D ]  }||t        ||z   |         yw)a7  returns an iterator on database vectors.
        bs is the number of vectors per batch
        split = (nsplit, rank) means the dataset is split in nsplit
        shards and we want shard number rank
        The default implementation just iterates over the full matrix
        returned by get_dataset.
        r   N)r#   r   rangemin	r   bssplitxbnsplitranki0i1j0s	            r   database_iteratorzDataset.database_iterator&   sw       46)477dQh+?6+IBB# 	+BRR"Wb)**	+s   A$A&c                     t               )z7 return the ground truth for k-nearest neighbor search r   r   ks     r   get_groundtruthzDataset.get_groundtruth4   r   r   c                     t               )z* return the ground truth for range search r   )r   threshs     r   get_groundtruth_rangezDataset.get_groundtruth_range8   r   r   c           
          d| j                    d| j                   d| j                   d| j                   d| j                   
S )Nzdataset in dimension z, with metric z
, size: Q z B z T r   r   s    r   __str__zDataset.__str__<   sD    'x~dkk] K77)3twwis477)= 	>r   c                    | j                         j                  | j                  | j                  fk(  sJ | j                  dkD  rA| j                  d      }|j                  d| j                  fk(  sJ d|j                         | j                         j                  | j                  | j                  fk(  sJ | j                  d      j                  | j                  dfk(  sJ y)z8 runs the previous and checks the sizes of the matrices r   {   )r    zshape=   )r3   N)	r   shaper   r   r   r!   r#   r   r4   )r   xts     r   check_sizeszDataset.check_sizes@   s    !''DGGTVV+<<<<77Q;-B88TVV},GBHH.GG,  "((TWWdff,====##b#)//DGGR=@@@r   N   )r   r   )__name__
__module____qualname____doc__r   r   r!   r#   r0   r4   r7   r9   r?    r   r   r
   r
      s3    5$$$+$$>Ar   r
   c                   4    e Zd ZdZddZd Zd	dZd Zd
dZy)SyntheticDatasetzOA dataset that is not completely random but still challenging to
    index
    c                    t         j                  |        ||||f\  | _        | _        | _        | _        d}||z   |z   }t        j                  j                  |      }	|	j                  ||f      }
t        j                  |
|	j                  ||            }
|
|	j                  |      dz  dz   z  }
t        j                  |
      }
|
j                  d      }
|| _        |
d | | _        |
|||z    | _        |
||z   d  | _        y )N
   )size   g?float32)r
   r   r   r   r   r   nprandomRandomStatenormaldotrandsinastyper   r>   r*   xq)r   r   r   r   r   r   seedd1nrsxs              r   r   zSyntheticDataset.__init__O   s    ,-r2rM)$'GbLYY""4(IIAr7I#FF1bggb!n% a#%&FF1IHHYCR&BrBw-BGH+r   c                     | j                   S r@   )rW   r   s    r   r   zSyntheticDataset.get_queriesa       wwr   Nc                 @    ||n| j                   }| j                  d | S r@   )r   r>   r   s     r   r!   zSyntheticDataset.get_traind   s#    '38wwy!!r   c                     | j                   S r@   )r*   r   s    r   r#   zSyntheticDataset.get_databaseh   r^   r   c                     t        | j                  | j                  || j                  dk(  rt        j
                        d   S t        j                        d   S )Nr   r   )r   rW   r*   r   faiss	METRIC_L2METRIC_INNER_PRODUCTr2   s     r   r4   z SyntheticDataset.get_groundtruthk   sW    GGTWWa#{{d2EOO
  	8=8R8R
  	r   )r   i:  r@   )d   	rC   rD   rE   rF   r   r   r!   r#   r4   rG   r   r   rI   rI   J   s     $"r   rI   z/datasets01/simsearch/041218/z7/mnt/vol/gfsai-flash3-east/ai-group/datasets/simsearch/z/home/z/simsearch/data/zdata/c                     | a y r@   )dataset_basedir)paths    r   set_dataset_basedirrj      s    Or   c                   2    e Zd ZdZd Zd ZddZd ZddZy)	DatasetSIFT1M_
    The original dataset is available at: http://corpus-texmex.irisa.fr/
    (ANN_SIFT1M)
    c                     t         j                  |        d\  | _        | _        | _        | _        t        dz   | _        y )N)rB   順 @B '  zsift1M/r
   r   r   r   r   r   rh   basedirr   s    r   r   zDatasetSIFT1M.__init__   2    ,G)$'&2r   c                 2    t        | j                  dz         S )Nzsift_query.fvecsr   rs   r   s    r   r   zDatasetSIFT1M.get_queries       $,,);;<<r   Nc                 X    ||n| j                   }t        | j                  dz         d | S )Nzsift_learn.fvecsr   r   rs   r   s     r   r!   zDatasetSIFT1M.get_train   .    '38$,,);;<YhGGr   c                 2    t        | j                  dz         S )Nzsift_base.fvecsrv   r   s    r   r#   zDatasetSIFT1M.get_database       $,,)::;;r   c                 ^    t        | j                  dz         }||dk  sJ |d d d |f   }|S )Nzsift_groundtruth.ivecsre   r   rs   r   r3   gts      r   r4   zDatasetSIFT1M.get_groundtruth   <    '??@=8O8ArrEB	r   r@   rf   rG   r   r   rl   rl      !    
3
=H<r   rl   c                 0    t        j                  | d      S )NrN   dtype)rO   ascontiguousarray)r\   s    r   sanitizer      s    33r   c                   <    e Zd ZdZd	dZd Zd
dZd
dZd ZddZ	y)DatasetBigANNz_
    The original dataset is available at: http://corpus-texmex.irisa.fr/
    (ANN_SIFT1B)
    c                     t         j                  |        |dv sJ || _        |dz  }dd|df\  | _        | _        | _        | _        t        dz   | _        y )N)
r         rK      2   re      i    rp   rB    rq   zbigann/)	r
   r   nb_Mr   r   r   r   rh   rs   )r   r   r   s      r   r   zDatasetBigANN.__init__   sZ    AAAA	E\,/E,A)$'&2r   c                 J    t        t        | j                  dz         d d        S )Nzbigann_query.bvecs)r   r   rs   r   s    r   r   zDatasetBigANN.get_queries   s!    
4<<2F#FGJKKr   Nc                 j    ||n| j                   }t        t        | j                  dz         d |       S )Nzbigann_learn.bvecs)r   r   r   rs   r   s     r   r!   zDatasetBigANN.get_train   s3    '38
4<<2F#FG	RSSr   c                 x    t        | j                  d| j                  z  z         }||dk  sJ |d d d |f   }|S )Nzgnd/idx_%dM.ivecsre   )r   rs   r   r   s      r   r4   zDatasetBigANN.get_groundtruth   sE    ':TYY'FFG=8O8ArrEB	r   c                     | j                   dk  sJ d       t        t        | j                  dz         d | j                         S )Nre   dataset too large, use iteratorbigann_base.bvecs)r   r   r   rs   r   r   s    r   r#   zDatasetBigANN.get_database   s=    yy3A AA
4<<2E#EFxPQQr   c           	   #      K   t        | j                  dz         }|\  }}| j                  |z  |z  | j                  |dz   z  |z  }}t        |||      D ]  }t	        ||t        ||z   |              ! y w)Nr   r   )r   rs   r   r%   r   r&   r'   s	            r   r0   zDatasetBigANN.database_iterator   s     '::;46)477dQh+?6+IBB# 	5B2b#b2gr"2344	5   A5A7)r   r@   rA   
rC   rD   rE   rF   r   r   r!   r4   r#   r0   rG   r   r   r   r      s(    
3LTR5r   r   c                   <    e Zd ZdZd	dZd Zd
dZd
dZd ZddZ	y)DatasetDeep1Bzv
    See
    https://github.com/facebookresearch/faiss/tree/main/benchs#getting-deep1b
    on how to get the data
    c                     t         j                  |        dddddd}||v sJ dd|d	f\  | _        | _        | _        | _        t        d
z   | _        | j                  d|| j                     d| _        y )N100k1M10M100M1B)ro   rp   i r    ʚ;`   i]rq   zdeep1b/deepz_groundtruth.ivecs)	r
   r   r   r   r   r   rh   rs   gt_fname)r   r   
nb_to_names      r   r   zDatasetDeep1B.__init__   s|    

 Z,.	2u,D)$'&2LL*TWW-/r   c                 D    t        t        | j                  dz               S )Nzdeep1B_queries.fvecs)r   r   rs   r   s    r   r   zDatasetDeep1B.get_queries   s    
4<<2H#HIJJr   Nc                 j    ||n| j                   }t        t        | j                  dz         d |       S )Nzlearn.fvecs)r   r   r   rs   r   s     r   r!   zDatasetDeep1B.get_train   s2    '38
4<<-#?@(KLLr   c                 X    t        | j                        }||dk  sJ |d d d |f   }|S )Nre   )r   r   r   s      r   r4   zDatasetDeep1B.get_groundtruth   s6    &=8O8ArrEB	r   c                     | j                   dk  sJ d       t        t        | j                  dz         d | j                          S )Nr   r   
base.fvecs)r   r   r   rs   r   s    r   r#   zDatasetDeep1B.get_database   s>    ww%B!BB
4<<,#>?IJJr   c           	   #      K   t        | j                  dz         }|\  }}| j                  |z  |z  | j                  |dz   z  |z  }}t        |||      D ]  }t	        ||t        ||z   |              ! y w)Nr   r   )r   rs   r   r%   r   r&   r'   s	            r   r0   zDatasetDeep1B.database_iterator   s     |3446)477dQh+?6+IBB# 	5B2b#b2gr"2344	5r   )r   r@   rA   r   rG   r   r   r   r      s(    /KMK5r   r   c                   ,    e Zd ZdZddZd Zd ZddZy)	DatasetGlovezD
    Data from http://ann-benchmarks.com/glove-100-angular.hdf5
    Nc                    dd l }|rJ d       |s	t        dz   }|j                  |d      | _        d| _        d\  | _        | _        | j                  d   j                  d   | _        | j                  d   j                  d   | _	        y )	Nr   znot implementedzglove/glove-100-angular.hdf5rIP)re   r   traintest)
h5pyrh   File
glove_h5pyr   r   r   r=   r   r   )r   locdownloadr   s       r   r   zDatasetGlove.__init__  s}    ...|!$BBC))C- //'*003//&)//2r   c                 t    t        j                  | j                  d         }t        j                  |       |S )Nr   rO   arrayr   rb   normalize_L2r   rW   s     r   r   zDatasetGlove.get_queries  s,    XXdoof-.2	r   c                 t    t        j                  | j                  d         }t        j                  |       |S )Nr   r   r   r*   s     r   r#   zDatasetGlove.get_database  s,    XXdoog./2	r   c                 L    | j                   d   }||dk  sJ |d d d |f   }|S )N	neighborsre   )r   r   s      r   r4   zDatasetGlove.get_groundtruth  s6    __[)=8O8ArrEB	r   )NFr@   rC   rD   rE   rF   r   r   r#   r4   rG   r   r   r   r     s    
3

r   r   c                   *    e Zd ZdZd Zd Zd ZddZy)DatasetMusic100zO
    get dataset from
    https://github.com/stanis-morozov/ip-nsw#dataset
    c                     t         j                  |        d\  | _        | _        | _        | _        d| _        t        dz   | _        y )N)re   r   rp   rq   r   z
music-100/)	r
   r   r   r   r   r   r   rh   rs   r   s    r   r   zDatasetMusic100.__init__,  s9    ,@)$'&5r   c                 r    t        j                  | j                  dz   d      }|j                  dd      }|S )Nzquery_music100.binrN   r   r   re   rO   fromfilers   reshaper   s     r   r   zDatasetMusic100.get_queries2  s1    [[(<<INZZC 	r   c                 r    t        j                  | j                  dz   d      }|j                  dd      }|S )Nzdatabase_music100.binrN   r   r   re   r   r   s     r   r#   zDatasetMusic100.get_database7  s1    [[(??yQZZC 	r   Nc                 r    t        j                  | j                  dz         }||dk  sJ |d d d |f   }|S )Nzgt.npyre   )rO   loadrs   r   s      r   r4   zDatasetMusic100.get_groundtruth<  s?    WWT\\H,-=8O8ArrEB	r   r@   r   rG   r   r   r   r   &  s    
6

r   r   c                   2    e Zd ZdZd Zd ZddZd ZddZy)	DatasetGIST1Mrm   c                     t         j                  |        d\  | _        | _        | _        | _        t        dz   | _        y )N)i  ro   rp   rq   zgist1M/rr   r   s    r   r   zDatasetGIST1M.__init__I  rt   r   c                 2    t        | j                  dz         S )Nzgist_query.fvecsrv   r   s    r   r   zDatasetGIST1M.get_queriesN  rw   r   Nc                 X    ||n| j                   }t        | j                  dz         d | S )Nzgist_learn.fvecsry   r   s     r   r!   zDatasetGIST1M.get_trainQ  rz   r   c                 2    t        | j                  dz         S )Nzgist_base.fvecsrv   r   s    r   r#   zDatasetGIST1M.get_databaseU  r|   r   c                 ^    t        | j                  dz         }||dk  sJ |d d d |f   }|S )Nzgist_groundtruth.ivecsre   r~   r   s      r   r4   zDatasetGIST1M.get_groundtruthX  r   r   r@   rf   rG   r   r   r   r   C  r   r   r   c                    | dk(  r
t               S | dk(  r
t               S | j                  d      r!| dk(  rdnt        | dd       }t	        |      S | j                  d	      rW| d
d }|d   dk(  rdt        |dd       z  }n,|dk(  rd}n$|d   dk(  rdt        |dd       z  }n
J d|z          t        |      S | dk(  r
t               S | dk(  rt        |      S t        d| z         )z converts a string describing a dataset to a Dataset object
    Supports sift1M, bigann1M..bigann1B, deep1M..deep1B, music-100 and glove
    sift1Mgist1Mbigannbigann1Br      r   )r   r   rM   NMrp   r   r   r3   zdid not recognize suffix )r   z	music-100glove)r   zunknown dataset )	rl   r   
startswithintr   r   r   r   RuntimeError)datasetr   dbsizeszsufs       r   dataset_from_namer   `  s   
 (	H				H	% J.C"4F&))			F	#9s5":.Fd]F2Y#Ccr
O+F=5==5''	K	  	G	X.. -788r   )deep1MF)osnumpyrO   rb   getpassvecs_ior   r   r   r   exhaustive_searchr   r
   rI   getuserusernamerh   ri   existsrj   rl   r   r   r   r   r   r   r   rG   r   r   <module>r      s    
    D C "8A 8Av%w %\ 7?? 	(A

*+- O 
ww~~o& O
G :4%5G %5P-5G -5` 7  Fg :G :#9r   