
    bcg6                        d dl Z d dlZd dlZd dlZd dlZd dlZd dlZd dl	Z	d dl
Z
ddlmZmZ  ej                  e      ZddZd Zd Zd Zdad Zd	 Zdd
ZddZddZd Zd Zd Zd Zd Zd Z d Z!d Z"d Z#d Z$d Z%y)    N   )
PATH_TYPESfspathc                 ,   t        j                         }t        j                  |       	 t        j                  | d      5 }|j                          d d d        t        j                  |       y # 1 sw Y   xY w# t        j                  |       w xY w)Nzr:gz)osgetcwdchdirtarfileopen
extractall)src_filedst_dircur_dirfs       L/var/www/html/bid-api/venv/lib/python3.12/site-packages/catboost/datasets.py_extractr      sb    iikGHHW\\(F+qLLN , 	 ,+ 	s#   A< A0A< 0A95A< <Bc                     t        j                         }t        | d      5 }	 |j                  d      }|sn|j	                  |       &	 d d d        |j                         S # 1 sw Y   |j                         S xY w)Nrbi   )hashlibmd5r   readupdate	hexdigest)pathhasherr   blocks       r   	_calc_md5r      sr    [[]F	dD	QFF5MEMM% 	  	 
  
 s   (A""A:c                     	 t        j                  |        y # t        $ r# t         j                  j	                  |       s Y y w xY wN)r   makedirsOSErrorr   isdirr   s    r   _ensure_dir_existsr$   &   s:    
D ww}}T" #s    )AAc                 F   t         j                  j                  |      rt        |      |k(  ry d }t	        | t
              st	        | t              r| n| f}|D ]:  }	 t        j                  j                  j                  j                  |||        n t#        d|      t        |      }||k7  rt#        dj%                  |||            y # t        j                  j                  j                  j                  t        f$ r t        j!                  d|       Y w xY w)Nc                 0    t         j                  d|       y )Nzdownloaded %s bytes)loggerdebug)blocknumbssizes      r   
reporthookz$_cached_download.<locals>.reporthook2   s    *D1    )r,   zfailed to download from %szBmd5 sum mismatch for url {url}; expected {expected}, but got {got})urlexpectedgot)r   r   isfiler   
isinstancelisttuplesixmovesurllibrequesturlretrieveerrorURLErrorIOErrorr'   r(   RuntimeErrorformat)r.   r   dstr,   urlsudst_md5s          r   _cached_downloadrC   .   s   	ww~~cy~42 S$':c5+A3wD	:II$$00CJ0O  7>>nG#~_ffCW g . / 	/  		  &&//9 	:LL5q9	:s   6CAD D c                  ~    t         2t        j                  j                  t        j                         d      a t         S )Ncatboost_cached_datasets)_cache_pathr   r   joinr    r-   r   _get_cache_pathrI   I   s)    ggll299;0JKr-   c                 H    t        | t              sJ d       t        |       ay )Nzexpected string or pathlib.Path)r2   r   r   rF   r#   s    r   set_cache_pathrK   P   s!    dJ'J)JJ',Kr-   c                    |r(t         j                  j                  t               |      nt	        j
                         }t         j                  j                  ||      }t         j                  j                  ||      }t         j                  j                  |      rt         j                  j                  |      sft        |       t	        j                         \  }	}
t        j                  |	       	 t        | ||
       t        |
|       t        j                  |
       |st	        j                         \  }}t	        j                         \  }}t        j                  |       t        j                  |       t        j                  ||       t        j                  ||       t        j                  |       ||}}||fS # t        j                  |
       w xY wr   )r   r   rG   rI   tempfilemkdtempexistsr$   mkstempcloserC   r   removereplaceshutilrmtree)r.   r   dataset_name
train_file	test_filecachedir_path
train_path	test_pathfile_descriptor	file_pathfd_new_trainnew_train_pathfd_new_testnew_test_paths                  r   _download_datasetrc   V   sL   
 AFrww||O-|<8K[K[K]Hh
3JXy1IGGNN:&277>>)+D8$%-%5%5%7"
!	!S#y1Y)IIi '/'7'7'9$n%-%5%5%7"]




:~.


9m,h .I
y   IIi s   5G Gc                     t        | |||||      \  }}	t        j                  |||      t        j                  |	||      }}
|s*t        j                  |       t        j                  |	       |
|fS )N)headersep)rc   pdread_csvr   rR   )r.   r   rV   rW   rX   rf   re   rY   r[   r\   traintests               r   _load_dataset_pdrk   t   sg    -c3jR[]bcJ	++jSA2;;yagmpCq4E
		*
		)$;r-   c                    t        j                  ||ft         j                  d      }t        | d      5 }t	        |      D ]\  \  }}t        j
                  |t         j                  |      }|j                  |k(  sJ d|dz   ||j                  fz         |||   d d  ^ 	 d d d        dz   |k(  sJ d||dz   fz         t        j                  |      S # 1 sw Y   5xY w)NF)dtypeorderr   )rn   rf   z=got too many columns at line %d (expected %d columns, got %d)r   z.got too many lines (expected %d lines, got %d))	npzerosfloat32r   	enumerate
fromstringr+   rg   	DataFrame)	r   	row_countcolumn_countrf   datasetr   line_idxlinerows	            r   _load_numeric_only_datasetr|   }   s     hh	<2"**CPG	dD	Q'lNHd--BJJC@C88|+  V-lpx{|p|  K  MP  MU  MU  pV  .V  V+ $'GHa  + 
 a<9$r&VZcempqeqYr&rr$<<   
	s   A+CCc                  4    d} d}d\  }}}t        | ||||      S )NzLhttps://storage.mds.yandex.net/get-devtools-opensource/233854/titanic.tar.gz 9c8bc61d545c6af244a1d37494df3fc3)titanic	train.csvtest.csvrk   r.   r   rV   rW   rX   s        r   r   r      s+    
XC
,C*L'L*iClJ	JJr-   c                  4    d} d}d\  }}}t        | ||||      S )NzKhttps://storage.mds.yandex.net/get-devtools-opensource/250854/amazon.tar.gz 8fe3eec12bfd9c4c532b24a181d0aa2c)amazonr   r   r   r   s        r   r   r      s+    
WC
,C*K'L*iClJ	JJr-   c            
      <    d} d}d\  }}}t        | ||||d dd      S )NzKhttps://storage.mds.yandex.net/get-devtools-opensource/233854/msrank.tar.gz 34fee225d02419adc106581f4eb36f2e)msrank	train.tsvtest.tsv	T)re   rf   rY   r   r   s        r   r   r      s4    
WC
,C*K'L*iClJ	RV\`hlmmr-   c                  8    d} d}d\  }}}t        | ||||d       S )NzOhttps://storage.mds.yandex.net/get-devtools-opensource/250854/msrank_10k.tar.gz 79c5b67397289c4c8b367c1f34629eae)
msrank_10kr   r   )re   r   r   s        r   r   r      s.    
[C
,C*O'L*iClJ	RVWWr-   c                  8    d} d}d\  }}}t        | ||||d      S )a  
    Contains information from kaggle [1], which is made available here under the Open Database License (ODbL) [2].

    Download "rotten_tomatoes" [1] data set.

    Will return two pandas.DataFrame-s, first with train part (rotten_tomatoes.data) and second with test part
    (rotten_tomatoes.test) of the dataset.

    NOTE: This is a preprocessed version of the dataset.

    [1]: https://www.kaggle.com/rpnuser8182/rotten-tomatoes
    [2]: https://opendatacommons.org/licenses/odbl/1-0/index.html
    z@https://catboost-opensource.s3.yandex.net/rotten_tomatoes.tar.gz a07fed612805ac9e17ced0d82a96add4)rotten_tomatoes	learn.tsvr   r   rf   r   r   s        r   r   r      s0     MC
,C*T'L*iClJ	tTTr-   c                  8    d} d}d\  }}}t        | ||||d      S )Nz5https://catboost-opensource.s3.yandex.net/imdb.tar.gz 0fd62578d631ac3d71a71c3e6ced6f8b)imdbr   r   r   r   r   r   s        r   r   r      s-    
AC
,C*I'L*iClJ	tTTr-   c                  z    d} d}d\  }}}t        | ||||d      \  }}t        |ddd	      t        |d
dd	      fS )a  
    Download "epsilon" [1] data set.

    Will return two pandas.DataFrame-s, first with train part (epsilon_normalized) and second with
    test part (epsilon_normalized.t) of the dataset. Object class will be located in the first
    column of dataset.

    NOTE: This is a preprocessed version of the dataset. It was converted from libsvm format into
    tsv (CatBoost doesn't support libsvm format out of the box).

    [1]: https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary.html#epsilon
    )z.https://proxy.sandbox.yandex-team.ru/785711439zLhttps://storage.mds.yandex.net/get-devtools-opensource/250854/epsilon.tar.gz 5bbfac403ac673da7d7ee84bd532e973)epsilonr   r   TrY   i i  r   r   i rc   r|   )r@   r   rV   rW   rX   r[   r\   s          r   r   r      s`    ZD -C*L'L*i-dCzS\dhiJ	":vtF"9fdEG Gr-   c            	      :    d} d}d\  }}}t        | ||||dd      S )a  
    Dataset with monotonic constraints.
    Can be used for poisson regression.
    Has several numerical and several categorical features.
    The first column contains target values. Columns with names Cat* contain categorical features.
    Columns with names Num* contain numerical features.

    Dataset also contains several numerical features, for which monotonic constraints must hold.
    For features in columns named MonotonicNeg*, if feature value decreases, then prediction value must not decrease.
    Thus, if there are two samples x1, x2 with all features being equal except
    for a monotonic negative feature M, such that x1[M] > x2[M], then the following inequality must
    hold for predictions: f(x1) <= f(x2)
    zOhttps://storage.mds.yandex.net/get-devtools-opensource/479623/monotonic1.tar.gz 1b9d8e15bc3fd6f1498e652e7fc4f4ca)
monotonic1r   r   r   T)rf   rY   r   r   s        r   r   r      s3     \C
,C*O'L*iClJ	t[_``r-   c                  8    d} d}d\  }}}t        | ||||d      S )a  
    Dataset with monotonic constraints.
    Can be used for regression.
    The first column contains target values.
    Other columns contain contain numerical features, for which monotonic constraints must hold.

    For features in columns named MonotonicNeg*, if feature value decreases, then prediction
    value must not decrease. Thus, if there are two samples x1, x2 with all features being
    equal except for a monotonic negative feature MNeg, such that x1[MNeg] > x2[MNeg], then
    the following inequality must hold for predictions: f(x1) <= f(x2)
    For features in columns named MonotonicPos*, if feature value decreases, then prediction
    value must not increase. Thus, if there are two samples x1, x2 with all features being
    equal except for a monotonic positive feature MPos, such that x1[MPos] > x2[MPos],
    then the following inequality must hold for predictions: f(x1) >= f(x2)
    zOhttps://storage.mds.yandex.net/get-devtools-opensource/250854/monotonic2.tar.gz ce559e212cb72c156269f6f9a641baca)
monotonic2r   r   r   r   r   r   s        r   r   r      s0      \C
,C*O'L*iClJ	tTTr-   c                     d} t         t        t         t        t         t        t        t        t        t        t         t         t         t        t        d}d}d}t        j                         \  }}t	        j
                  |       t        |||       d}d}t        j                         \  }}	t	        j
                  |       t        |||	       t        j                  || dddgd	
      }
t	        j                  |       t        j                  |	| dddgddd id	      }t	        j                  |	       |
j                  |      }
|j                  |      }|
|fS )a  
    Download "Adult Data Set" [1] from UCI Machine Learning Repository.

    Will return two pandas.DataFrame-s, first with train part (adult.data) and second with test part
    (adult.test) of the dataset.

    [1]: https://archive.ics.uci.edu/ml/datasets/Adult
    )age	workclassfnlwgt	educationzeducation-numzmarital-status
occupationrelationshipracesexzcapital-gainzcapital-losszhours-per-weekznative-countryincome)z.https://proxy.sandbox.yandex-team.ru/779118052zJhttps://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data 5d7c39d7b8804f071cdd1f2a7c460872)z.https://proxy.sandbox.yandex-team.ru/779120000zJhttps://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.test 35238206dfdf7f1fe215bbb874adecdcNz,\s*?python)namesre   rf   	na_valuesenginer   r   c                     | d d S )NrH   )xs    r   <lambda>zadult.<locals>.<lambda>5  s      JK  LO  MO  JPr-   )r   re   rf   r   skiprows
convertersr   )floatobjectrM   rP   r   rQ   rC   rg   rh   rR   astype)r   rn   
train_urls	train_md5fd_trainr[   	test_urlstest_md5fd_testr\   train_dftest_dfs               r   adultr   	  sA   &E
 6U&vu F6EXJ 3I#++-HjHHXZJ7XI 2H!))+GYHHWY)4{{:U4WY\X]fnoHIIj kk)57WZV[fgu}  @P  uQ  Zb  cGIIi u%HnnU#GWr-   c                  z    d} d}d\  }}}t        | ||||d      \  }}t        |ddd	      t        |d
dd	      fS )a  
    Download "higgs" [1] data set.

    Will return two pandas.DataFrame-s, first with train part and second with
    test part of the dataset. Object class will be located in the first
    column of dataset.

    [1]: https://archive.ics.uci.edu/ml/datasets/HIGGS
    zJhttps://storage.mds.yandex.net/get-devtools-opensource/250854/higgs.tar.gz ad59ba8328a9afa3837d7bf1a0e10e7b)higgsr   r   Tr   i7    r   r   i  r   )r.   r   rV   rW   rX   r[   r\   s          r   r   r   @  s^     WC
,C*J'L*i-c3jR[cghJ	":xF"9fbdCE Er-   ).)F),inferF)r   )&r   loggingnumpyrp   r   pandasrg   r
   rM   r5   rT   corer   r   	getLogger__name__r'   r   r   r$   rC   rF   rI   rK   rc   rk   r|   r   r   r   r   r   r   r   r   r   r   r   rH   r-   r   <module>r      s       	    
  $ 
		8	$/0 !<!&KKnXU(UG0a(U,4nEr-   