web-dev-qa-db-fra.com

ValueError: Type d'étiquette inconnu: «continu»

J'ai vu d'autres articles en parler, mais n'importe lequel d'entre eux peut m'aider. J'utilise jupyter notebook avec Python 3.6.0 sur une machine Windows x6. J'ai un grand ensemble de données mais je n'en garde qu'une partie pour exécuter mes modèles:

Ceci est un morceau de code que j'ai utilisé:

df = loan_2.reindex(columns= ['term_clean','grade_clean', 'annual_inc', 'loan_amnt', 'int_rate','purpose_clean','installment','loan_status_clean'])
df.fillna(method= 'ffill').astype(int)
from sklearn.preprocessing import Imputer
from sklearn.preprocessing import StandardScaler
imp = Imputer(missing_values='NaN', strategy='median', axis=0)
array = df.values
y = df['loan_status_clean'].values
imp.fit(array)
array_imp = imp.transform(array)

y2= y.reshape(1,-1)
imp.fit(y2)
y_imp= imp.transform(y2)
X = array_imp[:,0:4]
Y = array_imp[:,4]
validation_size = 0.20
seed = 7
X_train, X_validation, Y_train, Y_validation = model_selection.train_test_split(X, Y, test_size=validation_size, random_state=seed)
seed = 7
scoring = 'accuracy'

from sklearn import model_selection
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import  BernoulliNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.ensemble import AdaBoostClassifier
from sklearn.neural_network import MLPClassifier

# Spot Check Algorithms
models = []
models.append(('LR', LogisticRegression()))
models.append(('LDA', LinearDiscriminantAnalysis()))
models.append(('KNN', KNeighborsClassifier()))
models.append(('CART', DecisionTreeClassifier()))
models.append(('BNB', BernoulliNB()))
models.append(('RF', RandomForestClassifier()))
models.append(('GBM', AdaBoostClassifier()))
models.append(('NN', MLPClassifier()))
models.append(('SVM', SVC()))

# evaluate each model in turn
results = []
names = []
for name, model in models:
    kfold = model_selection.KFold(n_splits=10, random_state=seed)
    cv_results = model_selection.cross_val_score(model, X_train, Y_train, cv=kfold, scoring=scoring)
    results.append(cv_results)
    names.append(name)
    msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
    print(msg)

Lorsque j'exécute le dernier morceau de code, cette erreur apparaît:


ValueError                                Traceback (most recent call last)
<ipython-input-262-1e6860ba615b> in <module>()
      4 for name, model in models:
      5         kfold = model_selection.KFold(n_splits=10, random_state=seed)
----> 6         cv_results = model_selection.cross_val_score(model, X_train, Y_train, cv=kfold, scoring=scoring)
      7         results.append(cv_results)
      8         names.append(name)

C:\Users\dalila\Anaconda\lib\site-packages\sklearn\model_selection\_validation.py in cross_val_score(estimator, X, y, groups, scoring, cv, n_jobs, verbose, fit_params, pre_dispatch)
    138                                               train, test, verbose, None,
    139                                               fit_params)
--> 140                       for train, test in cv_iter)
    141     return np.array(scores)[:, 0]
    142 

C:\Users\dalila\Anaconda\lib\site-packages\sklearn\externals\joblib\parallel.py in __call__(self, iterable)
    756             # was dispatched. In particular this covers the Edge
    757             # case of Parallel used with an exhausted iterator.
--> 758             while self.dispatch_one_batch(iterator):
    759                 self._iterating = True
    760             else:

C:\Users\dalila\Anaconda\lib\site-packages\sklearn\externals\joblib\parallel.py in dispatch_one_batch(self, iterator)
    606                 return False
    607             else:
--> 608                 self._dispatch(tasks)
    609                 return True
    610 

C:\Users\dalila\Anaconda\lib\site-packages\sklearn\externals\joblib\parallel.py in _dispatch(self, batch)
    569         dispatch_timestamp = time.time()
    570         cb = BatchCompletionCallBack(dispatch_timestamp, len(batch), self)
--> 571         job = self._backend.apply_async(batch, callback=cb)
    572         self._jobs.append(job)
    573 

C:\Users\dalila\Anaconda\lib\site-packages\sklearn\externals\joblib\_parallel_backends.py in apply_async(self, func, callback)
    107     def apply_async(self, func, callback=None):
    108         """Schedule a func to be run"""
--> 109         result = ImmediateResult(func)
    110         if callback:
    111             callback(result)

C:\Users\dalila\Anaconda\lib\site-packages\sklearn\externals\joblib\_parallel_backends.py in __init__(self, batch)
    324         # Don't delay the application, to avoid keeping the input
    325         # arguments in memory
--> 326         self.results = batch()
    327 
    328     def get(self):

C:\Users\dalila\Anaconda\lib\site-packages\sklearn\externals\joblib\parallel.py in __call__(self)
    129 
    130     def __call__(self):
--> 131         return [func(*args, **kwargs) for func, args, kwargs in self.items]
    132 
    133     def __len__(self):

C:\Users\dalila\Anaconda\lib\site-packages\sklearn\externals\joblib\parallel.py in <listcomp>(.0)
    129 
    130     def __call__(self):
--> 131         return [func(*args, **kwargs) for func, args, kwargs in self.items]
    132 
    133     def __len__(self):

C:\Users\dalila\Anaconda\lib\site-packages\sklearn\model_selection\_validation.py in _fit_and_score(estimator, X, y, scorer, train, test, verbose, parameters, fit_params, return_train_score, return_parameters, return_n_test_samples, return_times, error_score)
    236             estimator.fit(X_train, **fit_params)
    237         else:
--> 238             estimator.fit(X_train, y_train, **fit_params)
    239 
    240     except Exception as e:

C:\Users\dalila\Anaconda\lib\site-packages\sklearn\linear_model\logistic.py in fit(self, X, y, sample_weight)
   1172         X, y = check_X_y(X, y, accept_sparse='csr', dtype=np.float64,
   1173                          order="C")
-> 1174         check_classification_targets(y)
   1175         self.classes_ = np.unique(y)
   1176         n_samples, n_features = X.shape

C:\Users\dalila\Anaconda\lib\site-packages\sklearn\utils\multiclass.py in check_classification_targets(y)
    170     if y_type not in ['binary', 'multiclass', 'multiclass-multioutput',
    171             'multilabel-indicator', 'multilabel-sequences']:
--> 172         raise ValueError("Unknown label type: %r" % y_type)
    173 
    174 

ValueError: Unknown label type: 'continuous'

Hypothèse brève: mes données sont propres de NaN et de valeur manquante en général.

7
Dalila

La solution de votre problème est que vous avez besoin d'un modèle de régression au lieu d'un modèle de classification donc: au lieu de ces deux lignes:

from sklearn.svm import SVC
..
..
models.append(('SVM', SVC()))

utilisez ceux-ci:

from sklearn.svm import SVR
..
..
models.append(('SVM', SVR()))
13
Tamer Farrag

Le classificateur attend dans Y_train uniquement des valeurs entières (étiquettes de classes). Mais il se met à flotter et déclenche cette erreur. Si vous effectuez une régression, utilisez les régresseurs au lieu des classificateurs. Ou si vous avez besoin d'une classification, vérifiez y_train. Peut-être que cette partie de votre code le transforme en flottant:

imp = Imputer(missing_values='NaN', strategy='median', axis=0)
array = df.values
imp.fit(array)
array_imp = imp.transform(array)
Y = array_imp[:,4]

essayez de le changer en

Y = array[:,4] # take it from not changed data 
imp = Imputer(missing_values='NaN', strategy='median', axis=0)
array = df.values
imp.fit(array)
array_imp = imp.transform(array)
5
malugina