膨大なデータフレーム(550MB)があり、貸出クラブ1が利用可能ここで、成績のクラスを予測する必要があります。 夕暮れのデータフレームは次のとおりです。

    Unnamed: 0  Unnamed: 0.1    loan_amnt   funded_amnt funded_amnt_inv term    int_rate    installment annual_inc  issue_d ... addr_state_SD   addr_state_TN   addr_state_TX   addr_state_UT   addr_state_VA   addr_state_VT   addr_state_WA   addr_state_WI   addr_state_WV   addr_state_WY
0   41131   931434  24000   24000   24000.0 0   8.49    757.51  80000.0 2015    ... 0   0   0   0   0   0   1   0   0   0
1   41132   942549  6000    6000    6000.0  0   11.22   197.06  52000.0 2015    ... 0   0   0   0   0   0   0   0   0   0
2   41135   931619  8000    8000    8000.0  0   9.80    257.39  55000.0 2015    ... 0   0   0   0   0   0   0   0   0   0
3   41136   935204  19975   19975   19975.0 1   12.88   453.27  92000.0 2015    ... 0   0   0   0   0   0   0   0   0   0
...

したがって、これはマルチクラス分類の問題です。 それでも、Pandasを使用してデータをインポートしようとすると、フリーズしているようです。

NotImplementedError: 'DataFrame.iloc' only supports selecting columns. It must be used like 'df.iloc[:, column_indexer]'.

それでは、このデータセットが少しでも分類するにはどうすればよいですか?

ここに私のコードがあります

import dask.dataframe as dd

# predicting model
import pandas
from keras.models import Sequential
from keras.layers import Dense
from keras.wrappers.scikit_learn import KerasClassifier
from keras.utils import np_utils
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder

# define baseline model
def baseline_model():
    # create model
    model = Sequential()
    # input layer
    model.add(Dense(100, input_dim=input_dim, activation='relu', kernel_constraint=maxnorm(3)))
    model.add(Dropout(0.2))

    # hidden layer
    model.add(Dense(60, activation='relu', kernel_constraint=maxnorm(3)))
    model.add(Dropout(0.2))

    # output layer
    model.add(Dense(output_dim, activation='softmax'))
    # Compile model
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

from dask.distributed import Client

# load dataset
X = result.loc[:, result.columns != 'TARGET']
Y = result['TARGET']
import sklearn.model_selection
from sklearn.model_selection import train_test_split

from sklearn.externals import joblib
from sklearn.externals.joblib import parallel_backend

client = Client()

with joblib.parallel_backend('dask'):
    print("Before train test split")
    train_X, test_X, train_y, test_y = train_test_split(X,result['TARGET'], test_size = 0.2, random_state = 0)
    print("before one hot encoder 1")
    train_y = pd.get_dummies(train_y)
    print("before one hot encoder 1")
    test_y = pd.get_dummies(test_y)
    print("Before Keras Classifier")
    estimator = KerasClassifier(build_fn=baseline_model, epochs=200, batch_size=5, verbose=1)
    kfold = KFold(n_splits=10, shuffle=True)
    results = cross_val_score(estimator, X, YDummies, cv=kfold)
    print("Baseline: %.2f%% (%.2f%%)" % (results.mean()*100, results.std()*100))

完全なエラーメッセージを次に示します。

C:\Users\antoi\AppData\Roaming\Python\Python36\site-packages\sklearn\externals\joblib\__init__.py:15: DeprecationWarning: sklearn.externals.joblib is deprecated in 0.21 and will be removed in 0.23. Please import this functionality directly from joblib, which can be installed with: pip install joblib. If this warning is raised when loading pickled models, you may need to re-serialize those models with scikit-learn 0.21+.
  warnings.warn(msg, category=DeprecationWarning)
Before train test split
---------------------------------------------------------------------------
NotImplementedError                       Traceback (most recent call last)
<ipython-input-8-7bcf8dee84cb> in <module>
      6 with joblib.parallel_backend('dask'):
      7     print("Before train test split")
----> 8     train_X, test_X, train_y, test_y = train_test_split(X,result['TARGET'], test_size = 0.2, random_state = 0)
      9     print("before one hot encoder 1")
     10     train_y = pd.get_dummies(train_y)

~\AppData\Roaming\Python\Python36\site-packages\sklearn\model_selection\_split.py in train_test_split(*arrays, **options)
   2122 
   2123     return list(chain.from_iterable((safe_indexing(a, train),
-> 2124                                      safe_indexing(a, test)) for a in arrays))
   2125 
   2126 

~\AppData\Roaming\Python\Python36\site-packages\sklearn\model_selection\_split.py in <genexpr>(.0)
   2122 
   2123     return list(chain.from_iterable((safe_indexing(a, train),
-> 2124                                      safe_indexing(a, test)) for a in arrays))
   2125 
   2126 

~\AppData\Roaming\Python\Python36\site-packages\sklearn\utils\__init__.py in safe_indexing(X, indices)
    206         # Pandas Dataframes and Series
    207         try:
--> 208             return X.iloc[indices]
    209         except ValueError:
    210             # Cython typed memoryviews internally used in pandas do not support

~\AppData\Roaming\Python\Python36\site-packages\dask\dataframe\indexing.py in __getitem__(self, key)
     52         )
     53         if not isinstance(key, tuple):
---> 54             raise NotImplementedError(msg)
     55 
     56         if len(key) > 2:

NotImplementedError: 'DataFrame.iloc' only supports selecting columns. It must be used like 'df.iloc[:, column_indexer]'.

私が今持っている唯一のアイデアは、次のようなチャンクでそれをすることです:

for chunk in pd.read_csv(<filepath>, chunksize=<your_chunksize_here>)
    do_processing()
    train_algorithm()