import tqdm
import numpy as np
import pandas as pd
import seaborn as sns

from zipfile import ZipFile
from matplotlib import pyplot as plt

from sklearn.ensemble import RandomForestClassifier
%matplotlib inline
plt.rcParams['figure.figsize'] = (12, 12)

Before running the below cell, upload your kaggle token, to make sure an error doesn't popup.

!mkdir ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json
!kaggle competitions download -c tabular-playground-series-apr-2022
Downloading tabular-playground-series-apr-2022.zip to /content
 96% 165M/171M [00:02<00:00, 78.6MB/s]
100% 171M/171M [00:02<00:00, 64.3MB/s]
with ZipFile('/content/tabular-playground-series-apr-2022.zip', 'r') as zf:
    zf.extractall('./')

Loading the data

train = pd.read_csv('train.csv')
train.head()
sequence subject step sensor_00 sensor_01 sensor_02 sensor_03 sensor_04 sensor_05 sensor_06 sensor_07 sensor_08 sensor_09 sensor_10 sensor_11 sensor_12
0 0 47 0 -0.196291 0.112395 1.0 0.329204 -1.004660 -0.131638 -0.127505 0.368702 -0.1 -0.963873 -0.985069 0.531893 4.751492
1 0 47 1 -0.447450 0.134454 1.0 -0.658407 0.162495 0.340314 -0.209472 -0.867176 0.2 -0.301301 0.082733 -0.231481 0.454390
2 0 47 2 0.326893 -0.694328 1.0 0.330088 0.473678 1.280479 -0.094718 0.535878 1.4 1.002168 0.449221 -0.586420 -4.736147
3 0 47 3 0.523184 0.751050 1.0 0.976991 -0.563287 -0.720269 0.793260 0.951145 -0.3 -0.995665 -0.434290 1.344650 0.429241
4 0 47 4 0.272025 1.074580 1.0 -0.136283 0.398579 0.044877 0.560109 -0.541985 -0.9 1.055636 0.812631 0.123457 -0.223359
train.info()
train.describe()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1558080 entries, 0 to 1558079
Data columns (total 16 columns):
 #   Column     Non-Null Count    Dtype  
---  ------     --------------    -----  
 0   sequence   1558080 non-null  int64  
 1   subject    1558080 non-null  int64  
 2   step       1558080 non-null  int64  
 3   sensor_00  1558080 non-null  float64
 4   sensor_01  1558080 non-null  float64
 5   sensor_02  1558080 non-null  float64
 6   sensor_03  1558080 non-null  float64
 7   sensor_04  1558080 non-null  float64
 8   sensor_05  1558080 non-null  float64
 9   sensor_06  1558080 non-null  float64
 10  sensor_07  1558080 non-null  float64
 11  sensor_08  1558080 non-null  float64
 12  sensor_09  1558080 non-null  float64
 13  sensor_10  1558080 non-null  float64
 14  sensor_11  1558080 non-null  float64
 15  sensor_12  1558080 non-null  float64
dtypes: float64(13), int64(3)
memory usage: 190.2 MB
sequence subject step sensor_00 sensor_01 sensor_02 sensor_03 sensor_04 sensor_05 sensor_06 sensor_07 sensor_08 sensor_09 sensor_10 sensor_11 sensor_12
count 1.558080e+06 1.558080e+06 1.558080e+06 1.558080e+06 1.558080e+06 1.558080e+06 1.558080e+06 1.558080e+06 1.558080e+06 1.558080e+06 1.558080e+06 1.558080e+06 1.558080e+06 1.558080e+06 1.558080e+06 1.558080e+06
mean 1.298350e+04 3.316331e+02 2.950000e+01 4.365526e-04 -1.034982e-03 -2.178045e-01 -2.156555e-03 -1.828903e-03 -1.651785e-03 -4.122917e-04 -2.620665e-05 -1.298393e-04 1.365584e-03 3.315801e-04 -3.733291e-03 -1.172605e-02
std 7.496318e+03 1.958257e+02 1.731811e+01 2.658684e+00 4.404200e+00 2.298002e+00 3.934184e+00 1.683685e+00 1.590818e+00 3.345143e+00 3.243428e+00 4.501534e+00 2.592913e+00 1.917333e+00 4.532568e+00 3.911767e+01
min 0.000000e+00 0.000000e+00 0.000000e+00 -3.750634e+02 -4.345977e+02 -3.165948e+01 -4.083761e+02 -2.362601e+01 -7.498280e+01 -4.705046e+02 -4.070115e+02 -5.361000e+02 -2.703468e+02 -4.341271e+01 -4.270586e+02 -6.125494e+02
25% 6.491750e+03 1.617500e+02 1.475000e+01 -5.000000e-01 -4.831933e-01 -6.461531e-01 -4.929204e-01 -4.729928e-01 -4.786836e-01 -4.927140e-01 -5.022901e-01 -5.000000e-01 -5.151734e-01 -4.787939e-01 -4.835391e-01 -5.805627e-01
50% 1.298350e+04 3.350000e+02 2.950000e+01 -3.091190e-03 3.151261e-03 0.000000e+00 0.000000e+00 -1.589577e-03 2.991773e-03 9.107468e-04 -2.290076e-03 0.000000e+00 -1.445087e-03 -1.655822e-03 3.086420e-03 0.000000e+00
75% 1.947525e+04 5.010000e+02 4.425000e+01 4.845440e-01 4.926471e-01 3.338469e-01 4.893805e-01 4.701565e-01 5.056096e-01 4.927140e-01 4.847328e-01 5.000000e-01 5.086705e-01 4.780386e-01 4.938272e-01 5.703325e-01
max 2.596700e+04 6.710000e+02 5.900000e+01 3.358246e+02 4.495914e+02 1.666667e+00 4.366504e+02 2.487286e+01 7.791548e+01 4.425009e+02 3.312542e+02 6.301000e+02 3.679812e+02 4.186559e+01 4.480206e+02 6.305111e+02
labels = pd.read_csv('train_labels.csv')
labels.head()
sequence state
0 0 0
1 1 1
2 2 1
3 3 1
4 4 1
sns.heatmap(train.corr(), annot=True, vmin=-1, vmax=1, cmap='RdYlGn')
<matplotlib.axes._subplots.AxesSubplot at 0x7fbad6cf6d90>
test = pd.read_csv('test.csv')
test.head()
sequence subject step sensor_00 sensor_01 sensor_02 sensor_03 sensor_04 sensor_05 sensor_06 sensor_07 sensor_08 sensor_09 sensor_10 sensor_11 sensor_12
0 25968 684 0 2.427357 19.639706 1.0 -1.466372 -1.289973 -4.207928 2.486339 -2.493893 8.0 -1.123555 -1.673048 10.980453 0.419011
1 25968 684 1 -4.950541 -21.747899 1.0 0.983186 -0.569053 1.845924 -3.887978 1.727481 -2.9 0.395231 -0.882233 -1.871399 -0.008525
2 25968 684 2 1.136012 -10.756303 1.0 1.016814 0.964157 2.454749 0.312386 1.154198 -5.6 1.114162 1.525273 -11.584362 0.139812
3 25968 684 3 0.806028 6.504202 1.0 -0.179646 0.969221 -1.035153 -0.457195 0.254962 -2.7 -0.588873 0.608761 -4.241770 -0.462916
4 25968 684 4 1.288253 5.552521 1.0 -0.493805 -1.036124 -1.126402 2.008197 -0.730534 0.0 0.899566 -1.259615 -0.472222 -0.121483

There are no missing values, in the data.

if train.isna().any().any():
    print(train.isna().sum()/train.shape[0])
else:
    print("No Missing values")
No Missing values
train_all = train.merge(labels, on='sequence')

Preparation

def time_series_train_test_split(X, y, split_col, target_col, train_size=0.8):
    train_len = int(len(X[split_col].unique()) * train_size)

    condition_mask = X[split_col].isin(list(range(train_len+1)))

    X_train, y_train = X[condition_mask], y[condition_mask]
    X_valid, y_valid = X[np.logical_not(condition_mask)], y[np.logical_not(condition_mask)]

    return X_train, X_valid, y_train, y_valid
X = train_all.drop(['state'], axis=1)
y = train_all['state']
X_train, X_valid, y_train, y_valid = time_series_train_test_split(X, y, 'sequence', 'state')

Modelling

Approach-1

I have used the classical machine learning models to fit the data and classify them into their states.

model = RandomForestClassifier(n_jobs=-1)
model.fit(X_train, y_train)
RandomForestClassifier(n_jobs=-1)
train_pred = model.predict(X_train.values)
valid_pred = model.predict(X_valid.values)
test_pred = model.predict(test.values)
/usr/local/lib/python3.7/dist-packages/sklearn/base.py:451: UserWarning: X does not have valid feature names, but RandomForestClassifier was fitted with feature names
  "X does not have valid feature names, but"
/usr/local/lib/python3.7/dist-packages/sklearn/base.py:451: UserWarning: X does not have valid feature names, but RandomForestClassifier was fitted with feature names
  "X does not have valid feature names, but"
/usr/local/lib/python3.7/dist-packages/sklearn/base.py:451: UserWarning: X does not have valid feature names, but RandomForestClassifier was fitted with feature names
  "X does not have valid feature names, but"
print(f"Train Accuarcy: {model.score(X_train, y_train)}")
print(f"Valid Accuarcy: {model.score(X_valid, y_valid)}")
Train Accuarcy: 1.0
Valid Accuarcy: 0.6185923358367033
test_all = pd.concat([test, pd.Series(test_pred)], axis=1)
test_all.head()
sequence subject step sensor_00 sensor_01 sensor_02 sensor_03 sensor_04 sensor_05 sensor_06 sensor_07 sensor_08 sensor_09 sensor_10 sensor_11 sensor_12 0
0 25968 684 0 2.427357 19.639706 1.0 -1.466372 -1.289973 -4.207928 2.486339 -2.493893 8.0 -1.123555 -1.673048 10.980453 0.419011 0
1 25968 684 1 -4.950541 -21.747899 1.0 0.983186 -0.569053 1.845924 -3.887978 1.727481 -2.9 0.395231 -0.882233 -1.871399 -0.008525 0
2 25968 684 2 1.136012 -10.756303 1.0 1.016814 0.964157 2.454749 0.312386 1.154198 -5.6 1.114162 1.525273 -11.584362 0.139812 0
3 25968 684 3 0.806028 6.504202 1.0 -0.179646 0.969221 -1.035153 -0.457195 0.254962 -2.7 -0.588873 0.608761 -4.241770 -0.462916 1
4 25968 684 4 1.288253 5.552521 1.0 -0.493805 -1.036124 -1.126402 2.008197 -0.730534 0.0 0.899566 -1.259615 -0.472222 -0.121483 0

For each timestamp of a particular sequence the model will predict wheather it is in state 0 or 1.

But the test results, should be in the format of sequence number and the

test_actual = test_all.groupby(['sequence']).agg({0: np.mean}).reset_index()
test_actual[0] = np.where(test_actual[0] < 0.5, 0, 1)
test_actual.head()
sequence 0
0 25968 0
1 25969 0
2 25970 0
3 25971 1
4 25972 0
submission = pd.read_csv('/content/sample_submission.csv')
submission.merge(test_actual, on='sequence').drop(['state'], axis=1).rename(columns={0: 'state'})
submission.to_csv('output.csv', index=False)
!kaggle competitions submit -c tabular-playground-series-apr-2022 -f output.csv -m "RF 100"
100% 95.5k/95.5k [00:02<00:00, 44.6kB/s]
Successfully submitted to Tabular Playground Series - Apr 2022

ROC AUC: 0.5