import tqdm
import numpy as np
import pandas as pd
import seaborn as sns

from zipfile import ZipFile
from matplotlib import pyplot as plt

from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.base import BaseEstimator, TransformerMixin
%matplotlib inline
plt.rcParams['figure.figsize'] = (20, 12)

Before running the below cell, upload your kaggle token, to make sure an error doesn't popup.

!mkdir ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json
!kaggle competitions download -c tabular-playground-series-aug-2022
Downloading tabular-playground-series-aug-2022.zip to /content
  0% 0.00/2.27M [00:00<?, ?B/s]
100% 2.27M/2.27M [00:00<00:00, 81.1MB/s]
with ZipFile('/content/tabular-playground-series-aug-2022.zip', 'r') as zf:
    zf.extractall('./')

Loading the data

train = pd.read_csv('train.csv', index_col='id')
train.head()
product_code loading attribute_0 attribute_1 attribute_2 attribute_3 measurement_0 measurement_1 measurement_2 measurement_3 ... measurement_9 measurement_10 measurement_11 measurement_12 measurement_13 measurement_14 measurement_15 measurement_16 measurement_17 failure
id
0 A 80.10 material_7 material_8 9 5 7 8 4 18.040 ... 10.672 15.859 17.594 15.193 15.029 NaN 13.034 14.684 764.100 0
1 A 84.89 material_7 material_8 9 5 14 3 3 18.213 ... 12.448 17.947 17.915 11.755 14.732 15.425 14.395 15.631 682.057 0
2 A 82.43 material_7 material_8 9 5 12 1 5 18.057 ... 12.715 15.607 NaN 13.798 16.711 18.631 14.094 17.946 663.376 0
3 A 101.07 material_7 material_8 9 5 13 2 6 17.295 ... 12.471 16.346 18.377 10.020 15.250 15.562 16.154 17.172 826.282 0
4 A 188.06 material_7 material_8 9 5 9 2 8 19.346 ... 10.337 17.082 19.932 12.428 16.182 12.760 13.153 16.412 579.885 0

5 rows × 25 columns

train.info()
train.describe()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 26570 entries, 0 to 26569
Data columns (total 25 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   product_code    26570 non-null  object 
 1   loading         26320 non-null  float64
 2   attribute_0     26570 non-null  object 
 3   attribute_1     26570 non-null  object 
 4   attribute_2     26570 non-null  int64  
 5   attribute_3     26570 non-null  int64  
 6   measurement_0   26570 non-null  int64  
 7   measurement_1   26570 non-null  int64  
 8   measurement_2   26570 non-null  int64  
 9   measurement_3   26189 non-null  float64
 10  measurement_4   26032 non-null  float64
 11  measurement_5   25894 non-null  float64
 12  measurement_6   25774 non-null  float64
 13  measurement_7   25633 non-null  float64
 14  measurement_8   25522 non-null  float64
 15  measurement_9   25343 non-null  float64
 16  measurement_10  25270 non-null  float64
 17  measurement_11  25102 non-null  float64
 18  measurement_12  24969 non-null  float64
 19  measurement_13  24796 non-null  float64
 20  measurement_14  24696 non-null  float64
 21  measurement_15  24561 non-null  float64
 22  measurement_16  24460 non-null  float64
 23  measurement_17  24286 non-null  float64
 24  failure         26570 non-null  int64  
dtypes: float64(16), int64(6), object(3)
memory usage: 5.3+ MB
loading attribute_2 attribute_3 measurement_0 measurement_1 measurement_2 measurement_3 measurement_4 measurement_5 measurement_6 ... measurement_9 measurement_10 measurement_11 measurement_12 measurement_13 measurement_14 measurement_15 measurement_16 measurement_17 failure
count 26320.000000 26570.000000 26570.000000 26570.000000 26570.000000 26570.000000 26189.000000 26032.000000 25894.000000 25774.000000 ... 25343.000000 25270.000000 25102.000000 24969.000000 24796.000000 24696.000000 24561.000000 24460.000000 24286.000000 26570.000000
mean 127.826233 6.754046 7.240459 7.415883 8.232518 6.256568 17.791528 11.731988 17.127804 17.510759 ... 11.430725 16.117711 19.172085 11.702464 15.652904 16.048444 14.995554 16.460727 701.269059 0.212608
std 39.030020 1.471852 1.456493 4.116690 4.199401 3.309109 1.001200 0.996085 0.996414 0.995980 ... 0.999137 1.405978 1.520785 1.488838 1.155247 1.491923 1.549226 1.708935 123.304161 0.409160
min 33.160000 5.000000 5.000000 0.000000 0.000000 0.000000 13.968000 8.008000 12.073000 12.715000 ... 7.537000 9.323000 12.461000 5.167000 10.890000 9.140000 9.104000 9.701000 196.787000 0.000000
25% 99.987500 6.000000 6.000000 4.000000 5.000000 4.000000 17.117000 11.051000 16.443000 16.839000 ... 10.757000 15.209000 18.170000 10.703000 14.890000 15.057000 13.957000 15.268000 618.961500 0.000000
50% 122.390000 6.000000 8.000000 7.000000 8.000000 6.000000 17.787000 11.733000 17.132000 17.516000 ... 11.430000 16.127000 19.211500 11.717000 15.628500 16.040000 14.969000 16.436000 701.024500 0.000000
75% 149.152500 8.000000 8.000000 10.000000 11.000000 8.000000 18.469000 12.410000 17.805000 18.178000 ... 12.102000 17.025000 20.207000 12.709000 16.374000 17.082000 16.018000 17.628000 784.090250 0.000000
max 385.860000 9.000000 9.000000 29.000000 29.000000 24.000000 21.499000 16.484000 21.425000 21.543000 ... 15.412000 22.479000 25.640000 17.663000 22.713000 22.303000 21.626000 24.094000 1312.794000 1.000000

8 rows × 22 columns

test = pd.read_csv('test.csv', index_col='id')
test.head()
product_code loading attribute_0 attribute_1 attribute_2 attribute_3 measurement_0 measurement_1 measurement_2 measurement_3 ... measurement_8 measurement_9 measurement_10 measurement_11 measurement_12 measurement_13 measurement_14 measurement_15 measurement_16 measurement_17
id
26570 F 119.57 material_5 material_6 6 4 6 9 6 19.305 ... 18.654 10.802 15.909 18.070 13.772 13.659 16.825 13.742 17.710 634.612
26571 F 113.51 material_5 material_6 6 4 11 8 0 17.883 ... 19.368 12.032 13.998 NaN 12.473 17.468 16.708 14.776 14.102 537.037
26572 F 112.16 material_5 material_6 6 4 8 12 4 18.475 ... 17.774 11.743 17.046 18.086 10.907 13.363 15.737 17.065 16.021 658.995
26573 F 112.72 material_5 material_6 6 4 8 11 10 16.518 ... 18.948 11.790 18.165 16.163 10.933 15.501 15.667 12.620 16.111 594.301
26574 F 208.00 material_5 material_6 6 4 14 16 8 17.808 ... 19.141 12.370 14.578 17.849 11.941 16.070 16.183 13.324 17.150 801.044

5 rows × 24 columns

There are some missing values, in the data.

if train.isna().any().any():
    print(train.isna().sum()*100/train.shape[0])
else:
    print("No Missing values")
product_code      0.000000
loading           0.940911
attribute_0       0.000000
attribute_1       0.000000
attribute_2       0.000000
attribute_3       0.000000
measurement_0     0.000000
measurement_1     0.000000
measurement_2     0.000000
measurement_3     1.433948
measurement_4     2.024840
measurement_5     2.544223
measurement_6     2.995860
measurement_7     3.526534
measurement_8     3.944298
measurement_9     4.617990
measurement_10    4.892736
measurement_11    5.525028
measurement_12    6.025593
measurement_13    6.676703
measurement_14    7.053067
measurement_15    7.561159
measurement_16    7.941287
measurement_17    8.596161
failure           0.000000
dtype: float64

Exploratory Data Analysis(EDA)

sns.heatmap(train.corr(), annot=True, vmin=-1, vmax=1, cmap='RdYlGn')
<matplotlib.axes._subplots.AxesSubplot at 0x7f6bbd2f5410>
train_target = train['failure'].value_counts()

# sns.bar
train_target
0    20921
1     5649
Name: failure, dtype: int64

Some features are correlated with each other.

class OneHotEncoderExt(TransformerMixin, BaseEstimator):
    def __init__(self, categories=None):
        self.categories = categories

    def fit(self, y):
        if self.categories is None:
            self.categories = np.unique(y)
        code = [1] + [0]*(len(self.categories)-1)

        self.encoding = {k: np.roll(code, v) for v, k in enumerate(self.categories, start=0)}

    def transform(self, y):
        final_list = list()
        for row in y:
            final_list.append(self.encoding[row])
        
        return pd.DataFrame(final_list)

    def fit_transform(self, y):
        self.fit(y)

        return self.transform(y)
uni_values = list(set(train['attribute_0'].unique().tolist() + test['attribute_0'].unique().tolist()))

ohe_attr0 = OneHotEncoderExt(categories=uni_values)
ohe_attr0.fit(train['attribute_0'].values.reshape(1, -1))
attr0_train = ohe_attr0.transform(train['attribute_0'])
attr0_test = ohe_attr0.transform(test['attribute_0'])
attr0_train.columns = [f'attribute_0{col}' for col in attr0_train.columns]
attr0_test.columns = [f'attribute_0{col}' for col in attr0_test.columns]
attr0_test.index = test.index
uni_values = list(set(train['attribute_1'].unique().tolist() + test['attribute_1'].unique().tolist()))

ohe_attr1 = OneHotEncoderExt(categories=uni_values)
ohe_attr1.fit(train['attribute_1'].values.reshape(1, -1))
attr1_train = ohe_attr1.transform(train['attribute_1'])
attr1_test = ohe_attr1.transform(test['attribute_1'])
attr1_train.columns = [f'attribute_1{col}' for col in attr1_train.columns]
attr1_test.columns = [f'attribute_1{col}' for col in attr1_test.columns]
attr1_test.index = test.index
train = pd.concat([train, attr0_train, attr1_train], axis=1)
test = pd.concat([test, attr0_test, attr1_test], axis=1)
train.drop(['attribute_0', 'attribute_1'], axis=1, inplace=True)
test.drop(['attribute_0', 'attribute_1'], axis=1, inplace=True)
drop_cols = ['product_code']
target_col = 'failure'
feature_names = [col for col in train.columns if col not in [target_col]+drop_cols]

impute = SimpleImputer(strategy='median')
impute.fit(train.drop(drop_cols+[target_col], axis=1))
X = pd.DataFrame(data=impute.transform(train.drop(drop_cols+[target_col], axis=1)), columns=feature_names)
X_test = pd.DataFrame(data=impute.transform(test.drop(drop_cols, axis=1)), columns=feature_names)
y = train[target_col]

Modelling

model = LGBMClassifier()
model.fit(X, y)
LGBMClassifier(colsample_bytree=0.5, learning_rate=0.02, metric='auc',
               min_child_samples=10, n_estimators=2000, objective='binary',
               random_state=42, subsample=0.8, subsample_freq=5)
test_preds = model.predict(X_test)

Submission

submission = pd.read_csv('./sample_submission.csv')

submission['failure'] = test_preds
submission.to_csv('output.csv', index=False)
!kaggle competitions submit -c tabular-playground-series-aug-2022 -f output.csv -m "LGBM OneHot predict proba with median missing imputation"
100% 162k/162k [00:00<00:00, 397kB/s]
Successfully submitted to Tabular Playground Series - Aug 2022

AUC ROC is: 0.50565