import tqdm
import numpy as np
import pandas as pd
import seaborn as sns
from zipfile import ZipFile
from matplotlib import pyplot as plt
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.base import BaseEstimator, TransformerMixin
%matplotlib inline
plt.rcParams['figure.figsize'] = (20, 12)
Before running the below cell, upload your kaggle token, to make sure an error doesn't popup.
!mkdir ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json
!kaggle competitions download -c tabular-playground-series-aug-2022
with ZipFile('/content/tabular-playground-series-aug-2022.zip', 'r') as zf:
zf.extractall('./')
train = pd.read_csv('train.csv', index_col='id')
train.head()
train.info()
train.describe()
test = pd.read_csv('test.csv', index_col='id')
test.head()
There are some missing values, in the data.
if train.isna().any().any():
print(train.isna().sum()*100/train.shape[0])
else:
print("No Missing values")
sns.heatmap(train.corr(), annot=True, vmin=-1, vmax=1, cmap='RdYlGn')
train_target = train['failure'].value_counts()
# sns.bar
train_target
Some features are correlated with each other.
class OneHotEncoderExt(TransformerMixin, BaseEstimator):
def __init__(self, categories=None):
self.categories = categories
def fit(self, y):
if self.categories is None:
self.categories = np.unique(y)
code = [1] + [0]*(len(self.categories)-1)
self.encoding = {k: np.roll(code, v) for v, k in enumerate(self.categories, start=0)}
def transform(self, y):
final_list = list()
for row in y:
final_list.append(self.encoding[row])
return pd.DataFrame(final_list)
def fit_transform(self, y):
self.fit(y)
return self.transform(y)
uni_values = list(set(train['attribute_0'].unique().tolist() + test['attribute_0'].unique().tolist()))
ohe_attr0 = OneHotEncoderExt(categories=uni_values)
ohe_attr0.fit(train['attribute_0'].values.reshape(1, -1))
attr0_train = ohe_attr0.transform(train['attribute_0'])
attr0_test = ohe_attr0.transform(test['attribute_0'])
attr0_train.columns = [f'attribute_0{col}' for col in attr0_train.columns]
attr0_test.columns = [f'attribute_0{col}' for col in attr0_test.columns]
attr0_test.index = test.index
uni_values = list(set(train['attribute_1'].unique().tolist() + test['attribute_1'].unique().tolist()))
ohe_attr1 = OneHotEncoderExt(categories=uni_values)
ohe_attr1.fit(train['attribute_1'].values.reshape(1, -1))
attr1_train = ohe_attr1.transform(train['attribute_1'])
attr1_test = ohe_attr1.transform(test['attribute_1'])
attr1_train.columns = [f'attribute_1{col}' for col in attr1_train.columns]
attr1_test.columns = [f'attribute_1{col}' for col in attr1_test.columns]
attr1_test.index = test.index
train = pd.concat([train, attr0_train, attr1_train], axis=1)
test = pd.concat([test, attr0_test, attr1_test], axis=1)
train.drop(['attribute_0', 'attribute_1'], axis=1, inplace=True)
test.drop(['attribute_0', 'attribute_1'], axis=1, inplace=True)
drop_cols = ['product_code']
target_col = 'failure'
feature_names = [col for col in train.columns if col not in [target_col]+drop_cols]
impute = SimpleImputer(strategy='median')
impute.fit(train.drop(drop_cols+[target_col], axis=1))
X = pd.DataFrame(data=impute.transform(train.drop(drop_cols+[target_col], axis=1)), columns=feature_names)
X_test = pd.DataFrame(data=impute.transform(test.drop(drop_cols, axis=1)), columns=feature_names)
y = train[target_col]
model = LGBMClassifier()
model.fit(X, y)
test_preds = model.predict(X_test)
submission = pd.read_csv('./sample_submission.csv')
submission['failure'] = test_preds
submission.to_csv('output.csv', index=False)
!kaggle competitions submit -c tabular-playground-series-aug-2022 -f output.csv -m "LGBM OneHot predict proba with median missing imputation"
AUC ROC is: 0.50565