import numpy as np
import pandas as pd
import seaborn as sns
import lightgbm as lgb
from xgboost import XGBClassifier
from zipfile import ZipFile
from matplotlib import pyplot as plt
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
%matplotlib inline
plt.rcParams['figure.figsize'] = (12, 12)
Before running the below cell, upload your kaggle token, to make sure an error doesn't popup.
!mkdir ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json
!kaggle competitions download -c tabular-playground-series-may-2022
with ZipFile('/content/tabular-playground-series-may-2022.zip', 'r') as zf:
zf.extractall('./')
train = pd.read_csv('train.csv', index_col='id')
train.head()
train.info()
train.describe()
sns.heatmap(train.corr(), annot=True, vmin=-1, vmax=1, cmap='RdYlGn')
test = pd.read_csv('test.csv', index_col='id')
test.head()
There are no missing values, in the data.
if train.isna().any().any():
print(train.isna().sum()/train.shape[0])
else:
print("No Missing values")
# Some of the feature engineering functions are from here
# https://www.kaggle.com/code/slythe/super-lightgbm-w-feature-engineering#%F0%9F%8C%9F-Feature-Engineering-%F0%9F%8C%9F
def text_engineering(df, col):
# Adding the characters unicode value, unique characters and there length
for i in range(10):
df[f"{col}_{i}"] = df[col].apply(lambda x: ord(x[i]) - ord('A'))
df["unique_chars"] = df[col].apply(lambda x: "".join(set(x)))
df["unique_chars"] = df["unique_chars"].astype("category")
df["unique_length"] = df["unique_chars"].apply(lambda x: len(x))
return df
def group_central_stats(df, cols, suffix):
df[f"sum_{suffix}"] = df[cols].sum(axis=1)
df[f"mean_{suffix}"] = df[cols].mean(axis=1)
df[f"std_{suffix}"] = df[cols].std(axis=1)
df[f"min_{suffix}"] = df[cols].min(axis=1)
df[f"max_{suffix}"] = df[cols].max(axis=1)
df[f"median_{suffix}"] = df[cols].median(axis=1)
df[f"mad_{suffix}"] = df[cols].mad(axis=1)
df[f"range_{suffix}"] = df[cols].max(axis=1) - df[cols].min(axis=1)
df[f"q01_{suffix}"] = df[cols].quantile(q=0.01, axis=1)
df[f"q05_{suffix}"] = df[cols].quantile(q=0.05, axis=1)
df[f"q25_{suffix}"] = df[cols].quantile(q=0.25, axis=1)
df[f"q50_{suffix}"] = df[cols].quantile(q=0.5, axis=1)
df[f"q75_{suffix}"] = df[cols].quantile(q=0.75, axis=1)
df[f"q95_{suffix}"] = df[cols].quantile(q=0.95, axis=1)
df[f"q99_{suffix}"] = df[cols].quantile(q=0.99, axis=1)
df[f"kurt_{suffix}"] = df[cols].kurt(axis=1)
df[f"skew_{suffix}"] = df[cols].skew( axis=1)
return df
def feature_engineering(df):
col = 'f_27'
group_floats1 = [f"f_0{i}" for i in range(7)]
group_floats2 = [f"f_{i}" for i in range(19, 27)]
df = text_engineering(df, col)
df = group_central_stats(df, group_floats1, "g1")
df = group_central_stats(df, group_floats2, "g2")
return df
def reduce_memory_dataframe(df, verbose=True):
# Reduce the DataFrame's Memory by altering the data types of the columns
num_types = ['int8', 'int16', 'int32', 'int64', 'float16', 'float32', 'float64']
start_memory = df.memory_usage().sum()
for col in df.columns:
if df[col].dtypes in num_types:
c_min = df[col].min()
c_max = df[col].max()
if str(df[col].dtype).find('int') != 1:
if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
df[col] = df[col].astype(np.int8)
elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
df[col] = df[col].astype(np.int16)
elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
df[col] = df[col].astype(np.int32)
elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
df[col] = df[col].astype(np.int64)
else:
if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
df[col] = df[col].astype(np.float16)
elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
df[col] = df[col].astype(np.float32)
elif c_min > np.finfo(np.float64).min and c_max < np.finfo(np.float64).max:
df[col] = df[col].astype(np.float64)
end_memory = df.memory_usage().sum()
if verbose:
print(f"Memory Reduced by {round((start_memory - end_memory) / start_memory, 2)} from {start_memory} to {end_memory}")
return df
train = feature_engineering(train)
test = feature_engineering(test)
train = reduce_memory_dataframe(train)
test = reduce_memory_dataframe(test)
X = train.drop(['target'], axis=1)
y = train['target']
X.drop(['f_27', 'unique_chars'], axis=1, inplace=True)
test.drop(['f_27', 'unique_chars'], axis=1, inplace=True)
X_train, X_valid, y_train, y_valid = train_test_split(X, y, train_size=0.8, random_state=88)
Just Apply a normal Random Forest Ensemble to the data.
model = RandomForestClassifier(n_estimators=300, verbose=2, n_jobs=-1)
model.fit(X_train, y_train)
Just Apply a normal Random Forest Ensemble to the data.
model = lgb.LGBMClassifier(
objective= 'binary',
metric= "auc",
num_iterations = 5000,
num_threads= -1,
learning_rate= 0.18319492258552644,
boosting= 'gbdt',
lambda_l1= 0.00028648667113792726,
lambda_l2= 0.00026863027834978876,
num_leaves= 229,
max_depth= 0,
min_child_samples=80,
max_bins=511,
random_state=42
)
model.fit(X_train,y_train, eval_set=[(X_valid, y_valid)], callbacks = [lgb.early_stopping(30)],eval_metric="auc")
print("Train Score", roc_auc_score(model.predict(X_train), y_train))
print("Valid Score", roc_auc_score(model.predict(X_valid), y_valid))
test_preds = model.predict_proba(test)[:, 1]
submission = pd.read_csv('/content/sample_submission.csv')
submission['target'] = test_preds
submission.to_csv('output.csv', index=False)
!kaggle competitions submit -c tabular-playground-series-may-2022 -f output.csv -m "LGB heavy feature engineered"
AUC ROC is: 0.97941