import tqdm
import numpy as np
import pandas as pd
import seaborn as sns
from zipfile import ZipFile
from matplotlib import pyplot as plt
from sklearn.ensemble import RandomForestClassifier
%matplotlib inline
plt.rcParams['figure.figsize'] = (12, 12)
Before running the below cell, upload your kaggle token, to make sure an error doesn't popup.
!mkdir ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json
!kaggle competitions download -c tabular-playground-series-apr-2022
with ZipFile('/content/tabular-playground-series-apr-2022.zip', 'r') as zf:
zf.extractall('./')
train = pd.read_csv('train.csv')
train.head()
train.info()
train.describe()
labels = pd.read_csv('train_labels.csv')
labels.head()
sns.heatmap(train.corr(), annot=True, vmin=-1, vmax=1, cmap='RdYlGn')
test = pd.read_csv('test.csv')
test.head()
There are no missing values, in the data.
if train.isna().any().any():
print(train.isna().sum()/train.shape[0])
else:
print("No Missing values")
train_all = train.merge(labels, on='sequence')
def time_series_train_test_split(X, y, split_col, target_col, train_size=0.8):
train_len = int(len(X[split_col].unique()) * train_size)
condition_mask = X[split_col].isin(list(range(train_len+1)))
X_train, y_train = X[condition_mask], y[condition_mask]
X_valid, y_valid = X[np.logical_not(condition_mask)], y[np.logical_not(condition_mask)]
return X_train, X_valid, y_train, y_valid
X = train_all.drop(['state'], axis=1)
y = train_all['state']
X_train, X_valid, y_train, y_valid = time_series_train_test_split(X, y, 'sequence', 'state')
I have used the classical machine learning models to fit the data and classify them into their states.
model = RandomForestClassifier(n_jobs=-1)
model.fit(X_train, y_train)
train_pred = model.predict(X_train.values)
valid_pred = model.predict(X_valid.values)
test_pred = model.predict(test.values)
print(f"Train Accuarcy: {model.score(X_train, y_train)}")
print(f"Valid Accuarcy: {model.score(X_valid, y_valid)}")
test_all = pd.concat([test, pd.Series(test_pred)], axis=1)
test_all.head()
For each timestamp of a particular sequence the model will predict wheather it is in state 0 or 1.
But the test results, should be in the format of sequence number and the
test_actual = test_all.groupby(['sequence']).agg({0: np.mean}).reset_index()
test_actual[0] = np.where(test_actual[0] < 0.5, 0, 1)
test_actual.head()
submission = pd.read_csv('/content/sample_submission.csv')
submission.merge(test_actual, on='sequence').drop(['state'], axis=1).rename(columns={0: 'state'})
submission.to_csv('output.csv', index=False)
!kaggle competitions submit -c tabular-playground-series-apr-2022 -f output.csv -m "RF 100"
ROC AUC: 0.5