import tqdm
import numpy as np
import pandas as pd
import seaborn as sns
from zipfile import ZipFile
from matplotlib import pyplot as plt
from sklearn.impute import SimpleImputer
%matplotlib inline
plt.rcParams['figure.figsize'] = (20, 12)
Before running the below cell, upload your kaggle token, to make sure an error doesn't popup.
!mkdir ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json
!kaggle competitions download -c tabular-playground-series-jun-2022
with ZipFile('/content/tabular-playground-series-jun-2022.zip', 'r') as zf:
zf.extractall('./')
data = pd.read_csv('data.csv')
data.head()
data.info()
data.describe()
sns.heatmap(data.corr(), annot=True, vmin=-1, vmax=1, cmap='RdYlGn')
There are no missing values, in the data.
if data.isna().any().any():
print(data.isna().sum()*100/data.shape[0])
else:
print("No Missing values")
From the dataframe's info method, we can clearly see that all the columns which have a dtype of float64 contains missing values(< 1%). And all the columns with int64 datatype has no missing values.
As all the int64 columns are categorical columns.
null_cols = [col for col in data.columns if data[col].dtype == 'float64']
for col in null_cols:
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
data[col] = imputer.fit_transform(data[col].values.reshape(-1, 1))
submission = pd.read_csv('/content/sample_submission.csv')
values = list()
for i, row in tqdm.tqdm(submission.iterrows()):
r, c = row['row-col'].split('-')
values.append(data.loc[int(r), c])
submission['values'] = values
submission.to_csv('output.csv', index=False)
!kaggle competitions submit -c tabular-playground-series-jun-2022 -f output.csv -m "Mean Imputation(benchmark score)"
RMSE is: 1.41613