import tqdm
import numpy as np
import pandas as pd
import seaborn as sns

from zipfile import ZipFile
from matplotlib import pyplot as plt

from sklearn.impute import SimpleImputer
%matplotlib inline
plt.rcParams['figure.figsize'] = (20, 12)

Before running the below cell, upload your kaggle token, to make sure an error doesn't popup.

!mkdir ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json
!kaggle competitions download -c tabular-playground-series-jun-2022
Downloading tabular-playground-series-jun-2022.zip to /content
 93% 219M/234M [00:01<00:00, 94.2MB/s]
100% 234M/234M [00:01<00:00, 157MB/s] 
with ZipFile('/content/tabular-playground-series-jun-2022.zip', 'r') as zf:
    zf.extractall('./')

Loading the data

data = pd.read_csv('data.csv')
data.head()
row_id F_1_0 F_1_1 F_1_2 F_1_3 F_1_4 F_1_5 F_1_6 F_1_7 F_1_8 ... F_4_5 F_4_6 F_4_7 F_4_8 F_4_9 F_4_10 F_4_11 F_4_12 F_4_13 F_4_14
0 0 -0.354591 -0.464038 2.304115 0.734486 1.696395 0.136285 -0.518344 0.502640 -1.852504 ... 3.744152 0.794438 0.265185 -0.561809 0.196480 0.373434 6.206995 3.809505 1.236486 1.182055
1 1 1.380940 -0.499626 -0.418548 1.911725 -0.826130 -1.715371 -0.577091 -1.041486 0.596067 ... -2.895826 -0.738275 2.361818 -0.060753 0.727249 -0.271882 5.232157 -4.218259 -2.724883 -0.063775
2 2 0.256023 -1.059874 NaN 0.345678 1.513814 1.243864 -0.509648 -0.800481 -0.115945 ... 2.252834 0.472496 2.491386 0.353381 -0.260682 -0.000833 -0.116457 -2.131747 3.661499 -0.131576
3 3 -0.728420 -2.432399 -2.453602 -0.020509 0.333397 0.086049 -1.787601 0.667011 0.761564 ... 2.004600 -4.664806 -0.847211 -0.264249 0.664334 -0.557868 8.499483 -4.738799 -3.054611 0.494152
4 4 0.590212 -0.066127 0.468009 -1.096038 0.119399 -1.809710 0.466358 -0.053196 -0.580320 ... 0.976937 2.558883 3.377724 0.846891 0.696032 0.554121 -5.979714 -2.869631 3.733057 -0.722943

5 rows × 81 columns

data.info()
data.describe()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000000 entries, 0 to 999999
Data columns (total 81 columns):
 #   Column  Non-Null Count    Dtype  
---  ------  --------------    -----  
 0   row_id  1000000 non-null  int64  
 1   F_1_0   981603 non-null   float64
 2   F_1_1   981784 non-null   float64
 3   F_1_2   981992 non-null   float64
 4   F_1_3   981750 non-null   float64
 5   F_1_4   981678 non-null   float64
 6   F_1_5   981911 non-null   float64
 7   F_1_6   981867 non-null   float64
 8   F_1_7   981872 non-null   float64
 9   F_1_8   981838 non-null   float64
 10  F_1_9   981751 non-null   float64
 11  F_1_10  982039 non-null   float64
 12  F_1_11  981830 non-null   float64
 13  F_1_12  981797 non-null   float64
 14  F_1_13  981602 non-null   float64
 15  F_1_14  981961 non-null   float64
 16  F_2_0   1000000 non-null  int64  
 17  F_2_1   1000000 non-null  int64  
 18  F_2_2   1000000 non-null  int64  
 19  F_2_3   1000000 non-null  int64  
 20  F_2_4   1000000 non-null  int64  
 21  F_2_5   1000000 non-null  int64  
 22  F_2_6   1000000 non-null  int64  
 23  F_2_7   1000000 non-null  int64  
 24  F_2_8   1000000 non-null  int64  
 25  F_2_9   1000000 non-null  int64  
 26  F_2_10  1000000 non-null  int64  
 27  F_2_11  1000000 non-null  int64  
 28  F_2_12  1000000 non-null  int64  
 29  F_2_13  1000000 non-null  int64  
 30  F_2_14  1000000 non-null  int64  
 31  F_2_15  1000000 non-null  int64  
 32  F_2_16  1000000 non-null  int64  
 33  F_2_17  1000000 non-null  int64  
 34  F_2_18  1000000 non-null  int64  
 35  F_2_19  1000000 non-null  int64  
 36  F_2_20  1000000 non-null  int64  
 37  F_2_21  1000000 non-null  int64  
 38  F_2_22  1000000 non-null  int64  
 39  F_2_23  1000000 non-null  int64  
 40  F_2_24  1000000 non-null  int64  
 41  F_3_0   981971 non-null   float64
 42  F_3_1   981655 non-null   float64
 43  F_3_2   981944 non-null   float64
 44  F_3_3   981946 non-null   float64
 45  F_3_4   981627 non-null   float64
 46  F_3_5   981702 non-null   float64
 47  F_3_6   981808 non-null   float64
 48  F_3_7   981987 non-null   float64
 49  F_3_8   981902 non-null   float64
 50  F_3_9   981894 non-null   float64
 51  F_3_10  981800 non-null   float64
 52  F_3_11  981612 non-null   float64
 53  F_3_12  981703 non-null   float64
 54  F_3_13  981940 non-null   float64
 55  F_3_14  981861 non-null   float64
 56  F_3_15  981762 non-null   float64
 57  F_3_16  981878 non-null   float64
 58  F_3_17  981722 non-null   float64
 59  F_3_18  981911 non-null   float64
 60  F_3_19  981800 non-null   float64
 61  F_3_20  981752 non-null   float64
 62  F_3_21  981604 non-null   float64
 63  F_3_22  981823 non-null   float64
 64  F_3_23  981794 non-null   float64
 65  F_3_24  981855 non-null   float64
 66  F_4_0   981872 non-null   float64
 67  F_4_1   981836 non-null   float64
 68  F_4_2   981505 non-null   float64
 69  F_4_3   981971 non-null   float64
 70  F_4_4   982043 non-null   float64
 71  F_4_5   981937 non-null   float64
 72  F_4_6   981675 non-null   float64
 73  F_4_7   981986 non-null   float64
 74  F_4_8   981824 non-null   float64
 75  F_4_9   981735 non-null   float64
 76  F_4_10  981775 non-null   float64
 77  F_4_11  981881 non-null   float64
 78  F_4_12  981694 non-null   float64
 79  F_4_13  982005 non-null   float64
 80  F_4_14  981733 non-null   float64
dtypes: float64(55), int64(26)
memory usage: 618.0 MB
row_id F_1_0 F_1_1 F_1_2 F_1_3 F_1_4 F_1_5 F_1_6 F_1_7 F_1_8 ... F_4_5 F_4_6 F_4_7 F_4_8 F_4_9 F_4_10 F_4_11 F_4_12 F_4_13 F_4_14
count 1000000.000000 981603.000000 981784.000000 981992.000000 981750.000000 981678.000000 981911.000000 981867.000000 981872.000000 981838.000000 ... 981937.000000 981675.000000 981986.000000 981824.000000 981735.000000 981775.000000 981881.000000 981694.000000 982005.000000 981733.000000
mean 499999.500000 -0.000687 0.002091 0.000551 0.000982 0.002437 0.000635 -0.000124 -0.063857 -0.000014 ... 0.335968 0.003773 0.334433 -0.071842 -0.079854 0.038282 0.551900 0.333509 0.330047 0.037223
std 288675.278933 1.000784 0.999878 0.999825 0.999847 1.001151 0.999927 1.000427 0.725986 1.000715 ... 2.352749 2.292272 2.358757 0.777855 0.806965 0.706913 4.996632 2.383485 2.364036 0.776354
min 0.000000 -4.664447 -4.790594 -4.870645 -5.053023 -5.363454 -5.508223 -5.199170 -6.990302 -4.566593 ... -12.536923 -11.132350 -11.677862 -10.091950 -9.864310 -10.353951 -26.276588 -11.524137 -10.661978 -9.983739
25% 249999.750000 -0.674667 -0.672284 -0.673783 -0.672211 -0.672326 -0.673884 -0.674834 -0.500305 -0.674006 ... -1.266885 -1.570766 -1.224468 -0.517559 -0.576799 -0.385768 -2.791428 -1.268732 -1.298927 -0.396043
50% 499999.500000 -0.000769 0.002049 0.001386 0.000370 0.002733 0.000276 0.000814 0.000578 -0.000047 ... 0.303320 -0.071800 0.379014 0.018222 -0.027811 0.102679 0.203056 0.354137 0.295049 0.131245
75% 749999.250000 0.673181 0.675997 0.674490 0.674852 0.676588 0.675951 0.673983 0.444337 0.674295 ... 1.917695 1.521134 1.925968 0.474558 0.480100 0.529550 3.646324 1.947385 1.916931 0.574182
max 999999.000000 5.039177 5.042807 5.130025 5.462472 4.856916 4.960724 4.957977 2.527570 4.886075 ... 13.539926 11.524512 12.536151 2.607285 2.814969 2.548404 31.229363 11.341926 11.901409 2.584492

8 rows × 81 columns

sns.heatmap(data.corr(), annot=True, vmin=-1, vmax=1, cmap='RdYlGn')
Output hidden; open in https://colab.research.google.com to view.

There are no missing values, in the data.

if data.isna().any().any():
    print(data.isna().sum()*100/data.shape[0])
else:
    print("No Missing values")
row_id    0.0000
F_1_0     1.8397
F_1_1     1.8216
F_1_2     1.8008
F_1_3     1.8250
           ...  
F_4_10    1.8225
F_4_11    1.8119
F_4_12    1.8306
F_4_13    1.7995
F_4_14    1.8267
Length: 81, dtype: float64

From the dataframe's info method, we can clearly see that all the columns which have a dtype of float64 contains missing values(< 1%). And all the columns with int64 datatype has no missing values.

As all the int64 columns are categorical columns.

null_cols = [col for col in data.columns if data[col].dtype == 'float64']

Imputation

Approach - 1

Using Central Tendency Measures to impute the data(mean, median)

for col in null_cols:
    imputer = SimpleImputer(missing_values=np.nan, strategy='mean')

    data[col] = imputer.fit_transform(data[col].values.reshape(-1, 1))
submission = pd.read_csv('/content/sample_submission.csv')
values = list()

for i, row in tqdm.tqdm(submission.iterrows()):
    r, c = row['row-col'].split('-')
    values.append(data.loc[int(r), c])

submission['values'] = values
submission.to_csv('output.csv', index=False)
!kaggle competitions submit -c tabular-playground-series-jun-2022 -f output.csv -m "Mean Imputation(benchmark score)"
100% 33.4M/33.4M [00:01<00:00, 20.1MB/s]
Successfully submitted to Tabular Playground Series - Jun 2022

RMSE is: 1.41613