import re
import shutil
import numpy as np
import pandas as pd
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go

from google.colab import drive
from matplotlib import pyplot as plt
from plotly.subplots import make_subplots
%matplotlib inline
plt.rcParams['figure.figsize'] = (18, 14)
drive.mount('./mydrive')
Mounted at ./mydrive
shutil.copy('/content/mydrive/MyDrive/Anurag-Project/Datasets/NSLKDD/KDDTrain_original.csv', 'train.csv')
shutil.copy('/content/mydrive/MyDrive/Anurag-Project/Datasets/NSLKDD/KDDTest_original.csv', 'test.csv')
'test.csv'
train = pd.read_csv('train.csv')
train.head()
duration protocol_type service flag src_bytes dst_bytes land wrong_fragment urgent hot ... dst_host_srv_count dst_host_same_srv_rate dst_host_diff_srv_rate dst_host_same_src_port_rate dst_host_srv_diff_host_rate dst_host_serror_rate dst_host_srv_serror_rate dst_host_rerror_rate dst_host_srv_rerror_rate classification.
0 0 tcp ftp_data SF 491 0 0 0 0 0 ... 25 0.17 0.03 0.17 0.00 0.00 0.00 0.05 0.00 normal
1 0 udp other SF 146 0 0 0 0 0 ... 1 0.00 0.60 0.88 0.00 0.00 0.00 0.00 0.00 normal
2 0 tcp private S0 0 0 0 0 0 0 ... 26 0.10 0.05 0.00 0.00 1.00 1.00 0.00 0.00 Dos
3 0 tcp http SF 232 8153 0 0 0 0 ... 255 1.00 0.00 0.03 0.04 0.03 0.01 0.00 0.01 normal
4 0 tcp http SF 199 420 0 0 0 0 ... 255 1.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 normal

5 rows × 42 columns

train.info()
train.describe()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 125973 entries, 0 to 125972
Data columns (total 42 columns):
 #   Column                        Non-Null Count   Dtype  
---  ------                        --------------   -----  
 0   duration                      125973 non-null  int64  
 1    protocol_type                125973 non-null  object 
 2    service                      125973 non-null  object 
 3    flag                         125973 non-null  object 
 4    src_bytes                    125973 non-null  int64  
 5    dst_bytes                    125973 non-null  int64  
 6    land                         125973 non-null  int64  
 7    wrong_fragment               125973 non-null  int64  
 8    urgent                       125973 non-null  int64  
 9    hot                          125973 non-null  int64  
 10   num_failed_loginsì           125973 non-null  int64  
 11   logged_in                    125973 non-null  int64  
 12   num_compromised              125973 non-null  int64  
 13   root_shell                   125973 non-null  int64  
 14   su_attempted                 125973 non-null  int64  
 15   num_root                     125973 non-null  int64  
 16   num_file_creations           125973 non-null  int64  
 17   num_shells                   125973 non-null  int64  
 18   num_access_files             125973 non-null  int64  
 19   num_outbound_cmds            125973 non-null  int64  
 20   is_host_login                125973 non-null  int64  
 21   is_guest_login               125973 non-null  int64  
 22   count                        125973 non-null  int64  
 23   srv_count                    125973 non-null  int64  
 24   serror_rate                  125973 non-null  float64
 25   srv_serror_rate              125973 non-null  float64
 26   rerror_rate                  125973 non-null  float64
 27   srv_rerror_rate              125973 non-null  float64
 28   same_srv_rate                125973 non-null  float64
 29   diff_srv_rate                125973 non-null  float64
 30   srv_diff_host_rate           125973 non-null  float64
 31   dst_host_count               125973 non-null  int64  
 32   dst_host_srv_count           125973 non-null  int64  
 33   dst_host_same_srv_rate       125973 non-null  float64
 34   dst_host_diff_srv_rate       125973 non-null  float64
 35   dst_host_same_src_port_rate  125973 non-null  float64
 36   dst_host_srv_diff_host_rate  125973 non-null  float64
 37   dst_host_serror_rate         125973 non-null  float64
 38    dst_host_srv_serror_rate    125973 non-null  float64
 39   dst_host_rerror_rate         125973 non-null  float64
 40   dst_host_srv_rerror_rate     125973 non-null  float64
 41   classification.              125973 non-null  object 
dtypes: float64(15), int64(23), object(4)
memory usage: 40.4+ MB
duration src_bytes dst_bytes land wrong_fragment urgent hot num_failed_loginsì logged_in num_compromised ... dst_host_count dst_host_srv_count dst_host_same_srv_rate dst_host_diff_srv_rate dst_host_same_src_port_rate dst_host_srv_diff_host_rate dst_host_serror_rate dst_host_srv_serror_rate dst_host_rerror_rate dst_host_srv_rerror_rate
count 125973.00000 1.259730e+05 1.259730e+05 125973.000000 125973.000000 125973.000000 125973.000000 125973.000000 125973.000000 125973.000000 ... 125973.000000 125973.000000 125973.000000 125973.000000 125973.000000 125973.000000 125973.000000 125973.000000 125973.000000 125973.000000
mean 287.14465 4.556674e+04 1.977911e+04 0.000198 0.022687 0.000111 0.204409 0.001222 0.395736 0.279250 ... 182.148945 115.653005 0.521242 0.082951 0.148379 0.032542 0.284452 0.278485 0.118832 0.120240
std 2604.51531 5.870331e+06 4.021269e+06 0.014086 0.253530 0.014366 2.149968 0.045239 0.489010 23.942042 ... 99.206213 110.702741 0.448949 0.188922 0.308997 0.112564 0.444784 0.445669 0.306557 0.319459
min 0.00000 0.000000e+00 0.000000e+00 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 ... 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
25% 0.00000 0.000000e+00 0.000000e+00 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 ... 82.000000 10.000000 0.050000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
50% 0.00000 4.400000e+01 0.000000e+00 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 ... 255.000000 63.000000 0.510000 0.020000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
75% 0.00000 2.760000e+02 5.160000e+02 0.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000 ... 255.000000 255.000000 1.000000 0.070000 0.060000 0.020000 1.000000 1.000000 0.000000 0.000000
max 42908.00000 1.379964e+09 1.309937e+09 1.000000 3.000000 3.000000 77.000000 5.000000 1.000000 7479.000000 ... 255.000000 255.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000

8 rows × 38 columns

test = pd.read_csv('test.csv')
test.head()
duration protocol_type service flag src_bytes dst_bytes land wrong_fragment urgent hot ... dst_host_srv_count dst_host_same_srv_rate dst_host_diff_srv_rate dst_host_same_src_port_rate dst_host_srv_diff_host_rate dst_host_serror_rate dst_host_srv_serror_rate dst_host_rerror_rate dst_host_srv_rerror_rate classification.
0 0 tcp private REJ 0 0 0 0 0 0 ... 10 0.04 0.06 0.00 0.00 0.0 0.0 1.00 1.00 Dos
1 0 tcp private REJ 0 0 0 0 0 0 ... 1 0.00 0.06 0.00 0.00 0.0 0.0 1.00 1.00 Dos
2 2 tcp ftp_data SF 12983 0 0 0 0 0 ... 86 0.61 0.04 0.61 0.02 0.0 0.0 0.00 0.00 normal
3 0 icmp eco_i SF 20 0 0 0 0 0 ... 57 1.00 0.00 1.00 0.28 0.0 0.0 0.00 0.00 Probe
4 1 tcp telnet RSTO 0 15 0 0 0 0 ... 86 0.31 0.17 0.03 0.02 0.0 0.0 0.83 0.71 Probe

5 rows × 42 columns

sns.heatmap(train.corr(), annot=True, vmin=-1, vmax=1)
<matplotlib.axes._subplots.AxesSubplot at 0x7f0701152d10>
def show_missing_status(df, df_name='Train'):
    if df.isna().any().any():
        missing_cols = df.columns[df.isna().any()]
        print(f"Missing Columns in the data are: {missing_cols.tolist()}")
        print(df[missing_cols].isna().sum()/df.shape[1])
    else:
        print(f"No Missing Values present in {df_name}")

show_missing_status(train, df_name='Training Data')
show_missing_status(test, df_name='Testing Data')
No Missing Values present in Training Data
No Missing Values present in Testing Data

Data Cleaning

cols = [re.sub('[^a-zA-Z_]', '', col.strip(' .')) for col in train.columns]
train.columns = cols
test.columns = cols
train.drop(['num_outbound_cmds'], axis=1, inplace=True)
test.drop(['num_outbound_cmds'], axis=1, inplace=True)

EDA

target_values_train = train['classification'].value_counts().reset_index().rename(columns={'index': 'Traffic Type', 'classification': 'Count'})
target_values_test = test['classification'].value_counts().reset_index().rename(columns={'index': 'Traffic Type', 'classification': 'Count'})
fig = make_subplots(rows=1, cols=2, specs=[[{"type": "pie"}, {"type": "pie"}]], subplot_titles=('Training', 'Testing'))
fig.add_trace(
    go.Pie(labels=target_values_train['Traffic Type'].values, values=target_values_train['Count'].values),
    row=1, col=1
)

fig.add_trace(
    go.Pie(labels=target_values_train['Traffic Type'].values, values=target_values_test['Count'].values),
    row=1, col=2
)
fig.update_layout(title_text="Distribution of Target Variable(in Train and Test Sets)")
target_values_train['Partition Type'] = 'Train'
target_values_test['Partition Type'] = 'Test'

target_values = pd.concat([target_values_train, target_values_test])
px.bar(target_values, x='Traffic Type', y='Count', color='Partition Type', barmode='group', text_auto=True)