import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np


df = pd.read_csv("data/Artisan_training.csv")


#Check the contet of the dataset
df.head()


df.shape

(29573, 123)


df.label.nunique()

26


df.label.value_counts()

_other                   6985
scdn.co                  5064
taboola.com              4117
krxd.net                 2253
outbrain.com             1112
githubusercontent.com     887
giphy.com                 836
everesttech.net           715
reddit.com                700
adnxs.com                 698
ftcdn.net                 671
redd.it                   605
disqus.com                604
contextweb.com            566
twitch.tv                 486
newrelic.com              454
pinterest.com             426
fastly.net                368
chartbeat.com             303
twitchcdn.net             291
fastly-insights.com       282
ads-twitter.com           267
polyfill.io               253
vimeocdn.com              221
slack-edge.com            212
twimg.com                 197
Name: label, dtype: int64


df.label.value_counts().plot.bar()

<AxesSubplot:>


df[df.isna().any(axis=1)]


df.describe()


def find_invariant_features(df):
    invariant_features = []
    #Check of columns with 0 variance exist
    #WARNING: There may be columns for which variance cannot be computed!
    for c in df.columns:
        try:
            var = df[c].var()
            if(var==0): 
                invariant_features.append(c)
                print(c)
        except:
            print("cannot compute variance: "+c)
    return invariant_features


invariant_features = find_invariant_features(df)
df = df.drop(invariant_features,axis=1)

_c_pkts_fc
_c_pkts_unfs
_c_sack_opt
_c_syn_retx
_s_pkts_unfs
_s_port
_s_sack_opt
_s_syn_retx
_s_win_0
cannot compute variance: c_ip
cannot compute variance: label


invariant_features

['_c_pkts_fc',
 '_c_pkts_unfs',
 '_c_sack_opt',
 '_c_syn_retx',
 '_s_pkts_unfs',
 '_s_port',
 '_s_sack_opt',
 '_s_syn_retx',
 '_s_win_0']


def drop_outliers_IQR(df):
    
    #Extract the 1st percentile and 99th percentile
    #Outliers are values below the 1st percentile or above the 99th percentile
    #Remove all rows having an outiler

    p1=df.quantile(0.01)

    p99=df.quantile(0.99)

    not_outliers = df[~((df<p1) | (df>p99))]

    outliers_dropped = not_outliers.dropna()

    return outliers_dropped


df_clean = drop_outliers_IQR(df)

/opt/conda/lib/python3.7/site-packages/ipykernel_launcher.py:11: FutureWarning: Automatic reindexing on DataFrame vs Series comparisons is deprecated and will raise ValueError in a future version.  Do `left, right = left.align(right, axis=1, copy=False)` before e.g. `left == right`
  # This is added back by InteractiveShellApp.init_path()


df_clean.label.value_counts().plot.bar()

<AxesSubplot:>


df_clean.label.nunique()

26


list(df_clean.columns)

['_c_ack_cnt',
 '_c_ack_cnt_p',
 '_c_appdataB',
 '_c_appdataT',
 '_c_bytes_all',
 '_c_bytes_retx',
 '_c_bytes_uniq',
 '_c_cwin_ini',
 '_c_cwin_max',
 '_c_cwin_min',
 '_c_fin_cnt',
 '_c_first',
 '_c_first_ack',
 '_c_last',
 '_c_last_handshakeT',
 '_c_msgsize1',
 '_c_msgsize_count',
 '_c_mss',
 '_c_mss_max',
 '_c_mss_min',
 '_c_pkts_all',
 '_c_pkts_data',
 '_c_pkts_data_avg',
 '_c_pkts_data_std',
 '_c_pkts_dup',
 '_c_pkts_ooo',
 '_c_pkts_push',
 '_c_pkts_reor',
 '_c_pkts_retx',
 '_c_pkts_rto',
 '_c_pkts_unk',
 '_c_pkts_unrto',
 '_c_pktsize1',
 '_c_pktsize_count',
 '_c_port',
 '_c_rst_cnt',
 '_c_rtt_avg',
 '_c_rtt_cnt',
 '_c_rtt_max',
 '_c_rtt_min',
 '_c_rtt_std',
 '_c_sack_cnt',
 '_c_seg_cnt',
 '_c_sit1',
 '_c_sit_avg',
 '_c_sit_std',
 '_c_syn_cnt',
 '_c_tm_opt',
 '_c_ttl_max',
 '_c_ttl_min',
 '_c_win_0',
 '_c_win_max',
 '_c_win_min',
 '_c_win_scl',
 '_durat',
 '_s_ack_cnt',
 '_s_ack_cnt_p',
 '_s_appdataB',
 '_s_appdataT',
 '_s_bytes_all',
 '_s_bytes_retx',
 '_s_bytes_uniq',
 '_s_cwin_ini',
 '_s_cwin_max',
 '_s_cwin_min',
 '_s_f1323_opt',
 '_s_fin_cnt',
 '_s_first',
 '_s_first_ack',
 '_s_last',
 '_s_last_handshakeT',
 '_s_msgsize1',
 '_s_msgsize_count',
 '_s_mss',
 '_s_mss_max',
 '_s_mss_min',
 '_s_pkts_all',
 '_s_pkts_data',
 '_s_pkts_data_avg',
 '_s_pkts_data_std',
 '_s_pkts_dup',
 '_s_pkts_fc',
 '_s_pkts_fs',
 '_s_pkts_ooo',
 '_s_pkts_push',
 '_s_pkts_reor',
 '_s_pkts_retx',
 '_s_pkts_rto',
 '_s_pkts_unk',
 '_s_pkts_unrto',
 '_s_pktsize1',
 '_s_pktsize_count',
 '_s_rst_cnt',
 '_s_rtt_avg',
 '_s_rtt_cnt',
 '_s_rtt_max',
 '_s_rtt_min',
 '_s_rtt_std',
 '_s_sack_cnt',
 '_s_seg_cnt',
 '_s_sit1',
 '_s_sit_avg',
 '_s_sit_std',
 '_s_syn_cnt',
 '_s_tm_opt',
 '_s_ttl_max',
 '_s_ttl_min',
 '_s_win_max',
 '_s_win_min',
 '_s_win_scl',
 '_tls_session_stat',
 'c_ip',
 'time',
 'label']


df_clean = df_clean.drop(["c_ip","time"],axis=1)


from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score


def confusion_matrix(y_true, y_pred):
    df = pd.DataFrame([x for x in zip(y_true, y_pred)],
                       columns=['y_true', 'y_pred'])
    df[['samples']] = 1
    confusion = pd.pivot_table(df, index='y_true', 
                               columns='y_pred', 
                               values='samples', 
                               aggfunc=sum)
    return confusion

def plot_confusion_matrix(confusion_matrix):

    fig, ax = plt.subplots(figsize=(20, 16))
    # Normalize by rows
    confusion_matrix_norm = confusion_matrix.divide(confusion_matrix.sum(1), axis=0)*100
    # Visualize
    sns.heatmap(confusion_matrix_norm, cmap='Blues', annot=True, vmin=0, vmax=100, ax=ax, cbar_kws={'label':'Percentage'},fmt=".1f")
    ax.set_xlabel('Prediction')
    ax.set_ylabel('True')    

    plt.tight_layout()
    plt.show()
    plt.close()  
    
    return


param_grid = {
    "max_depth" : [10, 15, 20],
    "n_estimators" : [10,20, 50]
}
rf = RandomForestClassifier()
grid_search = GridSearchCV(estimator=rf, cv=10, param_grid=param_grid)


#Prepare the dataset for the validation process

train_columns = list(df_clean.columns)
train_columns.remove("label")
X = df_clean[train_columns]
y = df_clean["label"]


#Perform a Grid search analysis

rf_models = grid_search.fit(X, y)


#What is the best estimator?

rf_models.best_estimator_

RandomForestClassifier(max_depth=20, n_estimators=50)


#How does the different fold perform?

print(rf_models.cv_results_)

{'mean_fit_time': array([0.4843389 , 0.94696097, 2.34931455, 0.61932619, 1.23678443,
       3.06452587, 0.68121929, 1.32761374, 3.33889036]), 'std_fit_time': array([0.01030868, 0.01201455, 0.02065306, 0.00825935, 0.01234856,
       0.01679352, 0.01331663, 0.01984481, 0.04146418]), 'mean_score_time': array([0.00907478, 0.01319396, 0.02681134, 0.00992603, 0.016485  ,
       0.03599961, 0.0110254 , 0.01769838, 0.04043283]), 'std_score_time': array([0.00078487, 0.00075032, 0.0011493 , 0.00047625, 0.00103498,
       0.00400181, 0.00084235, 0.00069512, 0.00515087]), 'param_max_depth': masked_array(data=[10, 10, 10, 15, 15, 15, 20, 20, 20],
             mask=[False, False, False, False, False, False, False, False,
                   False],
       fill_value='?',
            dtype=object), 'param_n_estimators': masked_array(data=[10, 20, 50, 10, 20, 50, 10, 20, 50],
             mask=[False, False, False, False, False, False, False, False,
                   False],
       fill_value='?',
            dtype=object), 'params': [{'max_depth': 10, 'n_estimators': 10}, {'max_depth': 10, 'n_estimators': 20}, {'max_depth': 10, 'n_estimators': 50}, {'max_depth': 15, 'n_estimators': 10}, {'max_depth': 15, 'n_estimators': 20}, {'max_depth': 15, 'n_estimators': 50}, {'max_depth': 20, 'n_estimators': 10}, {'max_depth': 20, 'n_estimators': 20}, {'max_depth': 20, 'n_estimators': 50}], 'split0_test_score': array([0.78073993, 0.78182807, 0.7867247 , 0.84167573, 0.83786725,
       0.84929271, 0.82861806, 0.84439608, 0.85527748]), 'split1_test_score': array([0.78019587, 0.78618063, 0.80195865, 0.84548422, 0.85908596,
       0.87377584, 0.85092492, 0.87431991, 0.87486398]), 'split2_test_score': array([0.77366703, 0.77094668, 0.78726877, 0.85908596, 0.86071817,
       0.86833515, 0.85527748, 0.8726877 , 0.87486398]), 'split3_test_score': array([0.77245509, 0.79096353, 0.77844311, 0.83505716, 0.84757757,
       0.85465433, 0.8328797 , 0.85737616, 0.86771911]), 'split4_test_score': array([0.76320087, 0.78551987, 0.77408819, 0.82416984, 0.85084377,
       0.85737616, 0.84158955, 0.86009799, 0.85900925]), 'split5_test_score': array([0.77789875, 0.78933043, 0.78933043, 0.85900925, 0.86663038,
       0.8742515 , 0.85846489, 0.87697333, 0.89874796]), 'split6_test_score': array([0.75884594, 0.80076211, 0.80239521, 0.8328797 , 0.85737616,
       0.8726184 , 0.84975504, 0.87915079, 0.8726184 ]), 'split7_test_score': array([0.75503538, 0.77681002, 0.77789875, 0.82580294, 0.84104518,
       0.85247686, 0.82634731, 0.85247686, 0.86336418]), 'split8_test_score': array([0.77626565, 0.81110506, 0.79749592, 0.84104518, 0.86989657,
       0.86608601, 0.84050082, 0.86880784, 0.8726184 ]), 'split9_test_score': array([0.80185084, 0.78606424, 0.7849755 , 0.83886772, 0.85574306,
       0.85955362, 0.84485574, 0.86009799, 0.86663038]), 'mean_test_score': array([0.77401554, 0.78795106, 0.78805792, 0.84030777, 0.85467841,
       0.86284206, 0.84292135, 0.86463846, 0.87057131]), 'std_test_score': array([0.01260253, 0.01083779, 0.00944559, 0.01134557, 0.00986086,
       0.0088586 , 0.0104715 , 0.01091005, 0.01132409]), 'rank_test_score': array([9, 8, 7, 6, 4, 3, 5, 2, 1], dtype=int32)}


#Read the test dataset

df_test = pd.read_csv("data/Artisan_testing.csv")


#Clean the dataset from outliers

df_test_clean = drop_outliers_IQR(df_test)

/opt/conda/lib/python3.7/site-packages/ipykernel_launcher.py:11: FutureWarning: Automatic reindexing on DataFrame vs Series comparisons is deprecated and will raise ValueError in a future version.  Do `left, right = left.align(right, axis=1, copy=False)` before e.g. `left == right`
  # This is added back by InteractiveShellApp.init_path()


#Prepare the dataset for the testing process

X_test = df_test_clean[train_columns]
y_test = df_test_clean["label"]


#Train the best model found in the previous step with all the Training data

model = rf_models.best_estimator_.fit(X,y)


#Predict the test label
y_test_predicted = model.predict(X_test)


#Analyse the classification report
#Are there classes that are more difficult to predict than others?=

print(classification_report(y_test, y_test_predicted))

                       precision    recall  f1-score   support

               _other       0.77      0.85      0.81      4355
            adnxs.com       0.90      0.79      0.84       293
      ads-twitter.com       0.96      0.78      0.86       158
        chartbeat.com       0.96      0.86      0.91       274
       contextweb.com       0.84      0.80      0.82       309
           disqus.com       0.81      0.57      0.67       362
      everesttech.net       0.97      0.83      0.89       378
  fastly-insights.com       0.97      0.76      0.85        42
           fastly.net       0.92      0.59      0.72       205
            ftcdn.net       0.99      0.95      0.97       250
            giphy.com       0.83      0.83      0.83       226
githubusercontent.com       0.90      0.81      0.85       438
             krxd.net       0.85      0.93      0.88      1399
         newrelic.com       0.86      0.74      0.80       198
         outbrain.com       0.68      0.55      0.61       650
        pinterest.com       0.88      0.63      0.74       215
          polyfill.io       0.80      0.54      0.65       123
              redd.it       0.90      0.68      0.77       264
           reddit.com       0.95      0.61      0.74       150
              scdn.co       0.91      0.98      0.94      1725
       slack-edge.com       1.00      0.26      0.41        39
          taboola.com       0.73      0.84      0.78      1979
            twimg.com       0.65      0.17      0.27        77
            twitch.tv       0.65      0.28      0.39        94
        twitchcdn.net       0.33      0.08      0.13        25
         vimeocdn.com       0.82      0.39      0.53       120

             accuracy                           0.82     14348
            macro avg       0.84      0.66      0.72     14348
         weighted avg       0.82      0.82      0.81     14348


#Extract and plot the confusion matrix

confusion_matrix = confusion_matrix(y_test, y_test_predicted)
plot_confusion_matrix(confusion_matrix)


RF_feature_importance = model.feature_importances_
features = list(X.columns)
feature_importance = {features[i]: RF_feature_importance[i] for i in range(len(RF_feature_importance))}
feature_importance_sorted = {k: v for k, v in sorted(feature_importance.items(), key=lambda item: item[1], reverse = True)}


fig, ax = plt.subplots(figsize=(5,4))
ax.plot(feature_importance_sorted.values())
ax.set_xlabel("Feature")
ax.set_ylabel("Feature Importance")
plt.grid()
plt.show()
plt.close()


Accuracy = []
NFeatures = []
for i in range(1,len(feature_importance_sorted)):
    if((i-1)%5==0):
        print(i)
        subset = list(feature_importance_sorted.keys())[0:i]
        model = rf_models.best_estimator_.fit(X[subset],y)
        y_test_predicted_i = model.predict(X_test[subset])
        Accuracy.append(accuracy_score(y_test, y_test_predicted_i))
        NFeatures.append(i)

1
6
11
16
21
26
31
36
41
46
51
56
61
66
71
76
81
86
91
96
101
106


#Plot the trend of the Accuracy

# Do you need all the features to create a good classifier or a few are better?

fig, ax = plt.subplots(figsize=(8,4))
ax.plot(NFeatures,Accuracy)
ax.set_xlabel("Features")
ax.set_ylabel("Accuracy")
ax.set_xticks(NFeatures)
ax.set_xlim(1,106)
plt.grid()
plt.show()
plt.close()


from sklearn.decomposition import PCA


pca = PCA()
pca.fit(X)

PCA()


# Get the explained variance
exp_var = pca.explained_variance_ratio_
# Get the cumulative expalined variance
cum_exp_var = np.cumsum(exp_var)/np.sum(exp_var)


#Plot the trend of the explained variance
plt.figure(figsize=(5, 3.5))
plt.plot(np.arange(1, X.shape[1]+1), cum_exp_var, marker='o')
plt.grid()
plt.xlabel('# Principal Components')
plt.ylabel('Cumulative explained variance')
plt.xticks(np.arange(1, X.shape[1]+1, 2))
plt.xlim(0,10)
plt.show()
plt.close()


# Fit the PCA on the training set with the decided number of components
pca = PCA(n_components=3, whiten=True)
pca.fit(X)

PCA(n_components=3, whiten=True)


#Transform the training and the test set
X_PCA = pca.transform(X)
X_test_PCA = pca.transform(X_test)


#Create a new model and predict the test label
model = rf_models.best_estimator_.fit(X_PCA,y)
y_test_predicted_PCA = model.predict(X_test_PCA)


print(classification_report(y_test, y_test_predicted_PCA))

                       precision    recall  f1-score   support

               _other       0.45      0.46      0.46      4355
            adnxs.com       0.34      0.22      0.27       293
      ads-twitter.com       0.16      0.08      0.10       158
        chartbeat.com       0.44      0.23      0.30       274
       contextweb.com       0.21      0.20      0.21       309
           disqus.com       0.12      0.05      0.07       362
      everesttech.net       0.24      0.21      0.22       378
  fastly-insights.com       0.08      0.19      0.12        42
           fastly.net       0.18      0.06      0.09       205
            ftcdn.net       0.65      0.55      0.60       250
            giphy.com       0.45      0.42      0.43       226
githubusercontent.com       0.26      0.20      0.22       438
             krxd.net       0.42      0.44      0.43      1399
         newrelic.com       0.32      0.14      0.20       198
         outbrain.com       0.16      0.09      0.12       650
        pinterest.com       0.25      0.13      0.18       215
          polyfill.io       0.14      0.06      0.08       123
              redd.it       0.44      0.35      0.39       264
           reddit.com       0.26      0.15      0.19       150
              scdn.co       0.46      0.68      0.54      1725
       slack-edge.com       0.05      0.08      0.06        39
          taboola.com       0.36      0.48      0.41      1979
            twimg.com       0.03      0.01      0.02        77
            twitch.tv       0.10      0.04      0.06        94
        twitchcdn.net       0.00      0.00      0.00        25
         vimeocdn.com       0.13      0.03      0.05       120

             accuracy                           0.39     14348
            macro avg       0.26      0.21      0.22     14348
         weighted avg       0.37      0.39      0.37     14348

	_c_ack_cnt	_c_ack_cnt_p	_c_appdataB	_c_appdataT	_c_bytes_all	_c_bytes_uniq	_c_cwin_ini	_c_cwin_max	_c_cwin_min	...	_s_ttl_max	_s_ttl_min	_s_win_max	_s_win_min	_s_win_scl	_tls_session_stat	c_ip	time	label
0	217.0	206.0	272.0	35.395	2426.0	2426.0	178.0	305.0	31.0	...	59.0	59.0	36352.0	27680.0	9.0	0.0	67.32.230.26	1.561979e+09	giphy.com
1	8.0	3.0	479.0	11.842	945.0	945.0	427.0	518.0	51.0	...	59.0	59.0	30208.0	27920.0	9.0	0.0	67.32.230.24	1.561976e+09	fastly-insights.com
2	53.0	45.0	611.0	146.000	3328.0	3328.0	517.0	710.0	38.0	...	59.0	59.0	34816.0	27920.0	9.0	1.0	67.32.76.252	1.561984e+09	_other
3	21.0	8.0	488.0	18.033	4101.0	4101.0	394.0	836.0	93.0	...	59.0	59.0	38912.0	27920.0	9.0	0.0	67.32.124.163	1.561984e+09	twitch.tv
4	9.0	5.0	0.0	0.000	641.0	641.0	517.0	517.0	31.0	...	59.0	59.0	29184.0	27680.0	9.0	1.0	67.32.230.4	1.561990e+09	krxd.net

	_c_ack_cnt	_c_ack_cnt_p	_c_appdataB	_c_appdataT	_c_bytes_all	_c_bytes_retx	_c_bytes_uniq	_c_cwin_ini	_c_cwin_max	_c_cwin_min	...	_s_syn_retx	_s_tm_opt	_s_ttl_max	_s_ttl_min	_s_win_0	_s_win_max	_s_win_min	_s_win_scl	_tls_session_stat	time
count	29573.000000	29573.000000	29573.000000	29573.000000	2.957300e+04	29573.000000	2.957300e+04	29573.000000	29573.000000	29573.000000	...	29573.0	29573.000000	29573.00000	29573.000000	29573.0	2.957300e+04	29573.000000	29573.000000	29573.000000	2.957300e+04
mean	111.895006	96.860278	424.265479	193.826524	5.147532e+03	50.361309	5.097170e+03	429.760863	944.303385	55.096270	...	0.0	0.448957	58.94096	58.891658	0.0	1.335119e+05	27812.241166	8.668278	0.723193	1.561983e+09
std	997.243526	914.683239	233.130543	1323.627062	1.589892e+05	1531.723204	1.578924e+05	138.342696	2148.354313	47.784299	...	0.0	0.497396	0.75267	1.011313	0.0	1.177282e+06	119.377259	1.695745	0.453732	1.172480e+04
min	2.000000	0.000000	0.000000	0.000000	1.550000e+02	0.000000	1.550000e+02	155.000000	155.000000	1.000000	...	0.0	0.000000	37.00000	37.000000	0.0	2.840800e+04	27648.000000	0.000000	0.000000	1.561932e+09
25%	10.000000	4.000000	295.000000	15.960000	7.590000e+02	0.000000	7.440000e+02	252.000000	517.000000	31.000000	...	0.0	0.000000	59.00000	59.000000	0.0	3.020800e+04	27680.000000	9.000000	0.000000	1.561973e+09
50%	17.000000	7.000000	569.000000	33.055000	1.140000e+03	0.000000	1.127000e+03	517.000000	517.000000	38.000000	...	0.0	0.000000	59.00000	59.000000	0.0	3.020800e+04	27920.000000	9.000000	1.000000	1.561983e+09
75%	37.000000	23.000000	611.000000	76.006000	2.041000e+03	0.000000	2.003000e+03	517.000000	707.000000	93.000000	...	0.0	1.000000	59.00000	59.000000	0.0	3.276800e+04	27920.000000	9.000000	1.000000	1.561992e+09
max	60406.000000	51434.000000	5673.000000	102908.359000	2.539156e+07	197307.000000	2.522823e+07	642.000000	167464.000000	593.000000	...	0.0	1.000000	59.00000	59.000000	0.0	1.429504e+07	27920.000000	9.000000	3.000000	1.562019e+09

1. Basic characterization¶

2. Data Preprocessing¶

3. Baseline Classification¶

4. Feature Selection for Classification¶

5. Dimensionality Reduction and Classification¶