import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
df = pd.read_csv("data/Artisan_training.csv")
#Check the contet of the dataset
df.head()
_c_ack_cnt | _c_ack_cnt_p | _c_appdataB | _c_appdataT | _c_bytes_all | _c_bytes_retx | _c_bytes_uniq | _c_cwin_ini | _c_cwin_max | _c_cwin_min | ... | _s_ttl_max | _s_ttl_min | _s_win_0 | _s_win_max | _s_win_min | _s_win_scl | _tls_session_stat | c_ip | time | label | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 217.0 | 206.0 | 272.0 | 35.395 | 2426.0 | 0.0 | 2426.0 | 178.0 | 305.0 | 31.0 | ... | 59.0 | 59.0 | 0.0 | 36352.0 | 27680.0 | 9.0 | 0.0 | 67.32.230.26 | 1.561979e+09 | giphy.com |
1 | 8.0 | 3.0 | 479.0 | 11.842 | 945.0 | 0.0 | 945.0 | 427.0 | 518.0 | 51.0 | ... | 59.0 | 59.0 | 0.0 | 30208.0 | 27920.0 | 9.0 | 0.0 | 67.32.230.24 | 1.561976e+09 | fastly-insights.com |
2 | 53.0 | 45.0 | 611.0 | 146.000 | 3328.0 | 0.0 | 3328.0 | 517.0 | 710.0 | 38.0 | ... | 59.0 | 59.0 | 0.0 | 34816.0 | 27920.0 | 9.0 | 1.0 | 67.32.76.252 | 1.561984e+09 | _other |
3 | 21.0 | 8.0 | 488.0 | 18.033 | 4101.0 | 0.0 | 4101.0 | 394.0 | 836.0 | 93.0 | ... | 59.0 | 59.0 | 0.0 | 38912.0 | 27920.0 | 9.0 | 0.0 | 67.32.124.163 | 1.561984e+09 | twitch.tv |
4 | 9.0 | 5.0 | 0.0 | 0.000 | 641.0 | 0.0 | 641.0 | 517.0 | 517.0 | 31.0 | ... | 59.0 | 59.0 | 0.0 | 29184.0 | 27680.0 | 9.0 | 1.0 | 67.32.230.4 | 1.561990e+09 | krxd.net |
5 rows × 123 columns
1.1) What are the dimensions of the dataset (number of rows and columns)?
df.shape
(29573, 123)
1.2) How many classes are present in the dataset?
df.label.nunique()
26
1.3) Are the classes balanced?
df.label.value_counts()
_other 6985 scdn.co 5064 taboola.com 4117 krxd.net 2253 outbrain.com 1112 githubusercontent.com 887 giphy.com 836 everesttech.net 715 reddit.com 700 adnxs.com 698 ftcdn.net 671 redd.it 605 disqus.com 604 contextweb.com 566 twitch.tv 486 newrelic.com 454 pinterest.com 426 fastly.net 368 chartbeat.com 303 twitchcdn.net 291 fastly-insights.com 282 ads-twitter.com 267 polyfill.io 253 vimeocdn.com 221 slack-edge.com 212 twimg.com 197 Name: label, dtype: int64
df.label.value_counts().plot.bar()
<AxesSubplot:>
1.4) Are there Missing Values?
df[df.isna().any(axis=1)]
_c_ack_cnt | _c_ack_cnt_p | _c_appdataB | _c_appdataT | _c_bytes_all | _c_bytes_retx | _c_bytes_uniq | _c_cwin_ini | _c_cwin_max | _c_cwin_min | ... | _s_ttl_max | _s_ttl_min | _s_win_0 | _s_win_max | _s_win_min | _s_win_scl | _tls_session_stat | c_ip | time | label |
---|
0 rows × 123 columns
1.5) Describe the dataset
Can you identify possible useless columns?
df.describe()
_c_ack_cnt | _c_ack_cnt_p | _c_appdataB | _c_appdataT | _c_bytes_all | _c_bytes_retx | _c_bytes_uniq | _c_cwin_ini | _c_cwin_max | _c_cwin_min | ... | _s_syn_retx | _s_tm_opt | _s_ttl_max | _s_ttl_min | _s_win_0 | _s_win_max | _s_win_min | _s_win_scl | _tls_session_stat | time | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
count | 29573.000000 | 29573.000000 | 29573.000000 | 29573.000000 | 2.957300e+04 | 29573.000000 | 2.957300e+04 | 29573.000000 | 29573.000000 | 29573.000000 | ... | 29573.0 | 29573.000000 | 29573.00000 | 29573.000000 | 29573.0 | 2.957300e+04 | 29573.000000 | 29573.000000 | 29573.000000 | 2.957300e+04 |
mean | 111.895006 | 96.860278 | 424.265479 | 193.826524 | 5.147532e+03 | 50.361309 | 5.097170e+03 | 429.760863 | 944.303385 | 55.096270 | ... | 0.0 | 0.448957 | 58.94096 | 58.891658 | 0.0 | 1.335119e+05 | 27812.241166 | 8.668278 | 0.723193 | 1.561983e+09 |
std | 997.243526 | 914.683239 | 233.130543 | 1323.627062 | 1.589892e+05 | 1531.723204 | 1.578924e+05 | 138.342696 | 2148.354313 | 47.784299 | ... | 0.0 | 0.497396 | 0.75267 | 1.011313 | 0.0 | 1.177282e+06 | 119.377259 | 1.695745 | 0.453732 | 1.172480e+04 |
min | 2.000000 | 0.000000 | 0.000000 | 0.000000 | 1.550000e+02 | 0.000000 | 1.550000e+02 | 155.000000 | 155.000000 | 1.000000 | ... | 0.0 | 0.000000 | 37.00000 | 37.000000 | 0.0 | 2.840800e+04 | 27648.000000 | 0.000000 | 0.000000 | 1.561932e+09 |
25% | 10.000000 | 4.000000 | 295.000000 | 15.960000 | 7.590000e+02 | 0.000000 | 7.440000e+02 | 252.000000 | 517.000000 | 31.000000 | ... | 0.0 | 0.000000 | 59.00000 | 59.000000 | 0.0 | 3.020800e+04 | 27680.000000 | 9.000000 | 0.000000 | 1.561973e+09 |
50% | 17.000000 | 7.000000 | 569.000000 | 33.055000 | 1.140000e+03 | 0.000000 | 1.127000e+03 | 517.000000 | 517.000000 | 38.000000 | ... | 0.0 | 0.000000 | 59.00000 | 59.000000 | 0.0 | 3.020800e+04 | 27920.000000 | 9.000000 | 1.000000 | 1.561983e+09 |
75% | 37.000000 | 23.000000 | 611.000000 | 76.006000 | 2.041000e+03 | 0.000000 | 2.003000e+03 | 517.000000 | 707.000000 | 93.000000 | ... | 0.0 | 1.000000 | 59.00000 | 59.000000 | 0.0 | 3.276800e+04 | 27920.000000 | 9.000000 | 1.000000 | 1.561992e+09 |
max | 60406.000000 | 51434.000000 | 5673.000000 | 102908.359000 | 2.539156e+07 | 197307.000000 | 2.522823e+07 | 642.000000 | 167464.000000 | 593.000000 | ... | 0.0 | 1.000000 | 59.00000 | 59.000000 | 0.0 | 1.429504e+07 | 27920.000000 | 9.000000 | 3.000000 | 1.562019e+09 |
8 rows × 121 columns
def find_invariant_features(df):
invariant_features = []
#Check of columns with 0 variance exist
#WARNING: There may be columns for which variance cannot be computed!
for c in df.columns:
try:
var = df[c].var()
if(var==0):
invariant_features.append(c)
print(c)
except:
print("cannot compute variance: "+c)
return invariant_features
2.2) Remove the features with no variability
invariant_features = find_invariant_features(df)
df = df.drop(invariant_features,axis=1)
_c_pkts_fc _c_pkts_unfs _c_sack_opt _c_syn_retx _s_pkts_unfs _s_port _s_sack_opt _s_syn_retx _s_win_0 cannot compute variance: c_ip cannot compute variance: label
invariant_features
['_c_pkts_fc', '_c_pkts_unfs', '_c_sack_opt', '_c_syn_retx', '_s_pkts_unfs', '_s_port', '_s_sack_opt', '_s_syn_retx', '_s_win_0']
2.3) Outlier removal
Remove rows having values lower than the 1st percentile or greather than the 99th percentile of at least a feature
def drop_outliers_IQR(df):
#Extract the 1st percentile and 99th percentile
#Outliers are values below the 1st percentile or above the 99th percentile
#Remove all rows having an outiler
p1=df.quantile(0.01)
p99=df.quantile(0.99)
not_outliers = df[~((df<p1) | (df>p99))]
outliers_dropped = not_outliers.dropna()
return outliers_dropped
df_clean = drop_outliers_IQR(df)
/opt/conda/lib/python3.7/site-packages/ipykernel_launcher.py:11: FutureWarning: Automatic reindexing on DataFrame vs Series comparisons is deprecated and will raise ValueError in a future version. Do `left, right = left.align(right, axis=1, copy=False)` before e.g. `left == right` # This is added back by InteractiveShellApp.init_path()
Do you still have all classes?
df_clean.label.value_counts().plot.bar()
<AxesSubplot:>
df_clean.label.nunique()
26
list(df_clean.columns)
['_c_ack_cnt', '_c_ack_cnt_p', '_c_appdataB', '_c_appdataT', '_c_bytes_all', '_c_bytes_retx', '_c_bytes_uniq', '_c_cwin_ini', '_c_cwin_max', '_c_cwin_min', '_c_fin_cnt', '_c_first', '_c_first_ack', '_c_last', '_c_last_handshakeT', '_c_msgsize1', '_c_msgsize_count', '_c_mss', '_c_mss_max', '_c_mss_min', '_c_pkts_all', '_c_pkts_data', '_c_pkts_data_avg', '_c_pkts_data_std', '_c_pkts_dup', '_c_pkts_ooo', '_c_pkts_push', '_c_pkts_reor', '_c_pkts_retx', '_c_pkts_rto', '_c_pkts_unk', '_c_pkts_unrto', '_c_pktsize1', '_c_pktsize_count', '_c_port', '_c_rst_cnt', '_c_rtt_avg', '_c_rtt_cnt', '_c_rtt_max', '_c_rtt_min', '_c_rtt_std', '_c_sack_cnt', '_c_seg_cnt', '_c_sit1', '_c_sit_avg', '_c_sit_std', '_c_syn_cnt', '_c_tm_opt', '_c_ttl_max', '_c_ttl_min', '_c_win_0', '_c_win_max', '_c_win_min', '_c_win_scl', '_durat', '_s_ack_cnt', '_s_ack_cnt_p', '_s_appdataB', '_s_appdataT', '_s_bytes_all', '_s_bytes_retx', '_s_bytes_uniq', '_s_cwin_ini', '_s_cwin_max', '_s_cwin_min', '_s_f1323_opt', '_s_fin_cnt', '_s_first', '_s_first_ack', '_s_last', '_s_last_handshakeT', '_s_msgsize1', '_s_msgsize_count', '_s_mss', '_s_mss_max', '_s_mss_min', '_s_pkts_all', '_s_pkts_data', '_s_pkts_data_avg', '_s_pkts_data_std', '_s_pkts_dup', '_s_pkts_fc', '_s_pkts_fs', '_s_pkts_ooo', '_s_pkts_push', '_s_pkts_reor', '_s_pkts_retx', '_s_pkts_rto', '_s_pkts_unk', '_s_pkts_unrto', '_s_pktsize1', '_s_pktsize_count', '_s_rst_cnt', '_s_rtt_avg', '_s_rtt_cnt', '_s_rtt_max', '_s_rtt_min', '_s_rtt_std', '_s_sack_cnt', '_s_seg_cnt', '_s_sit1', '_s_sit_avg', '_s_sit_std', '_s_syn_cnt', '_s_tm_opt', '_s_ttl_max', '_s_ttl_min', '_s_win_max', '_s_win_min', '_s_win_scl', '_tls_session_stat', 'c_ip', 'time', 'label']
df_clean = df_clean.drop(["c_ip","time"],axis=1)
Select a classifier from https://scikit-learn.org/stable/search.html?q=classifier and run a full classification pipeline.
Suggestion: use a Random Fores for the ease of training time
Warnings:
According to the model you choose you might need/not need to scale your data (https://scikit-learn.org/stable/search.html?q=scaler)
Perform cross-validation to avoid overfitting and find the best set of parameters:
Evaluate the classification metrics Classification Report/Precision/Confusion Matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
def confusion_matrix(y_true, y_pred):
df = pd.DataFrame([x for x in zip(y_true, y_pred)],
columns=['y_true', 'y_pred'])
df[['samples']] = 1
confusion = pd.pivot_table(df, index='y_true',
columns='y_pred',
values='samples',
aggfunc=sum)
return confusion
def plot_confusion_matrix(confusion_matrix):
fig, ax = plt.subplots(figsize=(20, 16))
# Normalize by rows
confusion_matrix_norm = confusion_matrix.divide(confusion_matrix.sum(1), axis=0)*100
# Visualize
sns.heatmap(confusion_matrix_norm, cmap='Blues', annot=True, vmin=0, vmax=100, ax=ax, cbar_kws={'label':'Percentage'},fmt=".1f")
ax.set_xlabel('Prediction')
ax.set_ylabel('True')
plt.tight_layout()
plt.show()
plt.close()
return
3.1) Choose a classifier and perform the grid search cross validation
param_grid = {
"max_depth" : [10, 15, 20],
"n_estimators" : [10,20, 50]
}
rf = RandomForestClassifier()
grid_search = GridSearchCV(estimator=rf, cv=10, param_grid=param_grid)
#Prepare the dataset for the validation process
train_columns = list(df_clean.columns)
train_columns.remove("label")
X = df_clean[train_columns]
y = df_clean["label"]
#Perform a Grid search analysis
rf_models = grid_search.fit(X, y)
#What is the best estimator?
rf_models.best_estimator_
RandomForestClassifier(max_depth=20, n_estimators=50)
#How does the different fold perform?
print(rf_models.cv_results_)
{'mean_fit_time': array([0.4843389 , 0.94696097, 2.34931455, 0.61932619, 1.23678443, 3.06452587, 0.68121929, 1.32761374, 3.33889036]), 'std_fit_time': array([0.01030868, 0.01201455, 0.02065306, 0.00825935, 0.01234856, 0.01679352, 0.01331663, 0.01984481, 0.04146418]), 'mean_score_time': array([0.00907478, 0.01319396, 0.02681134, 0.00992603, 0.016485 , 0.03599961, 0.0110254 , 0.01769838, 0.04043283]), 'std_score_time': array([0.00078487, 0.00075032, 0.0011493 , 0.00047625, 0.00103498, 0.00400181, 0.00084235, 0.00069512, 0.00515087]), 'param_max_depth': masked_array(data=[10, 10, 10, 15, 15, 15, 20, 20, 20], mask=[False, False, False, False, False, False, False, False, False], fill_value='?', dtype=object), 'param_n_estimators': masked_array(data=[10, 20, 50, 10, 20, 50, 10, 20, 50], mask=[False, False, False, False, False, False, False, False, False], fill_value='?', dtype=object), 'params': [{'max_depth': 10, 'n_estimators': 10}, {'max_depth': 10, 'n_estimators': 20}, {'max_depth': 10, 'n_estimators': 50}, {'max_depth': 15, 'n_estimators': 10}, {'max_depth': 15, 'n_estimators': 20}, {'max_depth': 15, 'n_estimators': 50}, {'max_depth': 20, 'n_estimators': 10}, {'max_depth': 20, 'n_estimators': 20}, {'max_depth': 20, 'n_estimators': 50}], 'split0_test_score': array([0.78073993, 0.78182807, 0.7867247 , 0.84167573, 0.83786725, 0.84929271, 0.82861806, 0.84439608, 0.85527748]), 'split1_test_score': array([0.78019587, 0.78618063, 0.80195865, 0.84548422, 0.85908596, 0.87377584, 0.85092492, 0.87431991, 0.87486398]), 'split2_test_score': array([0.77366703, 0.77094668, 0.78726877, 0.85908596, 0.86071817, 0.86833515, 0.85527748, 0.8726877 , 0.87486398]), 'split3_test_score': array([0.77245509, 0.79096353, 0.77844311, 0.83505716, 0.84757757, 0.85465433, 0.8328797 , 0.85737616, 0.86771911]), 'split4_test_score': array([0.76320087, 0.78551987, 0.77408819, 0.82416984, 0.85084377, 0.85737616, 0.84158955, 0.86009799, 0.85900925]), 'split5_test_score': array([0.77789875, 0.78933043, 0.78933043, 0.85900925, 0.86663038, 0.8742515 , 0.85846489, 0.87697333, 0.89874796]), 'split6_test_score': array([0.75884594, 0.80076211, 0.80239521, 0.8328797 , 0.85737616, 0.8726184 , 0.84975504, 0.87915079, 0.8726184 ]), 'split7_test_score': array([0.75503538, 0.77681002, 0.77789875, 0.82580294, 0.84104518, 0.85247686, 0.82634731, 0.85247686, 0.86336418]), 'split8_test_score': array([0.77626565, 0.81110506, 0.79749592, 0.84104518, 0.86989657, 0.86608601, 0.84050082, 0.86880784, 0.8726184 ]), 'split9_test_score': array([0.80185084, 0.78606424, 0.7849755 , 0.83886772, 0.85574306, 0.85955362, 0.84485574, 0.86009799, 0.86663038]), 'mean_test_score': array([0.77401554, 0.78795106, 0.78805792, 0.84030777, 0.85467841, 0.86284206, 0.84292135, 0.86463846, 0.87057131]), 'std_test_score': array([0.01260253, 0.01083779, 0.00944559, 0.01134557, 0.00986086, 0.0088586 , 0.0104715 , 0.01091005, 0.01132409]), 'rank_test_score': array([9, 8, 7, 6, 4, 3, 5, 2, 1], dtype=int32)}
3.2) Predict the label on the test set
REMINDER: Preprocess the testing data too!
#Read the test dataset
df_test = pd.read_csv("data/Artisan_testing.csv")
#Clean the dataset from outliers
df_test_clean = drop_outliers_IQR(df_test)
/opt/conda/lib/python3.7/site-packages/ipykernel_launcher.py:11: FutureWarning: Automatic reindexing on DataFrame vs Series comparisons is deprecated and will raise ValueError in a future version. Do `left, right = left.align(right, axis=1, copy=False)` before e.g. `left == right` # This is added back by InteractiveShellApp.init_path()
#Prepare the dataset for the testing process
X_test = df_test_clean[train_columns]
y_test = df_test_clean["label"]
#Train the best model found in the previous step with all the Training data
model = rf_models.best_estimator_.fit(X,y)
#Predict the test label
y_test_predicted = model.predict(X_test)
3.3) Check the testing performance with respect to the validation performance
#Analyse the classification report
#Are there classes that are more difficult to predict than others?=
print(classification_report(y_test, y_test_predicted))
precision recall f1-score support _other 0.77 0.85 0.81 4355 adnxs.com 0.90 0.79 0.84 293 ads-twitter.com 0.96 0.78 0.86 158 chartbeat.com 0.96 0.86 0.91 274 contextweb.com 0.84 0.80 0.82 309 disqus.com 0.81 0.57 0.67 362 everesttech.net 0.97 0.83 0.89 378 fastly-insights.com 0.97 0.76 0.85 42 fastly.net 0.92 0.59 0.72 205 ftcdn.net 0.99 0.95 0.97 250 giphy.com 0.83 0.83 0.83 226 githubusercontent.com 0.90 0.81 0.85 438 krxd.net 0.85 0.93 0.88 1399 newrelic.com 0.86 0.74 0.80 198 outbrain.com 0.68 0.55 0.61 650 pinterest.com 0.88 0.63 0.74 215 polyfill.io 0.80 0.54 0.65 123 redd.it 0.90 0.68 0.77 264 reddit.com 0.95 0.61 0.74 150 scdn.co 0.91 0.98 0.94 1725 slack-edge.com 1.00 0.26 0.41 39 taboola.com 0.73 0.84 0.78 1979 twimg.com 0.65 0.17 0.27 77 twitch.tv 0.65 0.28 0.39 94 twitchcdn.net 0.33 0.08 0.13 25 vimeocdn.com 0.82 0.39 0.53 120 accuracy 0.82 14348 macro avg 0.84 0.66 0.72 14348 weighted avg 0.82 0.82 0.81 14348
#Extract and plot the confusion matrix
confusion_matrix = confusion_matrix(y_test, y_test_predicted)
plot_confusion_matrix(confusion_matrix)
Now we will reduce the number of features to identify the best subset for our classification task
RF_feature_importance = model.feature_importances_
features = list(X.columns)
feature_importance = {features[i]: RF_feature_importance[i] for i in range(len(RF_feature_importance))}
feature_importance_sorted = {k: v for k, v in sorted(feature_importance.items(), key=lambda item: item[1], reverse = True)}
fig, ax = plt.subplots(figsize=(5,4))
ax.plot(feature_importance_sorted.values())
ax.set_xlabel("Feature")
ax.set_ylabel("Feature Importance")
plt.grid()
plt.show()
plt.close()
4.3) Train the model using the traning set with an increasing number of features
Start from a subset of features composed by THE most important features than increasing the number of features by 5 each time, i.e., 1 feature, 6 features, 11 features, etc..
Check how the accuracy changes in the test set
Accuracy = []
NFeatures = []
for i in range(1,len(feature_importance_sorted)):
if((i-1)%5==0):
print(i)
subset = list(feature_importance_sorted.keys())[0:i]
model = rf_models.best_estimator_.fit(X[subset],y)
y_test_predicted_i = model.predict(X_test[subset])
Accuracy.append(accuracy_score(y_test, y_test_predicted_i))
NFeatures.append(i)
1 6 11 16 21 26 31 36 41 46 51 56 61 66 71 76 81 86 91 96 101 106
#Plot the trend of the Accuracy
# Do you need all the features to create a good classifier or a few are better?
fig, ax = plt.subplots(figsize=(8,4))
ax.plot(NFeatures,Accuracy)
ax.set_xlabel("Features")
ax.set_ylabel("Accuracy")
ax.set_xticks(NFeatures)
ax.set_xlim(1,106)
plt.grid()
plt.show()
plt.close()
Now we will experience how to use PCA to reduce the data dimensionality.
PCA is usefull as it does not consider the usage of label but only the dataset features
from sklearn.decomposition import PCA
5.1) Use the traning data and extract PCA Fit the PCA on the training dataset
Evaluate the explained variance.
How many components do you need to explain most of the dataset variance?
pca = PCA()
pca.fit(X)
PCA()
# Get the explained variance
exp_var = pca.explained_variance_ratio_
# Get the cumulative expalined variance
cum_exp_var = np.cumsum(exp_var)/np.sum(exp_var)
#Plot the trend of the explained variance
plt.figure(figsize=(5, 3.5))
plt.plot(np.arange(1, X.shape[1]+1), cum_exp_var, marker='o')
plt.grid()
plt.xlabel('# Principal Components')
plt.ylabel('Cumulative explained variance')
plt.xticks(np.arange(1, X.shape[1]+1, 2))
plt.xlim(0,10)
plt.show()
plt.close()
5.2) Use the given number of components to:
Compute the Accuracy on the test set
# Fit the PCA on the training set with the decided number of components
pca = PCA(n_components=3, whiten=True)
pca.fit(X)
PCA(n_components=3, whiten=True)
#Transform the training and the test set
X_PCA = pca.transform(X)
X_test_PCA = pca.transform(X_test)
#Create a new model and predict the test label
model = rf_models.best_estimator_.fit(X_PCA,y)
y_test_predicted_PCA = model.predict(X_test_PCA)
5.3) Analyze the classification report.
Does it perform better or worst with respect to original features?
print(classification_report(y_test, y_test_predicted_PCA))
precision recall f1-score support _other 0.45 0.46 0.46 4355 adnxs.com 0.34 0.22 0.27 293 ads-twitter.com 0.16 0.08 0.10 158 chartbeat.com 0.44 0.23 0.30 274 contextweb.com 0.21 0.20 0.21 309 disqus.com 0.12 0.05 0.07 362 everesttech.net 0.24 0.21 0.22 378 fastly-insights.com 0.08 0.19 0.12 42 fastly.net 0.18 0.06 0.09 205 ftcdn.net 0.65 0.55 0.60 250 giphy.com 0.45 0.42 0.43 226 githubusercontent.com 0.26 0.20 0.22 438 krxd.net 0.42 0.44 0.43 1399 newrelic.com 0.32 0.14 0.20 198 outbrain.com 0.16 0.09 0.12 650 pinterest.com 0.25 0.13 0.18 215 polyfill.io 0.14 0.06 0.08 123 redd.it 0.44 0.35 0.39 264 reddit.com 0.26 0.15 0.19 150 scdn.co 0.46 0.68 0.54 1725 slack-edge.com 0.05 0.08 0.06 39 taboola.com 0.36 0.48 0.41 1979 twimg.com 0.03 0.01 0.02 77 twitch.tv 0.10 0.04 0.06 94 twitchcdn.net 0.00 0.00 0.00 25 vimeocdn.com 0.13 0.03 0.05 120 accuracy 0.39 14348 macro avg 0.26 0.21 0.22 14348 weighted avg 0.37 0.39 0.37 14348