forked from divya711yadhav/PhisingwebsitesDetection
-
Notifications
You must be signed in to change notification settings - Fork 0
/
feature_extractor.py
90 lines (73 loc) · 3.17 KB
/
feature_extractor.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
import pandas as pd
import numpy as np
import rf_model
class feature_extractor:
def __init__(self,url:str):
self.input_url = url
def long_url(self,l):
"""This function is defined in order to differntiate website based on the length of the URL"""
l= str(l)
if len(l) < 54:
return 0
elif len(l) >= 54 and len(l) <= 75:
return 2
return 1
def have_at_symbol(self,l):
"""This function is used to check whether the URL contains @ symbol or not"""
if "@" in str(l):
return 1
return 0
def redirection(self,l):
"""If the url has symbol(//) after protocol then such URL is to be classified as phishing """
if "//" in str(l):
return 1
return 0
def prefix_suffix_seperation(self,l):
"""seprate prefix and suffix"""
if '-' in str(l):
return 1
return 0
def sub_domains(self,l):
"""check the subdomains"""
l= str(l)
if l.count('.') < 3:
return 0
elif l.count('.') == 3:
return 2
return 1
def extract(self):
print("in script 2")
input_data = [{"URL":self.input_url}]
print('input taken')
temp_df = pd.DataFrame(input_data)
print("dataframe created")
#expand argument in the split method will give you a new column
seperation_of_protocol = temp_df['URL'].str.split("://",expand = True)
print("step 1 done")
#split(seperator,no of splits according to seperator(delimiter),expand)
seperation_domain_name = seperation_of_protocol[1].str.split("/",1,expand = True)
print("step 2 done")
#renaming columns of data frame
seperation_domain_name.columns=["domain_name","address"]
print("step 3 done")
#Concatenation of data frames
splitted_data = pd.concat([seperation_of_protocol[0],seperation_domain_name],axis=1)
print("step 4 done")
splitted_data.columns = ['protocol','domain_name','address']
print("step 5 done")
#splitted_data['is_phished'] = pd.Series(temp_df['Target'], index=splitted_data.index)
#print("step 6 done")
"""feature extraction starts here"""
#Applying the above defined function in order to divide the websites into 3 categories
splitted_data['long_url'] = temp_df['URL'].apply(self.long_url)
print("feature extra 1")
splitted_data['having_@_symbol'] = temp_df['URL'].apply(self.have_at_symbol)
print("feature extra 2")
splitted_data['redirection_//_symbol'] = seperation_of_protocol[1].apply(self.redirection)
print("feature extra 3")
splitted_data['prefix_suffix_seperation'] = seperation_domain_name['domain_name'].apply(self.prefix_suffix_seperation)
print("feature extra 4")
splitted_data['sub_domains'] = splitted_data['domain_name'].apply(self.sub_domains)
print("feature extra 5")
#splitted_data.to_csv(r'dataset3.csv',header= True)
return rf_model.predictor(splitted_data)