Skip to content

Commit 884a80e

Browse files
authored
Initial commit.
1 parent 28210ba commit 884a80e

7 files changed

+68570
-0
lines changed

flaskrestapi.py

+106
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,106 @@
1+
#!/usr/bin/env python
2+
"""
3+
This is the Flask REST API that processed and outputs the prediction on the URL.
4+
"""
5+
import numpy as np
6+
from keras.models import load_model
7+
from keras.preprocessing.sequence import pad_sequences
8+
from keras.preprocessing.text import Tokenizer
9+
import tensorflow as tf
10+
import label_data
11+
import flask
12+
import json
13+
14+
# Initialize our Flask application and the Keras model
15+
app = flask.Flask(__name__)
16+
17+
18+
global graph
19+
graph = tf.get_default_graph()
20+
model_pre = 'bi-lstmchar256256128.h5'
21+
model = load_model(model_pre)
22+
23+
def prepare_url(url):
24+
25+
urlz = label_data.main()
26+
27+
samples = []
28+
labels = []
29+
for k, v in urlz.items():
30+
samples.append(k)
31+
labels.append(v)
32+
33+
#print(len(samples))
34+
#print(len(labels))
35+
36+
maxlen = 128
37+
max_words = 20000
38+
39+
tokenizer = Tokenizer(num_words=max_words, char_level=True)
40+
tokenizer.fit_on_texts(samples)
41+
sequences = tokenizer.texts_to_sequences(url)
42+
word_index = tokenizer.word_index
43+
#print('Found %s unique tokens.' % len(word_index))
44+
45+
url_prepped = pad_sequences(sequences, maxlen=maxlen)
46+
return url_prepped
47+
48+
@app.route("/predict", methods=["POST"])
49+
def predict():
50+
51+
# Initialize the dictionary for the response.
52+
data = {"success": False}
53+
54+
# Check if POST request.
55+
if flask.request.method == "POST":
56+
# Grab and process the incoming json.
57+
incoming = flask.request.get_json()
58+
urlz = []
59+
url = incoming["url"]
60+
61+
urlz.append(url)
62+
print(url)
63+
64+
# Process and prepare the URL.
65+
url_prepped = prepare_url(urlz)
66+
67+
# classify the URL and make the prediction.
68+
with graph.as_default():
69+
prediction = model.predict(url_prepped)
70+
print(prediction)
71+
72+
data["predictions"] = []
73+
74+
if prediction > 0.50:
75+
result = "URL is probably malicious."
76+
else:
77+
result = "URL is probably NOT malicious."
78+
79+
# Check for base URL. Accuracy is not as great.
80+
split = url.split("//")
81+
print(split[0])
82+
split2 = split[1]
83+
if "/" not in split2:
84+
result = "Base URLs cannot be accurately determined."
85+
86+
# Process prediction probability into something human-friendly.
87+
prediction = float(prediction)
88+
prediction = prediction * 100
89+
90+
if result == "Base URLs cannot be accurately determined.":
91+
r = {"result": result, "url": url}
92+
else:
93+
r = {"result": result, "malicious percentage": prediction, "url": url}
94+
data["predictions"].append(r)
95+
96+
# Show that the request was a success.
97+
data["success"] = True
98+
99+
# Return the data as a JSON response.
100+
return flask.jsonify(data)
101+
102+
# Start the server.
103+
if __name__ == "__main__":
104+
print("Starting the server and loading the model...")
105+
app.run(host='0.0.0.0', port=45000)
106+

label_data.py

+28
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,28 @@
1+
#!/usr/bin/env python
2+
"""
3+
This file gathers data to be used for pre-processing in training and prediction.
4+
"""
5+
import pandas as pd
6+
7+
def main():
8+
9+
blacklist = 'phishing_database.csv'
10+
whitelist = 'whitelist.txt'
11+
12+
urls = {}
13+
14+
blacklist = pd.read_csv(blacklist)
15+
16+
#Assign 0 for non-malicious and 1 as malicious for supervised learning.
17+
for url in blacklist['url']:
18+
urls[url] = 1
19+
20+
with open(whitelist, 'r') as f:
21+
lines = f.read().splitlines()
22+
for url in lines:
23+
urls[url] = 0
24+
25+
return urls
26+
27+
if __name__ == "__main__":
28+
main()

0 commit comments

Comments
 (0)