Skip to content

Commit a5703c4

Browse files
committed
ML with Linear Regression
Former-commit-id: aea9705
1 parent ef2aaf6 commit a5703c4

File tree

1 file changed

+271
-0
lines changed

1 file changed

+271
-0
lines changed
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,271 @@
1+
{
2+
"cells": [
3+
{
4+
"cell_type": "markdown",
5+
"metadata": {},
6+
"source": [
7+
"# Machine Learning - Linear Regression"
8+
]
9+
},
10+
{
11+
"cell_type": "code",
12+
"execution_count": 2,
13+
"metadata": {},
14+
"outputs": [],
15+
"source": [
16+
"from pyspark.sql import SparkSession\n",
17+
"\n",
18+
"from pyspark.ml.feature import VectorAssembler\n",
19+
"from pyspark.ml.regression import LinearRegression"
20+
]
21+
},
22+
{
23+
"cell_type": "code",
24+
"execution_count": 3,
25+
"metadata": {},
26+
"outputs": [],
27+
"source": [
28+
"spark = SparkSession.builder.getOrCreate()"
29+
]
30+
},
31+
{
32+
"cell_type": "code",
33+
"execution_count": 4,
34+
"metadata": {},
35+
"outputs": [],
36+
"source": [
37+
"data_path = '../Data'\n",
38+
"file_path = data_path + '/utilization.csv'"
39+
]
40+
},
41+
{
42+
"cell_type": "code",
43+
"execution_count": 5,
44+
"metadata": {},
45+
"outputs": [],
46+
"source": [
47+
"df = spark.read.format('csv').options(header=False, inferSchema=True).load(file_path)\n",
48+
"\n",
49+
"# rename columns\n",
50+
"df = df.withColumnRenamed('_c0', 'event_datetime')\\\n",
51+
" .withColumnRenamed('_c1', 'server_id')\\\n",
52+
" .withColumnRenamed('_c2', 'cpu_utilization')\\\n",
53+
" .withColumnRenamed('_c3', 'free_memory')\\\n",
54+
" .withColumnRenamed('_c4', 'session_count')"
55+
]
56+
},
57+
{
58+
"cell_type": "code",
59+
"execution_count": 6,
60+
"metadata": {},
61+
"outputs": [
62+
{
63+
"name": "stdout",
64+
"output_type": "stream",
65+
"text": [
66+
"+-------------------+---------+---------------+-----------+-------------+\n",
67+
"| event_datetime|server_id|cpu_utilization|free_memory|session_count|\n",
68+
"+-------------------+---------+---------------+-----------+-------------+\n",
69+
"|03/05/2019 08:06:14| 100| 0.57| 0.51| 47|\n",
70+
"|03/05/2019 08:11:14| 100| 0.47| 0.62| 43|\n",
71+
"|03/05/2019 08:16:14| 100| 0.56| 0.57| 62|\n",
72+
"|03/05/2019 08:21:14| 100| 0.57| 0.56| 50|\n",
73+
"|03/05/2019 08:26:14| 100| 0.35| 0.46| 43|\n",
74+
"+-------------------+---------+---------------+-----------+-------------+\n",
75+
"only showing top 5 rows\n",
76+
"\n"
77+
]
78+
}
79+
],
80+
"source": [
81+
"df.show(5)"
82+
]
83+
},
84+
{
85+
"cell_type": "markdown",
86+
"metadata": {},
87+
"source": [
88+
"---------"
89+
]
90+
},
91+
{
92+
"cell_type": "markdown",
93+
"metadata": {},
94+
"source": [
95+
"# Vectorizing Features, tranform using VectorAssembler"
96+
]
97+
},
98+
{
99+
"cell_type": "code",
100+
"execution_count": 7,
101+
"metadata": {},
102+
"outputs": [],
103+
"source": [
104+
"vector_assembler = VectorAssembler(inputCols=['cpu_utilization'], outputCol='features')"
105+
]
106+
},
107+
{
108+
"cell_type": "code",
109+
"execution_count": 8,
110+
"metadata": {},
111+
"outputs": [],
112+
"source": [
113+
"df_vutil = vector_assembler.transform(df)"
114+
]
115+
},
116+
{
117+
"cell_type": "code",
118+
"execution_count": 9,
119+
"metadata": {},
120+
"outputs": [
121+
{
122+
"name": "stdout",
123+
"output_type": "stream",
124+
"text": [
125+
"+-------------------+---------+---------------+-----------+-------------+--------+\n",
126+
"| event_datetime|server_id|cpu_utilization|free_memory|session_count|features|\n",
127+
"+-------------------+---------+---------------+-----------+-------------+--------+\n",
128+
"|03/05/2019 08:06:14| 100| 0.57| 0.51| 47| [0.57]|\n",
129+
"|03/05/2019 08:11:14| 100| 0.47| 0.62| 43| [0.47]|\n",
130+
"|03/05/2019 08:16:14| 100| 0.56| 0.57| 62| [0.56]|\n",
131+
"|03/05/2019 08:21:14| 100| 0.57| 0.56| 50| [0.57]|\n",
132+
"|03/05/2019 08:26:14| 100| 0.35| 0.46| 43| [0.35]|\n",
133+
"+-------------------+---------+---------------+-----------+-------------+--------+\n",
134+
"only showing top 5 rows\n",
135+
"\n"
136+
]
137+
}
138+
],
139+
"source": [
140+
"df_vutil.show(5)"
141+
]
142+
},
143+
{
144+
"cell_type": "markdown",
145+
"metadata": {},
146+
"source": [
147+
"# Model Creation and Prediction"
148+
]
149+
},
150+
{
151+
"cell_type": "code",
152+
"execution_count": 22,
153+
"metadata": {},
154+
"outputs": [],
155+
"source": [
156+
"linear_regression = LinearRegression(featuresCol='features', labelCol='session_count')"
157+
]
158+
},
159+
{
160+
"cell_type": "code",
161+
"execution_count": 23,
162+
"metadata": {},
163+
"outputs": [],
164+
"source": [
165+
"lr_model = linear_regression.fit(df_vutil)"
166+
]
167+
},
168+
{
169+
"cell_type": "markdown",
170+
"metadata": {},
171+
"source": [
172+
"# Coefficient and Y Intercept"
173+
]
174+
},
175+
{
176+
"cell_type": "code",
177+
"execution_count": 24,
178+
"metadata": {},
179+
"outputs": [
180+
{
181+
"data": {
182+
"text/plain": [
183+
"DenseVector([47.024])"
184+
]
185+
},
186+
"execution_count": 24,
187+
"metadata": {},
188+
"output_type": "execute_result"
189+
}
190+
],
191+
"source": [
192+
"lr_model.coefficients"
193+
]
194+
},
195+
{
196+
"cell_type": "code",
197+
"execution_count": 25,
198+
"metadata": {},
199+
"outputs": [
200+
{
201+
"data": {
202+
"text/plain": [
203+
"40.41695103550495"
204+
]
205+
},
206+
"execution_count": 25,
207+
"metadata": {},
208+
"output_type": "execute_result"
209+
}
210+
],
211+
"source": [
212+
"lr_model.intercept"
213+
]
214+
},
215+
{
216+
"cell_type": "markdown",
217+
"metadata": {},
218+
"source": [
219+
"# Checking RMSE"
220+
]
221+
},
222+
{
223+
"cell_type": "code",
224+
"execution_count": 26,
225+
"metadata": {},
226+
"outputs": [
227+
{
228+
"data": {
229+
"text/plain": [
230+
"12.837990225931527"
231+
]
232+
},
233+
"execution_count": 26,
234+
"metadata": {},
235+
"output_type": "execute_result"
236+
}
237+
],
238+
"source": [
239+
"lr_model.summary.rootMeanSquaredError"
240+
]
241+
},
242+
{
243+
"cell_type": "markdown",
244+
"metadata": {},
245+
"source": [
246+
"--------"
247+
]
248+
}
249+
],
250+
"metadata": {
251+
"kernelspec": {
252+
"display_name": "venv-datascience",
253+
"language": "python",
254+
"name": "venv-datascience"
255+
},
256+
"language_info": {
257+
"codemirror_mode": {
258+
"name": "ipython",
259+
"version": 3
260+
},
261+
"file_extension": ".py",
262+
"mimetype": "text/x-python",
263+
"name": "python",
264+
"nbconvert_exporter": "python",
265+
"pygments_lexer": "ipython3",
266+
"version": "3.8.5"
267+
}
268+
},
269+
"nbformat": 4,
270+
"nbformat_minor": 4
271+
}

0 commit comments

Comments
 (0)