Skip to content

Commit ef2aaf6

Browse files
committed
ML with KMeans Clustering
Former-commit-id: 85c2133
1 parent c4faf2a commit ef2aaf6

File tree

1 file changed

+254
-0
lines changed

1 file changed

+254
-0
lines changed
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,254 @@
1+
{
2+
"cells": [
3+
{
4+
"cell_type": "markdown",
5+
"metadata": {},
6+
"source": [
7+
"# Machine Learning - Clustering"
8+
]
9+
},
10+
{
11+
"cell_type": "code",
12+
"execution_count": 1,
13+
"metadata": {},
14+
"outputs": [],
15+
"source": [
16+
"from pyspark.sql import SparkSession\n",
17+
"\n",
18+
"from pyspark.ml.linalg import Vectors\n",
19+
"from pyspark.ml.feature import VectorAssembler\n",
20+
"from pyspark.ml.clustering import KMeans"
21+
]
22+
},
23+
{
24+
"cell_type": "code",
25+
"execution_count": 2,
26+
"metadata": {},
27+
"outputs": [],
28+
"source": [
29+
"spark = SparkSession.builder.getOrCreate()"
30+
]
31+
},
32+
{
33+
"cell_type": "code",
34+
"execution_count": 3,
35+
"metadata": {},
36+
"outputs": [],
37+
"source": [
38+
"data_path = '../Data'\n",
39+
"file_path = data_path + '/utilization.csv'"
40+
]
41+
},
42+
{
43+
"cell_type": "code",
44+
"execution_count": 4,
45+
"metadata": {},
46+
"outputs": [],
47+
"source": [
48+
"df = spark.read.format('csv').options(header=False, inferSchema=True).load(file_path)\n",
49+
"\n",
50+
"# rename columns\n",
51+
"df = df.withColumnRenamed('_c0', 'event_datetime')\\\n",
52+
" .withColumnRenamed('_c1', 'server_id')\\\n",
53+
" .withColumnRenamed('_c2', 'cpu_utilization')\\\n",
54+
" .withColumnRenamed('_c3', 'free_memory')\\\n",
55+
" .withColumnRenamed('_c4', 'session_count')"
56+
]
57+
},
58+
{
59+
"cell_type": "code",
60+
"execution_count": 5,
61+
"metadata": {},
62+
"outputs": [
63+
{
64+
"name": "stdout",
65+
"output_type": "stream",
66+
"text": [
67+
"+-------------------+---------+---------------+-----------+-------------+\n",
68+
"| event_datetime|server_id|cpu_utilization|free_memory|session_count|\n",
69+
"+-------------------+---------+---------------+-----------+-------------+\n",
70+
"|03/05/2019 08:06:14| 100| 0.57| 0.51| 47|\n",
71+
"|03/05/2019 08:11:14| 100| 0.47| 0.62| 43|\n",
72+
"|03/05/2019 08:16:14| 100| 0.56| 0.57| 62|\n",
73+
"|03/05/2019 08:21:14| 100| 0.57| 0.56| 50|\n",
74+
"|03/05/2019 08:26:14| 100| 0.35| 0.46| 43|\n",
75+
"+-------------------+---------+---------------+-----------+-------------+\n",
76+
"only showing top 5 rows\n",
77+
"\n"
78+
]
79+
}
80+
],
81+
"source": [
82+
"df.show(5)"
83+
]
84+
},
85+
{
86+
"cell_type": "markdown",
87+
"metadata": {},
88+
"source": [
89+
"---------"
90+
]
91+
},
92+
{
93+
"cell_type": "markdown",
94+
"metadata": {},
95+
"source": [
96+
"# Vectorizing Features, tranform using VectorAssembler\n",
97+
"- Spark ML algorithm expects the inputs in vector format."
98+
]
99+
},
100+
{
101+
"cell_type": "code",
102+
"execution_count": 6,
103+
"metadata": {},
104+
"outputs": [],
105+
"source": [
106+
"vector_assembler = VectorAssembler(inputCols=['cpu_utilization', 'free_memory', 'session_count'], outputCol='features')"
107+
]
108+
},
109+
{
110+
"cell_type": "code",
111+
"execution_count": 8,
112+
"metadata": {},
113+
"outputs": [],
114+
"source": [
115+
"# transform by passing the original dataframe\n",
116+
"vcluster_df = vector_assembler.transform(df)"
117+
]
118+
},
119+
{
120+
"cell_type": "code",
121+
"execution_count": 9,
122+
"metadata": {},
123+
"outputs": [
124+
{
125+
"name": "stdout",
126+
"output_type": "stream",
127+
"text": [
128+
"+-------------------+---------+---------------+-----------+-------------+----------------+\n",
129+
"| event_datetime|server_id|cpu_utilization|free_memory|session_count| features|\n",
130+
"+-------------------+---------+---------------+-----------+-------------+----------------+\n",
131+
"|03/05/2019 08:06:14| 100| 0.57| 0.51| 47|[0.57,0.51,47.0]|\n",
132+
"|03/05/2019 08:11:14| 100| 0.47| 0.62| 43|[0.47,0.62,43.0]|\n",
133+
"|03/05/2019 08:16:14| 100| 0.56| 0.57| 62|[0.56,0.57,62.0]|\n",
134+
"|03/05/2019 08:21:14| 100| 0.57| 0.56| 50|[0.57,0.56,50.0]|\n",
135+
"|03/05/2019 08:26:14| 100| 0.35| 0.46| 43|[0.35,0.46,43.0]|\n",
136+
"+-------------------+---------+---------------+-----------+-------------+----------------+\n",
137+
"only showing top 5 rows\n",
138+
"\n"
139+
]
140+
}
141+
],
142+
"source": [
143+
"vcluster_df.show(5)"
144+
]
145+
},
146+
{
147+
"cell_type": "markdown",
148+
"metadata": {},
149+
"source": [
150+
"We can see that there is a new column 'features' in the newly created DataFrame."
151+
]
152+
},
153+
{
154+
"cell_type": "code",
155+
"execution_count": 11,
156+
"metadata": {},
157+
"outputs": [],
158+
"source": [
159+
"# type(vcluster_df)"
160+
]
161+
},
162+
{
163+
"cell_type": "markdown",
164+
"metadata": {},
165+
"source": [
166+
"# Model Creation"
167+
]
168+
},
169+
{
170+
"cell_type": "code",
171+
"execution_count": 13,
172+
"metadata": {},
173+
"outputs": [],
174+
"source": [
175+
"kmeans = KMeans().setK(3) # set clusters as 3\n",
176+
"kmeans = kmeans.setSeed(1) # set random seed as 1"
177+
]
178+
},
179+
{
180+
"cell_type": "code",
181+
"execution_count": 14,
182+
"metadata": {},
183+
"outputs": [],
184+
"source": [
185+
"k_model = kmeans.fit(vcluster_df) #fit the model with transformed df"
186+
]
187+
},
188+
{
189+
"cell_type": "markdown",
190+
"metadata": {},
191+
"source": [
192+
"# Get Cluster Centers"
193+
]
194+
},
195+
{
196+
"cell_type": "code",
197+
"execution_count": 16,
198+
"metadata": {},
199+
"outputs": [
200+
{
201+
"data": {
202+
"text/plain": [
203+
"[array([ 0.71542187, 0.28469012, 87.5516823 ]),\n",
204+
" array([ 0.51439668, 0.48445202, 50.49452021]),\n",
205+
" array([ 0.62129573, 0.37851014, 69.19070448])]"
206+
]
207+
},
208+
"execution_count": 16,
209+
"metadata": {},
210+
"output_type": "execute_result"
211+
}
212+
],
213+
"source": [
214+
"k_model.clusterCenters() # get the clusters center"
215+
]
216+
},
217+
{
218+
"cell_type": "markdown",
219+
"metadata": {},
220+
"source": [
221+
"We can see that there are 3 clusters (as we have defined).\n",
222+
"- each cluster is represented by 3 features values ('cpu_utilization', 'free_memory', 'session_count')"
223+
]
224+
},
225+
{
226+
"cell_type": "markdown",
227+
"metadata": {},
228+
"source": [
229+
"--------"
230+
]
231+
}
232+
],
233+
"metadata": {
234+
"kernelspec": {
235+
"display_name": "venv-datascience",
236+
"language": "python",
237+
"name": "venv-datascience"
238+
},
239+
"language_info": {
240+
"codemirror_mode": {
241+
"name": "ipython",
242+
"version": 3
243+
},
244+
"file_extension": ".py",
245+
"mimetype": "text/x-python",
246+
"name": "python",
247+
"nbconvert_exporter": "python",
248+
"pygments_lexer": "ipython3",
249+
"version": "3.8.5"
250+
}
251+
},
252+
"nbformat": 4,
253+
"nbformat_minor": 4
254+
}

0 commit comments

Comments
 (0)