ML with KMeans Clustering

ptyadana · ptyadana · commit ef2aaf65e2c2 · 2021-01-14T11:42:26.000+08:00
Former-commit-id: 85c2133
diff --git a/Spark - Spark SQL and Data Frames/03.Data Analysis with Spark/05.03.Machine Learning - Clustering.ipynb b/Spark - Spark SQL and Data Frames/03.Data Analysis with Spark/05.03.Machine Learning - Clustering.ipynb
@@ -0,0 +1,254 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Machine Learning - Clustering"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from pyspark.sql import SparkSession\n",
+    "\n",
+    "from pyspark.ml.linalg import Vectors\n",
+    "from pyspark.ml.feature import VectorAssembler\n",
+    "from pyspark.ml.clustering import KMeans"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "spark = SparkSession.builder.getOrCreate()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "data_path = '../Data'\n",
+    "file_path = data_path + '/utilization.csv'"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df = spark.read.format('csv').options(header=False, inferSchema=True).load(file_path)\n",
+    "\n",
+    "# rename columns\n",
+    "df = df.withColumnRenamed('_c0', 'event_datetime')\\\n",
+    "    .withColumnRenamed('_c1', 'server_id')\\\n",
+    "    .withColumnRenamed('_c2', 'cpu_utilization')\\\n",
+    "    .withColumnRenamed('_c3', 'free_memory')\\\n",
+    "    .withColumnRenamed('_c4', 'session_count')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "+-------------------+---------+---------------+-----------+-------------+\n",
+      "|     event_datetime|server_id|cpu_utilization|free_memory|session_count|\n",
+      "+-------------------+---------+---------------+-----------+-------------+\n",
+      "|03/05/2019 08:06:14|      100|           0.57|       0.51|           47|\n",
+      "|03/05/2019 08:11:14|      100|           0.47|       0.62|           43|\n",
+      "|03/05/2019 08:16:14|      100|           0.56|       0.57|           62|\n",
+      "|03/05/2019 08:21:14|      100|           0.57|       0.56|           50|\n",
+      "|03/05/2019 08:26:14|      100|           0.35|       0.46|           43|\n",
+      "+-------------------+---------+---------------+-----------+-------------+\n",
+      "only showing top 5 rows\n",
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "df.show(5)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "---------"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Vectorizing Features, tranform using VectorAssembler\n",
+    "- Spark ML algorithm expects the inputs in vector format."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "vector_assembler = VectorAssembler(inputCols=['cpu_utilization', 'free_memory', 'session_count'], outputCol='features')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# transform by passing the original dataframe\n",
+    "vcluster_df = vector_assembler.transform(df)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "+-------------------+---------+---------------+-----------+-------------+----------------+\n",
+      "|     event_datetime|server_id|cpu_utilization|free_memory|session_count|        features|\n",
+      "+-------------------+---------+---------------+-----------+-------------+----------------+\n",
+      "|03/05/2019 08:06:14|      100|           0.57|       0.51|           47|[0.57,0.51,47.0]|\n",
+      "|03/05/2019 08:11:14|      100|           0.47|       0.62|           43|[0.47,0.62,43.0]|\n",
+      "|03/05/2019 08:16:14|      100|           0.56|       0.57|           62|[0.56,0.57,62.0]|\n",
+      "|03/05/2019 08:21:14|      100|           0.57|       0.56|           50|[0.57,0.56,50.0]|\n",
+      "|03/05/2019 08:26:14|      100|           0.35|       0.46|           43|[0.35,0.46,43.0]|\n",
+      "+-------------------+---------+---------------+-----------+-------------+----------------+\n",
+      "only showing top 5 rows\n",
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "vcluster_df.show(5)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "We can see that there is a new column 'features' in the newly created DataFrame."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# type(vcluster_df)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Model Creation"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "kmeans = KMeans().setK(3)   # set clusters as 3\n",
+    "kmeans = kmeans.setSeed(1)  # set random seed as 1"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "k_model = kmeans.fit(vcluster_df)  #fit the model with transformed df"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Get Cluster Centers"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 16,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "[array([ 0.71542187,  0.28469012, 87.5516823 ]),\n",
+       " array([ 0.51439668,  0.48445202, 50.49452021]),\n",
+       " array([ 0.62129573,  0.37851014, 69.19070448])]"
+      ]
+     },
+     "execution_count": 16,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "k_model.clusterCenters()         # get the clusters center"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "We can see that there are 3 clusters (as we have defined).\n",
+    "- each cluster is represented by 3 features values ('cpu_utilization', 'free_memory', 'session_count')"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "--------"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "venv-datascience",
+   "language": "python",
+   "name": "venv-datascience"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.8.5"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}