From: rtrimana Date: Thu, 23 Aug 2018 17:26:31 +0000 (-0700) Subject: Adding Python ML examples of clustering data points. X-Git-Url: http://plrg.eecs.uci.edu/git/?p=pingpong.git;a=commitdiff_plain;h=6d39a43c1dd95bb2f2c9064b4a820bab4597972f Adding Python ML examples of clustering data points. --- diff --git a/Code/Projects/SmartPlugDetector/src/main/java/edu/uci/iotproject/Main.java b/Code/Projects/SmartPlugDetector/src/main/java/edu/uci/iotproject/Main.java index 75eff77..72b14e0 100644 --- a/Code/Projects/SmartPlugDetector/src/main/java/edu/uci/iotproject/Main.java +++ b/Code/Projects/SmartPlugDetector/src/main/java/edu/uci/iotproject/Main.java @@ -38,10 +38,10 @@ public class Main { //String path = "/Users/varmarken/temp/UCI IoT Project/experiments"; // Janus // 1) D-Link July 26 experiment -// final String inputPcapFile = path + "/2018-07/dlink/dlink.wlan1.local.pcap"; -// final String outputPcapFile = path + "/2018-07/dlink/dlink-processed.pcap"; -// final String triggerTimesFile = path + "/2018-07/dlink/dlink-july-26-2018.timestamps"; -// final String deviceIp = "192.168.1.199"; // .246 == phone; .199 == dlink plug? + final String inputPcapFile = path + "/2018-07/dlink/dlink.wlan1.local.pcap"; + final String outputPcapFile = path + "/2018-07/dlink/dlink-processed.pcap"; + final String triggerTimesFile = path + "/2018-07/dlink/dlink-july-26-2018.timestamps"; + final String deviceIp = "192.168.1.246"; // .246 == phone; .199 == dlink plug? // 2) TP-Link July 25 experiment // final String inputPcapFile = path + "/2018-07/tplink/tplink.wlan1.local.pcap"; @@ -106,10 +106,10 @@ public class Main { // final String deviceIp = "192.168.1.246"; // .246 == phone; .235 == camera // 11) Arlo Camera August 10 experiment - final String inputPcapFile = path + "/2018-08/arlo-camera/arlo-camera.wlan1.local.pcap"; - final String outputPcapFile = path + "/2018-08/arlo-camera/arlo-camera-processed.pcap"; - final String triggerTimesFile = path + "/2018-08/arlo-camera/arlo-camera-aug-10-2018.timestamps"; - final String deviceIp = "192.168.1.140"; // .246 == phone; .140 == camera +// final String inputPcapFile = path + "/2018-08/arlo-camera/arlo-camera.wlan1.local.pcap"; +// final String outputPcapFile = path + "/2018-08/arlo-camera/arlo-camera-processed.pcap"; +// final String triggerTimesFile = path + "/2018-08/arlo-camera/arlo-camera-aug-10-2018.timestamps"; +// final String deviceIp = "192.168.1.140"; // .246 == phone; .140 == camera // 12) Blossom sprinkler August 13 experiment // final String inputPcapFile = path + "/2018-08/blossom/blossom.wlan1.local.pcap"; @@ -121,7 +121,7 @@ public class Main { // final String inputPcapFile = path + "/2018-08/dlink-siren/dlink-siren.wlan1.local.pcap"; // final String outputPcapFile = path + "/2018-08/dlink-siren/dlink-siren-processed.pcap"; // final String triggerTimesFile = path + "/2018-08/dlink-siren/dlink-siren-aug-14-2018.timestamps"; -// final String deviceIp = "192.168.1.183"; // .246 == phone; .183 == siren +// final String deviceIp = "192.168.1.246"; // .246 == phone; .183 == siren // 14) Nest thermostat August 15 experiment // final String inputPcapFile = path + "/2018-08/nest/nest.wlan1.local.pcap"; diff --git a/python_ml/clustering.py b/python_ml/clustering.py new file mode 100644 index 0000000..bf8c1eb --- /dev/null +++ b/python_ml/clustering.py @@ -0,0 +1,113 @@ +from __future__ import print_function + +from sklearn.datasets import make_blobs +from sklearn.cluster import KMeans +from sklearn.metrics import silhouette_samples, silhouette_score + +import matplotlib.pyplot as plt +import matplotlib.cm as cm +import numpy as np + +print(__doc__) + +# Generating the sample data from make_blobs +# This particular setting has one distinct cluster and 3 clusters placed close +# together. +'''X, y = make_blobs(n_samples=500, + n_features=2, + centers=4, + cluster_std=1, + center_box=(-10.0, 10.0), + shuffle=True, + random_state=1) # For reproducibility''' + +X = np.array([[132, 192], [117, 960], [117, 962], [1343, 0], [117, 1116], [117, 1117], [117, 1118], [117, 1119], [1015, 0], [117, 966]]) + +range_n_clusters = [2, 3, 4, 5, 6] + +for n_clusters in range_n_clusters: + # Create a subplot with 1 row and 2 columns +# fig, (ax1, ax2) = plt.subplots(1, 2) +# fig.set_size_inches(18, 7) + + # The 1st subplot is the silhouette plot + # The silhouette coefficient can range from -1, 1 but in this example all + # lie within [-0.1, 1] +# ax1.set_xlim([-0.1, 1]) + # The (n_clusters+1)*10 is for inserting blank space between silhouette + # plots of individual clusters, to demarcate them clearly. +# ax1.set_ylim([0, len(X) + (n_clusters + 1) * 10]) + + # Initialize the clusterer with n_clusters value and a random generator + # seed of 10 for reproducibility. +# clusterer = KMeans(n_clusters=n_clusters, random_state=20) +# cluster_labels = clusterer.fit_predict(X) + + # The silhouette_score gives the average value for all the samples. + # This gives a perspective into the density and separation of the formed + # clusters + silhouette_avg = silhouette_score(X, cluster_labels) + print("For n_clusters =", n_clusters, + "The average silhouette_score is :", silhouette_avg) + + # Compute the silhouette scores for each sample + sample_silhouette_values = silhouette_samples(X, cluster_labels) + +''' y_lower = 10 + for i in range(n_clusters): + # Aggregate the silhouette scores for samples belonging to + # cluster i, and sort them + ith_cluster_silhouette_values = \ + sample_silhouette_values[cluster_labels == i] + + ith_cluster_silhouette_values.sort() + + size_cluster_i = ith_cluster_silhouette_values.shape[0] + y_upper = y_lower + size_cluster_i + + color = cm.nipy_spectral(float(i) / n_clusters) + ax1.fill_betweenx(np.arange(y_lower, y_upper), + 0, ith_cluster_silhouette_values, + facecolor=color, edgecolor=color, alpha=0.7) + + # Label the silhouette plots with their cluster numbers at the middle + ax1.text(-0.05, y_lower + 0.5 * size_cluster_i, str(i)) + + # Compute the new y_lower for next plot + y_lower = y_upper + 10 # 10 for the 0 samples + + ax1.set_title("The silhouette plot for the various clusters.") + ax1.set_xlabel("The silhouette coefficient values") + ax1.set_ylabel("Cluster label") + + # The vertical line for average silhouette score of all the values + ax1.axvline(x=silhouette_avg, color="red", linestyle="--") + + ax1.set_yticks([]) # Clear the yaxis labels / ticks + ax1.set_xticks([-0.1, 0, 0.2, 0.4, 0.6, 0.8, 1]) + + # 2nd Plot showing the actual clusters formed + colors = cm.nipy_spectral(cluster_labels.astype(float) / n_clusters) + ax2.scatter(X[:, 0], X[:, 1], marker='.', s=30, lw=0, alpha=0.7, + c=colors, edgecolor='k') + + # Labeling the clusters + centers = clusterer.cluster_centers_ + # Draw white circles at cluster centers + ax2.scatter(centers[:, 0], centers[:, 1], marker='o', + c="white", alpha=1, s=200, edgecolor='k') + + for i, c in enumerate(centers): + ax2.scatter(c[0], c[1], marker='$%d$' % i, alpha=1, + s=50, edgecolor='k') + + ax2.set_title("The visualization of the clustered data.") + ax2.set_xlabel("Feature space for the 1st feature") + ax2.set_ylabel("Feature space for the 2nd feature") + + plt.suptitle(("Silhouette analysis for KMeans clustering on sample data " + "with n_clusters = %d" % n_clusters), + fontsize=14, fontweight='bold') + + plt.show() + diff --git a/python_ml/dlink_clustering.py b/python_ml/dlink_clustering.py new file mode 100644 index 0000000..badd5b1 --- /dev/null +++ b/python_ml/dlink_clustering.py @@ -0,0 +1,6 @@ +from sklearn.cluster import KMeans +import numpy as np +X = np.array([[132, 192], [117, 960], [117, 962], [1343, 0], [117, 1109], [117, 1110], [117, 1111], [117, 1116], [117, 1117], [117, 1118], [117, 1119], [1015, 0], [117, 966]]) +kmeans = KMeans(n_clusters=5, random_state=0).fit(X) +print(kmeans.labels_) +print(kmeans.labels_.tolist().count(3)) diff --git a/python_ml/silhouette.py b/python_ml/silhouette.py new file mode 100644 index 0000000..bf8c1eb --- /dev/null +++ b/python_ml/silhouette.py @@ -0,0 +1,113 @@ +from __future__ import print_function + +from sklearn.datasets import make_blobs +from sklearn.cluster import KMeans +from sklearn.metrics import silhouette_samples, silhouette_score + +import matplotlib.pyplot as plt +import matplotlib.cm as cm +import numpy as np + +print(__doc__) + +# Generating the sample data from make_blobs +# This particular setting has one distinct cluster and 3 clusters placed close +# together. +'''X, y = make_blobs(n_samples=500, + n_features=2, + centers=4, + cluster_std=1, + center_box=(-10.0, 10.0), + shuffle=True, + random_state=1) # For reproducibility''' + +X = np.array([[132, 192], [117, 960], [117, 962], [1343, 0], [117, 1116], [117, 1117], [117, 1118], [117, 1119], [1015, 0], [117, 966]]) + +range_n_clusters = [2, 3, 4, 5, 6] + +for n_clusters in range_n_clusters: + # Create a subplot with 1 row and 2 columns +# fig, (ax1, ax2) = plt.subplots(1, 2) +# fig.set_size_inches(18, 7) + + # The 1st subplot is the silhouette plot + # The silhouette coefficient can range from -1, 1 but in this example all + # lie within [-0.1, 1] +# ax1.set_xlim([-0.1, 1]) + # The (n_clusters+1)*10 is for inserting blank space between silhouette + # plots of individual clusters, to demarcate them clearly. +# ax1.set_ylim([0, len(X) + (n_clusters + 1) * 10]) + + # Initialize the clusterer with n_clusters value and a random generator + # seed of 10 for reproducibility. +# clusterer = KMeans(n_clusters=n_clusters, random_state=20) +# cluster_labels = clusterer.fit_predict(X) + + # The silhouette_score gives the average value for all the samples. + # This gives a perspective into the density and separation of the formed + # clusters + silhouette_avg = silhouette_score(X, cluster_labels) + print("For n_clusters =", n_clusters, + "The average silhouette_score is :", silhouette_avg) + + # Compute the silhouette scores for each sample + sample_silhouette_values = silhouette_samples(X, cluster_labels) + +''' y_lower = 10 + for i in range(n_clusters): + # Aggregate the silhouette scores for samples belonging to + # cluster i, and sort them + ith_cluster_silhouette_values = \ + sample_silhouette_values[cluster_labels == i] + + ith_cluster_silhouette_values.sort() + + size_cluster_i = ith_cluster_silhouette_values.shape[0] + y_upper = y_lower + size_cluster_i + + color = cm.nipy_spectral(float(i) / n_clusters) + ax1.fill_betweenx(np.arange(y_lower, y_upper), + 0, ith_cluster_silhouette_values, + facecolor=color, edgecolor=color, alpha=0.7) + + # Label the silhouette plots with their cluster numbers at the middle + ax1.text(-0.05, y_lower + 0.5 * size_cluster_i, str(i)) + + # Compute the new y_lower for next plot + y_lower = y_upper + 10 # 10 for the 0 samples + + ax1.set_title("The silhouette plot for the various clusters.") + ax1.set_xlabel("The silhouette coefficient values") + ax1.set_ylabel("Cluster label") + + # The vertical line for average silhouette score of all the values + ax1.axvline(x=silhouette_avg, color="red", linestyle="--") + + ax1.set_yticks([]) # Clear the yaxis labels / ticks + ax1.set_xticks([-0.1, 0, 0.2, 0.4, 0.6, 0.8, 1]) + + # 2nd Plot showing the actual clusters formed + colors = cm.nipy_spectral(cluster_labels.astype(float) / n_clusters) + ax2.scatter(X[:, 0], X[:, 1], marker='.', s=30, lw=0, alpha=0.7, + c=colors, edgecolor='k') + + # Labeling the clusters + centers = clusterer.cluster_centers_ + # Draw white circles at cluster centers + ax2.scatter(centers[:, 0], centers[:, 1], marker='o', + c="white", alpha=1, s=200, edgecolor='k') + + for i, c in enumerate(centers): + ax2.scatter(c[0], c[1], marker='$%d$' % i, alpha=1, + s=50, edgecolor='k') + + ax2.set_title("The visualization of the clustered data.") + ax2.set_xlabel("Feature space for the 1st feature") + ax2.set_ylabel("Feature space for the 2nd feature") + + plt.suptitle(("Silhouette analysis for KMeans clustering on sample data " + "with n_clusters = %d" % n_clusters), + fontsize=14, fontweight='bold') + + plt.show() +