PROWAREtech
.NET: Machine Learning, Unsupervised Learning or Clustering, K-mean / Silhouette Clustering Library
Unsupervised learning is what we humans do all the time, and it refers to algorithms that learn patterns from unlabeled data. Supervised learning is a machine learning paradigm for problems where the available data consists of labeled examples, meaning that each data point contains features (covariates) and an associated label. The goal of supervised learning algorithms is learning a function that maps feature vectors (inputs) to labels (output), based on example input-output pairs.
K-mean Clustering & Silhouette
K-mean clustering is a method of vector quantization that aims to partition x number of observations into k clusters in which each observation belongs to the cluster with the nearest mean, serving as a prototype of the cluster.
The first center points, called centroids, are randomly chosen and then the data is re-evaluated to determine the new centroid positions and so on until the centroid positions do not change.
Silhouette refers to a method of interpretation and validation of consistency within clusters of data. The technique provides a succinct graphical representation of how well each object has been classified.
Visit the playground for related.
TODO: Integrate with Nvidia CUDA for a boost in processing power!
This code is also available in C++, and it runs about twice as fast.
Download all this code and more: KMEANCLUSTERING.zip
Example Usage:
// Program.cs
using System;
using System.Collections.Generic;
using System.Diagnostics;
using System.Linq;
using System.Threading;
class Globals
{
public List<KMean.Data> dataList;
public int K;
public string csvFile;
public int colX;
public int? colY, colZ, colW;
public Thread thread;
}
namespace ConsoleApp1
{
class Program
{
static void thread(object? o)
{
Globals g = (Globals)o;
using(var csv = KMean.Csv.OpenFile(g.csvFile, ',', true))
{
g.dataList.Add(KMean.Clustering.Model(1, g.K, csv, g.colX, g.colY, g.colZ, g.colW));
}
}
static void Main(string[] args)
{
bool multithreaded = true;
string csvFile = "data.csv";
var reader = new StreamReader(csvFile);
int? columnCount = reader.ReadLine()?.Split(',').Length ?? 0;
reader.Dispose();
Console.WriteLine("Calculating two columns of data.");
{
List<Globals> globalsList = new List<Globals>();
List<KMean.Data> dataList = new List<KMean.Data>();
var start = new DateTime();
for (int colX = 0; colX < columnCount; colX++)
{
for (int colY = colX; colY < columnCount; colY++)
{
if (colX == colY)
continue;
for (int K = 2; K < 12; K++)
{
var globals = new Globals() { dataList = dataList, K = K, csvFile = csvFile, colX = colX, colY = colY, thread = new Thread(thread) { IsBackground = true } };
globalsList.Add(globals);
if (multithreaded)
globals.thread.Start(globals);
else
thread(globals);
}
}
}
while (globalsList.Where(j => j.thread.IsAlive == true).Count() > 0)
Thread.Sleep(1000);
Console.WriteLine((DateTime.Now - start).Seconds);
Console.WriteLine("Determining best K scores for each dataset.");
var sortedDataList = dataList.OrderBy(d => d.columnY).OrderBy(d => d.columnX);
int cX = -1, cY = -1;
var tmpList = new List<KMean.Data>();
foreach (var item in sortedDataList)
{
tmpList.Add(item);
if (item.columnX == cX && item.columnY == cY)
continue;
cX = item.columnX;
cY = item.columnY ?? -1;
var bestData = tmpList.OrderByDescending(x => x.KScore).First();
tmpList.Clear();
Console.WriteLine($"Best K for columns [{bestData.columnX},{bestData.columnY}] is {bestData.K}, KScore={bestData.KScore}");
}
}
Console.WriteLine("Calculating three columns of data.");
{
List<Globals> globalsList = new List<Globals>();
List<KMean.Data> dataList = new List<KMean.Data>();
for (int colX = 0; colX < columnCount; colX++)
{
for (int colY = colX; colY < columnCount; colY++)
{
for (int colZ = colY; colZ < columnCount; colZ++)
{
if (colZ == colY && colY == colX && colX == colZ)
continue;
for (int K = 2; K < 12; K++)
{
var globals = new Globals() { dataList = dataList, K = K, csvFile = csvFile, colX = colX, colY = colY, colZ = colZ, thread = new Thread(thread) { IsBackground = true } };
globalsList.Add(globals);
if (multithreaded)
globals.thread.Start(globals);
else
thread(globals);
}
}
}
}
while (globalsList.Where(j => j.thread.IsAlive == true).Count() > 0)
Thread.Sleep(1000);
Console.WriteLine("Determining best K scores for each dataset.");
var sortedDataList = dataList.OrderBy(d => d.columnZ).OrderBy(d => d.columnY).OrderBy(d => d.columnX);
int cX = -1, cY = -1, cZ = -1;
var tmpList = new List<KMean.Data>();
foreach (var item in sortedDataList)
{
tmpList.Add(item);
if (item.columnX == cX && item.columnY == cY && item.columnZ == cZ)
continue;
cX = item.columnX;
cY = item.columnY ?? -1;
cZ = item.columnZ ?? -1;
var bestData = tmpList.OrderByDescending(x => x.KScore).First();
tmpList.Clear();
Console.WriteLine($"Best K for columns [{bestData.columnX},{bestData.columnY},{bestData.columnZ}] is {bestData.K}, KScore={bestData.KScore}");
}
}
}
}
}
// CvsParser.cs
using System;
using System.Collections.Generic;
using System.Text;
using System.Text.RegularExpressions;
namespace KMean
{
/// <summary>
/// Must create a Csv object for each thread!!!
/// </summary>
public class Csv : IDisposable
{
public bool FirstRowColumnNames { get; }
public static Csv OpenFile(string file, char delimiter, bool firstRowColumNames)
{
return new Csv(new StreamReader(file, new FileStreamOptions { Access = FileAccess.Read, Mode = FileMode.Open, Share = FileShare.ReadWrite }), file, delimiter, firstRowColumNames);
}
public static Csv OpenString(string csv, char delimiter, bool firstRowColumNames)
{
return new Csv(new StringReader(csv), csv, delimiter, firstRowColumNames);
}
public string[]? GetLine()
{
var line = reader.ReadLine();
if (line == null || line.Length == 0)
{
Dispose();
if (file != null)
reader = new StreamReader(file);
else if (csv != null)
reader = new StringReader(csv);
return null;
}
return CsvParser.Split(line);
}
public void Dispose()
{
reader.Dispose();
}
private Csv(StreamReader reader, string file, char delimiter, bool firstRowColumnNames)
{
this.file = file;
FirstRowColumnNames = firstRowColumnNames;
this.reader = reader;
CsvParser = new Regex($"{delimiter}(?=(?:[^\"]*\"[^\"]*\")*(?![^\"]*\"))");
}
private Csv(StringReader reader, string csv, char delimiter, bool firstRowColumnNames)
{
this.csv = csv;
FirstRowColumnNames = firstRowColumnNames;
this.reader = reader;
CsvParser = new Regex($"{delimiter}(?=(?:[^\"]*\"[^\"]*\")*(?![^\"]*\"))");
}
private readonly string? file, csv;
private readonly Regex CsvParser;
private TextReader reader;
}
}
// KMeanClustering.cs
using System;
using System.Collections.Generic;
using System.Linq;
namespace KMean
{
public class Silhouette
{
// Silhouette Formula: S(i) = [b(i) - a(i)] / MAX(a(i), b(i))
// a(i) = the average of the distance between point i and all other data points in its cluster
// b(i) = the average of the distance between point i and all other data points outside its cluster
public static float CalculateKScore(Data data)
{
double a_total = 0;
double b_total = 0;
int a_count = 0;
int b_count = 0;
int count = data.points.Count, K = data.K;
for (int j = 0; j < count; j++)
{
for (int i = 0; i < K; i++)
{
if (data.points[j].cid == i)
{
a_count++;
a_total += Euclidean.FindDistanceBetweenPoints(data.points[j], data.centroids[i]);
}
else if (data.points[j].cid == FindNearestNeighboringClusterId(data.centroids, data.centroids[i]))
{
b_count++;
b_total += Euclidean.FindDistanceBetweenPoints(data.points[j], data.centroids[i]);
}
}
}
float a, b;
if (a_count == 0 && b_count == 0)
return float.MinValue;
else if (a_count == 0)
{
a = 0;
b = (float)(b_total / b_count);
}
else if (b_count == 0)
{
a = (float)(a_total / a_count);
b = 0;
}
else
{
a = (float)(a_total / a_count);
b = (float)(b_total / b_count);
}
float max = Math.Max(a, b);
if (max == 0)
return float.MinValue;
return (b - a) / Math.Max(a, b);
}
#region Private_Decl
private static int FindNearestNeighboringClusterId(Point[] centroids, Point centroid)
{
int id = -1;
float min = float.MaxValue;
int count = centroids.Length;
for (int i = 0; i < count; i++)
{
float dist = Euclidean.FindDistanceBetweenPoints(centroids[i], centroid);
if (dist > 0 && dist <= min)
{
id = i;
min = dist;
}
}
return id;
}
#endregion
}
public class Point
{
public float x { get; set; }
public float y { get; set; } // should be zero if not using a 2nd column of data
public float z { get; set; } // should be zero if not using a 3rd column of data
public float w { get; set; } // should be zero if not using a 4rd column of data
public Point(float x, float y, float z, float w)
{
this.x = x;
this.y = y;
this.z = z;
this.w = w;
}
}
public class ClusteredPoint : Point
{
/// <summary>
/// cid is the Cluster Identifier
/// </summary>
public int? cid { get; set; }
public ClusteredPoint(float x, float y, float z, float w) : base(x, y, z, w) { }
}
public class Data
{
public int K { get { return centroids.Length; } }
private float? score;
public float KScore
{
get
{
if (score == null)
score = Silhouette.CalculateKScore(this);
return score ?? float.MinValue;
}
}
public int columnX { get; }
public int? columnY { get; }
public int? columnZ { get; }
public int? columnW { get; }
public string? labelX { get; }
public string? labelY { get; }
public string? labelZ { get; }
public string? labelW { get; }
public Point[] centroids { get; }
// this.points and Centroid.distances are parallel
public List<ClusteredPoint> points { get; }
public Data(int K, Csv csvData, int colX, int? colY, int? colZ, int? colW)
{
strings = new Dictionary<string, float>();
columnX = colX;
columnY = colY;
columnZ = colZ;
columnW = colW;
centroids = new Point[K];
points = new List<ClusteredPoint>();
bool headings = csvData.FirstRowColumnNames;
for (var line = csvData.GetLine(); line != null; line = csvData.GetLine())
{
if (colX < line.Length && (colY == null || colY < line.Length) && (colZ == null || colZ < line.Length) && (colW == null || colW < line.Length))
{
if (headings)
{
headings = false;
labelX = line[colX];
labelY = colY == null ? null : line[colY ?? 0];
labelZ = colZ == null ? null : line[colZ ?? 0];
labelW = colW == null ? null : line[colW ?? 0];
}
else
{
var pt = new ClusteredPoint(GetFieldDataValue(line, colX) ?? 0, GetFieldDataValue(line, colY) ?? 0, GetFieldDataValue(line, colZ) ?? 0, GetFieldDataValue(line, colW) ?? 0);
points.Add(pt);
}
}
}
}
public Dictionary<string, float> strings { get; }
private float? GetFieldDataValue(string[] line, int? col)
{
if (col == null)
return null;
string s = line[col ?? 0].Trim();
if (float.TryParse(s, out float f))
return f;
else // get an arbitrary number for the string
{
if (strings.ContainsKey(s))
return strings[s];
f = strings.Count;
strings.Add(s, f);
return f;
}
}
public string ToJson()
{
return System.Text.Json.JsonSerializer.Serialize(this);
}
public static Data? FromJson(string json)
{
return System.Text.Json.JsonSerializer.Deserialize<Data>(json);
}
}
public class Euclidean
{
// Euclidean distance formula between three points: distance = square_root((x2 - x1) * (x2 - x1) + (y2 - y1) * (y2 - y1) + (z2 - z1) * (z2 - z1))
public static float FindDistanceBetweenPoints(Point p1, Point p2)
{
return (float)Math.Sqrt((p2.x - p1.x) * (p2.x - p1.x) + (p2.y - p1.y) * (p2.y - p1.y) + (p2.z - p1.z) * (p2.z - p1.z) + (p2.w - p1.w) * (p2.w - p1.w));
}
public static double FindDistanceBetweenPointsSquared(Point p1, Point p2)
{
return ((double)p2.x - (double)p1.x) * ((double)p2.x - (double)p1.x) +
((double)p2.y - (double)p1.y) * ((double)p2.y - (double)p1.y) +
((double)p2.z - (double)p1.z) * ((double)p2.z - (double)p1.z) +
((double)p2.w - (double)p1.w) * ((double)p2.w - (double)p1.w);
}
}
public class Clustering
{
public static Data Model(int attemptsToImproveKScore, int K, Csv csvData, int dataColumnX, int? dataColumnY, int? dataColumnZ = null, int? dataColumnW = null)
{
var attempts = new Data[attemptsToImproveKScore];
for (int i = 0; i < attemptsToImproveKScore; i++)
attempts[i] = RunModel(K, csvData, dataColumnX, dataColumnY, dataColumnZ, dataColumnW);
return attempts.OrderByDescending(x => x.KScore).First();
}
#region Private_Decl
private static Data RunModel(int K, Csv csvData, int dataColumnX, int? dataColumnY, int? dataColumnZ, int? dataColumnW)
{
var data = new Data(K, csvData, dataColumnX, dataColumnY, dataColumnZ, dataColumnW);
int count = data.points.Count;
if (K > count / 2)
throw new Exception($"There are not enough data for K={K}.") { Source = "KMean.Clustering" };
if (K < 2)
throw new Exception($"K={K} is invalid. K must be 2 or greater.") { Source = "KMean.Clustering" };
var chosen_indexes_to_centroid = new List<int>();
for (int i = 0; i < K; i++)
{
var pt = data.points[FindUnchosenRandomIndex(chosen_indexes_to_centroid, count)]; // pick random points initially
data.centroids[i] = new Point(pt.x, pt.y, pt.z, pt.w);
}
var temp_points = new Point[K];
while (true)
{
foreach (ClusteredPoint point in data.points)
point.cid = GetNearestCentroidIndex(point, data.centroids);
CopyPointsArray(data.centroids, temp_points);
RecenterCentroids(data);
if (PointArraysAreEqual(temp_points, data.centroids))
break;
}
return data;
}
private static int FindUnchosenRandomIndex(List<int> chosen_indexes_to_centroid, int count)
{
var rand = new Random();
int index = rand.Next(count);
while (true)
{
while (chosen_indexes_to_centroid.Where(i => i == index).Count() > 0)
index++;
if (index < count)
break;
index = rand.Next(count);
}
chosen_indexes_to_centroid.Add(index);
return index;
}
private static int GetNearestCentroidIndex(Point pt, Point[] centroids)
{
double d, min_dist = double.MaxValue;
int centroid_count = centroids.Length;
int min_index = centroid_count;
for (int i = 0; i < centroid_count; i++)
{
d = Euclidean.FindDistanceBetweenPointsSquared(pt, centroids[i]);
if (d < min_dist)
{
min_dist = d;
min_index = i;
}
}
return min_index;
}
private static void CopyPointsArray(Point[] src, Point[] dst)
{
for (int i = 0; i < src.Length; i++)
{
if (dst[i] == null)
dst[i] = new Point(src[i].x, src[i].y, src[i].z, src[i].w);
else
{
dst[i].x = src[i].x;
dst[i].y = src[i].y;
dst[i].z = src[i].z;
dst[i].w = src[i].w;
}
}
}
private static bool PointArraysAreEqual(Point[] points1, Point[] points2)
{
int count = points1.Length;
for(int i = 0; i < points1.Length && count > 0; i++)
{
for(int j = 0; j < points2.Length && count > 0; j++)
{
if (points2[j].x == points1[i].x && points2[j].y == points1[i].y && points2[j].z == points1[i].z && points2[j].w == points1[i].w)
count--;
}
}
return count == 0;
}
private static void RecenterCentroids(Data data)
{
int K = data.K;
for (int i = 0; i < K; i++)
{
int c = 0;
double x_total = 0;
double y_total = 0;
double z_total = 0;
double w_total = 0;
foreach (var point in data.points)
{
if (point.cid == i)
{
c++;
x_total += point.x;
y_total += point.y;
z_total += point.z;
w_total += point.w;
}
}
if (c == 0)
data.centroids[i].x = data.centroids[i].y = data.centroids[i].z = data.centroids[i].w = 0;
else
{
data.centroids[i].x = (float)(x_total / c);
data.centroids[i].y = (float)(y_total / c);
data.centroids[i].z = (float)(z_total / c);
data.centroids[i].w = (float)(w_total / c);
}
}
}
#endregion
}
}