PROWAREtech
C++: Machine Learning, Unsupervised Learning or Clustering, K-mean / Silhouette Clustering Library
Unsupervised learning is what we humans do all the time, and it refers to algorithms that learn patterns from unlabeled data. Supervised learning is a machine learning paradigm for problems where the available data consists of labeled examples, meaning that each data point contains features (covariates) and an associated label. The goal of supervised learning algorithms is learning a function that maps feature vectors (inputs) to labels (output), based on example input-output pairs.
See this article for a supervised machine learning library written in C++.
K-Means Clustering & Silhouette
K-means clustering is a method of vector quantization that aims to partition x number of observations into k clusters in which each observation belongs to the cluster with the nearest mean, serving as a prototype of the cluster.
The first center points, called centroids, are randomly chosen and then the data is re-evaluated to determine the new centroid positions and so on until the centroid positions do not change.
Silhouette refers to a method of interpretation and validation of consistency within clusters of data. The technique provides a succinct graphical representation of how well each object has been classified.
This code can use pure pseudorandom centroid selection or it can use k-means++ centroid selection. It also has the option to remove data outliers.
Visit the playground for related.
TODO: Integrate with Nvidia CUDA for a boost in processing power!
This code is also available in C#, but it does take about twice as long to run.
Download all this code and more: KMEANCLUSTERING.zip
Example Usage
// main.cpp
#include "kmean.h"
int main()
{
int K = 14; // the K-MAX value to set and the K that was used is returned in this variable
float KScore;
unsigned int count;
KMean::ClusteredPoint* points;
float remove_outliers_iqr = 1.5F; // remove outliers
bool use_kmeanspp = true;
bool multithreaded = true;
// can also copy an array of the data points (with cluster information) to be used for analysis
KMean::FindBestKFromFile(remove_outliers_iqr, use_kmeanspp, multithreaded, "data.csv", 5, false, 0, 1, -1, -1, &K, &KScore, &count, (void**)&points);
for(unsigned int i = 0; i < count; i++)
{
int cluster_id = points[i].cid;
float x = points[i].x;
float y = points[i].y;
float z = points[i].z;
float w = points[i].w;
}
KMean::FreeMemory(points);
return 0;
}
Library Source Code
// kmean.h
#include <thread>
#include <fstream>
#include <sstream>
#include <cstdlib>
#include <time.h>
#include <limits>
#include <cmath>
#include <string>
#include <vector>
#include "csvparser.h"
#ifndef KMEAN_H
#define KMEAN_H
#define MIN(X, Y) (((X) < (Y)) ? (X) : (Y))
#define MAX(X, Y) (((X) < (Y)) ? (Y) : (X))
#define MIN_FLOAT (-std::numeric_limits<float>::max())
#define MAX_FLOAT (std::numeric_limits<float>::max())
namespace KMean
{
const int KMIN = 2;
struct Point
{
float x, y, z, w; // should be zero if not using the data
Point()
{
x = y = z = w = 0;
}
Point(Point* pt)
{
x = pt->x;
y = pt->y;
z = pt->z;
w = pt->w;
}
Point(float x, float y, float z, float w)
{
this->x = x;
this->y = y;
this->z = z;
this->w = w;
}
};
struct ClusteredPoint : public Point
{
int cid;
ClusteredPoint() : cid(-1)
{
x = y = z = w = 0;
}
ClusteredPoint(Point* pt) : cid(-1)
{
x = pt->x;
y = pt->y;
z = pt->z;
w = pt->w;
}
ClusteredPoint(float x, float y, float z, float w) : cid(-1)
{
this->x = x;
this->y = y;
this->z = z;
this->w = w;
}
};
struct DistancePoint : public Point
{
float distance;
DistancePoint(Point* pt, float distance) : distance(distance)
{
x = pt->x;
y = pt->y;
z = pt->z;
w = pt->w;
}
};
namespace Euclidean
{
// Euclidean distance formula between four points: distance = square_root((x2 - x1) * (x2 - x1) + (y2 - y1) * (y2 - y1) + (z2 - z1) * (z2 - z1) + (w2 - w1) * (w2 - w1))
float FindDistanceBetweenPoints(const Point* const p1, const Point* const p2)
{
return (float)sqrt((p2->x - p1->x) * (p2->x - p1->x) + (p2->y - p1->y) * (p2->y - p1->y) + (p2->z - p1->z) * (p2->z - p1->z) + (p2->w - p1->w) * (p2->w - p1->w));
}
double FindDistanceBetweenPointsSquared(const Point* const p1, const Point* const p2)
{
return ((double)p2->x - (double)p1->x) * ((double)p2->x - (double)p1->x) + ((double)p2->y - (double)p1->y) * ((double)p2->y - (double)p1->y) + ((double)p2->z - (double)p1->z) * ((double)p2->z - (double)p1->z) + ((double)p2->w - (double)p1->w) * ((double)p2->w - (double)p1->w);
}
}
namespace Sorting
{
void qsort_distance_points(DistancePoint** const arr, const std::size_t size)
{
if (size <= 30)
{
// insertion sort algorithm
for (std::size_t i = 1; i < size; i++) {
DistancePoint* temp = arr[i];
std::size_t j = i;
while (j > 0 && temp->distance < arr[j - 1]->distance) {
arr[j] = arr[j - 1];
j--;
}
arr[j] = temp;
}
}
else
{
// quick sort algorithm
double pivot = arr[size / 2]->distance;
DistancePoint** left = arr;
DistancePoint** right = arr + size - 1;
while (true) {
while (left <= right && (*left)->distance < pivot) left++;
while (left <= right && (*right)->distance > pivot) right--;
if (left > right)
break;
DistancePoint* temp = *left;
*left++ = *right;
*right-- = temp;
}
qsort_distance_points(arr, right - arr + 1);
qsort_distance_points(left, arr + size - left);
}
}
}
class Silhouette
{
private:
static int FindNearestNeighboringClusterId(std::vector<Point*>& centroids, Point* centroid)
{
int id = -1;
float min = std::numeric_limits<float>::max();
std::size_t count = centroids.size();
for (int i = 0; i < count; i++)
{
float dist = Euclidean::FindDistanceBetweenPoints(centroids[i], centroid);
if (dist > 0 && dist <= min)
{
id = i;
min = dist;
}
}
return id;
}
public:
// Silhouette Formula: S(i) = [b(i) - a(i)] / MAX(a(i), b(i))
// a(i) = the average of the distance between point i and all other data points in its cluster
// b(i) = the average of the distance between point i and all other data points outside its cluster
static float CalculateKScore(std::vector<ClusteredPoint*>& points, std::vector<Point*>& centroids)
{
const std::size_t cluster_count = centroids.size();
double a_total = 0;
double b_total = 0;
int a_count = 0;
int b_count = 0;
const std::size_t count = points.size();
for (std::size_t cid = 0; cid < cluster_count; cid++)
{
for (int j = 0; j < count; j++)
{
if (points[j]->cid == cid)
{
a_count++;
a_total += Euclidean::FindDistanceBetweenPoints(points[j], centroids[cid]);
}
else if (points[j]->cid == FindNearestNeighboringClusterId(centroids, centroids[cid]))
{
b_count++;
b_total += Euclidean::FindDistanceBetweenPoints(points[j], centroids[cid]);
}
}
}
float a, b;
if (a_count == 0)
a = 0;
else
a = a_total / a_count;
if (b_count == 0)
b = 0;
else
b = b_total / b_count;
float max = MAX(a, b);
if (max == 0)
return MIN_FLOAT;
return (b - a) / max;
}
};
class RawData
{
private:
const int columnX, columnY, columnZ, columnW;
public:
std::string labelX, labelY, labelZ, labelW;
std::vector<Point*> points;
RawData(const int colX, const int colY, const int colZ, const int colW) : columnX(colX), columnY(colY), columnZ(colZ), columnW(colW) {}
bool Initialize(const float iqrMultiplier, CSVParser::CSVNumericData& parsedCSVData)
{
std::vector<std::string> columnNames = parsedCSVData.column_names();
if (columnX < columnNames.size())
labelX = columnNames[columnX];
if (columnY >= 0 && columnY < columnNames.size())
labelY = columnNames[columnY];
if (columnZ >= 0 && columnZ < columnNames.size())
labelZ = columnNames[columnZ];
if (columnW >= 0 && columnW < columnNames.size())
labelW = columnNames[columnW];
std::size_t col_count;
for (int row = 0; row < parsedCSVData.row_count(); row++)
{
col_count = parsedCSVData[row].size();
if (columnX < col_count && (columnY < 0 || columnY < col_count) && (columnZ < 0 || columnZ < col_count) && (columnW < 0 || columnW < col_count))
{
Point* pt = new Point(
parsedCSVData[row][columnX],
columnY < 0 ? 0 : parsedCSVData[row][columnY],
columnZ < 0 ? 0 : parsedCSVData[row][columnZ],
columnW < 0 ? 0 : parsedCSVData[row][columnW]);
points.push_back(pt);
}
}
std::size_t count = points.size();
if (count && iqrMultiplier > 0.0F) // use interquartile range to find outliers
{
DistancePoint** dp_array = new DistancePoint * [count];
// find average distance of every point from each other point
for (std::size_t i = 0; i < count; i++)
{
double total_distance = 0;
for (std::size_t j = 0; j < count; j++)
if (i != j)
total_distance += Euclidean::FindDistanceBetweenPoints(points[j], points[i]);
dp_array[i] = new DistancePoint(points[i], total_distance / count);
}
Sorting::qsort_distance_points(dp_array, count);
DistancePoint* Q1 = dp_array[count / 4];
DistancePoint* Q3 = dp_array[count / 2 + count / 4 + (count & 1)];
double IQR = Q3->distance - Q1->distance;
double upper_fence = Q3->distance + iqrMultiplier * IQR;
double lower_fence = Q1->distance - iqrMultiplier * IQR;
std::size_t len = count;
for (std::size_t i = 0; i < len; )
{
if (dp_array[i]->distance < lower_fence || dp_array[i]->distance > upper_fence)
{
delete dp_array[i];
dp_array[i] = nullptr;
len--;
for (std::size_t j = i; j < len; j++)
dp_array[j] = dp_array[j + 1];
}
else
i++;
}
for (Point* pt : points)
delete pt;
points.clear();
for (std::size_t i = 0; i < len; i++)
{
points.push_back(new Point(dp_array[i]));
delete dp_array[i];
}
delete[] dp_array;
count = len;
}
return count > 0;
}
~RawData()
{
for (Point* p : points)
{
if (p)
delete p;
}
}
};
class Data
{
private:
float score;
double GetNearestCentroidDistance(Point* pt, std::vector<Point*>& centroids)
{
double d, min_dist = std::numeric_limits<float>::max();
std::size_t centroid_count = centroids.size();
for (std::size_t i = 0; i < centroid_count; i++)
{
d = Euclidean::FindDistanceBetweenPointsSquared(pt, centroids[i]);
if (d < min_dist)
min_dist = d;
}
return min_dist;
}
void kMeanspp()
{
std::size_t point_count = points.size();
centroids.push_back(new Point(points[rand() % point_count]));
double* distances = new double[point_count];
double sum = 0;
for (std::size_t index = 1; index < K; index++)
{
for (std::size_t i = 0; i < point_count; i++)
sum += (distances[i] = GetNearestCentroidDistance(points[i], centroids));
sum *= rand() / (float)RAND_MAX;
for (std::size_t i = 0; i < point_count; i++)
{
sum -= distances[i];
if (sum < 0)
{
centroids.push_back(new Point(points[i]));
break;
}
}
}
delete[] distances;
}
static bool IsChosen(const std::size_t* const chosen_indexes_to_centroid, const int K, const std::size_t count, const std::size_t index)
{
for (int i = 0; i < K; i++)
{
if (chosen_indexes_to_centroid[i] == count)
break;
if (chosen_indexes_to_centroid[i] == index)
return true;
}
return false;
}
static std::size_t FindUnchosenRandomIndex(const std::size_t* const chosen_indexes_to_centroid, const int K, const std::size_t count)
{
std::size_t index = rand() % count;
while (true)
{
while (IsChosen(chosen_indexes_to_centroid, K, count, index))
index++;
if (index < count)
break;
index = rand() % count;
}
return index;
}
public:
const int K;
std::vector<Point*> centroids;
std::vector<ClusteredPoint*> points;
float KScore()
{
if (score == MIN_FLOAT)
score = Silhouette::CalculateKScore(points, centroids);
return score;
}
Data(const int K) : score(MIN_FLOAT), K(K) {}
bool Initialize(const bool use_kmeanspp, RawData* raw_data)
{
bool ret = false;
if (raw_data && raw_data->points.size())
{
std::size_t count = raw_data->points.size();
for (std::size_t i = 0; i < count; i++)
points.push_back(new ClusteredPoint(raw_data->points[i]));
if (use_kmeanspp) // use k-means++ method to find initial centroids
{
kMeanspp();
}
else // use total random method to choose centroids
{
std::size_t* const chosen_indexes_to_centroid = new std::size_t[K];
for (int i = 0; i < K; i++)
chosen_indexes_to_centroid[i] = count;
for (int i = 0; i < K; i++)
{
chosen_indexes_to_centroid[i] = FindUnchosenRandomIndex(chosen_indexes_to_centroid, K, count);
centroids.push_back(new Point(points[chosen_indexes_to_centroid[i]]));
}
delete[] chosen_indexes_to_centroid;
}
ret = true;
}
return ret;
}
~Data()
{
for (Point* c : centroids)
{
if (c)
delete c;
}
for (ClusteredPoint* p : points)
{
if (p)
delete p;
}
}
};
class Clustering
{
public:
static std::vector<Data*> Model(const bool use_kmeanspp, const int K, RawData* raw_data)
{
std::vector<Data*> dataArray;
for (int i = 0; i < K; i++)
{
Data* data = new Data(K);
if (!data->Initialize(use_kmeanspp, raw_data))
delete data;
else
{
std::vector<Point*> temp_points;
while (true)
{
for (ClusteredPoint* point : data->points)
point->cid = GetNearestCentroidIndex(point, data->centroids);
CopyPointArrays(data->centroids, temp_points);
RecenterCentroids(data);
if (PointArraysAreEqual(data->centroids, temp_points))
break;
}
dataArray.push_back(data);
}
}
return dataArray;
}
static void FindBestModelAndDelete(std::vector<std::vector<KMean::Data*>>& models, int* pK, float* pKScore, unsigned int* data_point_count, void** data_points)
{
Data* best = nullptr;
float kscore = MIN_FLOAT;
for (std::size_t i = 0; i < models.size(); i++)
{
for (std::size_t j = 0; j < models[i].size(); j++)
if (models[i][j]->KScore() > kscore)
{
kscore = models[i][j]->KScore();
best = models[i][j];
}
}
if (best)
{
if (pKScore)
*pKScore = best->KScore();
if (pK)
*pK = best->K;
if (data_point_count)
{
std::size_t count = best->points.size();
(*data_point_count) = (unsigned int)count;
if (data_points)
{
ClusteredPoint* points = new ClusteredPoint[count];
for (std::size_t i = 0; i < count; i++)
{
points[i].cid = best->points[i]->cid;
points[i].x = best->points[i]->x;
points[i].y = best->points[i]->y;
points[i].z = best->points[i]->z;
points[i].w = best->points[i]->w;
}
*data_points = points;
}
}
}
else
{
if (pKScore)
*pKScore = MIN_FLOAT;
if (pK)
*pK = 0;
if (data_point_count)
{
(*data_point_count) = 0;
if (data_points)
*data_points = nullptr;
}
}
for (std::size_t i = 0; i < models.size(); i++)
{
for (std::size_t j = 0; j < models[i].size(); j++)
delete models[i][j];
}
}
private:
static std::size_t GetNearestCentroidIndex(Point* pt, std::vector<Point*>& centroids)
{
double d, min_dist = std::numeric_limits<float>::max();
std::size_t centroid_count = centroids.size();
std::size_t min_index = centroid_count;
for (std::size_t i = 0; i < centroid_count; i++)
{
d = Euclidean::FindDistanceBetweenPointsSquared(pt, centroids[i]);
if (d < min_dist)
{
min_dist = d;
min_index = i;
}
}
return min_index;
}
static void CopyPointArrays(std::vector<Point*>& src, std::vector<Point*>& dst)
{
if (dst.size() < src.size())
{
for (std::size_t i = 0; i < src.size(); i++)
{
Point* pt = new Point(src[i]);
pt->x = src[i]->x;
pt->y = src[i]->y;
pt->z = src[i]->z;
pt->w = src[i]->w;
dst.push_back(pt);
}
}
else
{
for (std::size_t i = 0; i < src.size(); i++)
{
dst[i]->x = src[i]->x;
dst[i]->y = src[i]->y;
dst[i]->z = src[i]->z;
dst[i]->w = src[i]->w;
}
}
}
static bool PointArraysAreEqual(std::vector<Point*>& points1, std::vector<Point*>& points2)
{
std::size_t count = points1.size();
for (std::size_t i = 0; i < points1.size() && count > 0; i++)
{
for (std::size_t j = 0; j < points2.size() && count > 0; j++)
{
if (points2[j]->x == points1[i]->x && points2[j]->y == points1[i]->y && points2[j]->z == points1[i]->z && points2[j]->w == points1[i]->w)
count--;
}
}
return count == 0;
}
static void RecenterCentroids(Data* data)
{
const int K = data->K;
for (int i = 0; i < K; i++)
{
int c = 0;
double x_total = 0;
double y_total = 0;
double z_total = 0;
double w_total = 0;
for (ClusteredPoint* point : data->points)
{
if (point->cid == i)
{
c++;
x_total += point->x;
y_total += point->y;
z_total += point->z;
w_total += point->w;
}
}
if (c == 0)
data->centroids[i]->x = data->centroids[i]->y = data->centroids[i]->z = data->centroids[i]->w = 0;
else
{
data->centroids[i]->x = x_total / c;
data->centroids[i]->y = y_total / c;
data->centroids[i]->z = z_total / c;
data->centroids[i]->w = w_total / c;
}
}
}
};
struct thread_params
{
int active;
const bool use_kmeanspp;
const int K;
std::vector<Data*> data;
RawData* raw_data;
std::thread* pthread;
thread_params(const bool use_kmeanspp, const int K, RawData* raw_data) : active(true), use_kmeanspp(use_kmeanspp), K(K), raw_data(raw_data), pthread(nullptr) {}
};
static void thread(thread_params* params)
{
params->data = Clustering::Model(params->use_kmeanspp, params->K, params->raw_data);
params->active = false;
}
static void FindBestK(CSVParser::CSVNumericData& csv_data, const float iqrMultiplier, const bool use_kmeanspp, const bool multithreaded, const int attempts_to_improve_kscore, const bool first_row_column_names, const int columnX, const int columnY, const int columnZ, const int columnW, int* const pK, float* const pKScore, unsigned int* const data_point_count, void** const data_points)
{
if (csv_data.row_count() > 0)
{
const int KMAX = pK ? (*pK > 24 ? 24 : *pK) : 14;
std::vector<std::vector<KMean::Data*>> modelData;
RawData* raw_data = new RawData(columnX, columnY, columnZ, columnW);
if (raw_data->Initialize(iqrMultiplier, csv_data))
{
if (multithreaded)
{
std::vector<thread_params*> threads;
for (int K = KMean::KMIN; K <= KMAX; K++)
{
for (int i = 0; i < (attempts_to_improve_kscore ? attempts_to_improve_kscore : 1); i++)
{
thread_params* params = new thread_params(use_kmeanspp, K, raw_data);
params->pthread = new std::thread(thread, params);
threads.push_back(params);
}
}
while (true)
{
int active_count = 0;
for (std::size_t i = 0; i < threads.size(); i++)
active_count += threads[i]->active;
if (!active_count)
break;
std::this_thread::sleep_for(std::chrono::milliseconds(100));
}
for (std::size_t i = 0; i < threads.size(); i++)
if (threads[i]->pthread->joinable())
threads[i]->pthread->join();
std::this_thread::sleep_for(std::chrono::seconds(1));
for (std::size_t i = 0; i < threads.size(); i++)
{
modelData.push_back(threads[i]->data);
delete threads[i]->pthread;
delete threads[i];
}
}
else
{
for (int K = KMean::KMIN; K <= KMAX; K++)
for (int i = 0; i < (attempts_to_improve_kscore ? attempts_to_improve_kscore : 1); i++)
modelData.push_back(Clustering::Model(use_kmeanspp, K, raw_data));
}
}
delete raw_data;
Clustering::FindBestModelAndDelete(modelData, pK, pKScore, data_point_count, data_points);
}
}
void FindBestKFromFile(const float iqrMultiplier, const bool use_kmeanspp, const bool multithreaded, const char* filename, const int attempts_to_improve_kscore, const bool first_row_column_names, const int columnX, const int columnY = -1, const int columnZ = -1, const int columnW = -1, int* const pK = nullptr, float* const pKScore = nullptr, unsigned int* const data_point_count = nullptr, void** const data_points = nullptr)
{
srand(time(0)); // initialize rand()
std::fstream file;
file.open(filename, std::ios::in);
if (file.is_open())
{
CSVParser::CSVNumericData csv_data(file, first_row_column_names);
FindBestK(csv_data, iqrMultiplier, use_kmeanspp, multithreaded, attempts_to_improve_kscore, first_row_column_names, columnX, columnY, columnZ, columnW, pK, pKScore, data_point_count, data_points);
}
}
void FindBestKFromCSV(const float iqrMultiplier, const bool use_kmeanspp, const bool multithreaded, const char* csv_string, const int attempts_to_improve_kscore, const bool first_row_column_names, const int columnX, const int columnY = -1, const int columnZ = -1, const int columnW = -1, int* const pK = nullptr, float* const pKScore = nullptr, unsigned int* const data_point_count = nullptr, void** const data_points = nullptr)
{
srand(time(0)); // initialize rand()
std::istringstream ss(csv_string);
CSVParser::CSVNumericData csv_data(ss, first_row_column_names);
FindBestK(csv_data, iqrMultiplier, use_kmeanspp, multithreaded, attempts_to_improve_kscore, first_row_column_names, columnX, columnY, columnZ, columnW, pK, pKScore, data_point_count, data_points);
}
void FreeMemory(void* memory_ptr)
{
delete[](ClusteredPoint*)memory_ptr;
}
}
#endif
Download all this code plus various supporting files for Linux, Windows and C#: KMEANCLUSTERING.zip