PROWAREtech

articles » current » c-plus-plus » csv-parser

C++: CSV File Parser

An example parser for a file with comma separated values; written in C++.

This code defines a CSV parser implementation in C++ that specifically handles numeric data conversion. The main class CSVNumericData reads CSV files with support for quoted fields (including embedded quotes) and can treat the first row as column headers if specified.

The parser's key feature is its ability to convert all input data to floating-point numbers. For actual numeric values, it performs direct conversion. For non-numeric strings, it maintains a hash table that assigns unique numeric values to each unique string, effectively creating a numeric encoding of categorical data.

The implementation includes a state machine for properly parsing CSV fields (handling quotes and delimiters), and stores the processed data in a 2D vector of floats. The class provides access to the parsed data through array-style indexing, along with methods to retrieve the row count and column names. The parser is designed to handle Excel-style CSV files and includes robust number validation through the isNumber method.


// csvparser.h

#include <istream>
#include <string>
#include <vector>
#include "hashtable.h"

#ifndef CSVPARSER_H

#define CSVPARSER_H

namespace CSVParser
{
	enum CSVState
	{
		UnquotedField,
		QuotedField,
		QuotedQuote
	};

	class CSVNumericData
	{
	private:
		bool firstRowHeadings;
		std::vector<std::vector<float>> table;
		std::vector<std::string> col_names;
		Hashtable<std::string, float> strings;

		static bool isNumber(const std::string& s)
		{
			std::size_t char_pos(0);

			char_pos = s.find_first_not_of(' ');
			if (char_pos == s.size()) return false;

			// check the significand
			if (s[char_pos] == '+' || s[char_pos] == '-')
				++char_pos; // skip the sign if exist

			int n_nm, n_pt, n_exp, n_exp_sign;
			bool n_fail;
			for (n_nm = 0, n_pt = 0, n_exp = 0, n_exp_sign = 0, n_fail = false; !n_fail && (std::isdigit(s[char_pos]) || s[char_pos] == '.' || s[char_pos] == 'e' || s[char_pos] == 'E' || s[char_pos] == '+' || s[char_pos] == '-'); ++char_pos)
			{
				if (s[char_pos] == '.')
					++n_pt;
				else if (s[char_pos] == 'e' || s[char_pos] == 'E')
					++n_exp;
				else if (s[char_pos] == '+' || s[char_pos] == '-')
				{
					if (n_exp == 1)
						++n_exp_sign;
					else
						n_fail = true;
				}
				else
					++n_nm;
				n_fail = (n_fail || n_pt > 1 || n_exp > 1 || n_exp_sign > 1);
			}

			if (n_fail || n_nm < 1) // no more than one point, at least one digit
				return false;

			// skip the trailing whitespaces
			while (s[char_pos] == ' ' || s[char_pos] == '\r' || s[char_pos] == '\n')
				++char_pos;

			return char_pos == s.size();  // must reach the ending 0 of the string
		}

		float GetDataValue(const std::string& d)
		{
			if (isNumber(d))
				return std::stof(d);
			HashtableItem<std::string, float>* entry = strings[d];
			if (entry)
				return entry->Value();
			float f = strings.Count();
			strings.Add(d, f);
			return f;
		}

		std::vector<float> ConvertToFloatRow(const std::vector<std::string>& row)
		{
			std::vector<float> float_row;
			for (std::size_t i = 0; i < row.size(); i++)
				float_row.push_back(GetDataValue(row[i]));
			return float_row;
		}

		std::vector<std::string> ReadCSVRow(const std::string& row)
		{
			CSVState state = CSVState::UnquotedField;
			std::vector<std::string> fields {""};
			size_t i = 0; // index of the current field
			for (char c : row)
			{
				switch (state)
				{
				case CSVState::UnquotedField:
					switch (c)
					{
					case ',': // end of field
						fields.push_back(""); i++;
						break;
					case '"': state = CSVState::QuotedField;
						break;
					default:  fields[i].push_back(c);
						break;
					}
					break;
				case CSVState::QuotedField:
					switch (c)
					{
					case '"': state = CSVState::QuotedQuote;
						break;
					default:  fields[i].push_back(c);
						break;
					}
					break;
				case CSVState::QuotedQuote:
					switch (c)
					{
					case ',': // , after closing quote
						fields.push_back(""); i++;
						state = CSVState::UnquotedField;
						break;
					case '"': // "" -> "
						fields[i].push_back('"');
						state = CSVState::QuotedField;
						break;
					default:  // end of quote
						state = CSVState::UnquotedField;
						break;
					}
					break;
				}
			}
			return fields;
		}

		/// Read CSV file, Excel dialect. Accept "quoted fields ""with quotes"""
		void ReadCSV(std::istream& in, bool firstRowColumnNames)
		{
			std::vector<std::vector<std::string>> raw_table;
			std::string row;
			while (!in.eof())
			{
				std::getline(in, row);

				if (in.bad() || in.fail())
					break;

				if (row.length() == 0)
					continue;

				if (firstRowColumnNames)
				{
					firstRowColumnNames = false;
					col_names = ReadCSVRow(row);
				}
				else
					raw_table.push_back(ReadCSVRow(row));
			}
			std::size_t row_count = raw_table.size(), current_col = 0, current_row = 0;
			while(true)
			{
				std::vector<float> floats;
				if(current_col == 0)
					table.push_back(floats);
				if (current_col < raw_table[current_row].size())
					table[current_row].push_back(GetDataValue(raw_table[current_row][current_col]));
				if (current_row == row_count - 1)
				{
					current_col++;
					if (raw_table[current_row].size() == current_col)
						break;
				}
				current_row++;
				if (current_row == row_count)
					current_row = 0;
			}
			strings.Clear();
		}

	public:
		CSVNumericData(std::istream& csv_string, const bool firstRowColumnNames)
		{
			ReadCSV(csv_string, firstRowColumnNames);
		}
		std::vector<float> operator[](const std::size_t index) const
		{
			return table[index];
		}
		std::size_t row_count()
		{
			return table.size();
		}
		std::vector<std::string> column_names()
		{
			return col_names;
		}
	};

}

#endif

And the required hashtable code:


// hashtable.h

#include <iostream>
#include <iomanip>
#include <string>

#ifndef HASHTABLE_H

#define HASHTABLE_H

template <class TKey, class TValue> class Hashtable;

#define TABLE_SIZE 1000003 // use a large prime number

unsigned int hash_func(const std::string& key)
{
	unsigned int h = 0;
	unsigned int o = 31415;
	const unsigned int t = 27183;
	const char* k = key.c_str();
	while (*k)
	{
		h = (o * h + *k++) % TABLE_SIZE;
		o = o * t % (TABLE_SIZE - 1);
	}
	return h;
}

unsigned int hash_func(const unsigned int key)
{
	// make sure table size is actually (TABLE_SIZE+1) (more about this below)
	return(key & TABLE_SIZE);
}

template <class TKey, class TValue> class HashtableItem;
template <class TKey, class TValue> class HashtableDataMembers;

template <class TKey, class TValue> class HashtableDataMembers
{
public:
	HashtableItem<TKey, TValue>** table;
	HashtableItem<TKey, TValue>* cur_table_item;
	unsigned int table_size, count;
	HashtableDataMembers();
	~HashtableDataMembers();
};

template <class TKey, class TValue> class Hashtable
{
private:
	HashtableDataMembers<TKey, TValue>* dm;

public:
	Hashtable();
	Hashtable(const Hashtable& obj);
	~Hashtable();
	bool Add(const TKey& key, const TValue& value); // Add a new entry, returns false when the key already exists
	const Hashtable<TKey, TValue>& operator=(const Hashtable<TKey, TValue>& obj);
	HashtableItem<TKey, TValue>* operator[](const TKey& key) const;
	void Remove(const TKey& key); // removes one table entry
	void Clear(); // removes all the table entries
	unsigned int Count(); // returns count of entries;
};

template <class TKey, class TValue> class HashtableItem
{
private:
	HashtableItem<TKey, TValue>* pnext;
	TKey key;
	TValue value;

	// keep these private to prevent the client from creating this object
	HashtableItem() {}
	HashtableItem(const TKey& key, const TValue& value);
	~HashtableItem();

public:
	const TKey& Key() const;
	const TValue& Value() const;
	const TValue& operator=(const TValue& value);

	// some friend functions that can access the private data
	friend bool Hashtable<TKey, TValue>::Add(const TKey& key, const TValue& value);
	friend HashtableDataMembers<TKey, TValue>::HashtableDataMembers();
	friend HashtableDataMembers<TKey, TValue>::~HashtableDataMembers();
	friend void Hashtable<TKey, TValue>::Remove(const TKey& key);
	friend const Hashtable<TKey, TValue>& Hashtable<TKey, TValue>::operator=(const Hashtable<TKey, TValue>& obj);
	friend HashtableItem<TKey, TValue>* Hashtable<TKey, TValue>::operator[](const TKey& key) const;
	friend void Hashtable<TKey, TValue>::Clear();
};





// ##################### class HashtableDataMembers ###################
template <class TKey, class TValue> HashtableDataMembers<TKey, TValue>::HashtableDataMembers()
{
	cur_table_item = nullptr;
	count = 0;
	table_size = TABLE_SIZE + 1; // add one to the TABLE_SIZE because the int hash_func can actually return TABLE_SIZE
	table = new HashtableItem<TKey, TValue>* [table_size]();
	for (unsigned int i = 0; i < table_size; table[i++] = nullptr);
}
template <class TKey, class TValue> HashtableDataMembers<TKey, TValue>::~HashtableDataMembers()
{
	for (unsigned int i = 0; i < table_size; i++)
	{
		if (table[i])
		{
			delete table[i];
			table[i] = nullptr; // clean-up the memory
		}
	}
	delete[] table;
}




// ##################### class Hashtable ###########################
template <class TKey, class TValue> Hashtable<TKey, TValue>::Hashtable()
{
	dm = new HashtableDataMembers<TKey, TValue>();
}
template <class TKey, class TValue> Hashtable<TKey, TValue>::Hashtable(const Hashtable& obj)
{
	dm = new HashtableDataMembers<TKey, TValue>();
	this->operator=(obj);
}
template <class TKey, class TValue> Hashtable<TKey, TValue>::~Hashtable()
{
	delete dm;
	dm = nullptr;
}
template <class TKey, class TValue> const Hashtable<TKey, TValue>& Hashtable<TKey, TValue>::operator=(const Hashtable<TKey, TValue>& obj)
{
	if (this != &obj)
	{
		unsigned int i;
		Clear();
		if (dm->table_size != obj.dm->table_size)
		{
			if (dm->table)
				delete[]dm->table;
			dm->table_size = obj.dm->table_size;
			dm->table = new HashtableItem<TKey, TValue>* [dm->table_size]();
			for (i = 0; i < dm->table_size; dm->table[i++] = nullptr);
		}
		for (i = 0; i < obj.dm->table_size; i++)
		{
			HashtableItem<TKey, TValue>** ppobjnode, ** ppnode;
			ppobjnode = &obj.dm->table[i];
			ppnode = &dm->table[i];
			while (*ppobjnode)
			{
				(*ppnode) = new HashtableItem<TKey, TValue>((*ppobjnode)->key, (*ppobjnode)->value);
				if (obj.dm->cur_table_item == *ppobjnode)
					dm->cur_table_item = *ppnode;
				ppnode = &(*ppnode)->pnext;
				ppobjnode = &(*ppobjnode)->pnext;
			}
		}
	}
	return (*this);
}
template <class TKey, class TValue> bool Hashtable<TKey, TValue>::Add(const TKey& key, const TValue& value)
{
	unsigned int i = hash_func(key);
	HashtableItem<TKey, TValue>** ppnode;
	for (ppnode = &dm->table[i]; (*ppnode) && ((*ppnode)->key != key); ppnode = &(*ppnode)->pnext);
	if (*ppnode) // then found
		return false;
	(*ppnode) = new HashtableItem<TKey, TValue>(key, value);
	dm->count++;
	return true;
}
template <class TKey, class TValue> HashtableItem<TKey, TValue>* Hashtable<TKey, TValue>::operator[](const TKey& key) const
{
	unsigned int i = hash_func(key);
	HashtableItem<TKey, TValue>* pnode;
	for (pnode = dm->table[i]; pnode && (pnode->key != key); pnode = pnode->pnext);
	return pnode;
}
template <class TKey, class TValue> void Hashtable<TKey, TValue>::Remove(const TKey& key)
{
	unsigned int i = hash_func(key);
	HashtableItem<TKey, TValue>** ppnode, * tmp;
	for (ppnode = &dm->table[i]; (*ppnode) && ((*ppnode)->key != key); ppnode = &(*ppnode)->pnext);
	if (*ppnode)
	{
		tmp = (*ppnode);
		(*ppnode) = (*ppnode)->pnext;
		tmp->pnext = nullptr;
		delete tmp;
	}
	dm->count--;
}
template <class TKey, class TValue> void Hashtable<TKey, TValue>::Clear()
{
	for (unsigned int i = 0; i < dm->table_size; i++)
	{
		if (dm->table[i])
		{
			delete dm->table[i];
			dm->table[i] = nullptr;
		}
	}
	dm->count = 0;
}
template <class TKey, class TValue> unsigned int Hashtable<TKey, TValue>::Count()
{
	return dm->count;
}




// ##################### class HashtableItem ###########################
template <class TKey, class TValue> HashtableItem<TKey, TValue>::HashtableItem(const TKey& xKey, const TValue& xValue)
{
	key = xKey;
	value = xValue;
	pnext = nullptr;
}
template <class TKey, class TValue> HashtableItem<TKey, TValue>::~HashtableItem()
{
	if (this->pnext)
	{
		delete this->pnext;
		this->pnext = nullptr; // clean-up memory
	}
}
template <class TKey, class TValue> const TKey& HashtableItem<TKey, TValue>::Key() const
{
	return this->key;
}
template <class TKey, class TValue> const TValue& HashtableItem<TKey, TValue>::Value() const
{
	return this->value;
}
template <class TKey, class TValue> const TValue& HashtableItem<TKey, TValue>::operator=(const TValue& value)
{
	this->value = value;
	return value;
}

#endif

Comment

PROWAREtech

C++: CSV File Parser

.

..

algorithms

data-structures

procedures

C/C++: About

C/C++: Draw ASCII Diamond

C++: Copy Constructor

C++: CSV File Parser

C++: Hello World

C++: How to Implement std::cin and std::cout

C++: Machine Learning, Unsupervised Learning or Clustering, K-mean / Silhouette Clustering Library

C++: Neural Network, Supervised Deep Machine Learning Example

C++: Polymorphism

C++: Template Class Tutorial

PROWAREtech

PROWAREtech