PROWAREtech
C++: CSV File Parser
This code defines a CSV parser implementation in C++ that specifically handles numeric data conversion. The main class CSVNumericData
reads CSV files with support for quoted fields (including embedded quotes) and can treat the first row as column headers if specified.
The parser's key feature is its ability to convert all input data to floating-point numbers. For actual numeric values, it performs direct conversion. For non-numeric strings, it maintains a hash table that assigns unique numeric values to each unique string, effectively creating a numeric encoding of categorical data.
The implementation includes a state machine for properly parsing CSV fields (handling quotes and delimiters), and stores the processed data in a 2D vector of floats. The class provides access to the parsed data through array-style indexing, along with methods to retrieve the row count and column names. The parser is designed to handle Excel-style CSV files and includes robust number validation through the isNumber
method.
// csvparser.h
#include <istream>
#include <string>
#include <vector>
#include "hashtable.h"
#ifndef CSVPARSER_H
#define CSVPARSER_H
namespace CSVParser
{
enum CSVState
{
UnquotedField,
QuotedField,
QuotedQuote
};
class CSVNumericData
{
private:
bool firstRowHeadings;
std::vector<std::vector<float>> table;
std::vector<std::string> col_names;
Hashtable<std::string, float> strings;
static bool isNumber(const std::string& s)
{
std::size_t char_pos(0);
char_pos = s.find_first_not_of(' ');
if (char_pos == s.size()) return false;
// check the significand
if (s[char_pos] == '+' || s[char_pos] == '-')
++char_pos; // skip the sign if exist
int n_nm, n_pt, n_exp, n_exp_sign;
bool n_fail;
for (n_nm = 0, n_pt = 0, n_exp = 0, n_exp_sign = 0, n_fail = false; !n_fail && (std::isdigit(s[char_pos]) || s[char_pos] == '.' || s[char_pos] == 'e' || s[char_pos] == 'E' || s[char_pos] == '+' || s[char_pos] == '-'); ++char_pos)
{
if (s[char_pos] == '.')
++n_pt;
else if (s[char_pos] == 'e' || s[char_pos] == 'E')
++n_exp;
else if (s[char_pos] == '+' || s[char_pos] == '-')
{
if (n_exp == 1)
++n_exp_sign;
else
n_fail = true;
}
else
++n_nm;
n_fail = (n_fail || n_pt > 1 || n_exp > 1 || n_exp_sign > 1);
}
if (n_fail || n_nm < 1) // no more than one point, at least one digit
return false;
// skip the trailing whitespaces
while (s[char_pos] == ' ' || s[char_pos] == '\r' || s[char_pos] == '\n')
++char_pos;
return char_pos == s.size(); // must reach the ending 0 of the string
}
float GetDataValue(const std::string& d)
{
if (isNumber(d))
return std::stof(d);
HashtableItem<std::string, float>* entry = strings[d];
if (entry)
return entry->Value();
float f = strings.Count();
strings.Add(d, f);
return f;
}
std::vector<float> ConvertToFloatRow(const std::vector<std::string>& row)
{
std::vector<float> float_row;
for (std::size_t i = 0; i < row.size(); i++)
float_row.push_back(GetDataValue(row[i]));
return float_row;
}
std::vector<std::string> ReadCSVRow(const std::string& row)
{
CSVState state = CSVState::UnquotedField;
std::vector<std::string> fields {""};
size_t i = 0; // index of the current field
for (char c : row)
{
switch (state)
{
case CSVState::UnquotedField:
switch (c)
{
case ',': // end of field
fields.push_back(""); i++;
break;
case '"': state = CSVState::QuotedField;
break;
default: fields[i].push_back(c);
break;
}
break;
case CSVState::QuotedField:
switch (c)
{
case '"': state = CSVState::QuotedQuote;
break;
default: fields[i].push_back(c);
break;
}
break;
case CSVState::QuotedQuote:
switch (c)
{
case ',': // , after closing quote
fields.push_back(""); i++;
state = CSVState::UnquotedField;
break;
case '"': // "" -> "
fields[i].push_back('"');
state = CSVState::QuotedField;
break;
default: // end of quote
state = CSVState::UnquotedField;
break;
}
break;
}
}
return fields;
}
/// Read CSV file, Excel dialect. Accept "quoted fields ""with quotes"""
void ReadCSV(std::istream& in, bool firstRowColumnNames)
{
std::vector<std::vector<std::string>> raw_table;
std::string row;
while (!in.eof())
{
std::getline(in, row);
if (in.bad() || in.fail())
break;
if (row.length() == 0)
continue;
if (firstRowColumnNames)
{
firstRowColumnNames = false;
col_names = ReadCSVRow(row);
}
else
raw_table.push_back(ReadCSVRow(row));
}
std::size_t row_count = raw_table.size(), current_col = 0, current_row = 0;
while(true)
{
std::vector<float> floats;
if(current_col == 0)
table.push_back(floats);
if (current_col < raw_table[current_row].size())
table[current_row].push_back(GetDataValue(raw_table[current_row][current_col]));
if (current_row == row_count - 1)
{
current_col++;
if (raw_table[current_row].size() == current_col)
break;
}
current_row++;
if (current_row == row_count)
current_row = 0;
}
strings.Clear();
}
public:
CSVNumericData(std::istream& csv_string, const bool firstRowColumnNames)
{
ReadCSV(csv_string, firstRowColumnNames);
}
std::vector<float> operator[](const std::size_t index) const
{
return table[index];
}
std::size_t row_count()
{
return table.size();
}
std::vector<std::string> column_names()
{
return col_names;
}
};
}
#endif
And the required hashtable code:
// hashtable.h
#include <iostream>
#include <iomanip>
#include <string>
#ifndef HASHTABLE_H
#define HASHTABLE_H
template <class TKey, class TValue> class Hashtable;
#define TABLE_SIZE 1000003 // use a large prime number
unsigned int hash_func(const std::string& key)
{
unsigned int h = 0;
unsigned int o = 31415;
const unsigned int t = 27183;
const char* k = key.c_str();
while (*k)
{
h = (o * h + *k++) % TABLE_SIZE;
o = o * t % (TABLE_SIZE - 1);
}
return h;
}
unsigned int hash_func(const unsigned int key)
{
// make sure table size is actually (TABLE_SIZE+1) (more about this below)
return(key & TABLE_SIZE);
}
template <class TKey, class TValue> class HashtableItem;
template <class TKey, class TValue> class HashtableDataMembers;
template <class TKey, class TValue> class HashtableDataMembers
{
public:
HashtableItem<TKey, TValue>** table;
HashtableItem<TKey, TValue>* cur_table_item;
unsigned int table_size, count;
HashtableDataMembers();
~HashtableDataMembers();
};
template <class TKey, class TValue> class Hashtable
{
private:
HashtableDataMembers<TKey, TValue>* dm;
public:
Hashtable();
Hashtable(const Hashtable& obj);
~Hashtable();
bool Add(const TKey& key, const TValue& value); // Add a new entry, returns false when the key already exists
const Hashtable<TKey, TValue>& operator=(const Hashtable<TKey, TValue>& obj);
HashtableItem<TKey, TValue>* operator[](const TKey& key) const;
void Remove(const TKey& key); // removes one table entry
void Clear(); // removes all the table entries
unsigned int Count(); // returns count of entries;
};
template <class TKey, class TValue> class HashtableItem
{
private:
HashtableItem<TKey, TValue>* pnext;
TKey key;
TValue value;
// keep these private to prevent the client from creating this object
HashtableItem() {}
HashtableItem(const TKey& key, const TValue& value);
~HashtableItem();
public:
const TKey& Key() const;
const TValue& Value() const;
const TValue& operator=(const TValue& value);
// some friend functions that can access the private data
friend bool Hashtable<TKey, TValue>::Add(const TKey& key, const TValue& value);
friend HashtableDataMembers<TKey, TValue>::HashtableDataMembers();
friend HashtableDataMembers<TKey, TValue>::~HashtableDataMembers();
friend void Hashtable<TKey, TValue>::Remove(const TKey& key);
friend const Hashtable<TKey, TValue>& Hashtable<TKey, TValue>::operator=(const Hashtable<TKey, TValue>& obj);
friend HashtableItem<TKey, TValue>* Hashtable<TKey, TValue>::operator[](const TKey& key) const;
friend void Hashtable<TKey, TValue>::Clear();
};
// ##################### class HashtableDataMembers ###################
template <class TKey, class TValue> HashtableDataMembers<TKey, TValue>::HashtableDataMembers()
{
cur_table_item = nullptr;
count = 0;
table_size = TABLE_SIZE + 1; // add one to the TABLE_SIZE because the int hash_func can actually return TABLE_SIZE
table = new HashtableItem<TKey, TValue>* [table_size]();
for (unsigned int i = 0; i < table_size; table[i++] = nullptr);
}
template <class TKey, class TValue> HashtableDataMembers<TKey, TValue>::~HashtableDataMembers()
{
for (unsigned int i = 0; i < table_size; i++)
{
if (table[i])
{
delete table[i];
table[i] = nullptr; // clean-up the memory
}
}
delete[] table;
}
// ##################### class Hashtable ###########################
template <class TKey, class TValue> Hashtable<TKey, TValue>::Hashtable()
{
dm = new HashtableDataMembers<TKey, TValue>();
}
template <class TKey, class TValue> Hashtable<TKey, TValue>::Hashtable(const Hashtable& obj)
{
dm = new HashtableDataMembers<TKey, TValue>();
this->operator=(obj);
}
template <class TKey, class TValue> Hashtable<TKey, TValue>::~Hashtable()
{
delete dm;
dm = nullptr;
}
template <class TKey, class TValue> const Hashtable<TKey, TValue>& Hashtable<TKey, TValue>::operator=(const Hashtable<TKey, TValue>& obj)
{
if (this != &obj)
{
unsigned int i;
Clear();
if (dm->table_size != obj.dm->table_size)
{
if (dm->table)
delete[]dm->table;
dm->table_size = obj.dm->table_size;
dm->table = new HashtableItem<TKey, TValue>* [dm->table_size]();
for (i = 0; i < dm->table_size; dm->table[i++] = nullptr);
}
for (i = 0; i < obj.dm->table_size; i++)
{
HashtableItem<TKey, TValue>** ppobjnode, ** ppnode;
ppobjnode = &obj.dm->table[i];
ppnode = &dm->table[i];
while (*ppobjnode)
{
(*ppnode) = new HashtableItem<TKey, TValue>((*ppobjnode)->key, (*ppobjnode)->value);
if (obj.dm->cur_table_item == *ppobjnode)
dm->cur_table_item = *ppnode;
ppnode = &(*ppnode)->pnext;
ppobjnode = &(*ppobjnode)->pnext;
}
}
}
return (*this);
}
template <class TKey, class TValue> bool Hashtable<TKey, TValue>::Add(const TKey& key, const TValue& value)
{
unsigned int i = hash_func(key);
HashtableItem<TKey, TValue>** ppnode;
for (ppnode = &dm->table[i]; (*ppnode) && ((*ppnode)->key != key); ppnode = &(*ppnode)->pnext);
if (*ppnode) // then found
return false;
(*ppnode) = new HashtableItem<TKey, TValue>(key, value);
dm->count++;
return true;
}
template <class TKey, class TValue> HashtableItem<TKey, TValue>* Hashtable<TKey, TValue>::operator[](const TKey& key) const
{
unsigned int i = hash_func(key);
HashtableItem<TKey, TValue>* pnode;
for (pnode = dm->table[i]; pnode && (pnode->key != key); pnode = pnode->pnext);
return pnode;
}
template <class TKey, class TValue> void Hashtable<TKey, TValue>::Remove(const TKey& key)
{
unsigned int i = hash_func(key);
HashtableItem<TKey, TValue>** ppnode, * tmp;
for (ppnode = &dm->table[i]; (*ppnode) && ((*ppnode)->key != key); ppnode = &(*ppnode)->pnext);
if (*ppnode)
{
tmp = (*ppnode);
(*ppnode) = (*ppnode)->pnext;
tmp->pnext = nullptr;
delete tmp;
}
dm->count--;
}
template <class TKey, class TValue> void Hashtable<TKey, TValue>::Clear()
{
for (unsigned int i = 0; i < dm->table_size; i++)
{
if (dm->table[i])
{
delete dm->table[i];
dm->table[i] = nullptr;
}
}
dm->count = 0;
}
template <class TKey, class TValue> unsigned int Hashtable<TKey, TValue>::Count()
{
return dm->count;
}
// ##################### class HashtableItem ###########################
template <class TKey, class TValue> HashtableItem<TKey, TValue>::HashtableItem(const TKey& xKey, const TValue& xValue)
{
key = xKey;
value = xValue;
pnext = nullptr;
}
template <class TKey, class TValue> HashtableItem<TKey, TValue>::~HashtableItem()
{
if (this->pnext)
{
delete this->pnext;
this->pnext = nullptr; // clean-up memory
}
}
template <class TKey, class TValue> const TKey& HashtableItem<TKey, TValue>::Key() const
{
return this->key;
}
template <class TKey, class TValue> const TValue& HashtableItem<TKey, TValue>::Value() const
{
return this->value;
}
template <class TKey, class TValue> const TValue& HashtableItem<TKey, TValue>::operator=(const TValue& value)
{
this->value = value;
return value;
}
#endif