我需要在c++中加载和使用CSV文件数据。在这一点上,它实际上只是一个以逗号分隔的解析器(即不用担心转义新行和逗号)。主要需要的是逐行解析器,它将在每次调用方法时为下一行返回一个向量。
我发现这篇文章看起来很有前途: http://www.boost.org/doc/libs/1_35_0/libs/spirit/example/fundamental/list_parser.cpp
我从未使用过Boost's Spirit,但我愿意尝试一下。但前提是我忽略了一个更直接的解决方案。
我需要在c++中加载和使用CSV文件数据。在这一点上,它实际上只是一个以逗号分隔的解析器(即不用担心转义新行和逗号)。主要需要的是逐行解析器,它将在每次调用方法时为下一行返回一个向量。
我发现这篇文章看起来很有前途: http://www.boost.org/doc/libs/1_35_0/libs/spirit/example/fundamental/list_parser.cpp
我从未使用过Boost's Spirit,但我愿意尝试一下。但前提是我忽略了一个更直接的解决方案。
当前回答
不管怎样,下面是我的实现。它处理wstring输入,但是可以很容易地调整为string。它不处理字段中的换行符(因为我的应用程序也不这样做,但添加它的支持并不太难),它不符合RFC中的“\r\n”行尾(假设您使用std::getline),但它确实正确地处理空格修剪和双引号(希望如此)。
using namespace std;
// trim whitespaces around field or double-quotes, remove double-quotes and replace escaped double-quotes (double double-quotes)
wstring trimquote(const wstring& str, const wstring& whitespace, const wchar_t quotChar)
{
wstring ws;
wstring::size_type strBegin = str.find_first_not_of(whitespace);
if (strBegin == wstring::npos)
return L"";
wstring::size_type strEnd = str.find_last_not_of(whitespace);
wstring::size_type strRange = strEnd - strBegin + 1;
if((str[strBegin] == quotChar) && (str[strEnd] == quotChar))
{
ws = str.substr(strBegin+1, strRange-2);
strBegin = 0;
while((strEnd = ws.find(quotChar, strBegin)) != wstring::npos)
{
ws.erase(strEnd, 1);
strBegin = strEnd+1;
}
}
else
ws = str.substr(strBegin, strRange);
return ws;
}
pair<unsigned, unsigned> nextCSVQuotePair(const wstring& line, const wchar_t quotChar, unsigned ofs = 0)
{
pair<unsigned, unsigned> r;
r.first = line.find(quotChar, ofs);
r.second = wstring::npos;
if(r.first != wstring::npos)
{
r.second = r.first;
while(((r.second = line.find(quotChar, r.second+1)) != wstring::npos)
&& (line[r.second+1] == quotChar)) // WARNING: assumes null-terminated string such that line[r.second+1] always exist
r.second++;
}
return r;
}
unsigned parseLine(vector<wstring>& fields, const wstring& line)
{
unsigned ofs, ofs0, np;
const wchar_t delim = L',';
const wstring whitespace = L" \t\xa0\x3000\x2000\x2001\x2002\x2003\x2004\x2005\x2006\x2007\x2008\x2009\x200a\x202f\x205f";
const wchar_t quotChar = L'\"';
pair<unsigned, unsigned> quot;
fields.clear();
ofs = ofs0 = 0;
quot = nextCSVQuotePair(line, quotChar);
while((np = line.find(delim, ofs)) != wstring::npos)
{
if((np > quot.first) && (np < quot.second))
{ // skip delimiter inside quoted field
ofs = quot.second+1;
quot = nextCSVQuotePair(line, quotChar, ofs);
continue;
}
fields.push_back( trimquote(line.substr(ofs0, np-ofs0), whitespace, quotChar) );
ofs = ofs0 = np+1;
}
fields.push_back( trimquote(line.substr(ofs0), whitespace, quotChar) );
return fields.size();
}
其他回答
如果您正在使用Visual Studio / MFC,下面的解决方案可能会使您的工作更轻松。它支持Unicode和MBCS,有注释,除了CString之外没有其他依赖项,对我来说工作得很好。它不支持在带引号的字符串中嵌入换行符,但我不在乎,只要它在这种情况下不崩溃,它不会崩溃。
总体策略是,将带引号的字符串和空字符串作为特殊情况处理,其余使用Tokenize。对于带引号的字符串,策略是找到真正的结束引号,跟踪是否遇到了连续的引号对。如果是,则使用Replace将成对转换为单个。毫无疑问,有更有效的方法,但在我的案例中,性能还不够重要,不足以证明进一步优化的合理性。
class CParseCSV {
public:
// Construction
CParseCSV(const CString& sLine);
// Attributes
bool GetString(CString& sDest);
protected:
CString m_sLine; // line to extract tokens from
int m_nLen; // line length in characters
int m_iPos; // index of current position
};
CParseCSV::CParseCSV(const CString& sLine) : m_sLine(sLine)
{
m_nLen = m_sLine.GetLength();
m_iPos = 0;
}
bool CParseCSV::GetString(CString& sDest)
{
if (m_iPos < 0 || m_iPos > m_nLen) // if position out of range
return false;
if (m_iPos == m_nLen) { // if at end of string
sDest.Empty(); // return empty token
m_iPos = -1; // really done now
return true;
}
if (m_sLine[m_iPos] == '\"') { // if current char is double quote
m_iPos++; // advance to next char
int iTokenStart = m_iPos;
bool bHasEmbeddedQuotes = false;
while (m_iPos < m_nLen) { // while more chars to parse
if (m_sLine[m_iPos] == '\"') { // if current char is double quote
// if next char exists and is also double quote
if (m_iPos < m_nLen - 1 && m_sLine[m_iPos + 1] == '\"') {
// found pair of consecutive double quotes
bHasEmbeddedQuotes = true; // request conversion
m_iPos++; // skip first quote in pair
} else // next char doesn't exist or is normal
break; // found closing quote; exit loop
}
m_iPos++; // advance to next char
}
sDest = m_sLine.Mid(iTokenStart, m_iPos - iTokenStart);
if (bHasEmbeddedQuotes) // if string contains embedded quote pairs
sDest.Replace(_T("\"\""), _T("\"")); // convert pairs to singles
m_iPos += 2; // skip closing quote and trailing delimiter if any
} else if (m_sLine[m_iPos] == ',') { // else if char is comma
sDest.Empty(); // return empty token
m_iPos++; // advance to next char
} else { // else get next comma-delimited token
sDest = m_sLine.Tokenize(_T(","), m_iPos);
}
return true;
}
// calling code should look something like this:
CStdioFile fIn(pszPath, CFile::modeRead);
CString sLine, sToken;
while (fIn.ReadString(sLine)) { // for each line of input file
if (!sLine.IsEmpty()) { // ignore blank lines
CParseCSV csv(sLine);
while (csv.GetString(sToken)) {
// do something with sToken here
}
}
}
如果你不关心转义逗号和换行符, 并且你不能在引号中嵌入逗号和换行符(如果你不能转义那么…) 那么它只有大约三行代码(好的14 ->,但它只有15读取整个文件)。
std::vector<std::string> getNextLineAndSplitIntoTokens(std::istream& str)
{
std::vector<std::string> result;
std::string line;
std::getline(str,line);
std::stringstream lineStream(line);
std::string cell;
while(std::getline(lineStream,cell, ','))
{
result.push_back(cell);
}
// This checks for a trailing comma with no data after it.
if (!lineStream && cell.empty())
{
// If there was a trailing comma then add an empty element.
result.push_back("");
}
return result;
}
我只需要创建一个表示一行的类。 然后流到该对象:
#include <iterator>
#include <iostream>
#include <fstream>
#include <sstream>
#include <vector>
#include <string>
class CSVRow
{
public:
std::string_view operator[](std::size_t index) const
{
return std::string_view(&m_line[m_data[index] + 1], m_data[index + 1] - (m_data[index] + 1));
}
std::size_t size() const
{
return m_data.size() - 1;
}
void readNextRow(std::istream& str)
{
std::getline(str, m_line);
m_data.clear();
m_data.emplace_back(-1);
std::string::size_type pos = 0;
while((pos = m_line.find(',', pos)) != std::string::npos)
{
m_data.emplace_back(pos);
++pos;
}
// This checks for a trailing comma with no data after it.
pos = m_line.size();
m_data.emplace_back(pos);
}
private:
std::string m_line;
std::vector<int> m_data;
};
std::istream& operator>>(std::istream& str, CSVRow& data)
{
data.readNextRow(str);
return str;
}
int main()
{
std::ifstream file("plop.csv");
CSVRow row;
while(file >> row)
{
std::cout << "4th Element(" << row[3] << ")\n";
}
}
但只要做一点工作,我们就可以在技术上创建一个迭代器:
class CSVIterator
{
public:
typedef std::input_iterator_tag iterator_category;
typedef CSVRow value_type;
typedef std::size_t difference_type;
typedef CSVRow* pointer;
typedef CSVRow& reference;
CSVIterator(std::istream& str) :m_str(str.good()?&str:nullptr) { ++(*this); }
CSVIterator() :m_str(nullptr) {}
// Pre Increment
CSVIterator& operator++() {if (m_str) { if (!((*m_str) >> m_row)){m_str = nullptr;}}return *this;}
// Post increment
CSVIterator operator++(int) {CSVIterator tmp(*this);++(*this);return tmp;}
CSVRow const& operator*() const {return m_row;}
CSVRow const* operator->() const {return &m_row;}
bool operator==(CSVIterator const& rhs) {return ((this == &rhs) || ((this->m_str == nullptr) && (rhs.m_str == nullptr)));}
bool operator!=(CSVIterator const& rhs) {return !((*this) == rhs);}
private:
std::istream* m_str;
CSVRow m_row;
};
int main()
{
std::ifstream file("plop.csv");
for(CSVIterator loop(file); loop != CSVIterator(); ++loop)
{
std::cout << "4th Element(" << (*loop)[3] << ")\n";
}
}
现在我们已经到了2020年,让我们添加一个CSVRange对象:
class CSVRange
{
std::istream& stream;
public:
CSVRange(std::istream& str)
: stream(str)
{}
CSVIterator begin() const {return CSVIterator{stream};}
CSVIterator end() const {return CSVIterator{};}
};
int main()
{
std::ifstream file("plop.csv");
for(auto& row: CSVRange(file))
{
std::cout << "4th Element(" << row[3] << ")\n";
}
}
您需要做的第一件事是确保文件存在。来完成 这你只需要尝试打开文件流的路径。在你 打开文件流使用stream.fail()查看它是否如预期的那样工作, 与否。
bool fileExists(string fileName)
{
ifstream test;
test.open(fileName.c_str());
if (test.fail())
{
test.close();
return false;
}
else
{
test.close();
return true;
}
}
您还必须验证所提供的文件是正确的文件类型。 要做到这一点,您需要查看提供的文件路径直到 您可以找到文件扩展名。一旦你有了文件扩展名,请确保 它是一个。csv文件。
bool verifyExtension(string filename)
{
int period = 0;
for (unsigned int i = 0; i < filename.length(); i++)
{
if (filename[i] == '.')
period = i;
}
string extension;
for (unsigned int i = period; i < filename.length(); i++)
extension += filename[i];
if (extension == ".csv")
return true;
else
return false;
}
此函数将返回稍后在错误消息中使用的文件扩展名。
string getExtension(string filename)
{
int period = 0;
for (unsigned int i = 0; i < filename.length(); i++)
{
if (filename[i] == '.')
period = i;
}
string extension;
if (period != 0)
{
for (unsigned int i = period; i < filename.length(); i++)
extension += filename[i];
}
else
extension = "NO FILE";
return extension;
}
这个函数实际上会调用上面创建的错误检查,然后解析文件。
void parseFile(string fileName)
{
if (fileExists(fileName) && verifyExtension(fileName))
{
ifstream fs;
fs.open(fileName.c_str());
string fileCommand;
while (fs.good())
{
string temp;
getline(fs, fileCommand, '\n');
for (unsigned int i = 0; i < fileCommand.length(); i++)
{
if (fileCommand[i] != ',')
temp += fileCommand[i];
else
temp += " ";
}
if (temp != "\0")
{
// Place your code here to run the file.
}
}
fs.close();
}
else if (!fileExists(fileName))
{
cout << "Error: The provided file does not exist: " << fileName << endl;
if (!verifyExtension(fileName))
{
if (getExtension(fileName) != "NO FILE")
cout << "\tCheck the file extension." << endl;
else
cout << "\tThere is no file in the provided path." << endl;
}
}
else if (!verifyExtension(fileName))
{
if (getExtension(fileName) != "NO FILE")
cout << "Incorrect file extension provided: " << getExtension(fileName) << endl;
else
cout << "There is no file in the following path: " << fileName << endl;
}
}
如果你确实关心正确解析CSV,这将做它…相对较慢,因为它一次只处理一个字符。
void ParseCSV(const string& csvSource, vector<vector<string> >& lines)
{
bool inQuote(false);
bool newLine(false);
string field;
lines.clear();
vector<string> line;
string::const_iterator aChar = csvSource.begin();
while (aChar != csvSource.end())
{
switch (*aChar)
{
case '"':
newLine = false;
inQuote = !inQuote;
break;
case ',':
newLine = false;
if (inQuote == true)
{
field += *aChar;
}
else
{
line.push_back(field);
field.clear();
}
break;
case '\n':
case '\r':
if (inQuote == true)
{
field += *aChar;
}
else
{
if (newLine == false)
{
line.push_back(field);
lines.push_back(line);
field.clear();
line.clear();
newLine = true;
}
}
break;
default:
newLine = false;
field.push_back(*aChar);
break;
}
aChar++;
}
if (field.size())
line.push_back(field);
if (line.size())
lines.push_back(line);
}
该解决方案检测这4种情况
完整的课程在
https://github.com/pedro-vicente/csv-parser
1,field 2,field 3,
1,field 2,"field 3 quoted, with separator",
1,field 2,"field 3
with newline",
1,field 2,"field 3
with newline and separator,",
它一个字符一个字符地读取文件,每次读取一行到一个向量(字符串),因此适合于非常大的文件。
使用
迭代直到返回空行(文件结束)。行是一个向量,其中每个条目都是一个CSV列。
read_csv_t csv;
csv.open("../test.csv");
std::vector<std::string> row;
while (true)
{
row = csv.read_row();
if (row.size() == 0)
{
break;
}
}
类声明
class read_csv_t
{
public:
read_csv_t();
int open(const std::string &file_name);
std::vector<std::string> read_row();
private:
std::ifstream m_ifs;
};
实现
std::vector<std::string> read_csv_t::read_row()
{
bool quote_mode = false;
std::vector<std::string> row;
std::string column;
char c;
while (m_ifs.get(c))
{
switch (c)
{
/////////////////////////////////////////////////////////////////////////////////////////////////////
//separator ',' detected.
//in quote mode add character to column
//push column if not in quote mode
/////////////////////////////////////////////////////////////////////////////////////////////////////
case ',':
if (quote_mode == true)
{
column += c;
}
else
{
row.push_back(column);
column.clear();
}
break;
/////////////////////////////////////////////////////////////////////////////////////////////////////
//quote '"' detected.
//toggle quote mode
/////////////////////////////////////////////////////////////////////////////////////////////////////
case '"':
quote_mode = !quote_mode;
break;
/////////////////////////////////////////////////////////////////////////////////////////////////////
//line end detected
//in quote mode add character to column
//return row if not in quote mode
/////////////////////////////////////////////////////////////////////////////////////////////////////
case '\n':
case '\r':
if (quote_mode == true)
{
column += c;
}
else
{
return row;
}
break;
/////////////////////////////////////////////////////////////////////////////////////////////////////
//default, add character to column
/////////////////////////////////////////////////////////////////////////////////////////////////////
default:
column += c;
break;
}
}
//return empty vector if end of file detected
m_ifs.close();
std::vector<std::string> v;
return v;
}