我需要在c++中加载和使用CSV文件数据。在这一点上,它实际上只是一个以逗号分隔的解析器(即不用担心转义新行和逗号)。主要需要的是逐行解析器,它将在每次调用方法时为下一行返回一个向量。
我发现这篇文章看起来很有前途: http://www.boost.org/doc/libs/1_35_0/libs/spirit/example/fundamental/list_parser.cpp
我从未使用过Boost's Spirit,但我愿意尝试一下。但前提是我忽略了一个更直接的解决方案。
我需要在c++中加载和使用CSV文件数据。在这一点上,它实际上只是一个以逗号分隔的解析器(即不用担心转义新行和逗号)。主要需要的是逐行解析器,它将在每次调用方法时为下一行返回一个向量。
我发现这篇文章看起来很有前途: http://www.boost.org/doc/libs/1_35_0/libs/spirit/example/fundamental/list_parser.cpp
我从未使用过Boost's Spirit,但我愿意尝试一下。但前提是我忽略了一个更直接的解决方案。
当前回答
下面是Unicode CSV解析器的另一个实现(使用wchar_t)。我写了一部分,乔纳森·莱弗勒写了剩下的部分。
注意:此解析器旨在尽可能地复制Excel的行为,特别是在导入损坏或格式错误的CSV文件时。
这是最初的问题-用多行字段和转义双引号解析CSV文件
这是作为SSCCE(简短,自包含,正确示例)的代码。
#include <stdbool.h>
#include <wchar.h>
#include <wctype.h>
extern const wchar_t *nextCsvField(const wchar_t *p, wchar_t sep, bool *newline);
// Returns a pointer to the start of the next field,
// or zero if this is the last field in the CSV
// p is the start position of the field
// sep is the separator used, i.e. comma or semicolon
// newline says whether the field ends with a newline or with a comma
const wchar_t *nextCsvField(const wchar_t *p, wchar_t sep, bool *newline)
{
// Parse quoted sequences
if ('"' == p[0]) {
p++;
while (1) {
// Find next double-quote
p = wcschr(p, L'"');
// If we don't find it or it's the last symbol
// then this is the last field
if (!p || !p[1])
return 0;
// Check for "", it is an escaped double-quote
if (p[1] != '"')
break;
// Skip the escaped double-quote
p += 2;
}
}
// Find next newline or comma.
wchar_t newline_or_sep[4] = L"\n\r ";
newline_or_sep[2] = sep;
p = wcspbrk(p, newline_or_sep);
// If no newline or separator, this is the last field.
if (!p)
return 0;
// Check if we had newline.
*newline = (p[0] == '\r' || p[0] == '\n');
// Handle "\r\n", otherwise just increment
if (p[0] == '\r' && p[1] == '\n')
p += 2;
else
p++;
return p;
}
static wchar_t *csvFieldData(const wchar_t *fld_s, const wchar_t *fld_e, wchar_t *buffer, size_t buflen)
{
wchar_t *dst = buffer;
wchar_t *end = buffer + buflen - 1;
const wchar_t *src = fld_s;
if (*src == L'"')
{
const wchar_t *p = src + 1;
while (p < fld_e && dst < end)
{
if (p[0] == L'"' && p+1 < fld_s && p[1] == L'"')
{
*dst++ = p[0];
p += 2;
}
else if (p[0] == L'"')
{
p++;
break;
}
else
*dst++ = *p++;
}
src = p;
}
while (src < fld_e && dst < end)
*dst++ = *src++;
if (dst >= end)
return 0;
*dst = L'\0';
return(buffer);
}
static void dissect(const wchar_t *line)
{
const wchar_t *start = line;
const wchar_t *next;
bool eol;
wprintf(L"Input %3zd: [%.*ls]\n", wcslen(line), wcslen(line)-1, line);
while ((next = nextCsvField(start, L',', &eol)) != 0)
{
wchar_t buffer[1024];
wprintf(L"Raw Field: [%.*ls] (eol = %d)\n", (next - start - eol), start, eol);
if (csvFieldData(start, next-1, buffer, sizeof(buffer)/sizeof(buffer[0])) != 0)
wprintf(L"Field %3zd: [%ls]\n", wcslen(buffer), buffer);
start = next;
}
}
static const wchar_t multiline[] =
L"First field of first row,\"This field is multiline\n"
"\n"
"but that's OK because it's enclosed in double quotes, and this\n"
"is an escaped \"\" double quote\" but this one \"\" is not\n"
" \"This is second field of second row, but it is not multiline\n"
" because it doesn't start \n"
" with an immediate double quote\"\n"
;
int main(void)
{
wchar_t line[1024];
while (fgetws(line, sizeof(line)/sizeof(line[0]), stdin))
dissect(line);
dissect(multiline);
return 0;
}
其他回答
下面是Unicode CSV解析器的另一个实现(使用wchar_t)。我写了一部分,乔纳森·莱弗勒写了剩下的部分。
注意:此解析器旨在尽可能地复制Excel的行为,特别是在导入损坏或格式错误的CSV文件时。
这是最初的问题-用多行字段和转义双引号解析CSV文件
这是作为SSCCE(简短,自包含,正确示例)的代码。
#include <stdbool.h>
#include <wchar.h>
#include <wctype.h>
extern const wchar_t *nextCsvField(const wchar_t *p, wchar_t sep, bool *newline);
// Returns a pointer to the start of the next field,
// or zero if this is the last field in the CSV
// p is the start position of the field
// sep is the separator used, i.e. comma or semicolon
// newline says whether the field ends with a newline or with a comma
const wchar_t *nextCsvField(const wchar_t *p, wchar_t sep, bool *newline)
{
// Parse quoted sequences
if ('"' == p[0]) {
p++;
while (1) {
// Find next double-quote
p = wcschr(p, L'"');
// If we don't find it or it's the last symbol
// then this is the last field
if (!p || !p[1])
return 0;
// Check for "", it is an escaped double-quote
if (p[1] != '"')
break;
// Skip the escaped double-quote
p += 2;
}
}
// Find next newline or comma.
wchar_t newline_or_sep[4] = L"\n\r ";
newline_or_sep[2] = sep;
p = wcspbrk(p, newline_or_sep);
// If no newline or separator, this is the last field.
if (!p)
return 0;
// Check if we had newline.
*newline = (p[0] == '\r' || p[0] == '\n');
// Handle "\r\n", otherwise just increment
if (p[0] == '\r' && p[1] == '\n')
p += 2;
else
p++;
return p;
}
static wchar_t *csvFieldData(const wchar_t *fld_s, const wchar_t *fld_e, wchar_t *buffer, size_t buflen)
{
wchar_t *dst = buffer;
wchar_t *end = buffer + buflen - 1;
const wchar_t *src = fld_s;
if (*src == L'"')
{
const wchar_t *p = src + 1;
while (p < fld_e && dst < end)
{
if (p[0] == L'"' && p+1 < fld_s && p[1] == L'"')
{
*dst++ = p[0];
p += 2;
}
else if (p[0] == L'"')
{
p++;
break;
}
else
*dst++ = *p++;
}
src = p;
}
while (src < fld_e && dst < end)
*dst++ = *src++;
if (dst >= end)
return 0;
*dst = L'\0';
return(buffer);
}
static void dissect(const wchar_t *line)
{
const wchar_t *start = line;
const wchar_t *next;
bool eol;
wprintf(L"Input %3zd: [%.*ls]\n", wcslen(line), wcslen(line)-1, line);
while ((next = nextCsvField(start, L',', &eol)) != 0)
{
wchar_t buffer[1024];
wprintf(L"Raw Field: [%.*ls] (eol = %d)\n", (next - start - eol), start, eol);
if (csvFieldData(start, next-1, buffer, sizeof(buffer)/sizeof(buffer[0])) != 0)
wprintf(L"Field %3zd: [%ls]\n", wcslen(buffer), buffer);
start = next;
}
}
static const wchar_t multiline[] =
L"First field of first row,\"This field is multiline\n"
"\n"
"but that's OK because it's enclosed in double quotes, and this\n"
"is an escaped \"\" double quote\" but this one \"\" is not\n"
" \"This is second field of second row, but it is not multiline\n"
" because it doesn't start \n"
" with an immediate double quote\"\n"
;
int main(void)
{
wchar_t line[1024];
while (fgetws(line, sizeof(line)/sizeof(line[0]), stdin))
dissect(line);
dissect(multiline);
return 0;
}
下面是读取矩阵的代码,注意你在matlab中也有一个csvwrite函数
void loadFromCSV( const std::string& filename )
{
std::ifstream file( filename.c_str() );
std::vector< std::vector<std::string> > matrix;
std::vector<std::string> row;
std::string line;
std::string cell;
while( file )
{
std::getline(file,line);
std::stringstream lineStream(line);
row.clear();
while( std::getline( lineStream, cell, ',' ) )
row.push_back( cell );
if( !row.empty() )
matrix.push_back( row );
}
for( int i=0; i<int(matrix.size()); i++ )
{
for( int j=0; j<int(matrix[i].size()); j++ )
std::cout << matrix[i][j] << " ";
std::cout << std::endl;
}
}
使用Spirit来解析csv并不过分。Spirit非常适合微解析任务。例如,使用Spirit 2.1,它就像:
bool r = phrase_parse(first, last,
// Begin grammar
(
double_ % ','
)
,
// End grammar
space, v);
向量v被值填满了。在刚刚与Boost 1.41一起发布的新的Spirit 2.1文档中,有一系列教程涉及到这一点。
本教程从简单到复杂。CSV解析器呈现在中间的某个位置,并涉及使用Spirit的各种技术。生成的代码与手写代码一样紧凑。检查生成的汇编程序!
c++ String工具箱库(StrTk)有一个令牌网格类,它允许你从文本文件、字符串或字符缓冲区加载数据,并以行-列的方式解析/处理它们。
您可以指定行分隔符和列分隔符,或者只使用默认值。
void foo()
{
std::string data = "1,2,3,4,5\n"
"0,2,4,6,8\n"
"1,3,5,7,9\n";
strtk::token_grid grid(data,data.size(),",");
for(std::size_t i = 0; i < grid.row_count(); ++i)
{
strtk::token_grid::row_type r = grid.row(i);
for(std::size_t j = 0; j < r.size(); ++j)
{
std::cout << r.get<int>(j) << "\t";
}
std::cout << std::endl;
}
std::cout << std::endl;
}
更多的例子可以在这里找到
您还可以看看Qt库的功能。
它有正则表达式支持,QString类有很好的方法,例如split()返回QStringList,通过使用提供的分隔符分割原始字符串获得的字符串列表。应该足以为csv文件..
要获得具有给定标题名的列,我使用以下方法:c++继承Qt问题qstring