我需要在c++中加载和使用CSV文件数据。在这一点上,它实际上只是一个以逗号分隔的解析器(即不用担心转义新行和逗号)。主要需要的是逐行解析器,它将在每次调用方法时为下一行返回一个向量。

我发现这篇文章看起来很有前途: http://www.boost.org/doc/libs/1_35_0/libs/spirit/example/fundamental/list_parser.cpp

我从未使用过Boost's Spirit,但我愿意尝试一下。但前提是我忽略了一个更直接的解决方案。


当前回答

不好意思,但是为了隐藏几行代码,这似乎是非常复杂的语法。

为什么不这样呢:

/**

  Read line from a CSV file

  @param[in] fp file pointer to open file
  @param[in] vls reference to vector of strings to hold next line

  */
void readCSV( FILE *fp, std::vector<std::string>& vls )
{
    vls.clear();
    if( ! fp )
        return;
    char buf[10000];
    if( ! fgets( buf,999,fp) )
        return;
    std::string s = buf;
    int p,q;
    q = -1;
    // loop over columns
    while( 1 ) {
        p = q;
        q = s.find_first_of(",\n",p+1);
        if( q == -1 ) 
            break;
        vls.push_back( s.substr(p+1,q-p-1) );
    }
}

int _tmain(int argc, _TCHAR* argv[])
{
    std::vector<std::string> vls;
    FILE * fp = fopen( argv[1], "r" );
    if( ! fp )
        return 1;
    readCSV( fp, vls );
    readCSV( fp, vls );
    readCSV( fp, vls );
    std::cout << "row 3, col 4 is " << vls[3].c_str() << "\n";

    return 0;
}

其他回答

另一个CSV I/O库可以在这里找到:

http://code.google.com/p/fast-cpp-csv-parser/

#include "csv.h"

int main(){
  io::CSVReader<3> in("ram.csv");
  in.read_header(io::ignore_extra_column, "vendor", "size", "speed");
  std::string vendor; int size; double speed;
  while(in.read_row(vendor, size, speed)){
    // do stuff with the data
  }
}

不管怎样,下面是我的实现。它处理wstring输入,但是可以很容易地调整为string。它不处理字段中的换行符(因为我的应用程序也不这样做,但添加它的支持并不太难),它不符合RFC中的“\r\n”行尾(假设您使用std::getline),但它确实正确地处理空格修剪和双引号(希望如此)。

using namespace std;

// trim whitespaces around field or double-quotes, remove double-quotes and replace escaped double-quotes (double double-quotes)
wstring trimquote(const wstring& str, const wstring& whitespace, const wchar_t quotChar)
{
    wstring ws;
    wstring::size_type strBegin = str.find_first_not_of(whitespace);
    if (strBegin == wstring::npos)
        return L"";

    wstring::size_type strEnd = str.find_last_not_of(whitespace);
    wstring::size_type strRange = strEnd - strBegin + 1;

    if((str[strBegin] == quotChar) && (str[strEnd] == quotChar))
    {
        ws = str.substr(strBegin+1, strRange-2);
        strBegin = 0;
        while((strEnd = ws.find(quotChar, strBegin)) != wstring::npos)
        {
            ws.erase(strEnd, 1);
            strBegin = strEnd+1;
        }

    }
    else
        ws = str.substr(strBegin, strRange);
    return ws;
}

pair<unsigned, unsigned> nextCSVQuotePair(const wstring& line, const wchar_t quotChar, unsigned ofs = 0)
{
    pair<unsigned, unsigned> r;
    r.first = line.find(quotChar, ofs);
    r.second = wstring::npos;
    if(r.first != wstring::npos)
    {
        r.second = r.first;
        while(((r.second = line.find(quotChar, r.second+1)) != wstring::npos)
            && (line[r.second+1] == quotChar)) // WARNING: assumes null-terminated string such that line[r.second+1] always exist
            r.second++;

    }
    return r;
}

unsigned parseLine(vector<wstring>& fields, const wstring& line)
{
    unsigned ofs, ofs0, np;
    const wchar_t delim = L',';
    const wstring whitespace = L" \t\xa0\x3000\x2000\x2001\x2002\x2003\x2004\x2005\x2006\x2007\x2008\x2009\x200a\x202f\x205f";
    const wchar_t quotChar = L'\"';
    pair<unsigned, unsigned> quot;

    fields.clear();

    ofs = ofs0 = 0;
    quot = nextCSVQuotePair(line, quotChar);
    while((np = line.find(delim, ofs)) != wstring::npos)
    {
        if((np > quot.first) && (np < quot.second))
        { // skip delimiter inside quoted field
            ofs = quot.second+1;
            quot = nextCSVQuotePair(line, quotChar, ofs);
            continue;
        }
        fields.push_back( trimquote(line.substr(ofs0, np-ofs0), whitespace, quotChar) );
        ofs = ofs0 = np+1;
    }
    fields.push_back( trimquote(line.substr(ofs0), whitespace, quotChar) );

    return fields.size();
}

如果你不关心转义逗号和换行符, 并且你不能在引号中嵌入逗号和换行符(如果你不能转义那么…) 那么它只有大约三行代码(好的14 ->,但它只有15读取整个文件)。

std::vector<std::string> getNextLineAndSplitIntoTokens(std::istream& str)
{
    std::vector<std::string>   result;
    std::string                line;
    std::getline(str,line);

    std::stringstream          lineStream(line);
    std::string                cell;

    while(std::getline(lineStream,cell, ','))
    {
        result.push_back(cell);
    }
    // This checks for a trailing comma with no data after it.
    if (!lineStream && cell.empty())
    {
        // If there was a trailing comma then add an empty element.
        result.push_back("");
    }
    return result;
}

我只需要创建一个表示一行的类。 然后流到该对象:

#include <iterator>
#include <iostream>
#include <fstream>
#include <sstream>
#include <vector>
#include <string>

class CSVRow
{
    public:
        std::string_view operator[](std::size_t index) const
        {
            return std::string_view(&m_line[m_data[index] + 1], m_data[index + 1] -  (m_data[index] + 1));
        }
        std::size_t size() const
        {
            return m_data.size() - 1;
        }
        void readNextRow(std::istream& str)
        {
            std::getline(str, m_line);

            m_data.clear();
            m_data.emplace_back(-1);
            std::string::size_type pos = 0;
            while((pos = m_line.find(',', pos)) != std::string::npos)
            {
                m_data.emplace_back(pos);
                ++pos;
            }
            // This checks for a trailing comma with no data after it.
            pos   = m_line.size();
            m_data.emplace_back(pos);
        }
    private:
        std::string         m_line;
        std::vector<int>    m_data;
};

std::istream& operator>>(std::istream& str, CSVRow& data)
{
    data.readNextRow(str);
    return str;
}   
int main()
{
    std::ifstream       file("plop.csv");

    CSVRow              row;
    while(file >> row)
    {
        std::cout << "4th Element(" << row[3] << ")\n";
    }
}

但只要做一点工作,我们就可以在技术上创建一个迭代器:

class CSVIterator
{   
    public:
        typedef std::input_iterator_tag     iterator_category;
        typedef CSVRow                      value_type;
        typedef std::size_t                 difference_type;
        typedef CSVRow*                     pointer;
        typedef CSVRow&                     reference;

        CSVIterator(std::istream& str)  :m_str(str.good()?&str:nullptr) { ++(*this); }
        CSVIterator()                   :m_str(nullptr) {}

        // Pre Increment
        CSVIterator& operator++()               {if (m_str) { if (!((*m_str) >> m_row)){m_str = nullptr;}}return *this;}
        // Post increment
        CSVIterator operator++(int)             {CSVIterator    tmp(*this);++(*this);return tmp;}
        CSVRow const& operator*()   const       {return m_row;}
        CSVRow const* operator->()  const       {return &m_row;}

        bool operator==(CSVIterator const& rhs) {return ((this == &rhs) || ((this->m_str == nullptr) && (rhs.m_str == nullptr)));}
        bool operator!=(CSVIterator const& rhs) {return !((*this) == rhs);}
    private:
        std::istream*       m_str;
        CSVRow              m_row;
};


int main()
{
    std::ifstream       file("plop.csv");

    for(CSVIterator loop(file); loop != CSVIterator(); ++loop)
    {
        std::cout << "4th Element(" << (*loop)[3] << ")\n";
    }
}

现在我们已经到了2020年,让我们添加一个CSVRange对象:

class CSVRange
{
    std::istream&   stream;
    public:
        CSVRange(std::istream& str)
            : stream(str)
        {}
        CSVIterator begin() const {return CSVIterator{stream};}
        CSVIterator end()   const {return CSVIterator{};}
};

int main()
{
    std::ifstream       file("plop.csv");

    for(auto& row: CSVRange(file))
    {
        std::cout << "4th Element(" << row[3] << ")\n";
    }
}

CSV文件是由行组成的文本文件,每一行都由逗号分隔的令牌组成。虽然在解析时你应该知道一些事情:

(0)文件用“CP_ACP”编码页编码。您应该使用相同的编码页来解码文件内容。

(1) CSV丢失了“复合单元格”信息(比如rowspan > 1),所以当它被读回excel时,复合单元格信息丢失。

(2)单元格文本可以在头部和尾部用""" "进行引用,文字引用char将变成双引号。因此,结束匹配的引号字符必须是一个引号字符,而不是后面跟着另一个引号字符。例如,如果一个单元格有逗号,它必须在csv中被引用,因为逗号在csv中有意义。

(3)当单元格内容有多行时,它将在CSV中被引用,在这种情况下,解析器必须继续读取CSV文件中的下几行,直到获得与第一个引用字符匹配的结束引号字符,确保当前逻辑行读取完成后再解析该行的令牌。

例如:在CSV文件中,以下3个物理行是由3个令牌组成的逻辑行:

    --+----------
    1 |a,"b-first part
    2 |b-second part
    3 |b-third part",c
    --+----------

另一种快速简单的方法是使用Boost。I / O:融合

#include <iostream>
#include <sstream>

#include <boost/fusion/adapted/boost_tuple.hpp>
#include <boost/fusion/sequence/io.hpp>

namespace fusion = boost::fusion;

struct CsvString
{
    std::string value;

    // Stop reading a string once a CSV delimeter is encountered.
    friend std::istream& operator>>(std::istream& s, CsvString& v) {
        v.value.clear();
        for(;;) {
            auto c = s.peek();
            if(std::istream::traits_type::eof() == c || ',' == c || '\n' == c)
                break;
            v.value.push_back(c);
            s.get();
        }
        return s;
    }

    friend std::ostream& operator<<(std::ostream& s, CsvString const& v) {
        return s << v.value;
    }
};

int main() {
    std::stringstream input("abc,123,true,3.14\n"
                            "def,456,false,2.718\n");

    typedef boost::tuple<CsvString, int, bool, double> CsvRow;

    using fusion::operator<<;
    std::cout << std::boolalpha;

    using fusion::operator>>;
    input >> std::boolalpha;
    input >> fusion::tuple_open("") >> fusion::tuple_close("\n") >> fusion::tuple_delimiter(',');

    for(CsvRow row; input >> row;)
        std::cout << row << '\n';
}

输出:

(abc 123 true 3.14)
(def 456 false 2.718)