Java有一个方便的分割方法:

String str = "The quick brown fox";
String[] results = str.split(" ");

在c++中有简单的方法来做到这一点吗?


当前回答

If the maximum length of the input string to be tokenized is known, one can exploit this and implement a very fast version. I am sketching the basic idea below, which was inspired by both strtok() and the "suffix array"-data structure described Jon Bentley's "Programming Perls" 2nd edition, chapter 15. The C++ class in this case only gives some organization and convenience of use. The implementation shown can be easily extended for removing leading and trailing whitespace characters in the tokens.

基本上,可以将分隔符替换为以字符串结束的'\0'字符,并设置指向修改后字符串中的标记的指针。在极端情况下,当字符串仅由分隔符组成时,将得到字符串长度加1个空标记。复制要修改的字符串是可行的。

头文件:

class TextLineSplitter
{
public:

    TextLineSplitter( const size_t max_line_len );

    ~TextLineSplitter();

    void            SplitLine( const char *line,
                               const char sep_char = ',',
                             );

    inline size_t   NumTokens( void ) const
    {
        return mNumTokens;
    }

    const char *    GetToken( const size_t token_idx ) const
    {
        assert( token_idx < mNumTokens );
        return mTokens[ token_idx ];
    }

private:
    const size_t    mStorageSize;

    char           *mBuff;
    char          **mTokens;
    size_t          mNumTokens;

    inline void     ResetContent( void )
    {
        memset( mBuff, 0, mStorageSize );
        // mark all items as empty:
        memset( mTokens, 0, mStorageSize * sizeof( char* ) );
        // reset counter for found items:
        mNumTokens = 0L;
    }
};

Implementattion文件:

TextLineSplitter::TextLineSplitter( const size_t max_line_len ):
    mStorageSize ( max_line_len + 1L )
{
    // allocate memory
    mBuff   = new char  [ mStorageSize ];
    mTokens = new char* [ mStorageSize ];

    ResetContent();
}

TextLineSplitter::~TextLineSplitter()
{
    delete [] mBuff;
    delete [] mTokens;
}


void TextLineSplitter::SplitLine( const char *line,
                                  const char sep_char   /* = ',' */,
                                )
{
    assert( sep_char != '\0' );

    ResetContent();
    strncpy( mBuff, line, mMaxLineLen );

    size_t idx       = 0L; // running index for characters

    do
    {
        assert( idx < mStorageSize );

        const char chr = line[ idx ]; // retrieve current character

        if( mTokens[ mNumTokens ] == NULL )
        {
            mTokens[ mNumTokens ] = &mBuff[ idx ];
        } // if

        if( chr == sep_char || chr == '\0' )
        { // item or line finished
            // overwrite separator with a 0-terminating character:
            mBuff[ idx ] = '\0';
            // count-up items:
            mNumTokens ++;
        } // if

    } while( line[ idx++ ] );
}

使用的场景是:

// create an instance capable of splitting strings up to 1000 chars long:
TextLineSplitter spl( 1000 );
spl.SplitLine( "Item1,,Item2,Item3" );
for( size_t i = 0; i < spl.NumTokens(); i++ )
{
    printf( "%s\n", spl.GetToken( i ) );
}

输出:

Item1

Item2
Item3

其他回答

这里有许多过于复杂的建议。试试这个简单的std::string解决方案:

using namespace std;

string someText = ...

string::size_type tokenOff = 0, sepOff = tokenOff;
while (sepOff != string::npos)
{
    sepOff = someText.find(' ', sepOff);
    string::size_type tokenLen = (sepOff == string::npos) ? sepOff : sepOff++ - tokenOff;
    string token = someText.substr(tokenOff, tokenLen);
    if (!token.empty())
        /* do something with token */;
    tokenOff = sepOff;
}

您可以使用流、迭代器和复制算法来相当直接地做到这一点。

#include <string>
#include <vector>
#include <iostream>
#include <istream>
#include <ostream>
#include <iterator>
#include <sstream>
#include <algorithm>

int main()
{
  std::string str = "The quick brown fox";

  // construct a stream from the string
  std::stringstream strstr(str);

  // use stream iterators to copy the stream to the vector as whitespace separated strings
  std::istream_iterator<std::string> it(strstr);
  std::istream_iterator<std::string> end;
  std::vector<std::string> results(it, end);

  // send the vector to stdout.
  std::ostream_iterator<std::string> oit(std::cout);
  std::copy(results.begin(), results.end(), oit);
}

我认为这就是字符串流上的>>操作符的用途:

string word; sin >> word;

使用strtok。在我看来,没有必要围绕标记化构建类,除非strtok不能提供您所需要的东西。可能不会,但在用C和c++编写各种解析代码的15年多时间里,我一直在使用strtok。这里有一个例子

char myString[] = "The quick brown fox";
char *p = strtok(myString, " ");
while (p) {
    printf ("Token: %s\n", p);
    p = strtok(NULL, " ");
}

一些注意事项(可能不适合您的需要)。该字符串在该过程中被“销毁”,这意味着EOS字符内联放置在分隔符点中。正确的用法可能需要创建字符串的非const版本。还可以在解析过程中更改分隔符列表。

在我看来,上面的代码比为它单独编写一个类要简单得多,也更容易使用。对我来说,这是语言提供的功能之一,而且它做得很好,很干净。这只是一个“基于C”的解决方案。它很合适,很简单,而且你不需要写很多额外的代码:-)

我以前只用标准库做了一个lexer/tokenizer。代码如下:

#include <iostream>
#include <string>
#include <vector>
#include <sstream>

using namespace std;

string seps(string& s) {
    if (!s.size()) return "";
    stringstream ss;
    ss << s[0];
    for (int i = 1; i < s.size(); i++) {
        ss << '|' << s[i];
    }
    return ss.str();
}

void Tokenize(string& str, vector<string>& tokens, const string& delimiters = " ")
{
    seps(str);

    // Skip delimiters at beginning.
    string::size_type lastPos = str.find_first_not_of(delimiters, 0);
    // Find first "non-delimiter".
    string::size_type pos = str.find_first_of(delimiters, lastPos);

    while (string::npos != pos || string::npos != lastPos)
    {
        // Found a token, add it to the vector.
        tokens.push_back(str.substr(lastPos, pos - lastPos));
        // Skip delimiters.  Note the "not_of"
        lastPos = str.find_first_not_of(delimiters, pos);
        // Find next "non-delimiter"
        pos = str.find_first_of(delimiters, lastPos);
    }
}

int main(int argc, char *argv[])
{
    vector<string> t;
    string s = "Tokens for everyone!";

    Tokenize(s, t, "|");

    for (auto c : t)
        cout << c << endl;

    system("pause");

    return 0;
}