我一直认为std::vector是“作为数组实现的”,等等等等。今天我去测试了一下,结果似乎不是这样:

以下是一些测试结果:

UseArray completed in 2.619 seconds
UseVector completed in 9.284 seconds
UseVectorPushBack completed in 14.669 seconds
The whole thing completed in 26.591 seconds

这大约要慢3 - 4倍!这并不能证明“向量可能会慢几纳秒”的评论是正确的。

我使用的代码是:

#include <cstdlib>
#include <vector>

#include <iostream>
#include <string>

#include <boost/date_time/posix_time/ptime.hpp>
#include <boost/date_time/microsec_time_clock.hpp>

class TestTimer
{
    public:
        TestTimer(const std::string & name) : name(name),
            start(boost::date_time::microsec_clock<boost::posix_time::ptime>::local_time())
        {
        }

        ~TestTimer()
        {
            using namespace std;
            using namespace boost;

            posix_time::ptime now(date_time::microsec_clock<posix_time::ptime>::local_time());
            posix_time::time_duration d = now - start;

            cout << name << " completed in " << d.total_milliseconds() / 1000.0 <<
                " seconds" << endl;
        }

    private:
        std::string name;
        boost::posix_time::ptime start;
};

struct Pixel
{
    Pixel()
    {
    }

    Pixel(unsigned char r, unsigned char g, unsigned char b) : r(r), g(g), b(b)
    {
    }

    unsigned char r, g, b;
};

void UseVector()
{
    TestTimer t("UseVector");

    for(int i = 0; i < 1000; ++i)
    {
        int dimension = 999;

        std::vector<Pixel> pixels;
        pixels.resize(dimension * dimension);

        for(int i = 0; i < dimension * dimension; ++i)
        {
            pixels[i].r = 255;
            pixels[i].g = 0;
            pixels[i].b = 0;
        }
    }
}

void UseVectorPushBack()
{
    TestTimer t("UseVectorPushBack");

    for(int i = 0; i < 1000; ++i)
    {
        int dimension = 999;

        std::vector<Pixel> pixels;
            pixels.reserve(dimension * dimension);

        for(int i = 0; i < dimension * dimension; ++i)
            pixels.push_back(Pixel(255, 0, 0));
    }
}

void UseArray()
{
    TestTimer t("UseArray");

    for(int i = 0; i < 1000; ++i)
    {
        int dimension = 999;

        Pixel * pixels = (Pixel *)malloc(sizeof(Pixel) * dimension * dimension);

        for(int i = 0 ; i < dimension * dimension; ++i)
        {
            pixels[i].r = 255;
            pixels[i].g = 0;
            pixels[i].b = 0;
        }

        free(pixels);
    }
}

int main()
{
    TestTimer t1("The whole thing");

    UseArray();
    UseVector();
    UseVectorPushBack();

    return 0;
}

我做错了吗?还是我刚刚打破了这个性能神话?

我使用Visual Studio 2005中的发布模式。


在Visual c++中,#define _SECURE_SCL 0将UseVector减少了一半(减少到4秒)。在我看来,这真的是件大事。


当前回答

顺便说一下,你在使用vector的类中看到的减速也发生在标准类型中,比如int。这是一个多线程代码:

#include <iostream>
#include <cstdio>
#include <map>
#include <string>
#include <typeinfo>
#include <vector>
#include <pthread.h>
#include <sstream>
#include <fstream>
using namespace std;

//pthread_mutex_t map_mutex=PTHREAD_MUTEX_INITIALIZER;

long long num=500000000;
int procs=1;

struct iterate
{
    int id;
    int num;
    void * member;
    iterate(int a, int b, void *c) : id(a), num(b), member(c) {}
};

//fill out viterate and piterate
void * viterate(void * input)
{
    printf("am in viterate\n");
    iterate * info=static_cast<iterate *> (input);
    // reproduce member type
    vector<int> test= *static_cast<vector<int>*> (info->member);
    for (int i=info->id; i<test.size(); i+=info->num)
    {
        //printf("am in viterate loop\n");
        test[i];
    }
    pthread_exit(NULL);
}

void * piterate(void * input)
{
    printf("am in piterate\n");
    iterate * info=static_cast<iterate *> (input);;
    int * test=static_cast<int *> (info->member);
    for (int i=info->id; i<num; i+=info->num) {
        //printf("am in piterate loop\n");
        test[i];
    }
    pthread_exit(NULL);
}

int main()
{
    cout<<"producing vector of size "<<num<<endl;
    vector<int> vtest(num);
    cout<<"produced  a vector of size "<<vtest.size()<<endl;
    pthread_t thread[procs];

    iterate** it=new iterate*[procs];
    int ans;
    void *status;

    cout<<"begining to thread through the vector\n";
    for (int i=0; i<procs; i++) {
        it[i]=new iterate(i, procs, (void *) &vtest);
    //  ans=pthread_create(&thread[i],NULL,viterate, (void *) it[i]);
    }
    for (int i=0; i<procs; i++) {
        pthread_join(thread[i], &status);
    }
    cout<<"end of threading through the vector";
    //reuse the iterate structures

    cout<<"producing a pointer with size "<<num<<endl;
    int * pint=new int[num];
    cout<<"produced a pointer with size "<<num<<endl;

    cout<<"begining to thread through the pointer\n";
    for (int i=0; i<procs; i++) {
        it[i]->member=&pint;
        ans=pthread_create(&thread[i], NULL, piterate, (void*) it[i]);
    }
    for (int i=0; i<procs; i++) {
        pthread_join(thread[i], &status);
    }
    cout<<"end of threading through the pointer\n";

    //delete structure array for iterate
    for (int i=0; i<procs; i++) {
        delete it[i];
    }
    delete [] it;

    //delete pointer
    delete [] pint;

    cout<<"end of the program"<<endl;
    return 0;
}

代码中的行为表明vector的实例化是代码中最长的部分。一旦你通过瓶颈。其余的代码运行得非常快。无论在多少个线程上运行,这都是正确的。

顺便说一下,忽略那些疯狂的包含数。我一直在使用这段代码来测试一个项目的东西,所以包含的数量不断增长。

其他回答

试试这个:

void UseVectorCtor()
{
    TestTimer t("UseConstructor");

    for(int i = 0; i < 1000; ++i)
    {
        int dimension = 999;

        std::vector<Pixel> pixels(dimension * dimension, Pixel(255, 0, 0));
    }
}

我得到了和数组几乎完全一样的性能。

The thing about vector is that it's a much more general tool than an array. And that means you have to consider how you use it. It can be used in a lot of different ways, providing functionality that an array doesn't even have. And if you use it "wrong" for your purpose, you incur a lot of overhead, but if you use it correctly, it is usually basically a zero-overhead data structure. In this case, the problem is that you separately initialized the vector (causing all elements to have their default ctor called), and then overwriting each element individually with the correct value. That is much harder for the compiler to optimize away than when you do the same thing with an array. Which is why the vector provides a constructor which lets you do exactly that: initialize N elements with value X.

当你使用它时,向量和数组一样快。

所以,你还没有打破性能神话。但是你已经证明了只有当你最优地使用向量时它才成立,这也是一个很好的观点。:)

好的一面是,它确实是最简单的用法,但却是最快的。如果您将我的代码片段(一行)与John Kugelman的答案进行对比,其中包含大量的调整和优化,但仍然不能完全消除性能差异,很明显,vector的设计非常巧妙。你不必费尽周折才能得到等于数组的速度。相反,您必须使用最简单的解决方案。

GNU's STL (and others), given vector<T>(n), default constructs a prototypal object T() - the compiler will optimise away the empty constructor - but then a copy of whatever garbage happened to be in the memory addresses now reserved for the object is taken by the STL's __uninitialized_fill_n_aux, which loops populating copies of that object as the default values in the vector. So, "my" STL is not looping constructing, but constructing then loop/copying. It's counter intuitive, but I should have remembered as I commented on a recent stackoverflow question about this very point: the construct/copy can be more efficient for reference counted objects etc..

So:

vector<T> x(n);

or

vector<T> x;
x.resize(n);

是-在许多STL实现中-类似于:

T temp;
for (int i = 0; i < n; ++i)
    x[i] = temp;

The issue being that the current generation of compiler optimisers don't seem to work from the insight that temp is uninitialised garbage, and fail to optimise out the loop and default copy constructor invocations. You could credibly argue that compilers absolutely shouldn't optimise this away, as a programmer writing the above has a reasonable expectation that all the objects will be identical after the loop, even if garbage (usual caveats about 'identical'/operator== vs memcmp/operator= etc apply). The compiler can't be expected to have any extra insight into the larger context of std::vector<> or the later usage of the data that would suggest this optimisation safe.

这可以与更明显的直接实现形成对比:

for (int i = 0; i < n; ++i)
    x[i] = T();

我们可以期待一个编译器优化。

为了更明确地解释vector行为的这一方面,可以考虑:

std::vector<big_reference_counted_object> x(10000);

显然,如果我们创建10000个独立对象,而不是创建10000个引用相同数据的对象,这是一个很大的区别。有一种合理的观点认为,保护普通c++用户不意外地做一些如此昂贵的事情的好处超过了现实世界中难以优化的拷贝构造的非常小的成本。

原始答案(供参考/理解评论): 没有机会。Vector和数组一样快,至少如果你合理地保留空间. ...

我做了一些长期以来一直想做的广泛测试。不妨分享一下。

这是我的双启动机i7-3770, 16GB Ram, x86_64, Windows 8.1和Ubuntu 16.04。更多信息和结论,备注如下。测试了MSVS 2017和g++(在Windows和Linux上)。

测试程序

#include <iostream>
#include <chrono>
//#include <algorithm>
#include <array>
#include <locale>
#include <vector>
#include <queue>
#include <deque>

// Note: total size of array must not exceed 0x7fffffff B = 2,147,483,647B
//  which means that largest int array size is 536,870,911
// Also image size cannot be larger than 80,000,000B
constexpr int long g_size = 100000;
int g_A[g_size];


int main()
{
    std::locale loc("");
    std::cout.imbue(loc);
    constexpr int long size = 100000;  // largest array stack size

    // stack allocated c array
    std::chrono::steady_clock::time_point start = std::chrono::steady_clock::now();
    int A[size];
    for (int i = 0; i < size; i++)
        A[i] = i;

    auto duration = std::chrono::duration_cast<std::chrono::microseconds>(std::chrono::steady_clock::now() - start).count();
    std::cout << "c-style stack array duration=" << duration / 1000.0 << "ms\n";
    std::cout << "c-style stack array size=" << sizeof(A) << "B\n\n";

    // global stack c array
    start = std::chrono::steady_clock::now();
    for (int i = 0; i < g_size; i++)
        g_A[i] = i;

    duration = std::chrono::duration_cast<std::chrono::microseconds>(std::chrono::steady_clock::now() - start).count();
    std::cout << "global c-style stack array duration=" << duration / 1000.0 << "ms\n";
    std::cout << "global c-style stack array size=" << sizeof(g_A) << "B\n\n";

    // raw c array heap array
    start = std::chrono::steady_clock::now();
    int* AA = new int[size];    // bad_alloc() if it goes higher than 1,000,000,000
    for (int i = 0; i < size; i++)
        AA[i] = i;

    duration = std::chrono::duration_cast<std::chrono::microseconds>(std::chrono::steady_clock::now() - start).count();
    std::cout << "c-style heap array duration=" << duration / 1000.0 << "ms\n";
    std::cout << "c-style heap array size=" << sizeof(AA) << "B\n\n";
    delete[] AA;

    // std::array<>
    start = std::chrono::steady_clock::now();
    std::array<int, size> AAA;
    for (int i = 0; i < size; i++)
        AAA[i] = i;
    //std::sort(AAA.begin(), AAA.end());

    duration = std::chrono::duration_cast<std::chrono::microseconds>(std::chrono::steady_clock::now() - start).count();
    std::cout << "std::array duration=" << duration / 1000.0 << "ms\n";
    std::cout << "std::array size=" << sizeof(AAA) << "B\n\n";

    // std::vector<>
    start = std::chrono::steady_clock::now();
    std::vector<int> v;
    for (int i = 0; i < size; i++)
        v.push_back(i);
    //std::sort(v.begin(), v.end());

    duration = std::chrono::duration_cast<std::chrono::microseconds>(std::chrono::steady_clock::now() - start).count();
    std::cout << "std::vector duration=" << duration / 1000.0 << "ms\n";
    std::cout << "std::vector size=" << v.size() * sizeof(v.back()) << "B\n\n";

    // std::deque<>
    start = std::chrono::steady_clock::now();
    std::deque<int> dq;
    for (int i = 0; i < size; i++)
        dq.push_back(i);
    //std::sort(dq.begin(), dq.end());

    duration = std::chrono::duration_cast<std::chrono::microseconds>(std::chrono::steady_clock::now() - start).count();
    std::cout << "std::deque duration=" << duration / 1000.0 << "ms\n";
    std::cout << "std::deque size=" << dq.size() * sizeof(dq.back()) << "B\n\n";

    // std::queue<>
    start = std::chrono::steady_clock::now();
    std::queue<int> q;
    for (int i = 0; i < size; i++)
        q.push(i);

    duration = std::chrono::duration_cast<std::chrono::microseconds>(std::chrono::steady_clock::now() - start).count();
    std::cout << "std::queue duration=" << duration / 1000.0 << "ms\n";
    std::cout << "std::queue size=" << q.size() * sizeof(q.front()) << "B\n\n";
}

结果

//////////////////////////////////////////////////////////////////////////////////////////
// with MSVS 2017:
// >> cl /std:c++14 /Wall -O2 array_bench.cpp
//
// c-style stack array duration=0.15ms
// c-style stack array size=400,000B
//
// global c-style stack array duration=0.130ms
// global c-style stack array size=400,000B
//
// c-style heap array duration=0.90ms
// c-style heap array size=4B
//
// std::array duration=0.20ms
// std::array size=400,000B
//
// std::vector duration=0.544ms
// std::vector size=400,000B
//
// std::deque duration=1.375ms
// std::deque size=400,000B
//
// std::queue duration=1.491ms
// std::queue size=400,000B
//
//////////////////////////////////////////////////////////////////////////////////////////
//
// with g++ version:
//      - (tdm64-1) 5.1.0 on Windows
//      - (Ubuntu 5.4.0-6ubuntu1~16.04.10) 5.4.0 20160609 on Ubuntu 16.04
// >> g++ -std=c++14 -Wall -march=native -O2 array_bench.cpp -o array_bench
//
// c-style stack array duration=0ms
// c-style stack array size=400,000B
//
// global c-style stack array duration=0.124ms
// global c-style stack array size=400,000B
//
// c-style heap array duration=0.648ms
// c-style heap array size=8B
//
// std::array duration=1ms
// std::array size=400,000B
//
// std::vector duration=0.402ms
// std::vector size=400,000B
//
// std::deque duration=0.234ms
// std::deque size=400,000B
//
// std::queue duration=0.304ms
// std::queue size=400,000
//
//////////////////////////////////////////////////////////////////////////////////////////

笔记

平均10次组装。 我最初也使用std::sort()执行测试(您可以看到它被注释掉了),但后来删除了它们,因为没有显著的相对差异。

我的结论和评论

notice how global c-style array takes almost as much time as the heap c-style array Out of all tests I noticed a remarkable stability in std::array's time variations between consecutive runs, while others especially std:: data structs varied wildly in comparison O3 optimization didn't show any noteworthy time differences Removing optimization on Windows cl (no -O2) and on g++ (Win/Linux no -O2, no -march=native) increases times SIGNIFICANTLY. Particularly for std::data structs. Overall higher times on MSVS than g++, but std::array and c-style arrays faster on Windows without optimization g++ produces faster code than microsoft's compiler (apparently it runs faster even on Windows).

判决

当然,这是用于优化构建的代码。既然问题是关于std::vector,那么是的,它是!比普通数组(优化/未优化)慢。但是当您进行基准测试时,您自然希望生成优化的代码。

对我来说,这个节目的明星是std::array。

使用正确的选项,向量和数组可以生成相同的asm。在这些情况下,它们的速度当然是一样的,因为无论哪种方式都可以得到相同的可执行文件。

使用以下方法:

g++ -O3 Time.cpp -I <MyBoost> . cfg . / a.o ut UseArray完成用时2.196秒 UseVector在4.412秒内完成 UseVectorPushBack在8.017秒内完成 全程用时14.626秒

数组的速度是向量的两倍。

但在更详细地查看代码后,这是预期的;当你遍历向量两次,只遍历数组一次时。注意:当你调整vector的size()时,你不仅是在分配内存,而且还在遍历vector并调用每个成员的构造函数。

稍微重新排列代码,使vector只初始化每个对象一次:

 std::vector<Pixel>  pixels(dimensions * dimensions, Pixel(255,0,0));

现在再做一次同样的计时:

g++ -O3 Time.cpp -I <MyBoost> . cfg . / a.o ut UseVector在2.216秒内完成

vector现在的性能只比数组差一点点。在我看来,这种差异是微不足道的,可能是由一大堆与测试无关的事情造成的。

我也会考虑到,你没有正确初始化/销毁像素对象在UseArrray()方法的构造函数/析构函数都没有被调用(这可能不是这个简单的类的问题,但任何稍微复杂(即指针或指针成员)将导致问题。