我相信有一种方法可以找到长度为n的O(n)无序数组中第k大的元素。也可能是期望O(n)之类的。我们该怎么做呢?


当前回答

这种方法怎么样

保持一个长度为k的缓冲区和一个tmp_max,得到tmp_max为O(k)并执行n次因此类似于O(kn)

是这样还是我漏掉了什么?

虽然它没有击败快速选择的平均情况和中值统计方法的最坏情况,但它非常容易理解和实现。

其他回答

还有一种算法,比快速选择算法性能更好。它叫做弗洛伊德-铆钉(FR)算法。

原文:https://doi.org/10.1145/360680.360694

下载版本:http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.309.7108&rep=rep1&type=pdf

维基百科文章https://en.wikipedia.org/wiki/Floyd%E2%80%93Rivest_algorithm

我尝试在c++中实现快速选择和FR算法。我还将它们与标准c++库实现std::nth_element(基本上是quickselect和heapselect的introselect混合)进行了比较。结果是快速选择和nth_element的平均运行,而FR算法的平均运行约。速度是它们的两倍。

我用于FR算法的示例代码:

template <typename T>
T FRselect(std::vector<T>& data, const size_t& n)
{
    if (n == 0)
        return *(std::min_element(data.begin(), data.end()));
    else if (n == data.size() - 1)
        return *(std::max_element(data.begin(), data.end()));
    else
        return _FRselect(data, 0, data.size() - 1, n);
}

template <typename T>
T _FRselect(std::vector<T>& data, const size_t& left, const size_t& right, const size_t& n)
{
    size_t leftIdx = left;
    size_t rightIdx = right;

    while (rightIdx > leftIdx)
    {
        if (rightIdx - leftIdx > 600)
        {
            size_t range = rightIdx - leftIdx + 1;
            long long i = n - (long long)leftIdx + 1;
            long long z = log(range);
            long long s = 0.5 * exp(2 * z / 3);
            long long sd = 0.5 * sqrt(z * s * (range - s) / range) * sgn(i - (long long)range / 2);

            size_t newLeft = fmax(leftIdx, n - i * s / range + sd);
            size_t newRight = fmin(rightIdx, n + (range - i) * s / range + sd);

            _FRselect(data, newLeft, newRight, n);
        }
        T t = data[n];
        size_t i = leftIdx;
        size_t j = rightIdx;
        // arrange pivot and right index
        std::swap(data[leftIdx], data[n]);
        if (data[rightIdx] > t)
            std::swap(data[rightIdx], data[leftIdx]);

        while (i < j)
        {
            std::swap(data[i], data[j]);
            ++i; --j;
            while (data[i] < t) ++i;
            while (data[j] > t) --j;
        }

        if (data[leftIdx] == t)
            std::swap(data[leftIdx], data[j]);
        else
        {
            ++j;
            std::swap(data[j], data[rightIdx]);
        }
        // adjust left and right towards the boundaries of the subset
        // containing the (k - left + 1)th smallest element
        if (j <= n)
            leftIdx = j + 1;
        if (n <= j)
            rightIdx = j - 1;
    }

    return data[leftIdx];
}

template <typename T>
int sgn(T val) {
    return (T(0) < val) - (val < T(0));
}

下面是eladv建议的算法的实现(我也把随机pivot的实现放在这里):

public class Median {

    public static void main(String[] s) {

        int[] test = {4,18,20,3,7,13,5,8,2,1,15,17,25,30,16};
        System.out.println(selectK(test,8));

        /*
        int n = 100000000;
        int[] test = new int[n];
        for(int i=0; i<test.length; i++)
            test[i] = (int)(Math.random()*test.length);

        long start = System.currentTimeMillis();
        random_selectK(test, test.length/2);
        long end = System.currentTimeMillis();
        System.out.println(end - start);
        */
    }

    public static int random_selectK(int[] a, int k) {
        if(a.length <= 1)
            return a[0];

        int r = (int)(Math.random() * a.length);
        int p = a[r];

        int small = 0, equal = 0, big = 0;
        for(int i=0; i<a.length; i++) {
            if(a[i] < p) small++;
            else if(a[i] == p) equal++;
            else if(a[i] > p) big++;
        }

        if(k <= small) {
            int[] temp = new int[small];
            for(int i=0, j=0; i<a.length; i++)
                if(a[i] < p)
                    temp[j++] = a[i];
            return random_selectK(temp, k);
        }

        else if (k <= small+equal)
            return p;

        else {
            int[] temp = new int[big];
            for(int i=0, j=0; i<a.length; i++)
                if(a[i] > p)
                    temp[j++] = a[i];
            return random_selectK(temp,k-small-equal);
        }
    }

    public static int selectK(int[] a, int k) {
        if(a.length <= 5) {
            Arrays.sort(a);
            return a[k-1];
        }

        int p = median_of_medians(a);

        int small = 0, equal = 0, big = 0;
        for(int i=0; i<a.length; i++) {
            if(a[i] < p) small++;
            else if(a[i] == p) equal++;
            else if(a[i] > p) big++;
        }

        if(k <= small) {
            int[] temp = new int[small];
            for(int i=0, j=0; i<a.length; i++)
                if(a[i] < p)
                    temp[j++] = a[i];
            return selectK(temp, k);
        }

        else if (k <= small+equal)
            return p;

        else {
            int[] temp = new int[big];
            for(int i=0, j=0; i<a.length; i++)
                if(a[i] > p)
                    temp[j++] = a[i];
            return selectK(temp,k-small-equal);
        }
    }

    private static int median_of_medians(int[] a) {
        int[] b = new int[a.length/5];
        int[] temp = new int[5];
        for(int i=0; i<b.length; i++) {
            for(int j=0; j<5; j++)
                temp[j] = a[5*i + j];
            Arrays.sort(temp);
            b[i] = temp[2];
        }

        return selectK(b, b.length/2 + 1);
    }
}

在线性时间内找到数组的中值,然后使用与快速排序完全相同的划分程序将数组分为两部分,中值左边的值小于(<)中值,右边的值大于(>)中值,这也可以在线性时间内完成,现在,找到数组中第k个元素所在的部分, 现在递归式变成: T(n) = T(n/2) + cn 得到O (n) /。

中位数中位数算法的解释可以在这里找到n中第k大的整数: http://cs.indstate.edu/~spitla/presentation.pdf

c++中的实现如下:

#include <iostream>
#include <vector>
#include <algorithm>
using namespace std;

int findMedian(vector<int> vec){
//    Find median of a vector
    int median;
    size_t size = vec.size();
    median = vec[(size/2)];
    return median;
}

int findMedianOfMedians(vector<vector<int> > values){
    vector<int> medians;

    for (int i = 0; i < values.size(); i++) {
        int m = findMedian(values[i]);
        medians.push_back(m);
    }

    return findMedian(medians);
}

void selectionByMedianOfMedians(const vector<int> values, int k){
//    Divide the list into n/5 lists of 5 elements each
    vector<vector<int> > vec2D;

    int count = 0;
    while (count != values.size()) {
        int countRow = 0;
        vector<int> row;

        while ((countRow < 5) && (count < values.size())) {
            row.push_back(values[count]);
            count++;
            countRow++;
        }
        vec2D.push_back(row);
    }

    cout<<endl<<endl<<"Printing 2D vector : "<<endl;
    for (int i = 0; i < vec2D.size(); i++) {
        for (int j = 0; j < vec2D[i].size(); j++) {
            cout<<vec2D[i][j]<<" ";
        }
        cout<<endl;
    }
    cout<<endl;

//    Calculating a new pivot for making splits
    int m = findMedianOfMedians(vec2D);
    cout<<"Median of medians is : "<<m<<endl;

//    Partition the list into unique elements larger than 'm' (call this sublist L1) and
//    those smaller them 'm' (call this sublist L2)
    vector<int> L1, L2;

    for (int i = 0; i < vec2D.size(); i++) {
        for (int j = 0; j < vec2D[i].size(); j++) {
            if (vec2D[i][j] > m) {
                L1.push_back(vec2D[i][j]);
            }else if (vec2D[i][j] < m){
                L2.push_back(vec2D[i][j]);
            }
        }
    }

//    Checking the splits as per the new pivot 'm'
    cout<<endl<<"Printing L1 : "<<endl;
    for (int i = 0; i < L1.size(); i++) {
        cout<<L1[i]<<" ";
    }

    cout<<endl<<endl<<"Printing L2 : "<<endl;
    for (int i = 0; i < L2.size(); i++) {
        cout<<L2[i]<<" ";
    }

//    Recursive calls
    if ((k - 1) == L1.size()) {
        cout<<endl<<endl<<"Answer :"<<m;
    }else if (k <= L1.size()) {
        return selectionByMedianOfMedians(L1, k);
    }else if (k > (L1.size() + 1)){
        return selectionByMedianOfMedians(L2, k-((int)L1.size())-1);
    }

}

int main()
{
    int values[] = {2, 3, 5, 4, 1, 12, 11, 13, 16, 7, 8, 6, 10, 9, 17, 15, 19, 20, 18, 23, 21, 22, 25, 24, 14};

    vector<int> vec(values, values + 25);

    cout<<"The given array is : "<<endl;
    for (int i = 0; i < vec.size(); i++) {
        cout<<vec[i]<<" ";
    }

    selectionByMedianOfMedians(vec, 8);

    return 0;
}

Haskell的解决方案:

kthElem index list = sort list !! index

withShape ~[]     []     = []
withShape ~(x:xs) (y:ys) = x : withShape xs ys

sort []     = []
sort (x:xs) = (sort ls `withShape` ls) ++ [x] ++ (sort rs `withShape` rs)
  where
   ls = filter (<  x)
   rs = filter (>= x)

这通过使用withShape方法来实现中值解的中值,从而发现分区的大小,而无需实际计算分区大小。