#29279 (Perl) No.206 数の積集合を求めるクエリ

提出ソース

結果

問題	No.206 数の積集合を求めるクエリ
コンテスト
ユーザー	anta
提出日時	2015-05-27 22:58:10
言語	Perl (5.42.0) コンパイル: `perl -cw _filename_` 実行: `perl -X _filename_`
結果	AC
実行時間	146 ms / 7,000 ms
コード長	9,361 bytes
記録記録タグの例: 初AC ショートコード純ショートコード純主流ショートコード最速実行時間
コンパイル時間	1,829 ms
コンパイル使用メモリ	112,012 KB
実行使用メモリ	6,784 KB
最終ジャッジ日時	2024-07-06 11:54:13
合計ジャッジ時間	6,583 ms
ジャッジサーバーID （参考情報）	judge1 / judge3
	外部呼び出し有り

このコードへのチャレンジ
（要ログイン）

ファイルパターン	結果
sample	AC * 3
other	AC * 28

権限があれば一括ダウンロードができます

コンパイルメッセージ

Main.pl syntax OK

ソースコード

raw source code

system './' . $exe_name;

BEGIN {
    $exe_name = $^O eq 'MSWin32' ? 'a.exe' : 'a.out';
    return if -e $exe_name;
    open my $fh, '>', 'tmp.cpp';
    print $fh <<'CODE';
#line 8
#include <string>
#include <vector>
#include <algorithm>
#include <numeric>
#include <set>
#include <map>
#include <queue>
#include <iostream>
#include <sstream>
#include <cstdio>
#include <cmath>
#include <ctime>
#include <cstring>
#include <cctype>
#include <limits>
#include <functional>

#include <cstdint>

#ifdef NDEBUG
#undef NDEBUG
#endif
#include <cassert>

#include <nmmintrin.h>
#if defined(_MSC_VER)
#include <intrin.h>
#endif

#ifdef _DEBUG
#undef assert
#include "C:\Dropbox\backup\implements\Util\MyAssert.hpp"
#define assert my_assert
#else
#undef assert
#define assert(x) 
#endif

#define rep(i,n) for(int (i)=0;(i)<(int)(n);++(i))
#define rer(i,l,u) for(int (i)=(int)(l);(i)<=(int)(u);++(i))
#define reu(i,l,u) for(int (i)=(int)(l);(i)<(int)(u);++(i))
#if defined(_MSC_VER) || __cplusplus > 199711L
#define aut(r,v) auto r = (v)
#else
#define aut(r,v) __typeof(v) r = (v)
#endif
#define each(it,o) for(aut(it, (o).begin()); it != (o).end(); ++ it)
#define all(o) (o).begin(), (o).end()
#define pb(x) push_back(x)
#define mp(x,y) make_pair((x),(y))
#define mset(m,v) memset(m,v,sizeof(m))
#define INF 0x3f3f3f3f
#define INFL 0x3f3f3f3f3f3f3f3fLL
using namespace std;
typedef vector<int> vi; typedef pair<int,int> pii; typedef vector<pair<int,int> > vpii; typedef long long ll;
template<typename T, typename U> inline void amin(T &x, U y) { if(y < x) x = y; }
template<typename T, typename U> inline void amax(T &x, U y) { if(x < y) x = y; }

#ifdef _MSC_VER
#define alignas(x) __declspec(align(x))
#endif

template<typename R_>
//typedef int R;
struct IntOpDefault {
    typedef R_ R;
    static void copy(R *res, const R *p, int n) {
        for(int i = 0; i < n; ++ i)
            res[i] = p[i];
    }

    static void fill_zero(R *p, int n) {
        for(int i = 0; i < n; ++ i)
            p[i] = R();
    }

    static void negate_all(R *res, const R *p, int n) {
        for(int i = 0; i < n; ++ i)
            res[i] = -p[i];
    }

    static void convolute_schoolbook(R *res, const R *p, int pn, const R *q, int qn) {
        fill_zero(res, pn + qn - 1);
        for(int i = 0; i < pn; ++ i)
            for(int j = 0; j < qn; ++ j)
                res[i + j] += p[i] * q[j];
    }

    static void add(R *res, const R *p, int n) {
        for(int i = 0; i < n; ++ i)
            res[i] += p[i];
    }

    static void subtract(R *res, const R *p, int n) {
        for(int i = 0; i < n; ++ i)
            res[i] -= p[i];
    }

    static R inverse(R x) {
        R i = x, p, TWO = R(2), ONE = R(1);
        do {
            p = i * x;
            i *= TWO - p;
        }while(!(p == ONE));
        return i;
    }

    static void multiply_scalar(R *p, int n, R scalar) {
        for(int i = 0; i < n; ++ i)
            p[i] *= scalar;
    }
};

struct u32x4 {
    __m128i v;
    u32x4(): v(_mm_setzero_si128()) { }
    u32x4(const __m128i &v_): v(v_) { }

    static u32x4 set1(uint32_t x) {
        return u32x4(_mm_set1_epi32(x));
    }
    template<typename T> static u32x4 loadu(const T *p) {
        return u32x4(_mm_loadu_si128(reinterpret_cast<const __m128i*>(p)));
    }
    template<typename T> void storeu(T *p) const {
        _mm_storeu_si128(reinterpret_cast<__m128i*>(p), v);
    }

    u32x4 operator*(const u32x4 &that) const {
        return u32x4(_mm_mullo_epi32(v, that.v));
    }
    u32x4 operator+(const u32x4 &that) const {
        return u32x4(_mm_add_epi32(v, that.v));
    }
    u32x4 operator-(const u32x4 &that) const {
        return u32x4(_mm_sub_epi32(v, that.v));
    }
    u32x4 &operator+=(const u32x4 &that) {
        return *this = *this + that;
    }

    template<int s> u32x4 slli() const {
        return u32x4(_mm_slli_si128(v, s));
    }
    u32x4 slli4() const { return slli<4>(); }
    u32x4 slli8() const { return slli<8>(); }
    u32x4 slli12() const { return slli<12>(); }

    template<int s> u32x4 srli() const {
        return u32x4(_mm_srli_si128(v, s));
    }

    u32x4 srli4() const { return srli<4>(); }
    u32x4 srli8() const { return srli<8>(); }
    u32x4 srli12() const { return srli<12>(); }
};

template<int PN_of_4, int QN_of_4>
inline void add_template(uint32_t *res, const uint32_t *p, const uint32_t *q) {
    static_assert(PN_of_4 >= QN_of_4, "PN_of_4 >= QN_of_4");
    for(int i = 0; i < QN_of_4; ++ i) {
        u32x4 sum = u32x4::loadu(p + i * 4) + u32x4::loadu(q + i * 4);
        sum.storeu(res + i * 4);
    }
    for(int i = QN_of_4 * 4; i < PN_of_4 * 4; ++ i)
        res[i] = p[i];
}
template<int N_of_4>
inline void add_template(uint32_t *p, const uint32_t *q) {
    for(int i = 0; i < N_of_4; ++ i) {
        u32x4 sum = u32x4::loadu(p + i * 4) + u32x4::loadu(q + i * 4);
        sum.storeu(p + i * 4);
    }
}

template<int PN_of_4, int QN_of_4>
inline void subtract_template(uint32_t *res, const uint32_t *p, const uint32_t *q) {
    static_assert(PN_of_4 >= QN_of_4, "PN_of_4 >= QN_of_4");
    for(int i = 0; i < QN_of_4; ++ i) {
        u32x4 diff = u32x4::loadu(p + i * 4) - u32x4::loadu(q + i * 4);
        diff.storeu(res + i * 4);
    }
    for(int i = QN_of_4 * 4; i < PN_of_4 * 4; ++ i)
        res[i] = p[i];
}
template<int N_of_4>
inline void subtract_template(uint32_t *p, const uint32_t *q) {
    for(int i = 0; i < N_of_4; ++ i) {
        u32x4 diff = u32x4::loadu(p + i * 4) - u32x4::loadu(q + i * 4);
        diff.storeu(p + i * 4);
    }
}

struct IntOp32 : IntOpDefault<IntOp32> {
    uint32_t x;
    IntOp32(): x(0) { }
    explicit IntOp32(uint32_t x_): x(x_) { }

    IntOp32 &operator+=(const IntOp32 &that) { x += that.x; return *this; }
    IntOp32 &operator-=(const IntOp32 &that) { x -= that.x; return *this; }
    IntOp32 &operator*=(const IntOp32 &that) { x *= that.x; return *this; }

    IntOp32 operator+(const IntOp32 &that) const { return IntOp32(x + that.x); }
    IntOp32 operator-(const IntOp32 &that) const { return IntOp32(x - that.x); }
    IntOp32 operator*(const IntOp32 &that) const { return IntOp32(x * that.x); }
    IntOp32 operator-() const { return IntOp32(~x + 1); }

    bool operator==(const IntOp32 &that) const { return x == that.x; }

    //resは (PN_of_4 + QN_of_4) * 4 のサイズを書き込む
    template<int PN_of_4, int QN_of_4>
    static void convolute_schoolbook_template(uint32_t *res, const uint32_t *p, const uint32_t *q) {
        u32x4 sum[PN_of_4 + QN_of_4];

        for(int i = 0; i < PN_of_4; ++ i) {
            u32x4 x0 = u32x4::set1(p[i * 4 + 0]);
            u32x4 x1 = u32x4::set1(p[i * 4 + 1]);
            u32x4 x2 = u32x4::set1(p[i * 4 + 2]);
            u32x4 x3 = u32x4::set1(p[i * 4 + 3]);

            for(int j = 0; j < QN_of_4; ++ j) {
                u32x4 y = u32x4::loadu(q + j * 4);
                u32x4 z0 = x0 * y;
                u32x4 z1 = x1 * y;
                u32x4 z2 = x2 * y;
                u32x4 z3 = x3 * y;

                sum[i + j + 0] += (z0 + z1.slli4()) + (z2.slli8() + z3.slli12());
                sum[i + j + 1] += (z1.srli8() + z2.srli4() + z3).srli4();
            }
        }

        for(int i = 0; i < PN_of_4 + QN_of_4; ++ i)
            sum[i].storeu(res + i * 4);
    }

    enum { KARATSUBA_THRESHOLD_OF_4 = 4 };

#define ENABLE_KARATSUBA(PNo4, QNo4) \
    ((PNo4) >= KARATSUBA_THRESHOLD_OF_4 && (QNo4) >= KARATSUBA_THRESHOLD_OF_4)

    template<int PNo4, int QNo4>
    static typename enable_if<ENABLE_KARATSUBA(PNo4,QNo4)>::type convolute_template(uint32_t *res, const uint32_t *p, const uint32_t *q) {
        enum { LOo4 = (PNo4 + 1) / 2, HPo4 = PNo4 - LOo4, HQo4 = QNo4 - LOo4 };
        static_assert(0 < LOo4 && 0 < HQo4 && HPo4 <= LOo4 && HQo4 <= LOo4, "parameters");
        uint32_t t0[LOo4 * 4], t1[LOo4 * 4], r1[LOo4 * 4 * 2];
        uint32_t * const r0 = res, * const rinf = res + LOo4 * 4 * 2;
        add_template<LOo4, HPo4>(t0, p, p + LOo4 * 4);
        add_template<LOo4, HQo4>(t1, q, q + LOo4 * 4);
        convolute_template<LOo4, LOo4>(r1, t0, t1);
        convolute_template<LOo4, LOo4>(r0, p, q);
        convolute_template<HPo4, HQo4>(rinf, p + LOo4 * 4, q + LOo4 * 4);
        subtract_template<LOo4 * 2>(r1, r0);
        subtract_template<HPo4 + HQo4>(r1, rinf);
        add_template<LOo4 * 2>(res + LOo4 * 4, r1);
    }

    template<int PNo4, int QNo4>
    static typename enable_if<!ENABLE_KARATSUBA(PNo4,QNo4)>::type convolute_template(uint32_t *res, const uint32_t *p, const uint32_t *q) {
        return convolute_schoolbook_template<PNo4,QNo4>(res, p, q);
    }

#undef ENABLE_KARATSUBA
};

alignas(16) uint32_t p[100000], q[100000], res[200000];
int main() {
    int L, M, N;
    scanf("%d%d%d", &L, &M, &N);
    rep(i, L) {
        int a;
        scanf("%d", &a), -- a;
        p[N-1-a] = 1;
    }
    rep(i, M) {
        int b;
        scanf("%d", &b), -- b;
        q[b] = 1;
    }
    IntOp32::convolute_template<100000/4,100000/4>(res, p, q);
    int Q;
    scanf("%d", &Q);
    rep(i, Q) {
        int ans = res[N-1-i];
        printf("%d\n", ans);
    }
    return 0;
}
CODE
    system "g++ -m64 -O2 -lm -mavx -std=c++11 tmp.cpp -o $exe_name 2> my_compile.log";
    if($? != 0) {
        open(my $fh, '<', 'my_compile.log');
        while(<$fh>) { print STDERR $_; }
        die 'compile error';
    }
}

yukicoder

結果

コンパイルメッセージ

ソースコード