system './' . $exe_name; BEGIN { $source_name = 'my_tmp.cpp'; $exe_name = $^O eq 'MSWin32' ? 'my_a.exe' : 'my_a.out'; return if -e $exe_name; open my $fh, '>', $source_name; print $fh <<'CODE'; #line 9 #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef NDEBUG #undef NDEBUG #endif #include #include #if defined(_MSC_VER) #include #endif #ifdef _DEBUG #undef assert #include "C:\Dropbox\backup\implements\Util\MyAssert.hpp" #define assert my_assert #else #undef assert #define assert(x) #endif #define rep(i,n) for(int (i)=0;(i)<(int)(n);++(i)) #define rer(i,l,u) for(int (i)=(int)(l);(i)<=(int)(u);++(i)) #define reu(i,l,u) for(int (i)=(int)(l);(i)<(int)(u);++(i)) #if defined(_MSC_VER) || __cplusplus > 199711L #define aut(r,v) auto r = (v) #else #define aut(r,v) __typeof(v) r = (v) #endif #define each(it,o) for(aut(it, (o).begin()); it != (o).end(); ++ it) #define all(o) (o).begin(), (o).end() #define pb(x) push_back(x) #define mp(x,y) make_pair((x),(y)) #define mset(m,v) memset(m,v,sizeof(m)) #define INF 0x3f3f3f3f #define INFL 0x3f3f3f3f3f3f3f3fLL using namespace std; typedef vector vi; typedef pair pii; typedef vector > vpii; typedef long long ll; template inline void amin(T &x, U y) { if(y < x) x = y; } template inline void amax(T &x, U y) { if(x < y) x = y; } #ifdef _MSC_VER #define alignas(x) __declspec(align(x)) #endif struct u32x4 { __m128i v; u32x4(): v(_mm_setzero_si128()) { } u32x4(const __m128i &v_): v(v_) { } __m128i get() const { return v; } static u32x4 set1(uint32_t x) { return u32x4(_mm_set1_epi32(x)); } template static u32x4 loadu(const T *p) { return u32x4(_mm_loadu_si128(reinterpret_cast(p))); } template void storeu(T *p) const { _mm_storeu_si128(reinterpret_cast<__m128i*>(p), v); } u32x4 operator*(const u32x4 &that) const { return u32x4(_mm_mullo_epi32(v, that.v)); } u32x4 operator+(const u32x4 &that) const { return u32x4(_mm_add_epi32(v, that.v)); } u32x4 operator-(const u32x4 &that) const { return u32x4(_mm_sub_epi32(v, that.v)); } u32x4 &operator+=(const u32x4 &that) { return *this = *this + that; } template u32x4 slli() const { return u32x4(_mm_slli_si128(v, s)); } u32x4 slli4() const { return slli<4>(); } u32x4 slli8() const { return slli<8>(); } u32x4 slli12() const { return slli<12>(); } template u32x4 srli() const { return u32x4(_mm_srli_si128(v, s)); } u32x4 srli4() const { return srli<4>(); } u32x4 srli8() const { return srli<8>(); } u32x4 srli12() const { return srli<12>(); } }; struct IntOp32 { typedef IntOp32 R; uint32_t x; IntOp32(): x(0) { } explicit IntOp32(uint32_t x_): x(x_) { } IntOp32 &operator+=(const IntOp32 &that) { x += that.x; return *this; } IntOp32 &operator-=(const IntOp32 &that) { x -= that.x; return *this; } IntOp32 &operator*=(const IntOp32 &that) { x *= that.x; return *this; } IntOp32 operator+(const IntOp32 &that) const { return IntOp32(x + that.x); } IntOp32 operator-(const IntOp32 &that) const { return IntOp32(x - that.x); } IntOp32 operator*(const IntOp32 &that) const { return IntOp32(x * that.x); } IntOp32 operator-() const { return IntOp32(~x + 1); } IntOp32 operator>>(int s) const { return IntOp32(x >> s); } IntOp32 operator<<(int s) const { return IntOp32(x << s); } bool operator==(const IntOp32 &that) const { return x == that.x; } //resは (PN_4 + QN_4) * 4 のサイズを書き込む template static void convolute_schoolbook_template(uint32_t *res, const uint32_t *p, const uint32_t *q) { u32x4 sum[PN_4 + QN_4]; for(int i = 0; i < PN_4; ++ i) { u32x4 x0 = u32x4::set1(p[i * 4 + 0]); u32x4 x1 = u32x4::set1(p[i * 4 + 1]); u32x4 x2 = u32x4::set1(p[i * 4 + 2]); u32x4 x3 = u32x4::set1(p[i * 4 + 3]); for(int j = 0; j < QN_4; ++ j) { u32x4 y = u32x4::loadu(q + j * 4); u32x4 z0 = x0 * y; u32x4 z1 = x1 * y; u32x4 z2 = x2 * y; u32x4 z3 = x3 * y; sum[i + j + 0] += (z0 + z1.slli4()) + (z2.slli8() + z3.slli12()); sum[i + j + 1] += (z1.srli8() + z2.srli4() + z3).srli4(); } } for(int i = 0; i < PN_4 + QN_4; ++ i) sum[i].storeu(res + i * 4); } typedef u32x4 Vec; static const int V = 4; enum { KARATSUBA_THRESHOLD_4 = 4 }; #define ENABLE_KARATSUBA(PN_4, QN_4) \ ((PN_4) >= KARATSUBA_THRESHOLD_4 && (QN_4) >= KARATSUBA_THRESHOLD_4) template static typename enable_if::type convolute_template(uint32_t *res, const uint32_t *p, const uint32_t *q) { enum { LO_4 = (PN_4 + 1) / 2, HP_4 = PN_4 - LO_4, HQ_4 = QN_4 - LO_4 }; static_assert(0 < LO_4 && 0 < HQ_4 && HP_4 <= LO_4 && HQ_4 <= LO_4, "parameters"); uint32_t t0[LO_4 * 4], t1[LO_4 * 4], r1[LO_4 * 4 * 2]; uint32_t * const r0 = res, * const rinf = res + LO_4 * 4 * 2; add_template(t0, p, p + LO_4 * 4); add_template(t1, q, q + LO_4 * 4); convolute_template(r1, t0, t1); convolute_template(r0, p, q); convolute_template(rinf, p + LO_4 * 4, q + LO_4 * 4); subtract_template(r1, r0); subtract_template(r1, rinf); add_template(res + LO_4 * 4, r1); } template static typename enable_if::type convolute_template(uint32_t *res, const uint32_t *p, const uint32_t *q) { return convolute_schoolbook_template(res, p, q); } template static void convolute_template(R *res, const R *p, const R *q) { convolute_template((uint32_t*)res, (const uint32_t*)p, (const uint32_t*)q); } #undef ENABLE_KARATSUBA template static void add_template(uint32_t *p, const uint32_t *q) { for(int i = 0; i < N / V; ++ i) { Vec sum = u32x4::loadu(p + i * V) + Vec::loadu(q + i * V); sum.storeu(p + i * V); } for(int i = N / V * V; i < N; ++ i) p[i] += q[i]; } template static void add_template(uint32_t *res, const uint32_t *p, const uint32_t *q) { static_assert(PN >= QN, "PN >= QN"); for(int i = 0; i < QN / V; ++ i) { Vec sum = Vec::loadu(p + i * V) + Vec::loadu(q + i * V); sum.storeu(res + i * V); } for(int i = QN / V * V; i < QN; ++ i) res[i] = p[i] + q[i]; for(int i = QN; i < PN; ++ i) res[i] = p[i]; } template static void subtract_template(uint32_t *p, const uint32_t *q) { for(int i = 0; i < N / V; ++ i) { Vec diff = Vec::loadu(p + i * V) - Vec::loadu(q + i * V); diff.storeu(p + i * V); } for(int i = N / V * V; i < N; ++ i) p[i] -= q[i]; } template static void subtract_template(uint32_t *res, const uint32_t *p, const uint32_t *q) { static_assert(PN >= QN, "PN >= QN"); for(int i = 0; i < QN / V; ++ i) { Vec diff = Vec::loadu(p + i * V) - Vec::loadu(q + i * V); diff.storeu(res + i * V); } for(int i = QN / V * V; i < QN; ++ i) res[i] = p[i] - q[i]; for(int i = QN; i < PN; ++ i) res[i] = p[i]; } static void add(R *p, const R *q, int n) { int n_t = n / V * V; for(int i = 0; i < n_t; i += V) { Vec sum = Vec::loadu(p + i) + Vec::loadu(q + i); sum.storeu(p + i); } for(int i = n_t; i < n; ++ i) p[i] += q[i]; } static void subtract(R *p, const R *q, int n) { int n_t = n / V * V; for(int i = 0; i < n_t; i += V) { Vec diff = Vec::loadu(p + i) - Vec::loadu(q + i); diff.storeu(p + i); } for(int i = n_t; i < n; ++ i) p[i] -= q[i]; } static void negate(R *p, const R *q, int n) { int n_t = n / V * V; Vec zero = Vec(); for(int i = 0; i < n_t; i += V) { Vec neg = zero - Vec::loadu(q + i); neg.storeu(p + i); } for(int i = n_t; i < n; ++ i) p[i] = -q[i]; } static void sumdiff(R *x0, R *x1, R *tmp, int n) { int n_t = n / V * V; for(int i = 0; i < n_t; i += V) { Vec a = Vec::loadu(x0 + i), b = Vec::loadu(tmp + i); (a + b).storeu(x0 + i); (a - b).storeu(x1 + i); } for(int i = n_t; i < n; ++ i) { R a = x0[i], b = tmp[i]; x0[i] = a + b; x1[i] = a - b; } } static void swap_range(R *p, R *q, int n) { int n_t = n / V * V; for(int i = 0; i < n_t; i += V) { Vec a = Vec::loadu(p + i), b = Vec::loadu(q + i); a.storeu(q + i); b.storeu(p + i); } for(int i = n_t; i < n; ++ i) swap(p[i], q[i]); } static void copy(R *res, const R *p, int n) { int n_t = n / V * V; for(int i = 0; i < n_t; i += V) Vec::loadu(p + i).storeu(res + i); for(int i = n_t; i < n; ++ i) res[i] = p[i]; } static void fill_zero(R *p, int n) { int n_t = n / V * V; Vec zero; for(int i = 0; i < n_t; i += V) zero.storeu(p + i); for(int i = n_t; i < n; ++ i) p[i].x = 0; } template static void bitshift_right_template(R *p, int n) { int n_t = n / V * V; for(int i = 0; i < n_t; i += V) Vec(_mm_srli_epi32(Vec::loadu(p + i).get(), k)).storeu(p + i); for(int i = n_t; i < n; ++ i) p[i].x >>= k; } }; struct IntPolynomial { typedef IntOp32 R; static const int MaxK = 15; template static typename enable_if::type schonhage_strassen_template(R *res, const R *X, const R *Y); template static typename enable_if::type schonhage_strassen_template(R *res, const R *X, const R *Y) { return; } static void schonhage_strassen_decompose(R *x, const R *X, int K, int n, int N, int m, R *tmp, bool negacyclic); static void schonhage_strassen_compose(R *res, const R *x, int K, int n, int N, int m, R *tmp, bool negacyclic); static void negacyclic_shift(R *res, const R *x, int n, int shift); static void fft(bool inv, int k, R *x, int n, R *tmp); }; //上位の何ビットか(再帰でのkの合計)は0となる。 //RがZ/(2^32)だったらZ/(2^{32-k})で計算されるということ。 template typename enable_if::type IntPolynomial::schonhage_strassen_template(R *res, const R *X, const R *Y) { static_assert(0 < k && k <= MaxK, "param"); static_assert(0 < N && Xn <= N && Yn <= N, "param"); enum { K = 1 << k, K_2 = K / 2 }; static_assert(N % K == 0, "param"); enum { m = N / K }; //X,YをK個に分割するときの1つのサイズ //(K | (!negacyclic ? 2n : n))であり、2m-1 <= n である最小の数 enum { nn = !negacyclic ? ((m * 2 - 2) / K_2 + 1) * K_2 : ((m * 2 - 2) / K + 1) * K }; //P = R[X] / (X^n + 1) として一つのPの要素として扱うサイズ enum { n = (((nn - 1) >> next_k) + 1) << next_k }; enum { n_4 = (n + 3) / 4 }; static_assert(n < N, "n < N"); enum { order = n * 2 }; //X ∈ P の multiplicative order static_assert(order % K == 0, "order"); enum { nK = n * K }; R *tmp_buffer = new R[nK * 2 + n]; R *x = tmp_buffer, *y = x + nK, *tmp = y + nK; schonhage_strassen_decompose(x, X, K, n, Xn, m, tmp, negacyclic); schonhage_strassen_decompose(y, Y, K, n, Yn, m, tmp, negacyclic); fft(false, k, x, n, tmp); fft(false, k, y, n, tmp); if(next_k == 0) { alignas(16) R tmp_x[n_4 * 4], tmp_y[n_4 * 4], tmp_res[n_4 * 4 * 2]; for(int i = 0; i < nK; i += n) { R::copy(tmp_x, x + i, n); R::fill_zero(tmp_x + n, n_4 * 4 - n); R::copy(tmp_y, y + i, n); R::fill_zero(tmp_y + n, n_4 * 4 - n); R::convolute_template(tmp_res, tmp_x, tmp_y); R::copy(x + i, tmp_res, n); R::subtract(x + i, tmp_res + n, n - 1); } }else { for(int i = 0; i < nK; i += n) { schonhage_strassen_template(x + i, x + i, y + i); } } fft(true, k, x, n, tmp); R::bitshift_right_template(x, nK); schonhage_strassen_compose(res, x, K, n, N, m, tmp, negacyclic); delete[] tmp_buffer; } void IntPolynomial::schonhage_strassen_decompose(R *x, const R *X, int K, int n, int N, int m, R *tmp, bool negacyclic) { assert((N + m - 1) / m <= K); if(!negacyclic) { R::fill_zero(x, n * K); for(int i = 0, j = 0; ; i += n, j += m) { int size = j + m <= N ? m : N - j; R::copy(x + i, X + j, size); if(size < m) break; } }else { assert(n % K == 0); R::fill_zero(x, n * K); int omega = n / K, shift = 0; for(int i = 0, j = 0; ; i += n, j += m) { int size = j + m <= N ? m : N - j; R::copy(tmp, X + j, size); R::fill_zero(tmp + size, n - size); negacyclic_shift(x + i, tmp, n, shift); if(size < m) break; shift += omega; } } } void IntPolynomial::schonhage_strassen_compose(R *res, const R *x, int K, int n, int N, int m, R *tmp, bool negacyclic) { assert(N >= n); int nK = n * K; R::fill_zero(res, N); if(!negacyclic) { for(int i = 0, j = 0; i < nK; i += n) { if(j + n <= N) { R::add(res + j, x + i, n); }else { int s = N - j; R::add(res + j, x + i, s); R::add(res, x + i + s, n - s); } if((j += m) >= N) j -= N; } }else { assert(n % K == 0); bool sign = false; int omega = n / K, shift = n * 2; for(int i = 0, j = 0; i < nK; i += n) { negacyclic_shift(tmp, x + i, n, shift); if(j + n <= N) { if(!sign) R::add(res + j, tmp, n); else R::subtract(res + j, tmp, n); }else { int s = N - j; if(!sign) { R::add(res + j, tmp, s); R::subtract(res, tmp + s, n - s); }else { R::subtract(res + j, tmp, s); R::add(res, tmp + s, n - s); } } if((j += m) >= N) { sign = !sign; j -= N; } shift -= omega; } } } void IntPolynomial::negacyclic_shift(R *res, const R *x, int n, int shift) { assert(0 <= shift && shift <= n * 2); if(shift < n) { R::negate(res, x + (n - shift), shift); R::copy(res + shift, x, n - shift); }else { int s = shift - n; R::copy(res, x + (n - s), s); R::negate(res + s, x, n - s); } } void IntPolynomial::fft(bool inv, int k, R *x, int n, R *tmp) { assert(k > 0); int order = n * 2; int K = 1 << k, K_2 = K / 2, nK = n * K; for(int i = 1, j = 0; i < K - 1; ++ i) { for(int h = K_2; h > (j ^= h); h >>= 1) ; if(i < j) R::swap_range(x + i * n, x + j * n, n); } for(int r = 0; r < nK; r += n * 2) { R *x0 = x + r, *x1 = x0 + n; R::sumdiff(x0, x1, x1, n); } int init_order = !inv ? 0 : order; for(int l = 2; l <= k; ++ l) { int L = 1 << l, nL_2 = n << (l-1), nL = n << l; int ww = !inv ? (order >> l) : -(order >> l); for(int r = 0; r < nK; r += nL) { int w = init_order; for(int j = 0; j < nL_2; j += n) { R *x0 = x + r + j, *x1 = x0 + nL_2; negacyclic_shift(tmp, x1, n, w); R::sumdiff(x0, x1, tmp, n); w += ww; } } } } alignas(16) uint32_t p[100000], q[100000], res[200000]; int main() { int L, M, N; scanf("%d%d%d", &L, &M, &N); rep(i, L) { int a; scanf("%d", &a), -- a; p[N-1-a] = 1; } rep(i, M) { int b; scanf("%d", &b), -- b; q[b] = 1; } using R = IntPolynomial::R; IntPolynomial::schonhage_strassen_template<100000,100000,200704,10,5,false>((R*)res, (const R*)p, (const R*)q); int Q; scanf("%d", &Q); rep(i, Q) { int ans = res[N-1-i]; printf("%d\n", ans); } return 0; } CODE system "g++ -m64 -O2 -lm -march=native -std=c++11 $source_name -o $exe_name 2> my_compile.log"; if($? != 0) { open(my $fh, '<', 'my_compile.log'); while(<$fh>) { print STDERR $_; } die 'compile error'; } }