system './' . $exe_name; BEGIN { $exe_name = $^O eq 'MSWin32' ? 'a.exe' : 'a.out'; return if -e $exe_name; open my $fh, '>', 'tmp.cpp'; print $fh <<'CODE'; #line 8 #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef NDEBUG #undef NDEBUG #endif #include #include #if defined(_MSC_VER) #include #endif #ifdef _DEBUG #undef assert #include "C:\Dropbox\backup\implements\Util\MyAssert.hpp" #define assert my_assert #else #undef assert #define assert(x) #endif #define rep(i,n) for(int (i)=0;(i)<(int)(n);++(i)) #define rer(i,l,u) for(int (i)=(int)(l);(i)<=(int)(u);++(i)) #define reu(i,l,u) for(int (i)=(int)(l);(i)<(int)(u);++(i)) #if defined(_MSC_VER) || __cplusplus > 199711L #define aut(r,v) auto r = (v) #else #define aut(r,v) __typeof(v) r = (v) #endif #define each(it,o) for(aut(it, (o).begin()); it != (o).end(); ++ it) #define all(o) (o).begin(), (o).end() #define pb(x) push_back(x) #define mp(x,y) make_pair((x),(y)) #define mset(m,v) memset(m,v,sizeof(m)) #define INF 0x3f3f3f3f #define INFL 0x3f3f3f3f3f3f3f3fLL using namespace std; typedef vector vi; typedef pair pii; typedef vector > vpii; typedef long long ll; template inline void amin(T &x, U y) { if(y < x) x = y; } template inline void amax(T &x, U y) { if(x < y) x = y; } #ifdef _MSC_VER #define alignas(x) __declspec(align(x)) #endif template struct IntOpDefault { typedef R_ R; static void copy(R *res, const R *p, int n) { for(int i = 0; i < n; ++ i) res[i] = p[i]; } static void fill_zero(R *p, int n) { for(int i = 0; i < n; ++ i) p[i] = R(); } static R inverse(R x) { R i = x, p, TWO = R(2), ONE = R(1); do { p = i * x; i *= TWO - p; }while(!(p == ONE)); return i; } static void swap_range(R *p, R *q, int n) { using std::swap; for(int i = 0; i < n; ++ i) swap(p[i], q[i]); } static void three_point_fft(int p, R *x0, R *x1, R *x2, R *t1, R *t2) { //x0' = x0 + t1 + t2 //x1' = x0 + X^p t1 + X^{2p} t2 //x2' = x0 + X^{2p} t1 + X^p t2 // //X^p t = -tb + (ta - tb) X^p //X^{2p} t = (xb - xa) + -xa X^p // //x0' = (x0a + t1a + t2a) + (x0b + t1b + t2b) X^p //x1' = (x0a - t1b - t2a + t2b) + (x0b + t1a - t1b - t2a) X^p //x2' = (x0a - t1a + t1b - t2b) + (x0b - t1a + t2a - t2b) X^p for(int a = 0, b = p; a < p; ++ a, ++ b) { R x0a = x0[a], t1a = t1[a], t2a = t2[a]; R x0b = x0[b], t1b = t1[b], t2b = t2[b]; x0[a] = x0a + t1a + t2a; x0[b] = x0b + t1b + t2b; x1[a] = x0a - t1b - t2a + t2b; x1[b] = x0b + t1a - t1b - t2a; x2[a] = x0a - t1a + t1b - t2b; x2[b] = x0b - t1a + t2a - t2b; } } }; struct u32x4 { __m128i v; u32x4(): v(_mm_setzero_si128()) { } u32x4(const __m128i &v_): v(v_) { } static u32x4 set1(uint32_t x) { return u32x4(_mm_set1_epi32(x)); } template static u32x4 loadu(const T *p) { return u32x4(_mm_loadu_si128(reinterpret_cast(p))); } template void storeu(T *p) const { _mm_storeu_si128(reinterpret_cast<__m128i*>(p), v); } u32x4 operator*(const u32x4 &that) const { return u32x4(_mm_mullo_epi32(v, that.v)); } u32x4 operator+(const u32x4 &that) const { return u32x4(_mm_add_epi32(v, that.v)); } u32x4 operator-(const u32x4 &that) const { return u32x4(_mm_sub_epi32(v, that.v)); } u32x4 &operator+=(const u32x4 &that) { return *this = *this + that; } template u32x4 slli() const { return u32x4(_mm_slli_si128(v, s)); } u32x4 slli4() const { return slli<4>(); } u32x4 slli8() const { return slli<8>(); } u32x4 slli12() const { return slli<12>(); } template u32x4 srli() const { return u32x4(_mm_srli_si128(v, s)); } u32x4 srli4() const { return srli<4>(); } u32x4 srli8() const { return srli<8>(); } u32x4 srli12() const { return srli<12>(); } }; struct IntOp32 : IntOpDefault { uint32_t x; IntOp32(): x(0) { } explicit IntOp32(uint32_t x_): x(x_) { } IntOp32 &operator+=(const IntOp32 &that) { x += that.x; return *this; } IntOp32 &operator-=(const IntOp32 &that) { x -= that.x; return *this; } IntOp32 &operator*=(const IntOp32 &that) { x *= that.x; return *this; } IntOp32 operator+(const IntOp32 &that) const { return IntOp32(x + that.x); } IntOp32 operator-(const IntOp32 &that) const { return IntOp32(x - that.x); } IntOp32 operator*(const IntOp32 &that) const { return IntOp32(x * that.x); } IntOp32 operator-() const { return IntOp32(~x + 1); } bool operator==(const IntOp32 &that) const { return x == that.x; } //resは (PN_4 + QN_4) * 4 のサイズを書き込む template static void convolute_schoolbook_template(uint32_t *res, const uint32_t *p, const uint32_t *q) { u32x4 sum[PN_4 + QN_4]; for(int i = 0; i < PN_4; ++ i) { u32x4 x0 = u32x4::set1(p[i * 4 + 0]); u32x4 x1 = u32x4::set1(p[i * 4 + 1]); u32x4 x2 = u32x4::set1(p[i * 4 + 2]); u32x4 x3 = u32x4::set1(p[i * 4 + 3]); for(int j = 0; j < QN_4; ++ j) { u32x4 y = u32x4::loadu(q + j * 4); u32x4 z0 = x0 * y; u32x4 z1 = x1 * y; u32x4 z2 = x2 * y; u32x4 z3 = x3 * y; sum[i + j + 0] += (z0 + z1.slli4()) + (z2.slli8() + z3.slli12()); sum[i + j + 1] += (z1.srli8() + z2.srli4() + z3).srli4(); } } for(int i = 0; i < PN_4 + QN_4; ++ i) sum[i].storeu(res + i * 4); } typedef u32x4 Vec; static const int V = 4; enum { KARATSUBA_THRESHOLD_4 = 4 }; #define ENABLE_KARATSUBA(PN_4, QN_4) \ ((PN_4) >= KARATSUBA_THRESHOLD_4 && (QN_4) >= KARATSUBA_THRESHOLD_4) template static typename enable_if::type convolute_template(uint32_t *res, const uint32_t *p, const uint32_t *q) { enum { LO_4 = (PN_4 + 1) / 2, HP_4 = PN_4 - LO_4, HQ_4 = QN_4 - LO_4 }; static_assert(0 < LO_4 && 0 < HQ_4 && HP_4 <= LO_4 && HQ_4 <= LO_4, "parameters"); uint32_t t0[LO_4 * 4], t1[LO_4 * 4], r1[LO_4 * 4 * 2]; uint32_t * const r0 = res, * const rinf = res + LO_4 * 4 * 2; add_template(t0, p, p + LO_4 * 4); add_template(t1, q, q + LO_4 * 4); convolute_template(r1, t0, t1); convolute_template(r0, p, q); convolute_template(rinf, p + LO_4 * 4, q + LO_4 * 4); subtract_template(r1, r0); subtract_template(r1, rinf); add_template(res + LO_4 * 4, r1); } template static typename enable_if::type convolute_template(uint32_t *res, const uint32_t *p, const uint32_t *q) { return convolute_schoolbook_template(res, p, q); } template static void convolute_template(R *res, const R *p, const R *q) { convolute_template((uint32_t*)res, (const uint32_t*)p, (const uint32_t*)q); } #undef ENABLE_KARATSUBA template static void add_template(uint32_t *p, const uint32_t *q) { for(int i = 0; i < N / V; ++ i) { Vec sum = u32x4::loadu(p + i * V) + Vec::loadu(q + i * V); sum.storeu(p + i * V); } for(int i = N / V * V; i < N; ++ i) p[i] += q[i]; } template static void add_template(uint32_t *res, const uint32_t *p, const uint32_t *q) { static_assert(PN >= QN, "PN >= QN"); for(int i = 0; i < QN / V; ++ i) { Vec sum = Vec::loadu(p + i * V) + Vec::loadu(q + i * V); sum.storeu(res + i * V); } for(int i = QN / V * V; i < QN; ++ i) res[i] = p[i] + q[i]; for(int i = QN; i < PN; ++ i) res[i] = p[i]; } template static void subtract_template(uint32_t *p, const uint32_t *q) { for(int i = 0; i < N / V; ++ i) { Vec diff = Vec::loadu(p + i * V) - Vec::loadu(q + i * V); diff.storeu(p + i * V); } for(int i = N / V * V; i < N; ++ i) p[i] -= q[i]; } template static void subtract_template(uint32_t *res, const uint32_t *p, const uint32_t *q) { static_assert(PN >= QN, "PN >= QN"); for(int i = 0; i < QN / V; ++ i) { Vec diff = Vec::loadu(p + i * V) - Vec::loadu(q + i * V); diff.storeu(res + i * V); } for(int i = QN / V * V; i < QN; ++ i) res[i] = p[i] - q[i]; for(int i = QN; i < PN; ++ i) res[i] = p[i]; } static void add(R *p, const R *q, int n) { int n_t = n / V * V; for(int i = 0; i < n_t; i += V) { Vec sum = Vec::loadu(p + i) + Vec::loadu(q + i); sum.storeu(p + i); } for(int i = n_t; i < n; ++ i) p[i] += q[i]; } static void subtract(R *p, const R *q, int n) { int n_t = n / V * V; for(int i = 0; i < n_t; i += V) { Vec diff = Vec::loadu(p + i) - Vec::loadu(q + i); diff.storeu(p + i); } for(int i = n_t; i < n; ++ i) p[i] -= q[i]; } static void negate_all(R *p, const R *q, int n) { int n_t = n / V * V; Vec zero = Vec(); for(int i = 0; i < n_t; i += V) { Vec neg = zero - Vec::loadu(q + i); neg.storeu(p + i); } for(int i = n_t; i < n; ++ i) p[i] = -q[i]; } static void three_point_fft(int p, R *x0, R *x1, R *x2, R *t1, R *t2) { int p_t = p / V * V; for(int a = 0, b = p; a < p_t; a += V, b += V) { Vec x0a = Vec::loadu(x0 + a), t1a = Vec::loadu(t1 + a), t2a = Vec::loadu(t2 + a); Vec x0b = Vec::loadu(x0 + b), t1b = Vec::loadu(t1 + b), t2b = Vec::loadu(t2 + b); (x0a + t1a + t2a).storeu(x0 + a); (x0b + t1b + t2b).storeu(x0 + b); (x0a - t1b - t2a + t2b).storeu(x1 + a); (x0b + t1a - t1b - t2a).storeu(x1 + b); (x0a - t1a + t1b - t2b).storeu(x2 + a); (x0b - t1a + t2a - t2b).storeu(x2 + b); } for(int a = p_t, b = p + p_t; a < p; ++ a, ++ b) { R x0a = x0[a], t1a = t1[a], t2a = t2[a]; R x0b = x0[b], t1b = t1[b], t2b = t2[b]; x0[a] = x0a + t1a + t2a; x0[b] = x0b + t1b + t2b; x1[a] = x0a - t1b - t2a + t2b; x1[b] = x0b + t1a - t1b - t2a; x2[a] = x0a - t1a + t1b - t2b; x2[b] = x0b - t1a + t2a - t2b; } } static void multiply_scalar(R *p, int n, R scalar) { int n_t = n / V * V; Vec x = Vec::set1(scalar.x); for(int i = 0; i < n_t; i += V) { Vec prod = Vec::loadu(p + i) * x; prod.storeu(p + i); } for(int i = n_t; i < n; ++ i) p[i] *= scalar; } }; template struct power_of_three_template { enum { val = power_of_three_template::val * 3 }; }; template<> struct power_of_three_template<0> { enum { val = 1 }; }; struct IntPolynomial { typedef IntOp32 R; static const int MaxK = 10; static int power_of_three(int k) { assert(0 <= k && k <= MaxK); static const int table[MaxK+1] = { 1, 3, 9, 27, 81, 243, 729, 2187, 6561, 19683, 59049 }; return table[k]; } template static void schonhage_strassen_template(R *res, const R *X, const R *Y); static void schonhage_strassen_decompose(R *x, const R *X, int K, int n, int N, int m); static void schonhage_strassen_compose(R *res, const R *x, int K, int n, int N, int m); static void base3_modulo(R *x, int p); static void base3_shift(R *res, const R *x, int p, int shift, R *tmp); static void base3_fft(bool inv, int k, R *x, int p, R *tmp_buffer); }; template void IntPolynomial::schonhage_strassen_template(R *res, const R *X, const R *Y) { static_assert(0 < k && k <= MaxK, "param"); static_assert(0 < N && Xn <= N && Yn <= N, "param"); enum { K = power_of_three_template::val, K_3 = K / 3 }; static_assert(N % K == 0, "param"); enum { m = N / K }; //X,YをK個に分割するときの1つのサイズ enum { p = ((m - 1) / K_3 + 1) * K_3 }; //(K | 3p)であり、2m <= 2p である最小の数 enum { n = p * 2, n_4 = (n + 3) / 4, }; //一つのPの要素として扱うサイズ static_assert((int)n < (int)N, "n < N"); enum { order = p * 3 }; static_assert(order % K == 0, "order"); enum { omega = order / K }; //K^th root of unity enum { inv_omega = order - omega }; enum { nK = n * K }; R *tmp_buffer = new R[nK * 2 + n * 3]; R *x = tmp_buffer, *y = x + nK, *tmp = y + nK; schonhage_strassen_decompose(x, X, K, n, Xn, m); schonhage_strassen_decompose(y, Y, K, n, Yn, m); base3_fft(false, k, x, p, tmp); base3_fft(false, k, y, p, tmp); { alignas(16) R tmp_x[n_4 * 4], tmp_y[n_4 * 4], tmp_res[n_4 * 4 * 2]; for(int i = 0; i < nK; i += n) { R::copy(tmp_x, x + i, n); R::fill_zero(tmp_x + n, n_4 * 4 - n); R::copy(tmp_y, y + i, n); R::fill_zero(tmp_y + n, n_4 * 4 - n); R::convolute_template(tmp_res, tmp_x, tmp_y); base3_modulo(tmp_res, p); R::copy(x + i, tmp_res, n); } } base3_fft(true, k, x, p, tmp); R invK = R::inverse(R(K)); R::multiply_scalar(x, nK, invK); schonhage_strassen_compose(res, x, K, n, N, m); delete[] tmp_buffer; } void IntPolynomial::schonhage_strassen_decompose(R *x, const R *X, int K, int n, int N, int m) { assert((N + m - 1) / m <= K); R::fill_zero(x, n * K); for(int i = 0, j = 0; ; i += n, j += m) { if(j + m <= N) { R::copy(x + i, X + j, m); }else { R::copy(x + i, X + j, N - j); break; } } } void IntPolynomial::schonhage_strassen_compose(R *res, const R *x, int K, int n, int N, int m) { assert(N >= n); int nK = n * K; R::fill_zero(res, N); for(int i = 0, j = 0; i < nK; i += n) { if(j + n <= N) { R::add(res + j, x + i, n); }else { int s = N - j; R::add(res + j, x + i, s); R::add(res, x + i + s, n - s); } if((j += m) >= N) j -= N; } } //4p-1サイズのxを(X^{2p} + X^p + 1)で剰余を取る void IntPolynomial::base3_modulo(R *x, int p) { //4p-2..2pのうち、距離がp未満なら相互に作用しないので、 //4p-2..3p, 3p-1..2p の2つに分割して、その中では連続した演算となる R::subtract(x + p , x + p * 3, p - 1); R::subtract(x + p * 2, x + p * 3, p - 1); R::subtract(x , x + p * 2, p); R::subtract(x + p , x + p * 2, p); } //サイズ2pのxに対してX^shiftをかけた後(X^{2p} + X^p + 1)で剰余を取る //tmpは 2p のサイズが必要 void IntPolynomial::base3_shift(R *res, const R *x, int p, int shift, R *tmp) { assert(0 <= shift && shift < 3 * p); assert(res != x); if(shift == 0) { R::copy(res, x, p * 2); }else if(shift == p) { //X^p x = X^{2p} x2 + X^p x1 //res = X^p (x1 - x2) - x2 R::negate_all(res, x + p, p); R::copy(res + p, x, p); R::subtract(res + p, x + p, p); }else if(shift < p) { //X^shift x = X^{2p} x2 + X^p x1 + x0 //res = X^p (x1 - x2) + (x0 - x2) R::copy(res + shift, x, p * 2 - shift); R::fill_zero(res, shift); R::subtract(res, x + (p * 2 - shift), shift); R::subtract(res + p, x + (p * 2 - shift), shift); }else if(shift == p * 2) { //X^{2p} x = X^{3p} x3 + X^{2p} x2 //res = X^p (-x2) + (x3 - x2) R::negate_all(res + p, x, p); R::copy(res, x + p, p); R::subtract(res, x, p); }else if(shift < p * 2) { //X^shift x = X^{3p} x3 + X^{2p} x2 + X^p x1 //res = (x1 - x2) X^p + (x3 - x2) int s = shift - p; const R *x1 = x, *x2 = x + (p - s), *x3 = x + (2 * p - s); R::copy(res + p + s, x1, p - s); R::fill_zero(res + p, s); R::copy(res, x3, s); R::fill_zero(res + s, p - s); R::subtract(res, x2, p); R::subtract(res + p, x2, p); return; }else { //X^shift x = X^{4p} x4 + X^{3p} x3 + X^{2p} x2 //res = (x4 - x2) X^p + (x3 - x2) int s = shift - p * 2; R::copy(res, x + (p - s), p + s); R::fill_zero(res + (p + s), p - s); R::subtract(res + s, x, p - s); R::subtract(res + p + s, x, p - s); return; } } //tmp_bufferは3nのサイズ void IntPolynomial::base3_fft(bool inv, int k, R *x, int p, R *tmp_buffer) { int n = p * 2, order = p * 3; int K = power_of_three(k), K_3 = K / 3; for(int i = 0; i < K; ++ i) { int j = 0; for(int t = 0, a = i; t < k; ++ t){ j = j * 3 + a % 3; a /= 3; } if(i < j) R::swap_range(x + i * n, x + j * n, n); } R *t1 = tmp_buffer, *t2 = t1 + n, *tmp = t2 + n; int omega = !inv ? order / K : order - order / K; int omegaK_3 = omega * K_3 % order; for(int L = 3; L <= K; L *= 3) { int L_3 = L / 3, nL_3 = n * L_3; int ww = omega * (K / L) % order, ww2 = ww * 2 % order; for(int r = 0; r < K; r += L) { int w = 0, w2 = 0; R *x0 = x + r * n, *x1 = x0 + nL_3, *x2 = x1 + nL_3; for(int j = 0; j < L_3; ++ j) { base3_shift(t1, x1, p, w, tmp); base3_shift(t2, x2, p, w2, tmp); if(!inv) R::three_point_fft(p, x0, x1, x2, t1, t2); else R::three_point_fft(p, x0, x1, x2, t2, t1); if((w += ww) >= order) w -= order; if((w2 += ww2) >= order) w2 -= order; x0 += n, x1 += n, x2 += n; } } } } alignas(16) uint32_t p[100000], q[100000], res[200000]; int main() { int L, M, N; scanf("%d%d%d", &L, &M, &N); rep(i, L) { int a; scanf("%d", &a), -- a; p[N-1-a] = 1; } rep(i, M) { int b; scanf("%d", &b), -- b; q[b] = 1; } using R = IntPolynomial::R; IntPolynomial::schonhage_strassen_template<100000,100000,200232,5>((R*)res, (const R*)p, (const R*)q); int Q; scanf("%d", &Q); rep(i, Q) { int ans = res[N-1-i]; printf("%d\n", ans); } return 0; } CODE system "g++ -m64 -O2 -lm -mavx -std=c++11 tmp.cpp -o $exe_name 2> my_compile.log"; if($? != 0) { open(my $fh, '<', 'my_compile.log'); while(<$fh>) { print STDERR $_; } die 'compile error'; } }