system './' . $exe_name; BEGIN { $exe_name = $^O eq 'MSWin32' ? 'a.exe' : 'a.out'; return if -e $exe_name; open my $fh, '>', 'tmp.cpp'; print $fh <<'CODE'; #line 8 #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef NDEBUG #undef NDEBUG #endif #include #include #if defined(_MSC_VER) #include #endif #ifdef _DEBUG #undef assert #include "C:\Dropbox\backup\implements\Util\MyAssert.hpp" #define assert my_assert #else #undef assert #define assert(x) #endif #define rep(i,n) for(int (i)=0;(i)<(int)(n);++(i)) #define rer(i,l,u) for(int (i)=(int)(l);(i)<=(int)(u);++(i)) #define reu(i,l,u) for(int (i)=(int)(l);(i)<(int)(u);++(i)) #if defined(_MSC_VER) || __cplusplus > 199711L #define aut(r,v) auto r = (v) #else #define aut(r,v) __typeof(v) r = (v) #endif #define each(it,o) for(aut(it, (o).begin()); it != (o).end(); ++ it) #define all(o) (o).begin(), (o).end() #define pb(x) push_back(x) #define mp(x,y) make_pair((x),(y)) #define mset(m,v) memset(m,v,sizeof(m)) #define INF 0x3f3f3f3f #define INFL 0x3f3f3f3f3f3f3f3fLL using namespace std; typedef vector vi; typedef pair pii; typedef vector > vpii; typedef long long ll; template inline void amin(T &x, U y) { if(y < x) x = y; } template inline void amax(T &x, U y) { if(x < y) x = y; } #ifdef _MSC_VER #define alignas(x) __declspec(align(x)) #endif template //typedef int R; struct IntOpDefault { typedef R_ R; static void copy(R *res, const R *p, int n) { for(int i = 0; i < n; ++ i) res[i] = p[i]; } static void fill_zero(R *p, int n) { for(int i = 0; i < n; ++ i) p[i] = R(); } static void negate_all(R *res, const R *p, int n) { for(int i = 0; i < n; ++ i) res[i] = -p[i]; } static void convolute_schoolbook(R *res, const R *p, int pn, const R *q, int qn) { fill_zero(res, pn + qn - 1); for(int i = 0; i < pn; ++ i) for(int j = 0; j < qn; ++ j) res[i + j] += p[i] * q[j]; } static void add(R *res, const R *p, int n) { for(int i = 0; i < n; ++ i) res[i] += p[i]; } static void subtract(R *res, const R *p, int n) { for(int i = 0; i < n; ++ i) res[i] -= p[i]; } static R inverse(R x) { R i = x, p, TWO = R(2), ONE = R(1); do { p = i * x; i *= TWO - p; }while(!(p == ONE)); return i; } static void multiply_scalar(R *p, int n, R scalar) { for(int i = 0; i < n; ++ i) p[i] *= scalar; } }; struct u32x4 { __m128i v; u32x4(): v(_mm_setzero_si128()) { } u32x4(const __m128i &v_): v(v_) { } static u32x4 set1(uint32_t x) { return u32x4(_mm_set1_epi32(x)); } template static u32x4 loadu(const T *p) { return u32x4(_mm_loadu_si128(reinterpret_cast(p))); } template void storeu(T *p) const { _mm_storeu_si128(reinterpret_cast<__m128i*>(p), v); } u32x4 operator*(const u32x4 &that) const { return u32x4(_mm_mullo_epi32(v, that.v)); } u32x4 operator+(const u32x4 &that) const { return u32x4(_mm_add_epi32(v, that.v)); } u32x4 operator-(const u32x4 &that) const { return u32x4(_mm_sub_epi32(v, that.v)); } u32x4 &operator+=(const u32x4 &that) { return *this = *this + that; } template u32x4 slli() const { return u32x4(_mm_slli_si128(v, s)); } u32x4 slli4() const { return slli<4>(); } u32x4 slli8() const { return slli<8>(); } u32x4 slli12() const { return slli<12>(); } template u32x4 srli() const { return u32x4(_mm_srli_si128(v, s)); } u32x4 srli4() const { return srli<4>(); } u32x4 srli8() const { return srli<8>(); } u32x4 srli12() const { return srli<12>(); } }; template inline void add_template(uint32_t *res, const uint32_t *p, const uint32_t *q) { static_assert(PN_of_4 >= QN_of_4, "PN_of_4 >= QN_of_4"); for(int i = 0; i < QN_of_4; ++ i) { u32x4 sum = u32x4::loadu(p + i * 4) + u32x4::loadu(q + i * 4); sum.storeu(res + i * 4); } for(int i = QN_of_4 * 4; i < PN_of_4 * 4; ++ i) res[i] = p[i]; } template inline void add_template(uint32_t *p, const uint32_t *q) { for(int i = 0; i < N_of_4; ++ i) { u32x4 sum = u32x4::loadu(p + i * 4) + u32x4::loadu(q + i * 4); sum.storeu(p + i * 4); } } template inline void subtract_template(uint32_t *res, const uint32_t *p, const uint32_t *q) { static_assert(PN_of_4 >= QN_of_4, "PN_of_4 >= QN_of_4"); for(int i = 0; i < QN_of_4; ++ i) { u32x4 diff = u32x4::loadu(p + i * 4) - u32x4::loadu(q + i * 4); diff.storeu(res + i * 4); } for(int i = QN_of_4 * 4; i < PN_of_4 * 4; ++ i) res[i] = p[i]; } template inline void subtract_template(uint32_t *p, const uint32_t *q) { for(int i = 0; i < N_of_4; ++ i) { u32x4 diff = u32x4::loadu(p + i * 4) - u32x4::loadu(q + i * 4); diff.storeu(p + i * 4); } } struct IntOp32 : IntOpDefault { uint32_t x; IntOp32(): x(0) { } explicit IntOp32(uint32_t x_): x(x_) { } IntOp32 &operator+=(const IntOp32 &that) { x += that.x; return *this; } IntOp32 &operator-=(const IntOp32 &that) { x -= that.x; return *this; } IntOp32 &operator*=(const IntOp32 &that) { x *= that.x; return *this; } IntOp32 operator+(const IntOp32 &that) const { return IntOp32(x + that.x); } IntOp32 operator-(const IntOp32 &that) const { return IntOp32(x - that.x); } IntOp32 operator*(const IntOp32 &that) const { return IntOp32(x * that.x); } IntOp32 operator-() const { return IntOp32(~x + 1); } bool operator==(const IntOp32 &that) const { return x == that.x; } //resは (PN_of_4 + QN_of_4) * 4 のサイズを書き込む template static void convolute_schoolbook_template(uint32_t *res, const uint32_t *p, const uint32_t *q) { u32x4 sum[PN_of_4 + QN_of_4]; for(int i = 0; i < PN_of_4; ++ i) { u32x4 x0 = u32x4::set1(p[i * 4 + 0]); u32x4 x1 = u32x4::set1(p[i * 4 + 1]); u32x4 x2 = u32x4::set1(p[i * 4 + 2]); u32x4 x3 = u32x4::set1(p[i * 4 + 3]); for(int j = 0; j < QN_of_4; ++ j) { u32x4 y = u32x4::loadu(q + j * 4); u32x4 z0 = x0 * y; u32x4 z1 = x1 * y; u32x4 z2 = x2 * y; u32x4 z3 = x3 * y; sum[i + j + 0] += (z0 + z1.slli4()) + (z2.slli8() + z3.slli12()); sum[i + j + 1] += (z1.srli8() + z2.srli4() + z3).srli4(); } } for(int i = 0; i < PN_of_4 + QN_of_4; ++ i) sum[i].storeu(res + i * 4); } enum { KARATSUBA_THRESHOLD_OF_4 = 4 }; #define ENABLE_KARATSUBA(PNo4, QNo4) \ ((PNo4) >= KARATSUBA_THRESHOLD_OF_4 && (QNo4) >= KARATSUBA_THRESHOLD_OF_4) template static typename enable_if::type convolute_template(uint32_t *res, const uint32_t *p, const uint32_t *q) { enum { LOo4 = (PNo4 + 1) / 2, HPo4 = PNo4 - LOo4, HQo4 = QNo4 - LOo4 }; static_assert(0 < LOo4 && 0 < HQo4 && HPo4 <= LOo4 && HQo4 <= LOo4, "parameters"); uint32_t t0[LOo4 * 4], t1[LOo4 * 4], r1[LOo4 * 4 * 2]; uint32_t * const r0 = res, * const rinf = res + LOo4 * 4 * 2; add_template(t0, p, p + LOo4 * 4); add_template(t1, q, q + LOo4 * 4); convolute_template(r1, t0, t1); convolute_template(r0, p, q); convolute_template(rinf, p + LOo4 * 4, q + LOo4 * 4); subtract_template(r1, r0); subtract_template(r1, rinf); add_template(res + LOo4 * 4, r1); } template static typename enable_if::type convolute_template(uint32_t *res, const uint32_t *p, const uint32_t *q) { return convolute_schoolbook_template(res, p, q); } #undef ENABLE_KARATSUBA }; alignas(16) uint32_t p[100000], q[100000], res[200000]; int main() { int L, M, N; scanf("%d%d%d", &L, &M, &N); rep(i, L) { int a; scanf("%d", &a), -- a; p[N-1-a] = 1; } rep(i, M) { int b; scanf("%d", &b), -- b; q[b] = 1; } IntOp32::convolute_template<100000/4,100000/4>(res, p, q); int Q; scanf("%d", &Q); rep(i, Q) { int ans = res[N-1-i]; printf("%d\n", ans); } return 0; } CODE system "g++ -m64 -O2 -lm -mavx -std=c++11 tmp.cpp -o $exe_name 2> my_compile.log"; if($? != 0) { open(my $fh, '<', 'my_compile.log'); while(<$fh>) { print STDERR $_; } die 'compile error'; } }