結果
問題 | No.1574 Swap and Repaint |
ユーザー | NyaanNyaan |
提出日時 | 2021-07-04 18:52:11 |
言語 | C++17 (gcc 12.3.0 + boost 1.83.0) |
結果 |
CE
(最新)
AC
(最初)
|
実行時間 | - |
コード長 | 22,536 bytes |
コンパイル時間 | 1,895 ms |
コンパイル使用メモリ | 244,184 KB |
最終ジャッジ日時 | 2024-10-03 12:37:17 |
合計ジャッジ時間 | 2,382 ms |
ジャッジサーバーID (参考情報) |
judge3 / judge2 |
(要ログイン)
コンパイルエラー時のメッセージ・ソースコードは、提出者また管理者しか表示できないようにしております。(リジャッジ後のコンパイルエラーは公開されます)
ただし、clay言語の場合は開発者のデバッグのため、公開されます。
ただし、clay言語の場合は開発者のデバッグのため、公開されます。
コンパイルメッセージ
main.cpp: In function 'mmint operator*(const mmint&, const mmint&)': main.cpp:632:44: warning: AVX vector return without AVX enabled changes the ABI [-Wpsabi] 632 | m256 a13 = _mm256_shuffle_epi32(A, 0xF5); | ^ In file included from /home/linuxbrew/.linuxbrew/Cellar/gcc@12/12.3.0/lib/gcc/12/gcc/x86_64-pc-linux-gnu/12/include/immintrin.h:47, from main.cpp:11: /home/linuxbrew/.linuxbrew/Cellar/gcc@12/12.3.0/lib/gcc/12/gcc/x86_64-pc-linux-gnu/12/include/avx2intrin.h: In function 'mmint operator+(const mmint&, const mmint&)': /home/linuxbrew/.linuxbrew/Cellar/gcc@12/12.3.0/lib/gcc/12/gcc/x86_64-pc-linux-gnu/12/include/avx2intrin.h:119:1: error: inlining failed in call to 'always_inline' '__m256i _mm256_add_epi32(__m256i, __m256i)': target specific option mismatch 119 | _mm256_add_epi32 (__m256i __A, __m256i __B) | ^~~~~~~~~~~~~~~~ main.cpp:621:28: note: called from here 621 | return _mm256_add_epi32(add, ret); | ~~~~~~~~~~~~~~~~^~~~~~~~~~ /home/linuxbrew/.linuxbrew/Cellar/gcc@12/12.3.0/lib/gcc/12/gcc/x86_64-pc-linux-gnu/12/include/avx2intrin.h:179:1: error: inlining failed in call to 'always_inline' '__m256i _mm256_and_si256(__m256i, __m256i)': target specific option mismatch 179 | _mm256_and_si256 (__m256i __A, __m256i __B) | ^~~~~~~~~~~~~~~~ main.cpp:620:32: note: called from here 620 | m256 add = _mm256_and_si256(cmp, M2); | ~~~~~~~~~~~~~~~~^~~~~~~~~ /home/linuxbrew/.linuxbrew/Cellar/gcc@12/12.3.0/lib/gcc/12/gcc/x86_64-pc-linux-gnu/12/include/avx2intrin.h:273:1: error: inlining failed in call to 'always_inline' '__m256i _mm256_cmpgt_epi32(__m256i, __m256i)': target specific option mismatch 273 | _mm256_cmpgt_epi32 (__m256i __A, __m256i __B) | ^~~~~~~~~~~~~~~~~~ main.cpp:619:34: note: called from here 619 | m256 cmp = _mm256_cmpgt_epi32(M0, ret); | ~~~~~~~~~~~~~~~~~~^~~~~~~~~ /home/linuxbrew/.linux
ソースコード
// O(N^2)です。ゆるして /** * date : 2021-07-04 01:10:55 */ #define NDEBUG using namespace std; // intrinstic #include <immintrin.h> #include <algorithm> #include <array> #include <bitset> #include <cassert> #include <cctype> #include <cfenv> #include <cfloat> #include <chrono> #include <cinttypes> #include <climits> #include <cmath> #include <complex> #include <cstdarg> #include <cstddef> #include <cstdint> #include <cstdio> #include <cstdlib> #include <cstring> #include <deque> #include <fstream> #include <functional> #include <initializer_list> #include <iomanip> #include <ios> #include <iostream> #include <istream> #include <iterator> #include <limits> #include <list> #include <map> #include <memory> #include <new> #include <numeric> #include <ostream> #include <queue> #include <random> #include <set> #include <sstream> #include <stack> #include <streambuf> #include <string> #include <tuple> #include <type_traits> #include <typeinfo> #include <unordered_map> #include <unordered_set> #include <utility> #include <vector> // utility namespace Nyaan { using ll = long long; using i64 = long long; using u64 = unsigned long long; using i128 = __int128_t; using u128 = __uint128_t; template <typename T> using V = vector<T>; template <typename T> using VV = vector<vector<T>>; using vi = vector<int>; using vl = vector<long long>; using vd = V<double>; using vs = V<string>; using vvi = vector<vector<int>>; using vvl = vector<vector<long long>>; template <typename T, typename U> struct P : pair<T, U> { template <typename... Args> P(Args... args) : pair<T, U>(args...) {} using pair<T, U>::first; using pair<T, U>::second; T &x() { return first; } const T &x() const { return first; } U &y() { return second; } const U &y() const { return second; } P &operator+=(const P &r) { first += r.first; second += r.second; return *this; } P &operator-=(const P &r) { first -= r.first; second -= r.second; return *this; } P &operator*=(const P &r) { first *= r.first; second *= r.second; return *this; } P operator+(const P &r) const { return P(*this) += r; } P operator-(const P &r) const { return P(*this) -= r; } P operator*(const P &r) const { return P(*this) *= r; } }; using pl = P<ll, ll>; using pi = P<int, int>; using vp = V<pl>; constexpr int inf = 1001001001; constexpr long long infLL = 4004004004004004004LL; template <typename T> int sz(const T &t) { return t.size(); } template <typename T, typename U> inline bool amin(T &x, U y) { return (y < x) ? (x = y, true) : false; } template <typename T, typename U> inline bool amax(T &x, U y) { return (x < y) ? (x = y, true) : false; } template <typename T> inline T Max(const vector<T> &v) { return *max_element(begin(v), end(v)); } template <typename T> inline T Min(const vector<T> &v) { return *min_element(begin(v), end(v)); } template <typename T> inline long long Sum(const vector<T> &v) { return accumulate(begin(v), end(v), 0LL); } template <typename T> int lb(const vector<T> &v, const T &a) { return lower_bound(begin(v), end(v), a) - begin(v); } template <typename T> int ub(const vector<T> &v, const T &a) { return upper_bound(begin(v), end(v), a) - begin(v); } constexpr long long TEN(int n) { long long ret = 1, x = 10; for (; n; x *= x, n >>= 1) ret *= (n & 1 ? x : 1); return ret; } template <typename T, typename U> pair<T, U> mkp(const T &t, const U &u) { return make_pair(t, u); } template <typename T> vector<T> mkrui(const vector<T> &v, bool rev = false) { vector<T> ret(v.size() + 1); if (rev) { for (int i = int(v.size()) - 1; i >= 0; i--) ret[i] = v[i] + ret[i + 1]; } else { for (int i = 0; i < int(v.size()); i++) ret[i + 1] = ret[i] + v[i]; } return ret; }; template <typename T> vector<T> mkuni(const vector<T> &v) { vector<T> ret(v); sort(ret.begin(), ret.end()); ret.erase(unique(ret.begin(), ret.end()), ret.end()); return ret; } template <typename F> vector<int> mkord(int N, F f) { vector<int> ord(N); iota(begin(ord), end(ord), 0); sort(begin(ord), end(ord), f); return ord; } template <typename T> vector<int> mkinv(vector<T> &v) { int max_val = *max_element(begin(v), end(v)); vector<int> inv(max_val + 1, -1); for (int i = 0; i < (int)v.size(); i++) inv[v[i]] = i; return inv; } } // namespace Nyaan // bit operation namespace Nyaan { __attribute__((target("popcnt"))) inline int popcnt(const u64 &a) { return _mm_popcnt_u64(a); } inline int lsb(const u64 &a) { return a ? __builtin_ctzll(a) : 64; } inline int ctz(const u64 &a) { return a ? __builtin_ctzll(a) : 64; } inline int msb(const u64 &a) { return a ? 63 - __builtin_clzll(a) : -1; } template <typename T> inline int gbit(const T &a, int i) { return (a >> i) & 1; } template <typename T> inline void sbit(T &a, int i, bool b) { if (gbit(a, i) != b) a ^= T(1) << i; } constexpr long long PW(int n) { return 1LL << n; } constexpr long long MSK(int n) { return (1LL << n) - 1; } } // namespace Nyaan // inout namespace Nyaan { template <typename T, typename U> ostream &operator<<(ostream &os, const pair<T, U> &p) { os << p.first << " " << p.second; return os; } template <typename T, typename U> istream &operator>>(istream &is, pair<T, U> &p) { is >> p.first >> p.second; return is; } template <typename T> ostream &operator<<(ostream &os, const vector<T> &v) { int s = (int)v.size(); for (int i = 0; i < s; i++) os << (i ? " " : "") << v[i]; return os; } template <typename T> istream &operator>>(istream &is, vector<T> &v) { for (auto &x : v) is >> x; return is; } void in() {} template <typename T, class... U> void in(T &t, U &... u) { cin >> t; in(u...); } void out() { cout << "\n"; } template <typename T, class... U, char sep = ' '> void out(const T &t, const U &... u) { cout << t; if (sizeof...(u)) cout << sep; out(u...); } void outr() {} template <typename T, class... U, char sep = ' '> void outr(const T &t, const U &... u) { cout << t; outr(u...); } struct IoSetupNya { IoSetupNya() { cin.tie(nullptr); ios::sync_with_stdio(false); cout << fixed << setprecision(15); cerr << fixed << setprecision(7); } } iosetupnya; } // namespace Nyaan // debug namespace DebugImpl { template <typename U, typename = void> struct is_specialize : false_type {}; template <typename U> struct is_specialize< U, typename conditional<false, typename U::iterator, void>::type> : true_type {}; template <typename U> struct is_specialize< U, typename conditional<false, decltype(U::first), void>::type> : true_type {}; template <typename U> struct is_specialize<U, enable_if_t<is_integral<U>::value, void>> : true_type { }; void dump(const char& t) { cerr << t; } void dump(const string& t) { cerr << t; } void dump(const bool& t) { cerr << (t ? "true" : "false"); } template <typename U, enable_if_t<!is_specialize<U>::value, nullptr_t> = nullptr> void dump(const U& t) { cerr << t; } template <typename T> void dump(const T& t, enable_if_t<is_integral<T>::value>* = nullptr) { string res; if (t == Nyaan::inf) res = "inf"; if constexpr (is_signed<T>::value) { if (t == -Nyaan::inf) res = "-inf"; } if constexpr (sizeof(T) == 8) { if (t == Nyaan::infLL) res = "inf"; if constexpr (is_signed<T>::value) { if (t == -Nyaan::infLL) res = "-inf"; } } if (res.empty()) res = to_string(t); cerr << res; } template <typename T, typename U> void dump(const pair<T, U>&); template <typename T> void dump(const pair<T*, int>&); template <typename T> void dump(const T& t, enable_if_t<!is_void<typename T::iterator>::value>* = nullptr) { cerr << "[ "; for (auto it = t.begin(); it != t.end();) { dump(*it); cerr << (++it == t.end() ? "" : ", "); } cerr << " ]"; } template <typename T, typename U> void dump(const pair<T, U>& t) { cerr << "( "; dump(t.first); cerr << ", "; dump(t.second); cerr << " )"; } template <typename T> void dump(const pair<T*, int>& t) { cerr << "[ "; for (int i = 0; i < t.second; i++) { dump(t.first[i]); cerr << (i == t.second - 1 ? "" : ", "); } cerr << " ]"; } void trace() { cerr << endl; } template <typename Head, typename... Tail> void trace(Head&& head, Tail&&... tail) { cerr << " "; dump(head); if (sizeof...(tail) != 0) cerr << ","; trace(forward<Tail>(tail)...); } } // namespace DebugImpl #ifdef NyaanDebug #define trc(...) \ do { \ cerr << "## " << #__VA_ARGS__ << " = "; \ DebugImpl::trace(__VA_ARGS__); \ } while (0) #else #define trc(...) (void(0)) #endif // macro #define each(x, v) for (auto&& x : v) #define each2(x, y, v) for (auto&& [x, y] : v) #define all(v) (v).begin(), (v).end() #define rep(i, N) for (long long i = 0; i < (long long)(N); i++) #define repr(i, N) for (long long i = (long long)(N)-1; i >= 0; i--) #define rep1(i, N) for (long long i = 1; i <= (long long)(N); i++) #define repr1(i, N) for (long long i = (N); (long long)(i) > 0; i--) #define reg(i, a, b) for (long long i = (a); i < (b); i++) #define regr(i, a, b) for (long long i = (b)-1; i >= (a); i--) #define fi first #define se second #define ini(...) \ int __VA_ARGS__; \ in(__VA_ARGS__) #define inl(...) \ long long __VA_ARGS__; \ in(__VA_ARGS__) #define ins(...) \ string __VA_ARGS__; \ in(__VA_ARGS__) #define in2(s, t) \ for (int i = 0; i < (int)s.size(); i++) { \ in(s[i], t[i]); \ } #define in3(s, t, u) \ for (int i = 0; i < (int)s.size(); i++) { \ in(s[i], t[i], u[i]); \ } #define in4(s, t, u, v) \ for (int i = 0; i < (int)s.size(); i++) { \ in(s[i], t[i], u[i], v[i]); \ } #define die(...) \ do { \ Nyaan::out(__VA_ARGS__); \ return; \ } while (0) namespace Nyaan { void solve(); } int main() { Nyaan::solve(); } // struct Timer { chrono::high_resolution_clock::time_point st; Timer() { reset(); } void reset() { st = chrono::high_resolution_clock::now(); } chrono::milliseconds::rep elapsed() { auto ed = chrono::high_resolution_clock::now(); return chrono::duration_cast<chrono::milliseconds>(ed - st).count(); } }; template <uint32_t mod> struct LazyMontgomeryModInt { using mint = LazyMontgomeryModInt; using i32 = int32_t; using u32 = uint32_t; using u64 = uint64_t; static constexpr u32 get_r() { u32 ret = mod; for (i32 i = 0; i < 4; ++i) ret *= 2 - mod * ret; return ret; } static constexpr u32 r = get_r(); static constexpr u32 n2 = -u64(mod) % mod; static_assert(r * mod == 1, "invalid, r * mod != 1"); static_assert(mod < (1 << 30), "invalid, mod >= 2 ^ 30"); static_assert((mod & 1) == 1, "invalid, mod % 2 == 0"); u32 a; constexpr LazyMontgomeryModInt() : a(0) {} constexpr LazyMontgomeryModInt(const int64_t &b) : a(reduce(u64(b % mod + mod) * n2)){}; static constexpr u32 reduce(const u64 &b) { return (b + u64(u32(b) * u32(-r)) * mod) >> 32; } constexpr mint &operator+=(const mint &b) { if (i32(a += b.a - 2 * mod) < 0) a += 2 * mod; return *this; } constexpr mint &operator-=(const mint &b) { if (i32(a -= b.a) < 0) a += 2 * mod; return *this; } constexpr mint &operator*=(const mint &b) { a = reduce(u64(a) * b.a); return *this; } constexpr mint &operator/=(const mint &b) { *this *= b.inverse(); return *this; } constexpr mint operator+(const mint &b) const { return mint(*this) += b; } constexpr mint operator-(const mint &b) const { return mint(*this) -= b; } constexpr mint operator*(const mint &b) const { return mint(*this) *= b; } constexpr mint operator/(const mint &b) const { return mint(*this) /= b; } constexpr bool operator==(const mint &b) const { return (a >= mod ? a - mod : a) == (b.a >= mod ? b.a - mod : b.a); } constexpr bool operator!=(const mint &b) const { return (a >= mod ? a - mod : a) != (b.a >= mod ? b.a - mod : b.a); } constexpr mint operator-() const { return mint() - mint(*this); } constexpr mint pow(u64 n) const { mint ret(1), mul(*this); while (n > 0) { if (n & 1) ret *= mul; mul *= mul; n >>= 1; } return ret; } constexpr mint inverse() const { return pow(mod - 2); } friend ostream &operator<<(ostream &os, const mint &b) { return os << b.get(); } friend istream &operator>>(istream &is, mint &b) { int64_t t; is >> t; b = LazyMontgomeryModInt<mod>(t); return (is); } constexpr u32 get() const { u32 ret = reduce(a); return ret >= mod ? ret - mod : ret; } static constexpr u32 get_mod() { return mod; } }; #pragma GCC optimize("O3,unroll-loops") #pragma GCC target("avx2") using m256 = __m256i; struct alignas(32) mmint { m256 x; static mmint R, M0, M1, M2, N2; mmint() : x() {} inline mmint(const m256& _x) : x(_x) {} inline mmint(unsigned int a) : x(_mm256_set1_epi32(a)) {} inline mmint(unsigned int a0, unsigned int a1, unsigned int a2, unsigned int a3, unsigned int a4, unsigned int a5, unsigned int a6, unsigned int a7) : x(_mm256_set_epi32(a7, a6, a5, a4, a3, a2, a1, a0)) {} inline operator m256&() { return x; } inline operator const m256&() const { return x; } inline int& operator[](int i) { return *(reinterpret_cast<int*>(&x) + i); } inline const int& operator[](int i) const { return *(reinterpret_cast<const int*>(&x) + i); } friend ostream& operator<<(ostream& os, const mmint& m) { unsigned r = R[0], mod = M1[0]; auto reduce1 = [&](const uint64_t& b) { unsigned res = (b + uint64_t(unsigned(b) * unsigned(-r)) * mod) >> 32; return res >= mod ? res - mod : res; }; for (int i = 0; i < 8; i++) { os << reduce1(m[i]) << (i == 7 ? "" : " "); } return os; } template <typename mint> static void set_mod() { R = _mm256_set1_epi32(mint::r); M0 = _mm256_setzero_si256(); M1 = _mm256_set1_epi32(mint::get_mod()); M2 = _mm256_set1_epi32(mint::get_mod() * 2); N2 = _mm256_set1_epi32(mint::n2); } static inline mmint reduce(const mmint& prod02, const mmint& prod13) { m256 unpalo = _mm256_unpacklo_epi32(prod02, prod13); m256 unpahi = _mm256_unpackhi_epi32(prod02, prod13); m256 prodlo = _mm256_unpacklo_epi64(unpalo, unpahi); m256 prodhi = _mm256_unpackhi_epi64(unpalo, unpahi); m256 hiplm1 = _mm256_add_epi32(prodhi, M1); m256 prodlohi = _mm256_shuffle_epi32(prodlo, 0xF5); m256 lmlr02 = _mm256_mul_epu32(prodlo, R); m256 lmlr13 = _mm256_mul_epu32(prodlohi, R); m256 prod02_ = _mm256_mul_epu32(lmlr02, M1); m256 prod13_ = _mm256_mul_epu32(lmlr13, M1); m256 unpalo_ = _mm256_unpacklo_epi32(prod02_, prod13_); m256 unpahi_ = _mm256_unpackhi_epi32(prod02_, prod13_); m256 prod = _mm256_unpackhi_epi64(unpalo_, unpahi_); return _mm256_sub_epi32(hiplm1, prod); } static inline mmint itom(const mmint& A) { return A * N2; } static inline mmint mtoi(const mmint& A) { m256 A13 = _mm256_shuffle_epi32(A, 0xF5); m256 lmlr02 = _mm256_mul_epu32(A, R); m256 lmlr13 = _mm256_mul_epu32(A13, R); m256 prod02_ = _mm256_mul_epu32(lmlr02, M1); m256 prod13_ = _mm256_mul_epu32(lmlr13, M1); m256 unpalo_ = _mm256_unpacklo_epi32(prod02_, prod13_); m256 unpahi_ = _mm256_unpackhi_epi32(prod02_, prod13_); m256 prod = _mm256_unpackhi_epi64(unpalo_, unpahi_); m256 cmp = _mm256_cmpgt_epi32(prod, M0); m256 dif = _mm256_and_si256(cmp, M1); return _mm256_sub_epi32(dif, prod); } friend inline mmint operator+(const mmint& A, const mmint& B) { m256 apb = _mm256_add_epi32(A, B); m256 ret = _mm256_sub_epi32(apb, M2); m256 cmp = _mm256_cmpgt_epi32(M0, ret); m256 add = _mm256_and_si256(cmp, M2); return _mm256_add_epi32(add, ret); } friend inline mmint operator-(const mmint& A, const mmint& B) { m256 ret = _mm256_sub_epi32(A, B); m256 cmp = _mm256_cmpgt_epi32(M0, ret); m256 add = _mm256_and_si256(cmp, M2); return _mm256_add_epi32(add, ret); } friend inline mmint operator*(const mmint& A, const mmint& B) { m256 a13 = _mm256_shuffle_epi32(A, 0xF5); m256 b13 = _mm256_shuffle_epi32(B, 0xF5); m256 prod02 = _mm256_mul_epu32(A, B); m256 prod13 = _mm256_mul_epu32(a13, b13); return reduce(prod02, prod13); } inline mmint& operator+=(const mmint& A) { return (*this) = (*this) + A; } inline mmint& operator-=(const mmint& A) { return (*this) = (*this) - A; } inline mmint& operator*=(const mmint& A) { return (*this) = (*this) * A; } bool operator==(const mmint& A) { m256 sub = _mm256_sub_epi32(x, A.x); return _mm256_testz_si256(sub, sub) == 1; } bool operator!=(const mmint& A) { return !((*this) == A); } }; __attribute__((aligned(32))) mmint mmint::R; __attribute__((aligned(32))) mmint mmint::M0, mmint::M1, mmint::M2, mmint::N2; /** * @brief vectorize modint */ template <typename T> struct Binomial { vector<T> f, g, h; Binomial(int MAX = 0) : f(1, T(1)), g(1, T(1)), h(1, T(1)) { while (MAX >= (int)f.size()) extend(); } void extend() { int n = f.size(); int m = n * 2; f.resize(m); g.resize(m); h.resize(m); for (int i = n; i < m; i++) f[i] = f[i - 1] * T(i); g[m - 1] = f[m - 1].inverse(); h[m - 1] = g[m - 1] * f[m - 2]; for (int i = m - 2; i >= n; i--) { g[i] = g[i + 1] * T(i + 1); h[i] = g[i] * f[i - 1]; } } T fac(int i) { if (i < 0) return T(0); while (i >= (int)f.size()) extend(); return f[i]; } T finv(int i) { if (i < 0) return T(0); while (i >= (int)g.size()) extend(); return g[i]; } T inv(int i) { if (i < 0) return -inv(-i); while (i >= (int)h.size()) extend(); return h[i]; } T C(int n, int r) { if (n < 0 || n < r || r < 0) return T(0); return fac(n) * finv(n - r) * finv(r); } inline T operator()(int n, int r) { return C(n, r); } T C_naive(int n, int r) { if (n < 0 || n < r || r < 0) return T(0); T ret = T(1); r = min(r, n - r); for (int i = 1; i <= r; ++i) ret *= inv(i) * (n--); return ret; } T P(int n, int r) { if (n < 0 || n < r || r < 0) return T(0); return fac(n) * finv(n - r); } T H(int n, int r) { if (n < 0 || r < 0) return T(0); return r == 0 ? 1 : C(n + r - 1, r); } }; using namespace Nyaan; using mint = LazyMontgomeryModInt<998244353>; using vm = vector<mint>; Binomial<mint> C; mmint F[13000]; mmint DP[13000]; mmint NX[13000]; void Nyaan::solve() { Timer timer; mmint::set_mod<mint>(); memset(F, 0, sizeof(F)); memset(DP, 0, sizeof(F)); memset(NX, 0, sizeof(F)); ini(N); vi a(N); in(a); // calc coeff vm f(N); // 右端以外 f[0] = mint(1) / 2; reg(i, 1, N - 1) { f[i] = C.finv(i + 1) * (C.inv(2) + i) * (C.inv(2).pow(i)); f[i] += f[i - 1]; } // 右端 { mint buf = 1; for (int i = 0; i < N; i++) { f.back() += buf; buf *= C.inv(2) * C.inv(i + 1); } } mmint* dp = DP + 1; mmint* nx = NX + 1; for (int i = 0; i < N; i++) F[i / 8][i % 8] = f[i].a; for (int i = 0; i < N; i++) dp[i / 8][i % 8] = mint(a[i]).a; mint coe = mint(2).pow(1LL * 2 * (N - 1)) * C.fac(N - 1); m256 MOD = _mm256_set1_epi32(998244353); mmint th1, th2, zero = _mm256_setzero_si256(); th1[1] = th1[3] = th1[5] = th1[7] = mmint::M1[0]; th2[1] = th2[3] = th2[5] = th2[7] = mmint::M2[0]; auto normalize = [&MOD](const mmint& data) -> m256 { m256 flag = _mm256_cmpgt_epi32(data, MOD); m256 dif = _mm256_and_si256(flag, MOD); return _mm256_sub_epi32(data, dif); }; #define INIT_X(x) \ m256 prod02##x = _mm256_setzero_si256(); \ m256 prod13##x = _mm256_setzero_si256() #define ADD(x) \ m256 f02##x = normalize(F[j + x]); \ m256 f13##x = _mm256_shuffle_epi32(f02##x, 0xF5); \ m256 dp02##x = normalize(dp[j + x]); \ m256 dp13##x = _mm256_shuffle_epi32(dp02##x, 0xF5); \ m256 fd02##x = _mm256_mul_epi32(f02##x, dp02##x); \ m256 fd13##x = _mm256_mul_epi32(f13##x, dp13##x); \ prod02##x = _mm256_add_epi64(prod02##x, fd02##x); \ prod13##x = _mm256_add_epi64(prod13##x, fd13##x) #define COMP(x) \ m256 cmp02##x = _mm256_cmpgt_epi64(zero, prod02##x); \ m256 cmp13##x = _mm256_cmpgt_epi64(zero, prod13##x); \ m256 dif02##x = _mm256_and_si256(cmp02##x, th2); \ m256 dif13##x = _mm256_and_si256(cmp13##x, th2); \ prod02##x = _mm256_sub_epi64(prod02##x, dif02##x); \ prod13##x = _mm256_sub_epi64(prod13##x, dif13##x) #define REDUCE(x) \ for (int _ = 0; _ < 2; _++) { \ m256 cmp02 = _mm256_cmpgt_epi64(prod02##x, th1); \ m256 cmp13 = _mm256_cmpgt_epi64(prod13##x, th1); \ m256 dif02 = _mm256_and_si256(cmp02, th1); \ m256 dif13 = _mm256_and_si256(cmp13, th1); \ prod02##x = _mm256_sub_epi64(prod02##x, dif02); \ prod13##x = _mm256_sub_epi64(prod13##x, dif13); \ } \ buf += mmint::reduce(prod02##x, prod13##x) auto done = [&]() { INIT_X(0); INIT_X(1); INIT_X(2); INIT_X(3); for (int i = 0; i < N / 8 + 32; i += 32) { for (int j = i; j < i + 32; j += 4) { ADD(0); ADD(1); ADD(2); ADD(3); } COMP(0); COMP(1); COMP(2); COMP(3); } mmint buf{0}; REDUCE(0); REDUCE(1); REDUCE(2); REDUCE(3); buf = mmint::mtoi(buf); mint res = 0; rep(i, 8) res += buf[i]; out(res * coe); }; done(); mint Nm2 = N - 2; mint Nm3 = N - 3; rep(_, N) { mint* dp0 = reinterpret_cast<mint*>(dp); mint* nx0 = reinterpret_cast<mint*>(nx); mmint NM3{Nm3.a}; for (int i = 0; i < N / 8 + 3; i++) { __m256i p1 = _mm256_loadu_si256((__m256i*)(dp0 + i * 8 - 1)); __m256i p2 = _mm256_loadu_si256((__m256i*)(dp0 + i * 8 + 1)); nx[i] = dp[i] * NM3 + p1 + p2; } nx0[0] = dp0[1] + dp0[0] * Nm2; nx0[N - 1] = dp0[N - 2] + dp0[N - 1] * Nm2; swap(dp, nx); trc(dp); done(); } cerr << timer.elapsed() << "\n"; }