結果
問題 | No.1574 Swap and Repaint |
ユーザー |
|
提出日時 | 2021-07-04 18:52:11 |
言語 | C++17(gcc12) (gcc 12.3.0 + boost 1.87.0) |
結果 |
CE
(最新)
AC
(最初)
|
実行時間 | - |
コード長 | 22,536 bytes |
コンパイル時間 | 6,189 ms |
コンパイル使用メモリ | 280,908 KB |
最終ジャッジ日時 | 2025-01-22 17:41:44 |
ジャッジサーバーID (参考情報) |
judge4 / judge2 |
(要ログイン)
コンパイルエラー時のメッセージ・ソースコードは、提出者また管理者しか表示できないようにしております。(リジャッジ後のコンパイルエラーは公開されます)
ただし、clay言語の場合は開発者のデバッグのため、公開されます。
ただし、clay言語の場合は開発者のデバッグのため、公開されます。
コンパイルメッセージ
main.cpp: In function 'mmint operator*(const mmint&, const mmint&)': main.cpp:632:44: warning: AVX vector return without AVX enabled changes the ABI [-Wpsabi] 632 | m256 a13 = _mm256_shuffle_epi32(A, 0xF5); | ^ In file included from /home/linuxbrew/.linuxbrew/Cellar/gcc@12/12.4.0/lib/gcc/12/gcc/x86_64-pc-linux-gnu/12/include/immintrin.h:47, from main.cpp:11: /home/linuxbrew/.linuxbrew/Cellar/gcc@12/12.4.0/lib/gcc/12/gcc/x86_64-pc-linux-gnu/12/include/avx2intrin.h: In function 'mmint operator+(const mmint&, const mmint&)': /home/linuxbrew/.linuxbrew/Cellar/gcc@12/12.4.0/lib/gcc/12/gcc/x86_64-pc-linux-gnu/12/include/avx2intrin.h:119:1: error: inlining failed in call to 'always_inline' '__m256i _mm256_add_epi32(__m256i, __m256i)': target specific option mismatch 119 | _mm256_add_epi32 (__m256i __A, __m256i __B) | ^~~~~~~~~~~~~~~~ main.cpp:621:28: note: called from here 621 | return _mm256_add_epi32(add, ret); | ~~~~~~~~~~~~~~~~^~~~~~~~~~ /home/linuxbrew/.linuxbrew/Cellar/gcc@12/12.4.0/lib/gcc/12/gcc/x86_64-pc-linux-gnu/12/include/avx2intrin.h:179:1: error: inlining failed in call to 'always_inline' '__m256i _mm256_and_si256(__m256i, __m256i)': target specific option mismatch 179 | _mm256_and_si256 (__m256i __A, __m256i __B) | ^~~~~~~~~~~~~~~~ main.cpp:620:32: note: called from here 620 | m256 add = _mm256_and_si256(cmp, M2); | ~~~~~~~~~~~~~~~~^~~~~~~~~ /home/linuxbrew/.linuxbrew/Cellar/gcc@12/12.4.0/lib/gcc/12/gcc/x86_64-pc-linux-gnu/12/include/avx2intrin.h:273:1: error: inlining failed in call to 'always_inline' '__m256i _mm256_cmpgt_epi32(__m256i, __m256i)': target specific option mismatch 273 | _mm256_cmpgt_epi32 (__m256i __A, __m256i __B) | ^~~~~~~~~~~~~~~~~~ main.cpp:619:34: note: called from here 619 | m256 cmp = _mm256_cmpgt_epi32(M0, ret); | ~~~~~~~~~~~~~~~~~~^~~~~~~~~ /home/linuxbrew/.linux
ソースコード
// O(N^2)です。ゆるして/*** date : 2021-07-04 01:10:55*/#define NDEBUGusing namespace std;// intrinstic#include <immintrin.h>#include <algorithm>#include <array>#include <bitset>#include <cassert>#include <cctype>#include <cfenv>#include <cfloat>#include <chrono>#include <cinttypes>#include <climits>#include <cmath>#include <complex>#include <cstdarg>#include <cstddef>#include <cstdint>#include <cstdio>#include <cstdlib>#include <cstring>#include <deque>#include <fstream>#include <functional>#include <initializer_list>#include <iomanip>#include <ios>#include <iostream>#include <istream>#include <iterator>#include <limits>#include <list>#include <map>#include <memory>#include <new>#include <numeric>#include <ostream>#include <queue>#include <random>#include <set>#include <sstream>#include <stack>#include <streambuf>#include <string>#include <tuple>#include <type_traits>#include <typeinfo>#include <unordered_map>#include <unordered_set>#include <utility>#include <vector>// utilitynamespace Nyaan {using ll = long long;using i64 = long long;using u64 = unsigned long long;using i128 = __int128_t;using u128 = __uint128_t;template <typename T>using V = vector<T>;template <typename T>using VV = vector<vector<T>>;using vi = vector<int>;using vl = vector<long long>;using vd = V<double>;using vs = V<string>;using vvi = vector<vector<int>>;using vvl = vector<vector<long long>>;template <typename T, typename U>struct P : pair<T, U> {template <typename... Args>P(Args... args) : pair<T, U>(args...) {}using pair<T, U>::first;using pair<T, U>::second;T &x() { return first; }const T &x() const { return first; }U &y() { return second; }const U &y() const { return second; }P &operator+=(const P &r) {first += r.first;second += r.second;return *this;}P &operator-=(const P &r) {first -= r.first;second -= r.second;return *this;}P &operator*=(const P &r) {first *= r.first;second *= r.second;return *this;}P operator+(const P &r) const { return P(*this) += r; }P operator-(const P &r) const { return P(*this) -= r; }P operator*(const P &r) const { return P(*this) *= r; }};using pl = P<ll, ll>;using pi = P<int, int>;using vp = V<pl>;constexpr int inf = 1001001001;constexpr long long infLL = 4004004004004004004LL;template <typename T>int sz(const T &t) {return t.size();}template <typename T, typename U>inline bool amin(T &x, U y) {return (y < x) ? (x = y, true) : false;}template <typename T, typename U>inline bool amax(T &x, U y) {return (x < y) ? (x = y, true) : false;}template <typename T>inline T Max(const vector<T> &v) {return *max_element(begin(v), end(v));}template <typename T>inline T Min(const vector<T> &v) {return *min_element(begin(v), end(v));}template <typename T>inline long long Sum(const vector<T> &v) {return accumulate(begin(v), end(v), 0LL);}template <typename T>int lb(const vector<T> &v, const T &a) {return lower_bound(begin(v), end(v), a) - begin(v);}template <typename T>int ub(const vector<T> &v, const T &a) {return upper_bound(begin(v), end(v), a) - begin(v);}constexpr long long TEN(int n) {long long ret = 1, x = 10;for (; n; x *= x, n >>= 1) ret *= (n & 1 ? x : 1);return ret;}template <typename T, typename U>pair<T, U> mkp(const T &t, const U &u) {return make_pair(t, u);}template <typename T>vector<T> mkrui(const vector<T> &v, bool rev = false) {vector<T> ret(v.size() + 1);if (rev) {for (int i = int(v.size()) - 1; i >= 0; i--) ret[i] = v[i] + ret[i + 1];} else {for (int i = 0; i < int(v.size()); i++) ret[i + 1] = ret[i] + v[i];}return ret;};template <typename T>vector<T> mkuni(const vector<T> &v) {vector<T> ret(v);sort(ret.begin(), ret.end());ret.erase(unique(ret.begin(), ret.end()), ret.end());return ret;}template <typename F>vector<int> mkord(int N, F f) {vector<int> ord(N);iota(begin(ord), end(ord), 0);sort(begin(ord), end(ord), f);return ord;}template <typename T>vector<int> mkinv(vector<T> &v) {int max_val = *max_element(begin(v), end(v));vector<int> inv(max_val + 1, -1);for (int i = 0; i < (int)v.size(); i++) inv[v[i]] = i;return inv;}} // namespace Nyaan// bit operationnamespace Nyaan {__attribute__((target("popcnt"))) inline int popcnt(const u64 &a) {return _mm_popcnt_u64(a);}inline int lsb(const u64 &a) { return a ? __builtin_ctzll(a) : 64; }inline int ctz(const u64 &a) { return a ? __builtin_ctzll(a) : 64; }inline int msb(const u64 &a) { return a ? 63 - __builtin_clzll(a) : -1; }template <typename T>inline int gbit(const T &a, int i) {return (a >> i) & 1;}template <typename T>inline void sbit(T &a, int i, bool b) {if (gbit(a, i) != b) a ^= T(1) << i;}constexpr long long PW(int n) { return 1LL << n; }constexpr long long MSK(int n) { return (1LL << n) - 1; }} // namespace Nyaan// inoutnamespace Nyaan {template <typename T, typename U>ostream &operator<<(ostream &os, const pair<T, U> &p) {os << p.first << " " << p.second;return os;}template <typename T, typename U>istream &operator>>(istream &is, pair<T, U> &p) {is >> p.first >> p.second;return is;}template <typename T>ostream &operator<<(ostream &os, const vector<T> &v) {int s = (int)v.size();for (int i = 0; i < s; i++) os << (i ? " " : "") << v[i];return os;}template <typename T>istream &operator>>(istream &is, vector<T> &v) {for (auto &x : v) is >> x;return is;}void in() {}template <typename T, class... U>void in(T &t, U &... u) {cin >> t;in(u...);}void out() { cout << "\n"; }template <typename T, class... U, char sep = ' '>void out(const T &t, const U &... u) {cout << t;if (sizeof...(u)) cout << sep;out(u...);}void outr() {}template <typename T, class... U, char sep = ' '>void outr(const T &t, const U &... u) {cout << t;outr(u...);}struct IoSetupNya {IoSetupNya() {cin.tie(nullptr);ios::sync_with_stdio(false);cout << fixed << setprecision(15);cerr << fixed << setprecision(7);}} iosetupnya;} // namespace Nyaan// debugnamespace DebugImpl {template <typename U, typename = void>struct is_specialize : false_type {};template <typename U>struct is_specialize<U, typename conditional<false, typename U::iterator, void>::type>: true_type {};template <typename U>struct is_specialize<U, typename conditional<false, decltype(U::first), void>::type>: true_type {};template <typename U>struct is_specialize<U, enable_if_t<is_integral<U>::value, void>> : true_type {};void dump(const char& t) { cerr << t; }void dump(const string& t) { cerr << t; }void dump(const bool& t) { cerr << (t ? "true" : "false"); }template <typename U,enable_if_t<!is_specialize<U>::value, nullptr_t> = nullptr>void dump(const U& t) {cerr << t;}template <typename T>void dump(const T& t, enable_if_t<is_integral<T>::value>* = nullptr) {string res;if (t == Nyaan::inf) res = "inf";if constexpr (is_signed<T>::value) {if (t == -Nyaan::inf) res = "-inf";}if constexpr (sizeof(T) == 8) {if (t == Nyaan::infLL) res = "inf";if constexpr (is_signed<T>::value) {if (t == -Nyaan::infLL) res = "-inf";}}if (res.empty()) res = to_string(t);cerr << res;}template <typename T, typename U>void dump(const pair<T, U>&);template <typename T>void dump(const pair<T*, int>&);template <typename T>void dump(const T& t,enable_if_t<!is_void<typename T::iterator>::value>* = nullptr) {cerr << "[ ";for (auto it = t.begin(); it != t.end();) {dump(*it);cerr << (++it == t.end() ? "" : ", ");}cerr << " ]";}template <typename T, typename U>void dump(const pair<T, U>& t) {cerr << "( ";dump(t.first);cerr << ", ";dump(t.second);cerr << " )";}template <typename T>void dump(const pair<T*, int>& t) {cerr << "[ ";for (int i = 0; i < t.second; i++) {dump(t.first[i]);cerr << (i == t.second - 1 ? "" : ", ");}cerr << " ]";}void trace() { cerr << endl; }template <typename Head, typename... Tail>void trace(Head&& head, Tail&&... tail) {cerr << " ";dump(head);if (sizeof...(tail) != 0) cerr << ",";trace(forward<Tail>(tail)...);}} // namespace DebugImpl#ifdef NyaanDebug#define trc(...) \do { \cerr << "## " << #__VA_ARGS__ << " = "; \DebugImpl::trace(__VA_ARGS__); \} while (0)#else#define trc(...) (void(0))#endif// macro#define each(x, v) for (auto&& x : v)#define each2(x, y, v) for (auto&& [x, y] : v)#define all(v) (v).begin(), (v).end()#define rep(i, N) for (long long i = 0; i < (long long)(N); i++)#define repr(i, N) for (long long i = (long long)(N)-1; i >= 0; i--)#define rep1(i, N) for (long long i = 1; i <= (long long)(N); i++)#define repr1(i, N) for (long long i = (N); (long long)(i) > 0; i--)#define reg(i, a, b) for (long long i = (a); i < (b); i++)#define regr(i, a, b) for (long long i = (b)-1; i >= (a); i--)#define fi first#define se second#define ini(...) \int __VA_ARGS__; \in(__VA_ARGS__)#define inl(...) \long long __VA_ARGS__; \in(__VA_ARGS__)#define ins(...) \string __VA_ARGS__; \in(__VA_ARGS__)#define in2(s, t) \for (int i = 0; i < (int)s.size(); i++) { \in(s[i], t[i]); \}#define in3(s, t, u) \for (int i = 0; i < (int)s.size(); i++) { \in(s[i], t[i], u[i]); \}#define in4(s, t, u, v) \for (int i = 0; i < (int)s.size(); i++) { \in(s[i], t[i], u[i], v[i]); \}#define die(...) \do { \Nyaan::out(__VA_ARGS__); \return; \} while (0)namespace Nyaan {void solve();}int main() { Nyaan::solve(); }//struct Timer {chrono::high_resolution_clock::time_point st;Timer() { reset(); }void reset() { st = chrono::high_resolution_clock::now(); }chrono::milliseconds::rep elapsed() {auto ed = chrono::high_resolution_clock::now();return chrono::duration_cast<chrono::milliseconds>(ed - st).count();}};template <uint32_t mod>struct LazyMontgomeryModInt {using mint = LazyMontgomeryModInt;using i32 = int32_t;using u32 = uint32_t;using u64 = uint64_t;static constexpr u32 get_r() {u32 ret = mod;for (i32 i = 0; i < 4; ++i) ret *= 2 - mod * ret;return ret;}static constexpr u32 r = get_r();static constexpr u32 n2 = -u64(mod) % mod;static_assert(r * mod == 1, "invalid, r * mod != 1");static_assert(mod < (1 << 30), "invalid, mod >= 2 ^ 30");static_assert((mod & 1) == 1, "invalid, mod % 2 == 0");u32 a;constexpr LazyMontgomeryModInt() : a(0) {}constexpr LazyMontgomeryModInt(const int64_t &b): a(reduce(u64(b % mod + mod) * n2)){};static constexpr u32 reduce(const u64 &b) {return (b + u64(u32(b) * u32(-r)) * mod) >> 32;}constexpr mint &operator+=(const mint &b) {if (i32(a += b.a - 2 * mod) < 0) a += 2 * mod;return *this;}constexpr mint &operator-=(const mint &b) {if (i32(a -= b.a) < 0) a += 2 * mod;return *this;}constexpr mint &operator*=(const mint &b) {a = reduce(u64(a) * b.a);return *this;}constexpr mint &operator/=(const mint &b) {*this *= b.inverse();return *this;}constexpr mint operator+(const mint &b) const { return mint(*this) += b; }constexpr mint operator-(const mint &b) const { return mint(*this) -= b; }constexpr mint operator*(const mint &b) const { return mint(*this) *= b; }constexpr mint operator/(const mint &b) const { return mint(*this) /= b; }constexpr bool operator==(const mint &b) const {return (a >= mod ? a - mod : a) == (b.a >= mod ? b.a - mod : b.a);}constexpr bool operator!=(const mint &b) const {return (a >= mod ? a - mod : a) != (b.a >= mod ? b.a - mod : b.a);}constexpr mint operator-() const { return mint() - mint(*this); }constexpr mint pow(u64 n) const {mint ret(1), mul(*this);while (n > 0) {if (n & 1) ret *= mul;mul *= mul;n >>= 1;}return ret;}constexpr mint inverse() const { return pow(mod - 2); }friend ostream &operator<<(ostream &os, const mint &b) {return os << b.get();}friend istream &operator>>(istream &is, mint &b) {int64_t t;is >> t;b = LazyMontgomeryModInt<mod>(t);return (is);}constexpr u32 get() const {u32 ret = reduce(a);return ret >= mod ? ret - mod : ret;}static constexpr u32 get_mod() { return mod; }};#pragma GCC optimize("O3,unroll-loops")#pragma GCC target("avx2")using m256 = __m256i;struct alignas(32) mmint {m256 x;static mmint R, M0, M1, M2, N2;mmint() : x() {}inline mmint(const m256& _x) : x(_x) {}inline mmint(unsigned int a) : x(_mm256_set1_epi32(a)) {}inline mmint(unsigned int a0, unsigned int a1, unsigned int a2,unsigned int a3, unsigned int a4, unsigned int a5,unsigned int a6, unsigned int a7): x(_mm256_set_epi32(a7, a6, a5, a4, a3, a2, a1, a0)) {}inline operator m256&() { return x; }inline operator const m256&() const { return x; }inline int& operator[](int i) { return *(reinterpret_cast<int*>(&x) + i); }inline const int& operator[](int i) const {return *(reinterpret_cast<const int*>(&x) + i);}friend ostream& operator<<(ostream& os, const mmint& m) {unsigned r = R[0], mod = M1[0];auto reduce1 = [&](const uint64_t& b) {unsigned res = (b + uint64_t(unsigned(b) * unsigned(-r)) * mod) >> 32;return res >= mod ? res - mod : res;};for (int i = 0; i < 8; i++) {os << reduce1(m[i]) << (i == 7 ? "" : " ");}return os;}template <typename mint>static void set_mod() {R = _mm256_set1_epi32(mint::r);M0 = _mm256_setzero_si256();M1 = _mm256_set1_epi32(mint::get_mod());M2 = _mm256_set1_epi32(mint::get_mod() * 2);N2 = _mm256_set1_epi32(mint::n2);}static inline mmint reduce(const mmint& prod02, const mmint& prod13) {m256 unpalo = _mm256_unpacklo_epi32(prod02, prod13);m256 unpahi = _mm256_unpackhi_epi32(prod02, prod13);m256 prodlo = _mm256_unpacklo_epi64(unpalo, unpahi);m256 prodhi = _mm256_unpackhi_epi64(unpalo, unpahi);m256 hiplm1 = _mm256_add_epi32(prodhi, M1);m256 prodlohi = _mm256_shuffle_epi32(prodlo, 0xF5);m256 lmlr02 = _mm256_mul_epu32(prodlo, R);m256 lmlr13 = _mm256_mul_epu32(prodlohi, R);m256 prod02_ = _mm256_mul_epu32(lmlr02, M1);m256 prod13_ = _mm256_mul_epu32(lmlr13, M1);m256 unpalo_ = _mm256_unpacklo_epi32(prod02_, prod13_);m256 unpahi_ = _mm256_unpackhi_epi32(prod02_, prod13_);m256 prod = _mm256_unpackhi_epi64(unpalo_, unpahi_);return _mm256_sub_epi32(hiplm1, prod);}static inline mmint itom(const mmint& A) { return A * N2; }static inline mmint mtoi(const mmint& A) {m256 A13 = _mm256_shuffle_epi32(A, 0xF5);m256 lmlr02 = _mm256_mul_epu32(A, R);m256 lmlr13 = _mm256_mul_epu32(A13, R);m256 prod02_ = _mm256_mul_epu32(lmlr02, M1);m256 prod13_ = _mm256_mul_epu32(lmlr13, M1);m256 unpalo_ = _mm256_unpacklo_epi32(prod02_, prod13_);m256 unpahi_ = _mm256_unpackhi_epi32(prod02_, prod13_);m256 prod = _mm256_unpackhi_epi64(unpalo_, unpahi_);m256 cmp = _mm256_cmpgt_epi32(prod, M0);m256 dif = _mm256_and_si256(cmp, M1);return _mm256_sub_epi32(dif, prod);}friend inline mmint operator+(const mmint& A, const mmint& B) {m256 apb = _mm256_add_epi32(A, B);m256 ret = _mm256_sub_epi32(apb, M2);m256 cmp = _mm256_cmpgt_epi32(M0, ret);m256 add = _mm256_and_si256(cmp, M2);return _mm256_add_epi32(add, ret);}friend inline mmint operator-(const mmint& A, const mmint& B) {m256 ret = _mm256_sub_epi32(A, B);m256 cmp = _mm256_cmpgt_epi32(M0, ret);m256 add = _mm256_and_si256(cmp, M2);return _mm256_add_epi32(add, ret);}friend inline mmint operator*(const mmint& A, const mmint& B) {m256 a13 = _mm256_shuffle_epi32(A, 0xF5);m256 b13 = _mm256_shuffle_epi32(B, 0xF5);m256 prod02 = _mm256_mul_epu32(A, B);m256 prod13 = _mm256_mul_epu32(a13, b13);return reduce(prod02, prod13);}inline mmint& operator+=(const mmint& A) { return (*this) = (*this) + A; }inline mmint& operator-=(const mmint& A) { return (*this) = (*this) - A; }inline mmint& operator*=(const mmint& A) { return (*this) = (*this) * A; }bool operator==(const mmint& A) {m256 sub = _mm256_sub_epi32(x, A.x);return _mm256_testz_si256(sub, sub) == 1;}bool operator!=(const mmint& A) { return !((*this) == A); }};__attribute__((aligned(32))) mmint mmint::R;__attribute__((aligned(32))) mmint mmint::M0, mmint::M1, mmint::M2, mmint::N2;/*** @brief vectorize modint*/template <typename T>struct Binomial {vector<T> f, g, h;Binomial(int MAX = 0) : f(1, T(1)), g(1, T(1)), h(1, T(1)) {while (MAX >= (int)f.size()) extend();}void extend() {int n = f.size();int m = n * 2;f.resize(m);g.resize(m);h.resize(m);for (int i = n; i < m; i++) f[i] = f[i - 1] * T(i);g[m - 1] = f[m - 1].inverse();h[m - 1] = g[m - 1] * f[m - 2];for (int i = m - 2; i >= n; i--) {g[i] = g[i + 1] * T(i + 1);h[i] = g[i] * f[i - 1];}}T fac(int i) {if (i < 0) return T(0);while (i >= (int)f.size()) extend();return f[i];}T finv(int i) {if (i < 0) return T(0);while (i >= (int)g.size()) extend();return g[i];}T inv(int i) {if (i < 0) return -inv(-i);while (i >= (int)h.size()) extend();return h[i];}T C(int n, int r) {if (n < 0 || n < r || r < 0) return T(0);return fac(n) * finv(n - r) * finv(r);}inline T operator()(int n, int r) { return C(n, r); }T C_naive(int n, int r) {if (n < 0 || n < r || r < 0) return T(0);T ret = T(1);r = min(r, n - r);for (int i = 1; i <= r; ++i) ret *= inv(i) * (n--);return ret;}T P(int n, int r) {if (n < 0 || n < r || r < 0) return T(0);return fac(n) * finv(n - r);}T H(int n, int r) {if (n < 0 || r < 0) return T(0);return r == 0 ? 1 : C(n + r - 1, r);}};using namespace Nyaan;using mint = LazyMontgomeryModInt<998244353>;using vm = vector<mint>;Binomial<mint> C;mmint F[13000];mmint DP[13000];mmint NX[13000];void Nyaan::solve() {Timer timer;mmint::set_mod<mint>();memset(F, 0, sizeof(F));memset(DP, 0, sizeof(F));memset(NX, 0, sizeof(F));ini(N);vi a(N);in(a);// calc coeffvm f(N);// 右端以外f[0] = mint(1) / 2;reg(i, 1, N - 1) {f[i] = C.finv(i + 1) * (C.inv(2) + i) * (C.inv(2).pow(i));f[i] += f[i - 1];}// 右端{mint buf = 1;for (int i = 0; i < N; i++) {f.back() += buf;buf *= C.inv(2) * C.inv(i + 1);}}mmint* dp = DP + 1;mmint* nx = NX + 1;for (int i = 0; i < N; i++) F[i / 8][i % 8] = f[i].a;for (int i = 0; i < N; i++) dp[i / 8][i % 8] = mint(a[i]).a;mint coe = mint(2).pow(1LL * 2 * (N - 1)) * C.fac(N - 1);m256 MOD = _mm256_set1_epi32(998244353);mmint th1, th2, zero = _mm256_setzero_si256();th1[1] = th1[3] = th1[5] = th1[7] = mmint::M1[0];th2[1] = th2[3] = th2[5] = th2[7] = mmint::M2[0];auto normalize = [&MOD](const mmint& data) -> m256 {m256 flag = _mm256_cmpgt_epi32(data, MOD);m256 dif = _mm256_and_si256(flag, MOD);return _mm256_sub_epi32(data, dif);};#define INIT_X(x) \m256 prod02##x = _mm256_setzero_si256(); \m256 prod13##x = _mm256_setzero_si256()#define ADD(x) \m256 f02##x = normalize(F[j + x]); \m256 f13##x = _mm256_shuffle_epi32(f02##x, 0xF5); \m256 dp02##x = normalize(dp[j + x]); \m256 dp13##x = _mm256_shuffle_epi32(dp02##x, 0xF5); \m256 fd02##x = _mm256_mul_epi32(f02##x, dp02##x); \m256 fd13##x = _mm256_mul_epi32(f13##x, dp13##x); \prod02##x = _mm256_add_epi64(prod02##x, fd02##x); \prod13##x = _mm256_add_epi64(prod13##x, fd13##x)#define COMP(x) \m256 cmp02##x = _mm256_cmpgt_epi64(zero, prod02##x); \m256 cmp13##x = _mm256_cmpgt_epi64(zero, prod13##x); \m256 dif02##x = _mm256_and_si256(cmp02##x, th2); \m256 dif13##x = _mm256_and_si256(cmp13##x, th2); \prod02##x = _mm256_sub_epi64(prod02##x, dif02##x); \prod13##x = _mm256_sub_epi64(prod13##x, dif13##x)#define REDUCE(x) \for (int _ = 0; _ < 2; _++) { \m256 cmp02 = _mm256_cmpgt_epi64(prod02##x, th1); \m256 cmp13 = _mm256_cmpgt_epi64(prod13##x, th1); \m256 dif02 = _mm256_and_si256(cmp02, th1); \m256 dif13 = _mm256_and_si256(cmp13, th1); \prod02##x = _mm256_sub_epi64(prod02##x, dif02); \prod13##x = _mm256_sub_epi64(prod13##x, dif13); \} \buf += mmint::reduce(prod02##x, prod13##x)auto done = [&]() {INIT_X(0);INIT_X(1);INIT_X(2);INIT_X(3);for (int i = 0; i < N / 8 + 32; i += 32) {for (int j = i; j < i + 32; j += 4) {ADD(0);ADD(1);ADD(2);ADD(3);}COMP(0);COMP(1);COMP(2);COMP(3);}mmint buf{0};REDUCE(0);REDUCE(1);REDUCE(2);REDUCE(3);buf = mmint::mtoi(buf);mint res = 0;rep(i, 8) res += buf[i];out(res * coe);};done();mint Nm2 = N - 2;mint Nm3 = N - 3;rep(_, N) {mint* dp0 = reinterpret_cast<mint*>(dp);mint* nx0 = reinterpret_cast<mint*>(nx);mmint NM3{Nm3.a};for (int i = 0; i < N / 8 + 3; i++) {__m256i p1 = _mm256_loadu_si256((__m256i*)(dp0 + i * 8 - 1));__m256i p2 = _mm256_loadu_si256((__m256i*)(dp0 + i * 8 + 1));nx[i] = dp[i] * NM3 + p1 + p2;}nx0[0] = dp0[1] + dp0[0] * Nm2;nx0[N - 1] = dp0[N - 2] + dp0[N - 1] * Nm2;swap(dp, nx);trc(dp);done();}cerr << timer.elapsed() << "\n";}