結果

問題 No.1303 Inconvenient Kingdom
ユーザー NyaanNyaan
提出日時 2020-11-27 22:06:56
言語 C++17
(gcc 13.3.0 + boost 1.87.0)
結果
AC  
実行時間 269 ms / 3,000 ms
コード長 59,353 bytes
コンパイル時間 5,625 ms
コンパイル使用メモリ 361,616 KB
最終ジャッジ日時 2025-01-16 07:27:56
ジャッジサーバーID
(参考情報)
judge5 / judge5
このコードへのチャレンジ
(要ログイン)
ファイルパターン 結果
sample AC * 4
other AC * 34
権限があれば一括ダウンロードができます
コンパイルメッセージ
main.cpp:381:1: warning: ‘always_inline’ function might not be inlinable [-Wattributes]
  381 | montgomery_sub_256(const __m256i &a, const __m256i &b, const __m256i &m2,
      | ^~~~~~~~~~~~~~~~~~
main.cpp:373:1: warning: ‘always_inline’ function might not be inlinable [-Wattributes]
  373 | montgomery_add_256(const __m256i &a, const __m256i &b, const __m256i &m2,
      | ^~~~~~~~~~~~~~~~~~
main.cpp:365:1: warning: ‘always_inline’ function might not be inlinable [-Wattributes]
  365 | montgomery_mul_256(const __m256i &a, const __m256i &b, const __m256i &r,
      | ^~~~~~~~~~~~~~~~~~
main.cpp:354:1: warning: ‘always_inline’ function might not be inlinable [-Wattributes]
  354 | my256_mulhi_epu32(const __m256i &a, const __m256i &b) {
      | ^~~~~~~~~~~~~~~~~
main.cpp:349:1: warning: ‘always_inline’ function might not be inlinable [-Wattributes]
  349 | my256_mullo_epu32(const __m256i &a, const __m256i &b) {
      | ^~~~~~~~~~~~~~~~~
main.cpp:342:1: warning: ‘always_inline’ function might not be inlinable [-Wattributes]
  342 | montgomery_sub_128(const __m128i &a, const __m128i &b, const __m128i &m2,
      | ^~~~~~~~~~~~~~~~~~
main.cpp:335:1: warning: ‘always_inline’ function might not be inlinable [-Wattributes]
  335 | montgomery_add_128(const __m128i &a, const __m128i &b, const __m128i &m2,
      | ^~~~~~~~~~~~~~~~~~
main.cpp:327:1: warning: ‘always_inline’ function might not be inlinable [-Wattributes]
  327 | montgomery_mul_128(const __m128i &a, const __m128i &b, const __m128i &r,
      | ^~~~~~~~~~~~~~~~~~
main.cpp:316:1: warning: ‘always_inline’ function might not be inlinable [-Wattributes]
  316 | my128_mulhi_epu32(const __m128i &a, const __m128i &b) {
      | ^~~~~~~~~~~~~~~~~
main.cpp:311:1: warning: ‘always_inline’ function might not be inlinable [-Wattributes]
  311 | my128_mullo_epu32(const __m128i &a, const __m128i &b) {
      | ^~~~~~~~~~~~~~~~~

ソースコード

diff #
プレゼンテーションモードにする

/**
* date : 2020-11-27 22:06:51
*/
#pragma region kyopro_template
#define Nyaan_template
#include <immintrin.h>
#include <bits/stdc++.h>
#define pb push_back
#define eb emplace_back
#define fi first
#define se second
#define each(x, v) for (auto &x : v)
#define all(v) (v).begin(), (v).end()
#define sz(v) ((int)(v).size())
#define mem(a, val) memset(a, val, sizeof(a))
#define ini(...) \
int __VA_ARGS__; \
in(__VA_ARGS__)
#define inl(...) \
long long __VA_ARGS__; \
in(__VA_ARGS__)
#define ins(...) \
string __VA_ARGS__; \
in(__VA_ARGS__)
#define inc(...) \
char __VA_ARGS__; \
in(__VA_ARGS__)
#define in2(s, t) \
for (int i = 0; i < (int)s.size(); i++) { \
in(s[i], t[i]); \
}
#define in3(s, t, u) \
for (int i = 0; i < (int)s.size(); i++) { \
in(s[i], t[i], u[i]); \
}
#define in4(s, t, u, v) \
for (int i = 0; i < (int)s.size(); i++) { \
in(s[i], t[i], u[i], v[i]); \
}
#define rep(i, N) for (long long i = 0; i < (long long)(N); i++)
#define repr(i, N) for (long long i = (long long)(N)-1; i >= 0; i--)
#define rep1(i, N) for (long long i = 1; i <= (long long)(N); i++)
#define repr1(i, N) for (long long i = (N); (long long)(i) > 0; i--)
#define reg(i, a, b) for (long long i = (a); i < (b); i++)
#define die(...) \
do { \
out(__VA_ARGS__); \
return; \
} while (0)
using namespace std;
using ll = long long;
template <class T>
using V = vector<T>;
using vi = vector<int>;
using vl = vector<long long>;
using vvi = vector<vector<int>>;
using vd = V<double>;
using vs = V<string>;
using vvl = vector<vector<long long>>;
using P = pair<long long, long long>;
using vp = vector<P>;
using pii = pair<int, int>;
using vpi = vector<pair<int, int>>;
constexpr int inf = 1001001001;
constexpr long long infLL = (1LL << 61) - 1;
template <typename T, typename U>
inline bool amin(T &x, U y) {
return (y < x) ? (x = y, true) : false;
}
template <typename T, typename U>
inline bool amax(T &x, U y) {
return (x < y) ? (x = y, true) : false;
}
template <typename T, typename U>
ostream &operator<<(ostream &os, const pair<T, U> &p) {
os << p.first << " " << p.second;
return os;
}
template <typename T, typename U>
istream &operator>>(istream &is, pair<T, U> &p) {
is >> p.first >> p.second;
return is;
}
template <typename T>
ostream &operator<<(ostream &os, const vector<T> &v) {
int s = (int)v.size();
for (int i = 0; i < s; i++) os << (i ? " " : "") << v[i];
return os;
}
template <typename T>
istream &operator>>(istream &is, vector<T> &v) {
for (auto &x : v) is >> x;
return is;
}
void in() {}
template <typename T, class... U>
void in(T &t, U &... u) {
cin >> t;
in(u...);
}
void out() { cout << "\n"; }
template <typename T, class... U>
void out(const T &t, const U &... u) {
cout << t;
if (sizeof...(u)) cout << " ";
out(u...);
}
#ifdef NyaanDebug
#define trc(...) \
do { \
cerr << #__VA_ARGS__ << " = "; \
dbg_out(__VA_ARGS__); \
} while (0)
#define trca(v, N) \
do { \
cerr << #v << " = "; \
array_out(v, N); \
} while (0)
#define trcc(v) \
do { \
cerr << #v << " = {"; \
each(x, v) { cerr << " " << x << ","; } \
cerr << "}" << endl; \
} while (0)
template <typename T>
void _cout(const T &c) {
cerr << c;
}
void _cout(const int &c) {
if (c == 1001001001)
cerr << "inf";
else if (c == -1001001001)
cerr << "-inf";
else
cerr << c;
}
void _cout(const unsigned int &c) {
if (c == 1001001001)
cerr << "inf";
else
cerr << c;
}
void _cout(const long long &c) {
if (c == 1001001001 || c == (1LL << 61) - 1)
cerr << "inf";
else if (c == -1001001001 || c == -((1LL << 61) - 1))
cerr << "-inf";
else
cerr << c;
}
void _cout(const unsigned long long &c) {
if (c == 1001001001 || c == (1LL << 61) - 1)
cerr << "inf";
else
cerr << c;
}
template <typename T, typename U>
void _cout(const pair<T, U> &p) {
cerr << "{ ";
_cout(p.fi);
cerr << ", ";
_cout(p.se);
cerr << " } ";
}
template <typename T>
void _cout(const vector<T> &v) {
int s = v.size();
cerr << "{ ";
for (int i = 0; i < s; i++) {
cerr << (i ? ", " : "");
_cout(v[i]);
}
cerr << " } ";
}
template <typename T>
void _cout(const vector<vector<T>> &v) {
cerr << "[ ";
for (const auto &x : v) {
cerr << endl;
_cout(x);
cerr << ", ";
}
cerr << endl << " ] ";
}
void dbg_out() { cerr << endl; }
template <typename T, class... U>
void dbg_out(const T &t, const U &... u) {
_cout(t);
if (sizeof...(u)) cerr << ", ";
dbg_out(u...);
}
template <typename T>
void array_out(const T &v, int s) {
cerr << "{ ";
for (int i = 0; i < s; i++) {
cerr << (i ? ", " : "");
_cout(v[i]);
}
cerr << " } " << endl;
}
template <typename T>
void array_out(const T &v, int H, int W) {
cerr << "[ ";
for (int i = 0; i < H; i++) {
cerr << (i ? ", " : "");
array_out(v[i], W);
}
cerr << " ] " << endl;
}
#else
#define trc(...)
#define trca(...)
#define trcc(...)
#endif
inline int popcnt(unsigned long long a) { return __builtin_popcountll(a); }
inline int lsb(unsigned long long a) { return __builtin_ctzll(a); }
inline int msb(unsigned long long a) { return 63 - __builtin_clzll(a); }
template <typename T>
inline int getbit(T a, int i) {
return (a >> i) & 1;
}
template <typename T>
inline void setbit(T &a, int i) {
a |= (1LL << i);
}
template <typename T>
inline void delbit(T &a, int i) {
a &= ~(1LL << i);
}
template <typename T>
int lb(const vector<T> &v, const T &a) {
return lower_bound(begin(v), end(v), a) - begin(v);
}
template <typename T>
int ub(const vector<T> &v, const T &a) {
return upper_bound(begin(v), end(v), a) - begin(v);
}
template <typename T>
int btw(T a, T x, T b) {
return a <= x && x < b;
}
template <typename T, typename U>
T ceil(T a, U b) {
return (a + b - 1) / b;
}
constexpr long long TEN(int n) {
long long ret = 1, x = 10;
while (n) {
if (n & 1) ret *= x;
x *= x;
n >>= 1;
}
return ret;
}
template <typename T>
vector<T> mkrui(const vector<T> &v) {
vector<T> ret(v.size() + 1);
for (int i = 0; i < int(v.size()); i++) ret[i + 1] = ret[i] + v[i];
return ret;
};
template <typename T>
vector<T> mkuni(const vector<T> &v) {
vector<T> ret(v);
sort(ret.begin(), ret.end());
ret.erase(unique(ret.begin(), ret.end()), ret.end());
return ret;
}
template <typename F>
vector<int> mkord(int N, F f) {
vector<int> ord(N);
iota(begin(ord), end(ord), 0);
sort(begin(ord), end(ord), f);
return ord;
}
template <typename T = int>
vector<T> mkiota(int N) {
vector<T> ret(N);
iota(begin(ret), end(ret), 0);
return ret;
}
template <typename T>
vector<int> mkinv(vector<T> &v) {
vector<int> inv(v.size());
for (int i = 0; i < (int)v.size(); i++) inv[v[i]] = i;
return inv;
}
struct IoSetupNya {
IoSetupNya() {
cin.tie(nullptr);
ios::sync_with_stdio(false);
cout << fixed << setprecision(15);
cerr << fixed << setprecision(7);
}
} iosetupnya;
void solve();
int main() { solve(); }
#pragma endregion
using namespace std;
using namespace std;
using namespace std;
__attribute__((target("sse4.2"))) __attribute__((always_inline)) __m128i
my128_mullo_epu32(const __m128i &a, const __m128i &b) {
return _mm_mullo_epi32(a, b);
}
__attribute__((target("sse4.2"))) __attribute__((always_inline)) __m128i
my128_mulhi_epu32(const __m128i &a, const __m128i &b) {
__m128i a13 = _mm_shuffle_epi32(a, 0xF5);
__m128i b13 = _mm_shuffle_epi32(b, 0xF5);
__m128i prod02 = _mm_mul_epu32(a, b);
__m128i prod13 = _mm_mul_epu32(a13, b13);
__m128i prod = _mm_unpackhi_epi64(_mm_unpacklo_epi32(prod02, prod13),
_mm_unpackhi_epi32(prod02, prod13));
return prod;
}
__attribute__((target("sse4.2"))) __attribute__((always_inline)) __m128i
montgomery_mul_128(const __m128i &a, const __m128i &b, const __m128i &r,
const __m128i &m1) {
return _mm_sub_epi32(
_mm_add_epi32(my128_mulhi_epu32(a, b), m1),
my128_mulhi_epu32(my128_mullo_epu32(my128_mullo_epu32(a, b), r), m1));
}
__attribute__((target("sse4.2"))) __attribute__((always_inline)) __m128i
montgomery_add_128(const __m128i &a, const __m128i &b, const __m128i &m2,
const __m128i &m0) {
__m128i ret = _mm_sub_epi32(_mm_add_epi32(a, b), m2);
return _mm_add_epi32(_mm_and_si128(_mm_cmpgt_epi32(m0, ret), m2), ret);
}
__attribute__((target("sse4.2"))) __attribute__((always_inline)) __m128i
montgomery_sub_128(const __m128i &a, const __m128i &b, const __m128i &m2,
const __m128i &m0) {
__m128i ret = _mm_sub_epi32(a, b);
return _mm_add_epi32(_mm_and_si128(_mm_cmpgt_epi32(m0, ret), m2), ret);
}
__attribute__((target("avx2"))) __attribute__((always_inline)) __m256i
my256_mullo_epu32(const __m256i &a, const __m256i &b) {
return _mm256_mullo_epi32(a, b);
}
__attribute__((target("avx2"))) __attribute__((always_inline)) __m256i
my256_mulhi_epu32(const __m256i &a, const __m256i &b) {
__m256i a13 = _mm256_shuffle_epi32(a, 0xF5);
__m256i b13 = _mm256_shuffle_epi32(b, 0xF5);
__m256i prod02 = _mm256_mul_epu32(a, b);
__m256i prod13 = _mm256_mul_epu32(a13, b13);
__m256i prod = _mm256_unpackhi_epi64(_mm256_unpacklo_epi32(prod02, prod13),
_mm256_unpackhi_epi32(prod02, prod13));
return prod;
}
__attribute__((target("avx2"))) __attribute__((always_inline)) __m256i
montgomery_mul_256(const __m256i &a, const __m256i &b, const __m256i &r,
const __m256i &m1) {
return _mm256_sub_epi32(
_mm256_add_epi32(my256_mulhi_epu32(a, b), m1),
my256_mulhi_epu32(my256_mullo_epu32(my256_mullo_epu32(a, b), r), m1));
}
__attribute__((target("avx2"))) __attribute__((always_inline)) __m256i
montgomery_add_256(const __m256i &a, const __m256i &b, const __m256i &m2,
const __m256i &m0) {
__m256i ret = _mm256_sub_epi32(_mm256_add_epi32(a, b), m2);
return _mm256_add_epi32(_mm256_and_si256(_mm256_cmpgt_epi32(m0, ret), m2),
ret);
}
__attribute__((target("avx2"))) __attribute__((always_inline)) __m256i
montgomery_sub_256(const __m256i &a, const __m256i &b, const __m256i &m2,
const __m256i &m0) {
__m256i ret = _mm256_sub_epi32(a, b);
return _mm256_add_epi32(_mm256_and_si256(_mm256_cmpgt_epi32(m0, ret), m2),
ret);
}
constexpr int SZ_FFT_BUF = 1 << 23;
uint32_t buf1_[SZ_FFT_BUF] __attribute__((aligned(64)));
uint32_t buf2_[SZ_FFT_BUF] __attribute__((aligned(64)));
template <typename mint>
struct NTT {
static constexpr uint32_t get_pr() {
uint32_t mod = mint::get_mod();
using u64 = uint64_t;
u64 ds[32] = {};
int idx = 0;
u64 m = mod - 1;
for (u64 i = 2; i * i <= m; ++i) {
if (m % i == 0) {
ds[idx++] = i;
while (m % i == 0) m /= i;
}
}
if (m != 1) ds[idx++] = m;
uint32_t pr = 2;
while (1) {
int flg = 1;
for (int i = 0; i < idx; ++i) {
u64 a = pr, b = (mod - 1) / ds[i], r = 1;
while (b) {
if (b & 1) r = r * a % mod;
a = a * a % mod;
b >>= 1;
}
if (r == 1) {
flg = 0;
break;
}
}
if (flg == 1) break;
++pr;
}
return pr;
};
static constexpr uint32_t mod = mint::get_mod();
static constexpr uint32_t pr = get_pr();
static constexpr int level = __builtin_ctzll(mod - 1);
mint dw[level], dy[level];
mint *buf1, *buf2;
constexpr NTT() {
setwy(level);
buf1 = reinterpret_cast<mint *>(::buf1_);
buf2 = reinterpret_cast<mint *>(::buf2_);
}
constexpr void setwy(int k) {
mint w[level], y[level];
w[k - 1] = mint(pr).pow((mod - 1) / (1 << k));
y[k - 1] = w[k - 1].inverse();
for (int i = k - 2; i > 0; --i)
w[i] = w[i + 1] * w[i + 1], y[i] = y[i + 1] * y[i + 1];
dw[0] = dy[0] = w[1] * w[1];
dw[1] = w[1], dy[1] = y[1], dw[2] = w[2], dy[2] = y[2];
for (int i = 3; i < k; ++i) {
dw[i] = dw[i - 1] * y[i - 2] * w[i];
dy[i] = dy[i - 1] * w[i - 2] * y[i];
}
}
__attribute__((target("avx2"))) void ntt(mint *a, int n) {
int k = n ? __builtin_ctz(n) : 0;
if (k == 0) return;
if (k == 1) {
mint a1 = a[1];
a[1] = a[0] - a[1];
a[0] = a[0] + a1;
return;
}
if (k & 1) {
int v = 1 << (k - 1);
if (v < 8) {
for (int j = 0; j < v; ++j) {
mint ajv = a[j + v];
a[j + v] = a[j] - ajv;
a[j] += ajv;
}
} else {
const __m256i m0 = _mm256_set1_epi32(0);
const __m256i m2 = _mm256_set1_epi32(mod + mod);
int j0 = 0;
int j1 = v;
for (; j0 < v; j0 += 8, j1 += 8) {
__m256i T0 = _mm256_loadu_si256((__m256i *)(a + j0));
__m256i T1 = _mm256_loadu_si256((__m256i *)(a + j1));
__m256i naj = montgomery_add_256(T0, T1, m2, m0);
__m256i najv = montgomery_sub_256(T0, T1, m2, m0);
_mm256_storeu_si256((__m256i *)(a + j0), naj);
_mm256_storeu_si256((__m256i *)(a + j1), najv);
}
}
}
int u = 1 << (2 + (k & 1));
int v = 1 << (k - 2 - (k & 1));
mint one = mint(1);
mint imag = dw[1];
while (v) {
if (v == 1) {
mint ww = one, xx = one, wx = one;
for (int jh = 0; jh < u;) {
ww = xx * xx, wx = ww * xx;
mint t0 = a[jh + 0], t1 = a[jh + 1] * xx;
mint t2 = a[jh + 2] * ww, t3 = a[jh + 3] * wx;
mint t0p2 = t0 + t2, t1p3 = t1 + t3;
mint t0m2 = t0 - t2, t1m3 = (t1 - t3) * imag;
a[jh + 0] = t0p2 + t1p3, a[jh + 1] = t0p2 - t1p3;
a[jh + 2] = t0m2 + t1m3, a[jh + 3] = t0m2 - t1m3;
xx *= dw[__builtin_ctz((jh += 4))];
}
} else if (v == 4) {
const __m128i m0 = _mm_set1_epi32(0);
const __m128i m1 = _mm_set1_epi32(mod);
const __m128i m2 = _mm_set1_epi32(mod + mod);
const __m128i r = _mm_set1_epi32(mint::r);
const __m128i Imag = _mm_set1_epi32(imag.a);
mint ww = one, xx = one, wx = one;
for (int jh = 0; jh < u;) {
if (jh == 0) {
int j0 = 0;
int j1 = v;
int j2 = j1 + v;
int j3 = j2 + v;
int je = v;
for (; j0 < je; j0 += 4, j1 += 4, j2 += 4, j3 += 4) {
const __m128i T0 = _mm_loadu_si128((__m128i *)(a + j0));
const __m128i T1 = _mm_loadu_si128((__m128i *)(a + j1));
const __m128i T2 = _mm_loadu_si128((__m128i *)(a + j2));
const __m128i T3 = _mm_loadu_si128((__m128i *)(a + j3));
const __m128i T0P2 = montgomery_add_128(T0, T2, m2, m0);
const __m128i T1P3 = montgomery_add_128(T1, T3, m2, m0);
const __m128i T0M2 = montgomery_sub_128(T0, T2, m2, m0);
const __m128i T1M3 = montgomery_mul_128(
montgomery_sub_128(T1, T3, m2, m0), Imag, r, m1);
_mm_storeu_si128((__m128i *)(a + j0),
montgomery_add_128(T0P2, T1P3, m2, m0));
_mm_storeu_si128((__m128i *)(a + j1),
montgomery_sub_128(T0P2, T1P3, m2, m0));
_mm_storeu_si128((__m128i *)(a + j2),
montgomery_add_128(T0M2, T1M3, m2, m0));
_mm_storeu_si128((__m128i *)(a + j3),
montgomery_sub_128(T0M2, T1M3, m2, m0));
}
} else {
ww = xx * xx, wx = ww * xx;
const __m128i WW = _mm_set1_epi32(ww.a);
const __m128i WX = _mm_set1_epi32(wx.a);
const __m128i XX = _mm_set1_epi32(xx.a);
int j0 = jh * v;
int j1 = j0 + v;
int j2 = j1 + v;
int j3 = j2 + v;
int je = j1;
for (; j0 < je; j0 += 4, j1 += 4, j2 += 4, j3 += 4) {
const __m128i T0 = _mm_loadu_si128((__m128i *)(a + j0));
const __m128i T1 = _mm_loadu_si128((__m128i *)(a + j1));
const __m128i T2 = _mm_loadu_si128((__m128i *)(a + j2));
const __m128i T3 = _mm_loadu_si128((__m128i *)(a + j3));
const __m128i MT1 = montgomery_mul_128(T1, XX, r, m1);
const __m128i MT2 = montgomery_mul_128(T2, WW, r, m1);
const __m128i MT3 = montgomery_mul_128(T3, WX, r, m1);
const __m128i T0P2 = montgomery_add_128(T0, MT2, m2, m0);
const __m128i T1P3 = montgomery_add_128(MT1, MT3, m2, m0);
const __m128i T0M2 = montgomery_sub_128(T0, MT2, m2, m0);
const __m128i T1M3 = montgomery_mul_128(
montgomery_sub_128(MT1, MT3, m2, m0), Imag, r, m1);
_mm_storeu_si128((__m128i *)(a + j0),
montgomery_add_128(T0P2, T1P3, m2, m0));
_mm_storeu_si128((__m128i *)(a + j1),
montgomery_sub_128(T0P2, T1P3, m2, m0));
_mm_storeu_si128((__m128i *)(a + j2),
montgomery_add_128(T0M2, T1M3, m2, m0));
_mm_storeu_si128((__m128i *)(a + j3),
montgomery_sub_128(T0M2, T1M3, m2, m0));
}
}
xx *= dw[__builtin_ctz((jh += 4))];
}
} else {
const __m256i m0 = _mm256_set1_epi32(0);
const __m256i m1 = _mm256_set1_epi32(mod);
const __m256i m2 = _mm256_set1_epi32(mod + mod);
const __m256i r = _mm256_set1_epi32(mint::r);
const __m256i Imag = _mm256_set1_epi32(imag.a);
mint ww = one, xx = one, wx = one;
for (int jh = 0; jh < u;) {
if (jh == 0) {
int j0 = 0;
int j1 = v;
int j2 = j1 + v;
int j3 = j2 + v;
int je = v;
for (; j0 < je; j0 += 8, j1 += 8, j2 += 8, j3 += 8) {
const __m256i T0 = _mm256_loadu_si256((__m256i *)(a + j0));
const __m256i T1 = _mm256_loadu_si256((__m256i *)(a + j1));
const __m256i T2 = _mm256_loadu_si256((__m256i *)(a + j2));
const __m256i T3 = _mm256_loadu_si256((__m256i *)(a + j3));
const __m256i T0P2 = montgomery_add_256(T0, T2, m2, m0);
const __m256i T1P3 = montgomery_add_256(T1, T3, m2, m0);
const __m256i T0M2 = montgomery_sub_256(T0, T2, m2, m0);
const __m256i T1M3 = montgomery_mul_256(
montgomery_sub_256(T1, T3, m2, m0), Imag, r, m1);
_mm256_storeu_si256((__m256i *)(a + j0),
montgomery_add_256(T0P2, T1P3, m2, m0));
_mm256_storeu_si256((__m256i *)(a + j1),
montgomery_sub_256(T0P2, T1P3, m2, m0));
_mm256_storeu_si256((__m256i *)(a + j2),
montgomery_add_256(T0M2, T1M3, m2, m0));
_mm256_storeu_si256((__m256i *)(a + j3),
montgomery_sub_256(T0M2, T1M3, m2, m0));
}
} else {
ww = xx * xx, wx = ww * xx;
const __m256i WW = _mm256_set1_epi32(ww.a);
const __m256i WX = _mm256_set1_epi32(wx.a);
const __m256i XX = _mm256_set1_epi32(xx.a);
int j0 = jh * v;
int j1 = j0 + v;
int j2 = j1 + v;
int j3 = j2 + v;
int je = j1;
for (; j0 < je; j0 += 8, j1 += 8, j2 += 8, j3 += 8) {
const __m256i T0 = _mm256_loadu_si256((__m256i *)(a + j0));
const __m256i T1 = _mm256_loadu_si256((__m256i *)(a + j1));
const __m256i T2 = _mm256_loadu_si256((__m256i *)(a + j2));
const __m256i T3 = _mm256_loadu_si256((__m256i *)(a + j3));
const __m256i MT1 = montgomery_mul_256(T1, XX, r, m1);
const __m256i MT2 = montgomery_mul_256(T2, WW, r, m1);
const __m256i MT3 = montgomery_mul_256(T3, WX, r, m1);
const __m256i T0P2 = montgomery_add_256(T0, MT2, m2, m0);
const __m256i T1P3 = montgomery_add_256(MT1, MT3, m2, m0);
const __m256i T0M2 = montgomery_sub_256(T0, MT2, m2, m0);
const __m256i T1M3 = montgomery_mul_256(
montgomery_sub_256(MT1, MT3, m2, m0), Imag, r, m1);
_mm256_storeu_si256((__m256i *)(a + j0),
montgomery_add_256(T0P2, T1P3, m2, m0));
_mm256_storeu_si256((__m256i *)(a + j1),
montgomery_sub_256(T0P2, T1P3, m2, m0));
_mm256_storeu_si256((__m256i *)(a + j2),
montgomery_add_256(T0M2, T1M3, m2, m0));
_mm256_storeu_si256((__m256i *)(a + j3),
montgomery_sub_256(T0M2, T1M3, m2, m0));
}
}
xx *= dw[__builtin_ctz((jh += 4))];
}
}
u <<= 2;
v >>= 2;
}
}
__attribute__((target("avx2"))) void intt(mint *a, int n,
int normalize = true) {
int k = n ? __builtin_ctz(n) : 0;
if (k == 0) return;
if (k == 1) {
mint a1 = a[1];
a[1] = a[0] - a[1];
a[0] = a[0] + a1;
if (normalize) {
a[0] *= mint(2).inverse();
a[1] *= mint(2).inverse();
}
return;
}
int u = 1 << (k - 2);
int v = 1;
mint one = mint(1);
mint imag = dy[1];
while (u) {
if (v == 1) {
mint ww = one, xx = one, yy = one;
u <<= 2;
for (int jh = 0; jh < u;) {
ww = xx * xx, yy = xx * imag;
mint t0 = a[jh + 0], t1 = a[jh + 1];
mint t2 = a[jh + 2], t3 = a[jh + 3];
mint t0p1 = t0 + t1, t2p3 = t2 + t3;
mint t0m1 = (t0 - t1) * xx, t2m3 = (t2 - t3) * yy;
a[jh + 0] = t0p1 + t2p3, a[jh + 2] = (t0p1 - t2p3) * ww;
a[jh + 1] = t0m1 + t2m3, a[jh + 3] = (t0m1 - t2m3) * ww;
xx *= dy[__builtin_ctz(jh += 4)];
}
} else if (v == 4) {
const __m128i m0 = _mm_set1_epi32(0);
const __m128i m1 = _mm_set1_epi32(mod);
const __m128i m2 = _mm_set1_epi32(mod + mod);
const __m128i r = _mm_set1_epi32(mint::r);
const __m128i Imag = _mm_set1_epi32(imag.a);
mint ww = one, xx = one, yy = one;
u <<= 2;
for (int jh = 0; jh < u;) {
if (jh == 0) {
int j0 = 0;
int j1 = v;
int j2 = v + v;
int j3 = j2 + v;
for (; j0 < v; j0 += 4, j1 += 4, j2 += 4, j3 += 4) {
const __m128i T0 = _mm_loadu_si128((__m128i *)(a + j0));
const __m128i T1 = _mm_loadu_si128((__m128i *)(a + j1));
const __m128i T2 = _mm_loadu_si128((__m128i *)(a + j2));
const __m128i T3 = _mm_loadu_si128((__m128i *)(a + j3));
const __m128i T0P1 = montgomery_add_128(T0, T1, m2, m0);
const __m128i T2P3 = montgomery_add_128(T2, T3, m2, m0);
const __m128i T0M1 = montgomery_sub_128(T0, T1, m2, m0);
const __m128i T2M3 = montgomery_mul_128(
montgomery_sub_128(T2, T3, m2, m0), Imag, r, m1);
_mm_storeu_si128((__m128i *)(a + j0),
montgomery_add_128(T0P1, T2P3, m2, m0));
_mm_storeu_si128((__m128i *)(a + j2),
montgomery_sub_128(T0P1, T2P3, m2, m0));
_mm_storeu_si128((__m128i *)(a + j1),
montgomery_add_128(T0M1, T2M3, m2, m0));
_mm_storeu_si128((__m128i *)(a + j3),
montgomery_sub_128(T0M1, T2M3, m2, m0));
}
} else {
ww = xx * xx, yy = xx * imag;
const __m128i WW = _mm_set1_epi32(ww.a);
const __m128i XX = _mm_set1_epi32(xx.a);
const __m128i YY = _mm_set1_epi32(yy.a);
int j0 = jh * v;
int j1 = j0 + v;
int j2 = j1 + v;
int j3 = j2 + v;
int je = j1;
for (; j0 < je; j0 += 4, j1 += 4, j2 += 4, j3 += 4) {
const __m128i T0 = _mm_loadu_si128((__m128i *)(a + j0));
const __m128i T1 = _mm_loadu_si128((__m128i *)(a + j1));
const __m128i T2 = _mm_loadu_si128((__m128i *)(a + j2));
const __m128i T3 = _mm_loadu_si128((__m128i *)(a + j3));
const __m128i T0P1 = montgomery_add_128(T0, T1, m2, m0);
const __m128i T2P3 = montgomery_add_128(T2, T3, m2, m0);
const __m128i T0M1 = montgomery_mul_128(
montgomery_sub_128(T0, T1, m2, m0), XX, r, m1);
__m128i T2M3 = montgomery_mul_128(
montgomery_sub_128(T2, T3, m2, m0), YY, r, m1);
_mm_storeu_si128((__m128i *)(a + j0),
montgomery_add_128(T0P1, T2P3, m2, m0));
_mm_storeu_si128(
(__m128i *)(a + j2),
montgomery_mul_128(montgomery_sub_128(T0P1, T2P3, m2, m0), WW,
r, m1));
_mm_storeu_si128((__m128i *)(a + j1),
montgomery_add_128(T0M1, T2M3, m2, m0));
_mm_storeu_si128(
(__m128i *)(a + j3),
montgomery_mul_128(montgomery_sub_128(T0M1, T2M3, m2, m0), WW,
r, m1));
}
}
xx *= dy[__builtin_ctz(jh += 4)];
}
} else {
const __m256i m0 = _mm256_set1_epi32(0);
const __m256i m1 = _mm256_set1_epi32(mod);
const __m256i m2 = _mm256_set1_epi32(mod + mod);
const __m256i r = _mm256_set1_epi32(mint::r);
const __m256i Imag = _mm256_set1_epi32(imag.a);
mint ww = one, xx = one, yy = one;
u <<= 2;
for (int jh = 0; jh < u;) {
if (jh == 0) {
int j0 = 0;
int j1 = v;
int j2 = v + v;
int j3 = j2 + v;
for (; j0 < v; j0 += 8, j1 += 8, j2 += 8, j3 += 8) {
const __m256i T0 = _mm256_loadu_si256((__m256i *)(a + j0));
const __m256i T1 = _mm256_loadu_si256((__m256i *)(a + j1));
const __m256i T2 = _mm256_loadu_si256((__m256i *)(a + j2));
const __m256i T3 = _mm256_loadu_si256((__m256i *)(a + j3));
const __m256i T0P1 = montgomery_add_256(T0, T1, m2, m0);
const __m256i T2P3 = montgomery_add_256(T2, T3, m2, m0);
const __m256i T0M1 = montgomery_sub_256(T0, T1, m2, m0);
const __m256i T2M3 = montgomery_mul_256(
montgomery_sub_256(T2, T3, m2, m0), Imag, r, m1);
_mm256_storeu_si256((__m256i *)(a + j0),
montgomery_add_256(T0P1, T2P3, m2, m0));
_mm256_storeu_si256((__m256i *)(a + j2),
montgomery_sub_256(T0P1, T2P3, m2, m0));
_mm256_storeu_si256((__m256i *)(a + j1),
montgomery_add_256(T0M1, T2M3, m2, m0));
_mm256_storeu_si256((__m256i *)(a + j3),
montgomery_sub_256(T0M1, T2M3, m2, m0));
}
} else {
ww = xx * xx, yy = xx * imag;
const __m256i WW = _mm256_set1_epi32(ww.a);
const __m256i XX = _mm256_set1_epi32(xx.a);
const __m256i YY = _mm256_set1_epi32(yy.a);
int j0 = jh * v;
int j1 = j0 + v;
int j2 = j1 + v;
int j3 = j2 + v;
int je = j1;
for (; j0 < je; j0 += 8, j1 += 8, j2 += 8, j3 += 8) {
const __m256i T0 = _mm256_loadu_si256((__m256i *)(a + j0));
const __m256i T1 = _mm256_loadu_si256((__m256i *)(a + j1));
const __m256i T2 = _mm256_loadu_si256((__m256i *)(a + j2));
const __m256i T3 = _mm256_loadu_si256((__m256i *)(a + j3));
const __m256i T0P1 = montgomery_add_256(T0, T1, m2, m0);
const __m256i T2P3 = montgomery_add_256(T2, T3, m2, m0);
const __m256i T0M1 = montgomery_mul_256(
montgomery_sub_256(T0, T1, m2, m0), XX, r, m1);
const __m256i T2M3 = montgomery_mul_256(
montgomery_sub_256(T2, T3, m2, m0), YY, r, m1);
_mm256_storeu_si256((__m256i *)(a + j0),
montgomery_add_256(T0P1, T2P3, m2, m0));
_mm256_storeu_si256(
(__m256i *)(a + j2),
montgomery_mul_256(montgomery_sub_256(T0P1, T2P3, m2, m0), WW,
r, m1));
_mm256_storeu_si256((__m256i *)(a + j1),
montgomery_add_256(T0M1, T2M3, m2, m0));
_mm256_storeu_si256(
(__m256i *)(a + j3),
montgomery_mul_256(montgomery_sub_256(T0M1, T2M3, m2, m0), WW,
r, m1));
}
}
xx *= dy[__builtin_ctz(jh += 4)];
}
}
u >>= 4;
v <<= 2;
}
if (k & 1) {
v = 1 << (k - 1);
if (v < 8) {
for (int j = 0; j < v; ++j) {
mint ajv = a[j] - a[j + v];
a[j] += a[j + v];
a[j + v] = ajv;
}
} else {
const __m256i m0 = _mm256_set1_epi32(0);
const __m256i m2 = _mm256_set1_epi32(mod + mod);
int j0 = 0;
int j1 = v;
for (; j0 < v; j0 += 8, j1 += 8) {
const __m256i T0 = _mm256_loadu_si256((__m256i *)(a + j0));
const __m256i T1 = _mm256_loadu_si256((__m256i *)(a + j1));
__m256i naj = montgomery_add_256(T0, T1, m2, m0);
__m256i najv = montgomery_sub_256(T0, T1, m2, m0);
_mm256_storeu_si256((__m256i *)(a + j0), naj);
_mm256_storeu_si256((__m256i *)(a + j1), najv);
}
}
}
if (normalize) {
mint invn = mint(n).inverse();
for (int i = 0; i < n; i++) a[i] *= invn;
}
}
__attribute__((target("avx2"))) void inplace_multiply(
int l1, int l2, int zero_padding = true) {
int l = l1 + l2 - 1;
int M = 4;
while (M < l) M <<= 1;
if (zero_padding) {
for (int i = l1; i < M; i++) buf1_[i] = 0;
for (int i = l2; i < M; i++) buf2_[i] = 0;
}
const __m256i m0 = _mm256_set1_epi32(0);
const __m256i m1 = _mm256_set1_epi32(mod);
const __m256i r = _mm256_set1_epi32(mint::r);
const __m256i N2 = _mm256_set1_epi32(mint::n2);
for (int i = 0; i < l1; i += 8) {
__m256i a = _mm256_loadu_si256((__m256i *)(buf1_ + i));
__m256i b = montgomery_mul_256(a, N2, r, m1);
_mm256_storeu_si256((__m256i *)(buf1_ + i), b);
}
for (int i = 0; i < l2; i += 8) {
__m256i a = _mm256_loadu_si256((__m256i *)(buf2_ + i));
__m256i b = montgomery_mul_256(a, N2, r, m1);
_mm256_storeu_si256((__m256i *)(buf2_ + i), b);
}
ntt(buf1, M);
ntt(buf2, M);
for (int i = 0; i < M; i += 8) {
__m256i a = _mm256_loadu_si256((__m256i *)(buf1_ + i));
__m256i b = _mm256_loadu_si256((__m256i *)(buf2_ + i));
__m256i c = montgomery_mul_256(a, b, r, m1);
_mm256_storeu_si256((__m256i *)(buf1_ + i), c);
}
intt(buf1, M, false);
const __m256i INVM = _mm256_set1_epi32((mint(M).inverse()).a);
for (int i = 0; i < l; i += 8) {
__m256i a = _mm256_loadu_si256((__m256i *)(buf1_ + i));
__m256i b = montgomery_mul_256(a, INVM, r, m1);
__m256i c = my256_mulhi_epu32(my256_mullo_epu32(b, r), m1);
__m256i d = _mm256_and_si256(_mm256_cmpgt_epi32(c, m0), m1);
__m256i e = _mm256_sub_epi32(d, c);
_mm256_storeu_si256((__m256i *)(buf1_ + i), e);
}
}
void ntt(vector<mint> &a) {
int M = (int)a.size();
for (int i = 0; i < M; i++) buf1[i].a = a[i].a;
ntt(buf1, M);
for (int i = 0; i < M; i++) a[i].a = buf1[i].a;
}
void intt(vector<mint> &a) {
int M = (int)a.size();
for (int i = 0; i < M; i++) buf1[i].a = a[i].a;
intt(buf1, M, true);
for (int i = 0; i < M; i++) a[i].a = buf1[i].a;
}
vector<mint> multiply(const vector<mint> &a, const vector<mint> &b) {
if (a.size() == 0 && b.size() == 0) return vector<mint>{};
int l = a.size() + b.size() - 1;
if (min<int>(a.size(), b.size()) <= 40) {
vector<mint> s(l);
for (int i = 0; i < (int)a.size(); ++i)
for (int j = 0; j < (int)b.size(); ++j) s[i + j] += a[i] * b[j];
return s;
}
assert(l <= SZ_FFT_BUF);
int M = 4;
while (M < l) M <<= 1;
for (int i = 0; i < (int)a.size(); ++i) buf1[i].a = a[i].a;
for (int i = (int)a.size(); i < M; ++i) buf1[i].a = 0;
for (int i = 0; i < (int)b.size(); ++i) buf2[i].a = b[i].a;
for (int i = (int)b.size(); i < M; ++i) buf2[i].a = 0;
ntt(buf1, M);
ntt(buf2, M);
for (int i = 0; i < M; ++i)
buf1[i].a = mint::reduce(uint64_t(buf1[i].a) * buf2[i].a);
intt(buf1, M, false);
vector<mint> s(l);
mint invm = mint(M).inverse();
for (int i = 0; i < l; ++i) s[i] = buf1[i] * invm;
return s;
}
void ntt_doubling(vector<mint> &a) {
int M = (int)a.size();
for (int i = 0; i < M; i++) buf1[i].a = a[i].a;
intt(buf1, M);
mint r = 1, zeta = mint(pr).pow((mint::get_mod() - 1) / (M << 1));
for (int i = 0; i < M; i++) buf1[i] *= r, r *= zeta;
ntt(buf1, M);
a.resize(2 * M);
for (int i = 0; i < M; i++) a[M + i].a = buf1[i].a;
}
};
using namespace std;
template <typename mint>
struct FormalPowerSeries : vector<mint> {
using vector<mint>::vector;
using FPS = FormalPowerSeries;
FPS &operator+=(const FPS &r) {
if (r.size() > this->size()) this->resize(r.size());
for (int i = 0; i < (int)r.size(); i++) (*this)[i] += r[i];
return *this;
}
FPS &operator+=(const mint &r) {
if (this->empty()) this->resize(1);
(*this)[0] += r;
return *this;
}
FPS &operator-=(const FPS &r) {
if (r.size() > this->size()) this->resize(r.size());
for (int i = 0; i < (int)r.size(); i++) (*this)[i] -= r[i];
return *this;
}
FPS &operator-=(const mint &r) {
if (this->empty()) this->resize(1);
(*this)[0] -= r;
return *this;
}
FPS &operator*=(const mint &v) {
for (int k = 0; k < (int)this->size(); k++) (*this)[k] *= v;
return *this;
}
FPS &operator/=(const FPS &r) {
if (this->size() < r.size()) {
this->clear();
return *this;
}
int n = this->size() - r.size() + 1;
if ((int)r.size() <= 64) {
FPS f(*this), g(r);
g.shrink();
mint coeff = g.back().inverse();
for (auto &x : g) x *= coeff;
int deg = (int)f.size() - (int)g.size() + 1;
int gs = g.size();
FPS quo(deg);
for (int i = deg - 1; i >= 0; i--) {
quo[i] = f[i + gs - 1];
for (int j = 0; j < gs; j++) f[i + j] -= quo[i] * g[j];
}
*this = quo * coeff;
this->resize(n, mint(0));
return *this;
}
return *this = ((*this).rev().pre(n) * r.rev().inv(n)).pre(n).rev();
}
FPS &operator%=(const FPS &r) {
*this -= *this / r * r;
shrink();
return *this;
}
FPS operator+(const FPS &r) const { return FPS(*this) += r; }
FPS operator+(const mint &v) const { return FPS(*this) += v; }
FPS operator-(const FPS &r) const { return FPS(*this) -= r; }
FPS operator-(const mint &v) const { return FPS(*this) -= v; }
FPS operator*(const FPS &r) const { return FPS(*this) *= r; }
FPS operator*(const mint &v) const { return FPS(*this) *= v; }
FPS operator/(const FPS &r) const { return FPS(*this) /= r; }
FPS operator%(const FPS &r) const { return FPS(*this) %= r; }
FPS operator-() const {
FPS ret(this->size());
for (int i = 0; i < (int)this->size(); i++) ret[i] = -(*this)[i];
return ret;
}
void shrink() {
while (this->size() && this->back() == mint(0)) this->pop_back();
}
FPS rev() const {
FPS ret(*this);
reverse(begin(ret), end(ret));
return ret;
}
FPS dot(FPS r) const {
FPS ret(min(this->size(), r.size()));
for (int i = 0; i < (int)ret.size(); i++) ret[i] = (*this)[i] * r[i];
return ret;
}
FPS pre(int sz) const {
return FPS(begin(*this), begin(*this) + min((int)this->size(), sz));
}
FPS operator>>(int sz) const {
if ((int)this->size() <= sz) return {};
FPS ret(*this);
ret.erase(ret.begin(), ret.begin() + sz);
return ret;
}
FPS operator<<(int sz) const {
FPS ret(*this);
ret.insert(ret.begin(), sz, mint(0));
return ret;
}
FPS diff() const {
const int n = (int)this->size();
FPS ret(max(0, n - 1));
mint one(1), coeff(1);
for (int i = 1; i < n; i++) {
ret[i - 1] = (*this)[i] * coeff;
coeff += one;
}
return ret;
}
FPS integral() const {
const int n = (int)this->size();
FPS ret(n + 1);
ret[0] = mint(0);
if (n > 0) ret[1] = mint(1);
auto mod = mint::get_mod();
for (int i = 2; i <= n; i++) ret[i] = (-ret[mod % i]) * (mod / i);
for (int i = 0; i < n; i++) ret[i + 1] *= (*this)[i];
return ret;
}
mint eval(mint x) const {
mint r = 0, w = 1;
for (auto &v : *this) r += w * v, w *= x;
return r;
}
FPS log(int deg = -1) const {
assert((*this)[0] == mint(1));
if (deg == -1) deg = (int)this->size();
return (this->diff() * this->inv(deg)).pre(deg - 1).integral();
}
FPS pow(int64_t k, int deg = -1) const {
const int n = (int)this->size();
if (deg == -1) deg = n;
for (int i = 0; i < n; i++) {
if ((*this)[i] != mint(0)) {
if (i * k > deg) return FPS(deg, mint(0));
mint rev = mint(1) / (*this)[i];
FPS ret = (((*this * rev) >> i).log() * k).exp() * ((*this)[i].pow(k));
ret = (ret << (i * k)).pre(deg);
if ((int)ret.size() < deg) ret.resize(deg, mint(0));
return ret;
}
}
return FPS(deg, mint(0));
}
static void *ntt_ptr;
static void set_fft();
FPS &operator*=(const FPS &r);
void ntt();
void intt();
void ntt_doubling();
static int ntt_pr();
FPS inv(int deg = -1) const;
FPS exp(int deg = -1) const;
};
template <typename mint>
void *FormalPowerSeries<mint>::ntt_ptr = nullptr;
/**
* @brief /
* @docs docs/fps/formal-power-series.md
*/
template <typename mint>
void FormalPowerSeries<mint>::set_fft() {
if (!ntt_ptr) ntt_ptr = new NTT<mint>;
}
template <typename mint>
FormalPowerSeries<mint>& FormalPowerSeries<mint>::operator*=(
const FormalPowerSeries<mint>& r) {
if (this->empty() || r.empty()) {
this->clear();
return *this;
}
set_fft();
auto ret = static_cast<NTT<mint>*>(ntt_ptr)->multiply(*this, r);
return *this = FormalPowerSeries<mint>(ret.begin(), ret.end());
}
template <typename mint>
void FormalPowerSeries<mint>::ntt() {
set_fft();
static_cast<NTT<mint>*>(ntt_ptr)->ntt(*this);
}
template <typename mint>
void FormalPowerSeries<mint>::intt() {
set_fft();
static_cast<NTT<mint>*>(ntt_ptr)->intt(*this);
}
template <typename mint>
void FormalPowerSeries<mint>::ntt_doubling() {
set_fft();
static_cast<NTT<mint>*>(ntt_ptr)->ntt_doubling(*this);
}
template <typename mint>
int FormalPowerSeries<mint>::ntt_pr() {
set_fft();
return static_cast<NTT<mint>*>(ntt_ptr)->pr;
}
template <typename mint>
FormalPowerSeries<mint> FormalPowerSeries<mint>::inv(int deg) const {
assert((*this)[0] != mint(0));
if (deg == -1) deg = (int)this->size();
FormalPowerSeries<mint> res(deg);
res[0] = {mint(1) / (*this)[0]};
for (int d = 1; d < deg; d <<= 1) {
FormalPowerSeries<mint> f(2 * d), g(2 * d);
for (int j = 0; j < min((int)this->size(), 2 * d); j++) f[j] = (*this)[j];
for (int j = 0; j < d; j++) g[j] = res[j];
f.ntt();
g.ntt();
for (int j = 0; j < 2 * d; j++) f[j] *= g[j];
f.intt();
for (int j = 0; j < d; j++) f[j] = 0;
f.ntt();
for (int j = 0; j < 2 * d; j++) f[j] *= g[j];
f.intt();
for (int j = d; j < min(2 * d, deg); j++) res[j] = -f[j];
}
return res.pre(deg);
}
template <typename mint>
FormalPowerSeries<mint> FormalPowerSeries<mint>::exp(int deg) const {
using fps = FormalPowerSeries<mint>;
assert((*this).size() == 0 || (*this)[0] == mint(0));
if (deg == -1) deg = this->size();
fps inv;
inv.reserve(deg + 1);
inv.push_back(mint(0));
inv.push_back(mint(1));
auto inplace_integral = [&](fps& F) -> void {
const int n = (int)F.size();
auto mod = mint::get_mod();
while ((int)inv.size() <= n) {
int i = inv.size();
inv.push_back((-inv[mod % i]) * (mod / i));
}
F.insert(begin(F), mint(0));
for (int i = 1; i <= n; i++) F[i] *= inv[i];
};
auto inplace_diff = [](fps& F) -> void {
if (F.empty()) return;
F.erase(begin(F));
mint coeff = 1, one = 1;
for (int i = 0; i < (int)F.size(); i++) {
F[i] *= coeff;
coeff += one;
}
};
fps b{1, 1 < (int)this->size() ? (*this)[1] : 0}, c{1}, z1, z2{1, 1};
for (int m = 2; m < deg; m *= 2) {
auto y = b;
y.resize(2 * m);
y.ntt();
z1 = z2;
fps z(m);
for (int i = 0; i < m; ++i) z[i] = y[i] * z1[i];
z.intt();
fill(begin(z), begin(z) + m / 2, mint(0));
z.ntt();
for (int i = 0; i < m; ++i) z[i] *= -z1[i];
z.intt();
c.insert(end(c), begin(z) + m / 2, end(z));
z2 = c;
z2.resize(2 * m);
z2.ntt();
fps x(begin(*this), begin(*this) + min<int>(this->size(), m));
inplace_diff(x);
x.push_back(mint(0));
x.ntt();
for (int i = 0; i < m; ++i) x[i] *= y[i];
x.intt();
x -= b.diff();
x.resize(2 * m);
for (int i = 0; i < m - 1; ++i) x[m + i] = x[i], x[i] = mint(0);
x.ntt();
for (int i = 0; i < 2 * m; ++i) x[i] *= z2[i];
x.intt();
x.pop_back();
inplace_integral(x);
for (int i = m; i < min<int>(this->size(), 2 * m); ++i) x[i] += (*this)[i];
fill(begin(x), begin(x) + m, mint(0));
x.ntt();
for (int i = 0; i < 2 * m; ++i) x[i] *= y[i];
x.intt();
b.insert(end(b), begin(x) + m, end(x));
}
return fps{begin(b), begin(b) + deg};
}
/**
* @brief NTT modFPS
* @docs docs/fps/ntt-friendly-fps.md
*/
using namespace std;
template <uint32_t mod>
struct LazyMontgomeryModInt {
using mint = LazyMontgomeryModInt;
using i32 = int32_t;
using u32 = uint32_t;
using u64 = uint64_t;
static constexpr u32 get_r() {
u32 ret = mod;
for (i32 i = 0; i < 4; ++i) ret *= 2 - mod * ret;
return ret;
}
static constexpr u32 r = get_r();
static constexpr u32 n2 = -u64(mod) % mod;
static_assert(r * mod == 1, "invalid, r * mod != 1");
static_assert(mod < (1 << 30), "invalid, mod >= 2 ^ 30");
static_assert((mod & 1) == 1, "invalid, mod % 2 == 0");
u32 a;
constexpr LazyMontgomeryModInt() : a(0) {}
constexpr LazyMontgomeryModInt(const int64_t &b)
: a(reduce(u64(b % mod + mod) * n2)){};
static constexpr u32 reduce(const u64 &b) {
return (b + u64(u32(b) * u32(-r)) * mod) >> 32;
}
constexpr mint &operator+=(const mint &b) {
if (i32(a += b.a - 2 * mod) < 0) a += 2 * mod;
return *this;
}
constexpr mint &operator-=(const mint &b) {
if (i32(a -= b.a) < 0) a += 2 * mod;
return *this;
}
constexpr mint &operator*=(const mint &b) {
a = reduce(u64(a) * b.a);
return *this;
}
constexpr mint &operator/=(const mint &b) {
*this *= b.inverse();
return *this;
}
constexpr mint operator+(const mint &b) const { return mint(*this) += b; }
constexpr mint operator-(const mint &b) const { return mint(*this) -= b; }
constexpr mint operator*(const mint &b) const { return mint(*this) *= b; }
constexpr mint operator/(const mint &b) const { return mint(*this) /= b; }
constexpr bool operator==(const mint &b) const {
return (a >= mod ? a - mod : a) == (b.a >= mod ? b.a - mod : b.a);
}
constexpr bool operator!=(const mint &b) const {
return (a >= mod ? a - mod : a) != (b.a >= mod ? b.a - mod : b.a);
}
constexpr mint operator-() const { return mint() - mint(*this); }
constexpr mint pow(u64 n) const {
mint ret(1), mul(*this);
while (n > 0) {
if (n & 1) ret *= mul;
mul *= mul;
n >>= 1;
}
return ret;
}
constexpr mint inverse() const { return pow(mod - 2); }
friend ostream &operator<<(ostream &os, const mint &b) {
return os << b.get();
}
friend istream &operator>>(istream &is, mint &b) {
int64_t t;
is >> t;
b = LazyMontgomeryModInt<mod>(t);
return (is);
}
constexpr u32 get() const {
u32 ret = reduce(a);
return ret >= mod ? ret - mod : ret;
}
static constexpr u32 get_mod() { return mod; }
};
using namespace std;
template <typename T>
struct Binomial {
vector<T> fac_, finv_, inv_;
Binomial(int MAX = 0) : fac_(MAX + 10), finv_(MAX + 10), inv_(MAX + 10) {
assert(T::get_mod() != 0);
MAX += 9;
fac_[0] = finv_[0] = inv_[0] = 1;
for (int i = 1; i <= MAX; i++) fac_[i] = fac_[i - 1] * i;
finv_[MAX] = fac_[MAX].inverse();
for (int i = MAX - 1; i > 0; i--) finv_[i] = finv_[i + 1] * (i + 1);
for (int i = 1; i <= MAX; i++) inv_[i] = finv_[i] * fac_[i - 1];
}
void extend() {
int n = fac_.size();
T fac = fac_.back() * n;
T inv = (-inv_[T::get_mod() % n]) * (T::get_mod() / n);
T finv = finv_.back() * inv;
fac_.push_back(fac);
finv_.push_back(finv);
inv_.push_back(inv);
}
T fac(int i) {
while (i >= (int)fac_.size()) extend();
return fac_[i];
}
T finv(int i) {
while (i >= (int)finv_.size()) extend();
return finv_[i];
}
T inv(int i) {
while (i >= (int)inv_.size()) extend();
return inv_[i];
}
T C(int n, int r) {
if (n < r || r < 0) return T(0);
return fac(n) * finv(n - r) * finv(r);
}
T C_naive(int n, int r) {
if (n < r || r < 0) return T(0);
T ret = T(1);
r = min(r, n - r);
for (int i = 1; i <= r; ++i) ret *= inv(i) * (n--);
return ret;
}
T P(int n, int r) {
if (n < r || r < 0) return T(0);
return fac(n) * finv(n - r);
}
T H(int n, int r) {
if (n < 0 || r < 0) return T(0);
return r == 0 ? 1 : C(n + r - 1, r);
}
};
using mint = LazyMontgomeryModInt<998244353>;
// #include "fps/arbitrary-fps.hpp"
// using mint = LazyMontgomeryModInt<1000000007>;
Binomial<mint> C;
using vm = vector<mint>;
using vvm = vector<vm>;
using fps = FormalPowerSeries<mint>;
using namespace std;
struct UnionFind {
vector<int> data;
UnionFind(int N) : data(N, -1) {}
int find(int k) { return data[k] < 0 ? k : data[k] = find(data[k]); }
int unite(int x, int y) {
if ((x = find(x)) == (y = find(y))) return false;
if (data[x] > data[y]) swap(x, y);
data[x] += data[y];
data[y] = x;
return true;
}
// f ... merge function
template<typename F>
int unite(int x, int y,const F &f) {
if ((x = find(x)) == (y = find(y))) return false;
if (data[x] > data[y]) swap(x, y);
data[x] += data[y];
data[y] = x;
f(x, y);
return true;
}
int size(int k) { return -data[find(k)]; }
int same(int x, int y) { return find(x) == find(y); }
};
/**
* @brief Union Find(Disjoint Set Union)
* @docs docs/data-structure/union-find.md
*/
using namespace std;
using namespace std;
template <typename mint>
struct ProductTree {
using fps = FormalPowerSeries<mint>;
const vector<mint> &xs;
vector<fps> buf;
int N, xsz;
vector<int> l, r;
ProductTree(const vector<mint> &xs_) : xs(xs_), xsz(xs.size()) {
N = 1;
while (N < (int)xs.size()) N *= 2;
buf.resize(2 * N);
l.resize(2 * N, xs.size());
r.resize(2 * N, xs.size());
fps::set_fft();
if (fps::ntt_ptr == nullptr)
build();
else
build_ntt();
}
void build() {
for (int i = 0; i < xsz; i++) {
l[i + N] = i;
r[i + N] = i + 1;
buf[i + N] = {-xs[i], 1};
}
for (int i = N - 1; i > 0; i--) {
l[i] = l[(i << 1) | 0];
r[i] = r[(i << 1) | 1];
if (buf[(i << 1) | 0].empty())
continue;
else if (buf[(i << 1) | 1].empty())
buf[i] = buf[(i << 1) | 0];
else
buf[i] = buf[(i << 1) | 0] * buf[(i << 1) | 1];
}
}
void build_ntt() {
fps f;
f.reserve(N * 2);
for (int i = 0; i < xsz; i++) {
l[i + N] = i;
r[i + N] = i + 1;
buf[i + N] = {-xs[i] + 1, -xs[i] - 1};
}
for (int i = N - 1; i > 0; i--) {
l[i] = l[(i << 1) | 0];
r[i] = r[(i << 1) | 1];
if (buf[(i << 1) | 0].empty())
continue;
else if (buf[(i << 1) | 1].empty())
buf[i] = buf[(i << 1) | 0];
else if (buf[(i << 1) | 0].size() == buf[(i << 1) | 1].size()) {
buf[i] = buf[(i << 1) | 0];
f.clear();
copy(begin(buf[(i << 1) | 1]), end(buf[(i << 1) | 1]),
back_inserter(f));
buf[i].ntt_doubling();
f.ntt_doubling();
for (int j = 0; j < (int)buf[i].size(); j++) buf[i][j] *= f[j];
} else {
buf[i] = buf[(i << 1) | 0];
f.clear();
copy(begin(buf[(i << 1) | 1]), end(buf[(i << 1) | 1]),
back_inserter(f));
buf[i].ntt_doubling();
f.intt();
f.resize(buf[i].size(), mint(0));
f.ntt();
for (int j = 0; j < (int)buf[i].size(); j++) buf[i][j] *= f[j];
}
}
for (int i = 0; i < 2 * N; i++) {
buf[i].intt();
buf[i].shrink();
}
}
};
template <typename mint>
vector<mint> InnerMultipointEvaluation(const FormalPowerSeries<mint> &f,
const vector<mint> &xs,
const ProductTree<mint> &ptree) {
using fps = FormalPowerSeries<mint>;
vector<mint> ret;
ret.reserve(xs.size());
auto rec = [&](auto self, fps a, int idx) {
if (ptree.l[idx] == ptree.r[idx]) return;
a %= ptree.buf[idx];
if ((int)a.size() <= 64) {
for (int i = ptree.l[idx]; i < ptree.r[idx]; i++)
ret.push_back(a.eval(xs[i]));
return;
}
self(self, a, (idx << 1) | 0);
self(self, a, (idx << 1) | 1);
};
rec(rec, f, 1);
return ret;
}
template <typename mint>
vector<mint> MultipointEvaluation(const FormalPowerSeries<mint> &f,
const vector<mint> &xs) {
return InnerMultipointEvaluation(f, xs, ProductTree<mint>(xs));
}
template <class mint>
FormalPowerSeries<mint> PolynomialInterpolation(const vector<mint> &xs,
const vector<mint> &ys) {
using fps = FormalPowerSeries<mint>;
assert(xs.size() == ys.size());
ProductTree<mint> ptree(xs);
fps w = ptree.buf[1].diff();
vector<mint> vs = InnerMultipointEvaluation<mint>(w, xs, ptree);
auto rec = [&](auto self, int idx) -> fps {
if (idx >= ptree.N) {
if (idx - ptree.N < (int)xs.size())
return {ys[idx - ptree.N] / vs[idx - ptree.N]};
else
return {mint(1)};
}
if (ptree.buf[idx << 1 | 0].empty())
return {};
else if (ptree.buf[idx << 1 | 1].empty())
return self(self, idx << 1 | 0);
return self(self, idx << 1 | 0) * ptree.buf[idx << 1 | 1] +
self(self, idx << 1 | 1) * ptree.buf[idx << 1 | 0];
};
return rec(rec, 1);
}
using namespace std;
using namespace std;
template <typename T>
struct edge {
int src, to;
T cost;
edge(int _to, T _cost) : src(-1), to(_to), cost(_cost) {}
edge(int _src, int _to, T _cost) : src(_src), to(_to), cost(_cost) {}
edge &operator=(const int &x) {
to = x;
return *this;
}
operator int() const { return to; }
};
template <typename T>
using Edges = vector<edge<T>>;
template <typename T>
using WeightedGraph = vector<Edges<T>>;
using UnweightedGraph = vector<vector<int>>;
// Input of (Unweighted) Graph
UnweightedGraph graph(int N, int M = -1, bool is_directed = false,
bool is_1origin = true) {
UnweightedGraph g(N);
if (M == -1) M = N - 1;
for (int _ = 0; _ < M; _++) {
int x, y;
cin >> x >> y;
if (is_1origin) x--, y--;
g[x].push_back(y);
if (!is_directed) g[y].push_back(x);
}
return g;
}
// Input of Weighted Graph
template <typename T>
WeightedGraph<T> wgraph(int N, int M = -1, bool is_directed = false,
bool is_1origin = true) {
WeightedGraph<T> g(N);
if (M == -1) M = N - 1;
for (int _ = 0; _ < M; _++) {
int x, y;
cin >> x >> y;
T c;
cin >> c;
if (is_1origin) x--, y--;
g[x].eb(x, y, c);
if (!is_directed) g[y].eb(y, x, c);
}
return g;
}
// Input of Edges
template <typename T>
Edges<T> esgraph(int N, int M, int is_weighted = true, bool is_1origin = true) {
Edges<T> es;
for (int _ = 0; _ < M; _++) {
int x, y;
cin >> x >> y;
T c;
if (is_weighted)
cin >> c;
else
c = 1;
if (is_1origin) x--, y--;
es.emplace_back(x, y, c);
}
return es;
}
// Input of Adjacency Matrix
template <typename T>
vector<vector<T>> adjgraph(int N, int M, T INF, int is_weighted = true,
bool is_directed = false, bool is_1origin = true) {
vector<vector<T>> d(N, vector<T>(N, INF));
for (int _ = 0; _ < M; _++) {
int x, y;
cin >> x >> y;
T c;
if (is_weighted)
cin >> c;
else
c = 1;
if (is_1origin) x--, y--;
d[x][y] = c;
if (!is_directed) d[y][x] = c;
}
return d;
}
// LowLink ... enumerate bridge and articulation point
// bridge ... articulation point ...
template <typename G>
struct LowLink {
int N;
const G &g;
vector<int> ord, low, articulation;
vector<pair<int, int> > bridge;
LowLink(const G &g) : g(g) {
N = g.size();
ord.resize(N, -1);
low.resize(N, -1);
int k = 0;
for (int i = 0; i < N; i++)
if (!(~ord[i])) k = dfs(i, k, -1);
}
int dfs(int idx, int k, int par) {
low[idx] = (ord[idx] = k++);
int cnt = 0;
bool is_arti = false, flg = false;
for (auto &to : g[idx]) {
if (ord[to] == -1) {
cnt++;
k = dfs(to, k, idx);
low[idx] = min(low[idx], low[to]);
is_arti |= (par != -1) && (low[to] >= ord[idx]);
if (ord[idx] < low[to]) {
bridge.emplace_back(minmax(idx, (int)to));
}
} else if (to != par || exchange(flg, true)) {
low[idx] = min(low[idx], ord[to]);
}
}
is_arti |= par == -1 && cnt > 1;
if (is_arti) articulation.push_back(idx);
return k;
}
};
using namespace std;
template <class T>
struct Matrix {
vector<vector<T> > A;
Matrix() {}
Matrix(int n, int m) : A(n, vector<T>(m, T())) {}
Matrix(int n) : A(n, vector<T>(n, T())){};
int height() const { return (A.size()); }
int width() const { return (A[0].size()); }
inline const vector<T> &operator[](int k) const { return (A.at(k)); }
inline vector<T> &operator[](int k) { return (A.at(k)); }
static Matrix I(int n) {
Matrix mat(n);
for (int i = 0; i < n; i++) mat[i][i] = 1;
return (mat);
}
Matrix &operator+=(const Matrix &B) {
int n = height(), m = width();
assert(n == B.height() && m == B.width());
for (int i = 0; i < n; i++)
for (int j = 0; j < m; j++) (*this)[i][j] += B[i][j];
return (*this);
}
Matrix &operator-=(const Matrix &B) {
int n = height(), m = width();
assert(n == B.height() && m == B.width());
for (int i = 0; i < n; i++)
for (int j = 0; j < m; j++) (*this)[i][j] -= B[i][j];
return (*this);
}
Matrix &operator*=(const Matrix &B) {
int n = height(), m = B.width(), p = width();
assert(p == B.height());
vector<vector<T> > C(n, vector<T>(m, 0));
for (int i = 0; i < n; i++)
for (int j = 0; j < m; j++)
for (int k = 0; k < p; k++)
C[i][j] = (C[i][j] + (*this)[i][k] * B[k][j]);
A.swap(C);
return (*this);
}
Matrix &operator^=(long long k) {
Matrix B = Matrix::I(height());
while (k > 0) {
if (k & 1) B *= *this;
*this *= *this;
k >>= 1LL;
}
A.swap(B.A);
return (*this);
}
Matrix operator+(const Matrix &B) const { return (Matrix(*this) += B); }
Matrix operator-(const Matrix &B) const { return (Matrix(*this) -= B); }
Matrix operator*(const Matrix &B) const { return (Matrix(*this) *= B); }
Matrix operator^(const long long k) const { return (Matrix(*this) ^= k); }
friend ostream &operator<<(ostream &os, Matrix &p) {
int n = p.height(), m = p.width();
for (int i = 0; i < n; i++) {
os << "[";
for (int j = 0; j < m; j++) {
os << p[i][j] << (j + 1 == m ? "]\n" : ",");
}
}
return (os);
}
T determinant() {
Matrix B(*this);
assert(width() == height());
T ret = 1;
for (int i = 0; i < width(); i++) {
int idx = -1;
for (int j = i; j < width(); j++) {
if (B[j][i] != 0) idx = j;
}
if (idx == -1) return (0);
if (i != idx) {
ret *= -1;
swap(B[i], B[idx]);
}
ret *= B[i][i];
T vv = B[i][i];
for (int j = 0; j < width(); j++) {
B[i][j] /= vv;
}
for (int j = i + 1; j < width(); j++) {
T a = B[j][i];
for (int k = 0; k < width(); k++) {
B[j][k] -= B[i][k] * a;
}
}
}
return (ret);
}
};
int a[111][111];
mint calc(vi v) {
if (sz(v) == 1) return 1;
Matrix<mint> m(sz(v) - 1);
rep(i, sz(v) - 1) rep(j, sz(v)) {
if (i == j) continue;
if (a[v[i]][v[j]] == 0) continue;
if (j != sz(v) - 1) m[i][j] -= 1;
m[i][i] += 1;
}
return m.determinant();
}
vm calc2(vi v) {
Matrix<fps> m(sz(v) - 1);
rep(i, sz(v) - 1) rep(j, sz(v)) {
if (i == j) continue;
if (a[v[i]][v[j]] == 0) {
if (j != sz(v) - 1) m[i][j] -= fps{1};
m[i][i] += fps{1};
} else {
if (j != sz(v) - 1) m[i][j] -= fps{0, 1};
m[i][i] += fps{0, 1};
}
}
vm x(sz(v) + 1);
vm y(sz(v) + 1);
Matrix<mint> m2(sz(v) - 1);
rep(x_, sz(v) + 1) {
x[x_] = x_;
rep(i, sz(v) - 1) rep(j, sz(v) - 1) { m2[i][j] = m[i][j].eval(x_); }
y[x_] = m2.determinant();
}
fps f = PolynomialInterpolation(x, y);
return f;
}
void solve() {
ini(N, M);
UnionFind uf(N);
rep(i, M) {
ini(u, v);
--u, --v;
a[u][v] = a[v][u] = 1;
uf.unite(u, v);
}
using P = pair<mint, int>;
vvi memo(N);
rep(i, N) memo[uf.find(i)].push_back(i);
V<P> v;
rep(i, N) {
if (uf.find(i) == i) v.emplace_back(calc(memo[i]), sz(memo[i]));
}
if (sz(v) == 1) {
out(0);
auto f = calc2(mkiota(N));
out(f[N - 1] + f[N - 2]);
return;
}
sort(all(v), [](P a, P b) { return a.second > b.second; });
int n1 = v[0].second, n2 = v[1].second;
ll h = -n1 * n2 * 2;
mint ans = 1;
each(p, v) ans *= p.first;
mint sm = 0;
rep(j, sz(v)) rep(i, j) {
h += v[i].second * v[j].second * 2;
if (v[i].second == n1 and v[j].second == n2) {
sm += v[i].second * v[j].second;
}
}
out(h);
out(ans*sm);
}
הההההההההההההההההההההההההההההההההההההההההההההההההההההההההההההההההההההההההההההההההההההההההההההההההההההההההההההההההההההההההההההההההההההההההההההההההההההההההההההההההההההההההההההההההההההההההההההההההההההההההההההההההההההההההההההההההההההההההההההההההההההההההההההההה
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
0