結果
| 問題 |
No.2747 Permutation Adjacent Sum
|
| コンテスト | |
| ユーザー |
|
| 提出日時 | 2024-04-20 20:00:21 |
| 言語 | C++17 (gcc 13.3.0 + boost 1.87.0) |
| 結果 |
AC
|
| 実行時間 | 591 ms / 3,000 ms |
| コード長 | 57,385 bytes |
| コンパイル時間 | 4,710 ms |
| コンパイル使用メモリ | 336,416 KB |
| 最終ジャッジ日時 | 2025-02-21 07:19:19 |
|
ジャッジサーバーID (参考情報) |
judge2 / judge4 |
(要ログイン)
| ファイルパターン | 結果 |
|---|---|
| sample | AC * 2 |
| other | AC * 40 |
コンパイルメッセージ
main.cpp:414:1: warning: ‘always_inline’ function might not be inlinable [-Wattributes]
414 | montgomery_sub_256(const __m256i &a,const __m256i &b,const __m256i &m2,
| ^~~~~~~~~~~~~~~~~~
main.cpp:406:1: warning: ‘always_inline’ function might not be inlinable [-Wattributes]
406 | montgomery_add_256(const __m256i &a,const __m256i &b,const __m256i &m2,
| ^~~~~~~~~~~~~~~~~~
main.cpp:398:1: warning: ‘always_inline’ function might not be inlinable [-Wattributes]
398 | montgomery_mul_256(const __m256i &a,const __m256i &b,const __m256i &r,
| ^~~~~~~~~~~~~~~~~~
main.cpp:387:1: warning: ‘always_inline’ function might not be inlinable [-Wattributes]
387 | my256_mulhi_epu32(const __m256i &a,const __m256i &b) {
| ^~~~~~~~~~~~~~~~~
main.cpp:382:1: warning: ‘always_inline’ function might not be inlinable [-Wattributes]
382 | my256_mullo_epu32(const __m256i &a,const __m256i &b) {
| ^~~~~~~~~~~~~~~~~
main.cpp:375:1: warning: ‘always_inline’ function might not be inlinable [-Wattributes]
375 | montgomery_sub_128(const __m128i &a,const __m128i &b,const __m128i &m2,
| ^~~~~~~~~~~~~~~~~~
main.cpp:368:1: warning: ‘always_inline’ function might not be inlinable [-Wattributes]
368 | montgomery_add_128(const __m128i &a,const __m128i &b,const __m128i &m2,
| ^~~~~~~~~~~~~~~~~~
main.cpp:360:1: warning: ‘always_inline’ function might not be inlinable [-Wattributes]
360 | montgomery_mul_128(const __m128i &a,const __m128i &b,const __m128i &r,
| ^~~~~~~~~~~~~~~~~~
main.cpp:349:1: warning: ‘always_inline’ function might not be inlinable [-Wattributes]
349 | my128_mulhi_epu32(const __m128i &a,const __m128i &b) {
| ^~~~~~~~~~~~~~~~~
main.cpp:344:1: warning: ‘always_inline’ function might not be inlinable [-Wattributes]
344 | my128_mullo_epu32(const __m128i &a,const __m128i &b) {
| ^~~~~~~~~~~~~~~~~
ソースコード
#include <bits/stdc++.h>
using namespace std;
//#include<atcoder/math>
//using namespace atcoder;
using ll = long long int;
using ull = unsigned long long int;
using ld = long double;
constexpr ll MAX = 2000000000000000000;
constexpr ld PI = 3.14159265358979;
constexpr ll MOD = 998244353;//2024948111;
ld dotorad(ld K){ return PI * K / 180.0; }
ld radtodo(ld K){ return K * 180.0 / PI; }
mt19937 mt;
void randinit(){ srand((unsigned)time(NULL));mt = mt19937(rand()); }
ll modpow(ll A,ll B,ll M){
ll r = 1,p = A;
while(B > 0){
if(B % 2 == 1) r = (r * p) % M;
p = (p * p) % M;
B /= 2;
}
return r;
}
vector<ll> fac,finv,inv;
void COMinit(ll n = 1000000){
fac.resize(n);
finv.resize(n);
inv.resize(n);
inv[1] = 1;
fac[0] = fac[1] = 1;
finv[0] = finv[1] = 1;
ll i;
for(i = 2;i < n;i++){
fac[i] = (fac[i - 1] * i) % MOD;
inv[i] = MOD - ((inv[MOD % i] * (MOD / i)) % MOD);
finv[i] = (finv[i - 1] * inv[i]) % MOD;
}
}
ll COM(ll n,ll m){
//nCm
if(n < m) return 0;
if(n < 0 || m < 0) return 0;
return (fac[n] * (finv[m] * finv[n - m] % MOD)) % MOD;
}
ll f(ll a,ll b){
return COM(a + b,a);
}
// https://judge.yosupo.jp/submission/17797
#pragma region kyopro_template
#define Nyaan_template
#include <immintrin.h>
#include <bits/stdc++.h>
#define pb push_back
#define eb emplace_back
#define fi first
#define se second
#define each(x, v) for (auto &x : v)
#define all(v) (v).begin(), (v).end()
#define sz(v) ((int)(v).size())
#define mem(a, val) memset(a, val, sizeof(a))
#define ini(...) \
int __VA_ARGS__; \
in(__VA_ARGS__)
#define inl(...) \
long long __VA_ARGS__; \
in(__VA_ARGS__)
#define ins(...) \
string __VA_ARGS__; \
in(__VA_ARGS__)
#define inc(...) \
char __VA_ARGS__; \
in(__VA_ARGS__)
#define in2(s, t) \
for (int i = 0; i < (int)s.size(); i++) { \
in(s[i], t[i]); \
}
#define in3(s, t, u) \
for (int i = 0; i < (int)s.size(); i++) { \
in(s[i], t[i], u[i]); \
}
#define in4(s, t, u, v) \
for (int i = 0; i < (int)s.size(); i++) { \
in(s[i], t[i], u[i], v[i]); \
}
#define rep(i, N) for (long long i = 0; i < (long long)(N); i++)
#define repr(i, N) for (long long i = (long long)(N)-1; i >= 0; i--)
#define rep1(i, N) for (long long i = 1; i <= (long long)(N); i++)
#define repr1(i, N) for (long long i = (N); (long long)(i) > 0; i--)
#define reg(i, a, b) for (long long i = (a); i < (b); i++)
#define die(...) \
do { \
out(__VA_ARGS__); \
return; \
} while (0)
using namespace std;
using ll = long long;
template <class T>
using V = vector<T>;
using vi = vector<int>;
using vl = vector<long long>;
using vvi = vector<vector<int>>;
using vd = V<double>;
using vs = V<string>;
using vvl = vector<vector<long long>>;
using P = pair<long long,long long>;
using vp = vector<P>;
using pii = pair<int,int>;
using vpi = vector<pair<int,int>>;
constexpr int inf = 1001001001;
constexpr long long infLL = (1LL << 61) - 1;
template <typename T,typename U>
inline bool amin(T &x,U y) {
return (y < x) ? (x = y,true) : false;
}
template <typename T,typename U>
inline bool amax(T &x,U y) {
return (x < y) ? (x = y,true) : false;
}
template <typename T,typename U>
ostream &operator<<(ostream &os,const pair<T,U> &p) {
os << p.first << " " << p.second;
return os;
}
template <typename T,typename U>
istream &operator>>(istream &is,pair<T,U> &p) {
is >> p.first >> p.second;
return is;
}
template <typename T>
ostream &operator<<(ostream &os,const vector<T> &v) {
int s = (int)v.size();
for(int i = 0; i < s; i++) os << (i ? " " : "") << v[i];
return os;
}
template <typename T>
istream &operator>>(istream &is,vector<T> &v) {
for(auto &x : v) is >> x;
return is;
}
#ifdef NyaanDebug
#define trc(...) \
do { \
cerr << #__VA_ARGS__ << " = "; \
dbg_out(__VA_ARGS__); \
} while (0)
#define trca(v, N) \
do { \
cerr << #v << " = "; \
array_out(v, N); \
} while (0)
#define trcc(v) \
do { \
cerr << #v << " = {"; \
each(x, v) { cerr << " " << x << ","; } \
cerr << "}" << endl; \
} while (0)
template <typename T>
void _cout(const T &c) {
cerr << c;
}
void _cout(const int &c) {
if(c == 1001001001)
cerr << "inf";
else if(c == -1001001001)
cerr << "-inf";
else
cerr << c;
}
void _cout(const unsigned int &c) {
if(c == 1001001001)
cerr << "inf";
else
cerr << c;
}
void _cout(const long long &c) {
if(c == 1001001001 || c == (1LL << 61) - 1)
cerr << "inf";
else if(c == -1001001001 || c == -((1LL << 61) - 1))
cerr << "-inf";
else
cerr << c;
}
void _cout(const unsigned long long &c) {
if(c == 1001001001 || c == (1LL << 61) - 1)
cerr << "inf";
else
cerr << c;
}
template <typename T,typename U>
void _cout(const pair<T,U> &p) {
cerr << "{ ";
_cout(p.fi);
cerr << ", ";
_cout(p.se);
cerr << " } ";
}
template <typename T>
void _cout(const vector<T> &v) {
int s = v.size();
cerr << "{ ";
for(int i = 0; i < s; i++) {
cerr << (i ? ", " : "");
_cout(v[i]);
}
cerr << " } ";
}
template <typename T>
void _cout(const vector<vector<T>> &v) {
cerr << "[ ";
for(const auto &x : v) {
cerr << endl;
_cout(x);
cerr << ", ";
}
cerr << endl << " ] ";
}
void dbg_out() { cerr << endl; }
template <typename T,class... U>
void dbg_out(const T &t,const U &... u) {
_cout(t);
if(sizeof...(u)) cerr << ", ";
dbg_out(u...);
}
template <typename T>
void array_out(const T &v,int s) {
cerr << "{ ";
for(int i = 0; i < s; i++) {
cerr << (i ? ", " : "");
_cout(v[i]);
}
cerr << " } " << endl;
}
template <typename T>
void array_out(const T &v,int H,int W) {
cerr << "[ ";
for(int i = 0; i < H; i++) {
cerr << (i ? ", " : "");
array_out(v[i],W);
}
cerr << " ] " << endl;
}
#else
#define trc(...)
#define trca(...)
#define trcc(...)
#endif
inline int popcnt(unsigned long long a) { return __builtin_popcountll(a); }
inline int lsb(unsigned long long a) { return __builtin_ctzll(a); }
inline int msb(unsigned long long a) { return 63 - __builtin_clzll(a); }
template <typename T>
inline int getbit(T a,int i) {
return (a >> i) & 1;
}
template <typename T>
inline void setbit(T &a,int i) {
a |= (1LL << i);
}
template <typename T>
inline void delbit(T &a,int i) {
a &= ~(1LL << i);
}
template <typename T>
int lb(const vector<T> &v,const T &a) {
return lower_bound(begin(v),end(v),a) - begin(v);
}
template <typename T>
int ub(const vector<T> &v,const T &a) {
return upper_bound(begin(v),end(v),a) - begin(v);
}
template <typename T>
int btw(T a,T x,T b) {
return a <= x && x < b;
}
template <typename T,typename U>
T ceil(T a,U b) {
return (a + b - 1) / b;
}
constexpr long long TEN(int n) {
long long ret = 1,x = 10;
while(n) {
if(n & 1) ret *= x;
x *= x;
n >>= 1;
}
return ret;
}
template <typename T>
vector<T> mkrui(const vector<T> &v) {
vector<T> ret(v.size() + 1);
for(int i = 0; i < int(v.size()); i++) ret[i + 1] = ret[i] + v[i];
return ret;
};
template <typename T>
vector<T> mkuni(const vector<T> &v) {
vector<T> ret(v);
sort(ret.begin(),ret.end());
ret.erase(unique(ret.begin(),ret.end()),ret.end());
return ret;
}
template <typename F>
vector<int> mkord(int N,F f) {
vector<int> ord(N);
iota(begin(ord),end(ord),0);
sort(begin(ord),end(ord),f);
return ord;
}
template <typename T = int>
vector<T> mkiota(int N) {
vector<T> ret(N);
iota(begin(ret),end(ret),0);
return ret;
}
template <typename T>
vector<int> mkinv(vector<T> &v) {
vector<int> inv(v.size());
for(int i = 0; i < (int)v.size(); i++) inv[v[i]] = i;
return inv;
}
struct IoSetupNya {
IoSetupNya() {
cin.tie(nullptr);
ios::sync_with_stdio(false);
cout << fixed << setprecision(15);
cerr << fixed << setprecision(7);
}
} iosetupnya;
#pragma endregion
using namespace std;
using namespace std;
using namespace std;
__attribute__((target("sse4.2"))) __attribute__((always_inline)) __m128i
my128_mullo_epu32(const __m128i &a,const __m128i &b) {
return _mm_mullo_epi32(a,b);
}
__attribute__((target("sse4.2"))) __attribute__((always_inline)) __m128i
my128_mulhi_epu32(const __m128i &a,const __m128i &b) {
__m128i a13 = _mm_shuffle_epi32(a,0xF5);
__m128i b13 = _mm_shuffle_epi32(b,0xF5);
__m128i prod02 = _mm_mul_epu32(a,b);
__m128i prod13 = _mm_mul_epu32(a13,b13);
__m128i prod = _mm_unpackhi_epi64(_mm_unpacklo_epi32(prod02,prod13),
_mm_unpackhi_epi32(prod02,prod13));
return prod;
}
__attribute__((target("sse4.2"))) __attribute__((always_inline)) __m128i
montgomery_mul_128(const __m128i &a,const __m128i &b,const __m128i &r,
const __m128i &m1) {
return _mm_sub_epi32(
_mm_add_epi32(my128_mulhi_epu32(a,b),m1),
my128_mulhi_epu32(my128_mullo_epu32(my128_mullo_epu32(a,b),r),m1));
}
__attribute__((target("sse4.2"))) __attribute__((always_inline)) __m128i
montgomery_add_128(const __m128i &a,const __m128i &b,const __m128i &m2,
const __m128i &m0) {
__m128i ret = _mm_sub_epi32(_mm_add_epi32(a,b),m2);
return _mm_add_epi32(_mm_and_si128(_mm_cmpgt_epi32(m0,ret),m2),ret);
}
__attribute__((target("sse4.2"))) __attribute__((always_inline)) __m128i
montgomery_sub_128(const __m128i &a,const __m128i &b,const __m128i &m2,
const __m128i &m0) {
__m128i ret = _mm_sub_epi32(a,b);
return _mm_add_epi32(_mm_and_si128(_mm_cmpgt_epi32(m0,ret),m2),ret);
}
__attribute__((target("avx2"))) __attribute__((always_inline)) __m256i
my256_mullo_epu32(const __m256i &a,const __m256i &b) {
return _mm256_mullo_epi32(a,b);
}
__attribute__((target("avx2"))) __attribute__((always_inline)) __m256i
my256_mulhi_epu32(const __m256i &a,const __m256i &b) {
__m256i a13 = _mm256_shuffle_epi32(a,0xF5);
__m256i b13 = _mm256_shuffle_epi32(b,0xF5);
__m256i prod02 = _mm256_mul_epu32(a,b);
__m256i prod13 = _mm256_mul_epu32(a13,b13);
__m256i prod = _mm256_unpackhi_epi64(_mm256_unpacklo_epi32(prod02,prod13),
_mm256_unpackhi_epi32(prod02,prod13));
return prod;
}
__attribute__((target("avx2"))) __attribute__((always_inline)) __m256i
montgomery_mul_256(const __m256i &a,const __m256i &b,const __m256i &r,
const __m256i &m1) {
return _mm256_sub_epi32(
_mm256_add_epi32(my256_mulhi_epu32(a,b),m1),
my256_mulhi_epu32(my256_mullo_epu32(my256_mullo_epu32(a,b),r),m1));
}
__attribute__((target("avx2"))) __attribute__((always_inline)) __m256i
montgomery_add_256(const __m256i &a,const __m256i &b,const __m256i &m2,
const __m256i &m0) {
__m256i ret = _mm256_sub_epi32(_mm256_add_epi32(a,b),m2);
return _mm256_add_epi32(_mm256_and_si256(_mm256_cmpgt_epi32(m0,ret),m2),
ret);
}
__attribute__((target("avx2"))) __attribute__((always_inline)) __m256i
montgomery_sub_256(const __m256i &a,const __m256i &b,const __m256i &m2,
const __m256i &m0) {
__m256i ret = _mm256_sub_epi32(a,b);
return _mm256_add_epi32(_mm256_and_si256(_mm256_cmpgt_epi32(m0,ret),m2),
ret);
}
constexpr int SZ = 1 << 19;
uint32_t buf1_[SZ * 2] __attribute__((aligned(64)));
uint32_t buf2_[SZ * 2] __attribute__((aligned(64)));
template <typename mint>
struct NTT {
static constexpr uint32_t get_pr() {
uint32_t mod = mint::get_mod();
using u64 = uint64_t;
u64 ds[32] = {};
int idx = 0;
u64 m = mod - 1;
for(u64 i = 2; i * i <= m; ++i) {
if(m % i == 0) {
ds[idx++] = i;
while(m % i == 0) m /= i;
}
}
if(m != 1) ds[idx++] = m;
uint32_t pr = 2;
while(1) {
int flg = 1;
for(int i = 0; i < idx; ++i) {
u64 a = pr,b = (mod - 1) / ds[i],r = 1;
while(b) {
if(b & 1) r = r * a % mod;
a = a * a % mod;
b >>= 1;
}
if(r == 1) {
flg = 0;
break;
}
}
if(flg == 1) break;
++pr;
}
return pr;
};
static constexpr uint32_t mod = mint::get_mod();
static constexpr uint32_t pr = get_pr();
static constexpr int level = __builtin_ctzll(mod - 1);
mint dw[level],dy[level];
mint *buf1,*buf2;
constexpr NTT() {
setwy(level);
buf1 = reinterpret_cast<mint *>(::buf1_);
buf2 = reinterpret_cast<mint *>(::buf2_);
}
constexpr void setwy(int k) {
mint w[level],y[level];
w[k - 1] = mint(pr).pow((mod - 1) / (1 << k));
y[k - 1] = w[k - 1].inverse();
for(int i = k - 2; i > 0; --i)
w[i] = w[i + 1] * w[i + 1],y[i] = y[i + 1] * y[i + 1];
dw[0] = dy[0] = w[1] * w[1];
dw[1] = w[1],dy[1] = y[1],dw[2] = w[2],dy[2] = y[2];
for(int i = 3; i < k; ++i) {
dw[i] = dw[i - 1] * y[i - 2] * w[i];
dy[i] = dy[i - 1] * w[i - 2] * y[i];
}
}
__attribute__((target("avx2"))) void ntt(mint *a,int n) {
int k = n ? __builtin_ctz(n) : 0;
if(k == 0) return;
if(k == 1) {
mint a1 = a[1];
a[1] = a[0] - a[1];
a[0] = a[0] + a1;
return;
}
if(k & 1) {
int v = 1 << (k - 1);
if(v < 8) {
for(int j = 0; j < v; ++j) {
mint ajv = a[j + v];
a[j + v] = a[j] - ajv;
a[j] += ajv;
}
}
else {
const __m256i m0 = _mm256_set1_epi32(0);
const __m256i m2 = _mm256_set1_epi32(mod + mod);
int j0 = 0;
int j1 = v;
for(; j0 < v; j0 += 8,j1 += 8) {
__m256i T0 = _mm256_loadu_si256((__m256i *)(a + j0));
__m256i T1 = _mm256_loadu_si256((__m256i *)(a + j1));
__m256i naj = montgomery_add_256(T0,T1,m2,m0);
__m256i najv = montgomery_sub_256(T0,T1,m2,m0);
_mm256_storeu_si256((__m256i *)(a + j0),naj);
_mm256_storeu_si256((__m256i *)(a + j1),najv);
}
}
}
int u = 1 << (2 + (k & 1));
int v = 1 << (k - 2 - (k & 1));
mint one = mint(1);
mint imag = dw[1];
while(v) {
if(v == 1) {
mint ww = one,xx = one,wx = one;
for(int jh = 0; jh < u;) {
ww = xx * xx,wx = ww * xx;
mint t0 = a[jh + 0],t1 = a[jh + 1] * xx;
mint t2 = a[jh + 2] * ww,t3 = a[jh + 3] * wx;
mint t0p2 = t0 + t2,t1p3 = t1 + t3;
mint t0m2 = t0 - t2,t1m3 = (t1 - t3) * imag;
a[jh + 0] = t0p2 + t1p3,a[jh + 1] = t0p2 - t1p3;
a[jh + 2] = t0m2 + t1m3,a[jh + 3] = t0m2 - t1m3;
xx *= dw[__builtin_ctz((jh += 4))];
}
}
else if(v == 4) {
const __m128i m0 = _mm_set1_epi32(0);
const __m128i m1 = _mm_set1_epi32(mod);
const __m128i m2 = _mm_set1_epi32(mod + mod);
const __m128i r = _mm_set1_epi32(mint::r);
const __m128i Imag = _mm_set1_epi32(imag.a);
mint ww = one,xx = one,wx = one;
for(int jh = 0; jh < u;) {
if(jh == 0) {
int j0 = 0;
int j1 = v;
int j2 = j1 + v;
int j3 = j2 + v;
int je = v;
for(; j0 < je; j0 += 4,j1 += 4,j2 += 4,j3 += 4) {
const __m128i T0 = _mm_loadu_si128((__m128i *)(a + j0));
const __m128i T1 = _mm_loadu_si128((__m128i *)(a + j1));
const __m128i T2 = _mm_loadu_si128((__m128i *)(a + j2));
const __m128i T3 = _mm_loadu_si128((__m128i *)(a + j3));
const __m128i T0P2 = montgomery_add_128(T0,T2,m2,m0);
const __m128i T1P3 = montgomery_add_128(T1,T3,m2,m0);
const __m128i T0M2 = montgomery_sub_128(T0,T2,m2,m0);
const __m128i T1M3 = montgomery_mul_128(
montgomery_sub_128(T1,T3,m2,m0),Imag,r,m1);
_mm_storeu_si128((__m128i *)(a + j0),
montgomery_add_128(T0P2,T1P3,m2,m0));
_mm_storeu_si128((__m128i *)(a + j1),
montgomery_sub_128(T0P2,T1P3,m2,m0));
_mm_storeu_si128((__m128i *)(a + j2),
montgomery_add_128(T0M2,T1M3,m2,m0));
_mm_storeu_si128((__m128i *)(a + j3),
montgomery_sub_128(T0M2,T1M3,m2,m0));
}
}
else {
ww = xx * xx,wx = ww * xx;
const __m128i WW = _mm_set1_epi32(ww.a);
const __m128i WX = _mm_set1_epi32(wx.a);
const __m128i XX = _mm_set1_epi32(xx.a);
int j0 = jh * v;
int j1 = j0 + v;
int j2 = j1 + v;
int j3 = j2 + v;
int je = j1;
for(; j0 < je; j0 += 4,j1 += 4,j2 += 4,j3 += 4) {
const __m128i T0 = _mm_loadu_si128((__m128i *)(a + j0));
const __m128i T1 = _mm_loadu_si128((__m128i *)(a + j1));
const __m128i T2 = _mm_loadu_si128((__m128i *)(a + j2));
const __m128i T3 = _mm_loadu_si128((__m128i *)(a + j3));
const __m128i MT1 = montgomery_mul_128(T1,XX,r,m1);
const __m128i MT2 = montgomery_mul_128(T2,WW,r,m1);
const __m128i MT3 = montgomery_mul_128(T3,WX,r,m1);
const __m128i T0P2 = montgomery_add_128(T0,MT2,m2,m0);
const __m128i T1P3 = montgomery_add_128(MT1,MT3,m2,m0);
const __m128i T0M2 = montgomery_sub_128(T0,MT2,m2,m0);
const __m128i T1M3 = montgomery_mul_128(
montgomery_sub_128(MT1,MT3,m2,m0),Imag,r,m1);
_mm_storeu_si128((__m128i *)(a + j0),
montgomery_add_128(T0P2,T1P3,m2,m0));
_mm_storeu_si128((__m128i *)(a + j1),
montgomery_sub_128(T0P2,T1P3,m2,m0));
_mm_storeu_si128((__m128i *)(a + j2),
montgomery_add_128(T0M2,T1M3,m2,m0));
_mm_storeu_si128((__m128i *)(a + j3),
montgomery_sub_128(T0M2,T1M3,m2,m0));
}
}
xx *= dw[__builtin_ctz((jh += 4))];
}
}
else {
const __m256i m0 = _mm256_set1_epi32(0);
const __m256i m1 = _mm256_set1_epi32(mod);
const __m256i m2 = _mm256_set1_epi32(mod + mod);
const __m256i r = _mm256_set1_epi32(mint::r);
const __m256i Imag = _mm256_set1_epi32(imag.a);
mint ww = one,xx = one,wx = one;
for(int jh = 0; jh < u;) {
if(jh == 0) {
int j0 = 0;
int j1 = v;
int j2 = j1 + v;
int j3 = j2 + v;
int je = v;
for(; j0 < je; j0 += 8,j1 += 8,j2 += 8,j3 += 8) {
const __m256i T0 = _mm256_loadu_si256((__m256i *)(a + j0));
const __m256i T1 = _mm256_loadu_si256((__m256i *)(a + j1));
const __m256i T2 = _mm256_loadu_si256((__m256i *)(a + j2));
const __m256i T3 = _mm256_loadu_si256((__m256i *)(a + j3));
const __m256i T0P2 = montgomery_add_256(T0,T2,m2,m0);
const __m256i T1P3 = montgomery_add_256(T1,T3,m2,m0);
const __m256i T0M2 = montgomery_sub_256(T0,T2,m2,m0);
const __m256i T1M3 = montgomery_mul_256(
montgomery_sub_256(T1,T3,m2,m0),Imag,r,m1);
_mm256_storeu_si256((__m256i *)(a + j0),
montgomery_add_256(T0P2,T1P3,m2,m0));
_mm256_storeu_si256((__m256i *)(a + j1),
montgomery_sub_256(T0P2,T1P3,m2,m0));
_mm256_storeu_si256((__m256i *)(a + j2),
montgomery_add_256(T0M2,T1M3,m2,m0));
_mm256_storeu_si256((__m256i *)(a + j3),
montgomery_sub_256(T0M2,T1M3,m2,m0));
}
}
else {
ww = xx * xx,wx = ww * xx;
const __m256i WW = _mm256_set1_epi32(ww.a);
const __m256i WX = _mm256_set1_epi32(wx.a);
const __m256i XX = _mm256_set1_epi32(xx.a);
int j0 = jh * v;
int j1 = j0 + v;
int j2 = j1 + v;
int j3 = j2 + v;
int je = j1;
for(; j0 < je; j0 += 8,j1 += 8,j2 += 8,j3 += 8) {
const __m256i T0 = _mm256_loadu_si256((__m256i *)(a + j0));
const __m256i T1 = _mm256_loadu_si256((__m256i *)(a + j1));
const __m256i T2 = _mm256_loadu_si256((__m256i *)(a + j2));
const __m256i T3 = _mm256_loadu_si256((__m256i *)(a + j3));
const __m256i MT1 = montgomery_mul_256(T1,XX,r,m1);
const __m256i MT2 = montgomery_mul_256(T2,WW,r,m1);
const __m256i MT3 = montgomery_mul_256(T3,WX,r,m1);
const __m256i T0P2 = montgomery_add_256(T0,MT2,m2,m0);
const __m256i T1P3 = montgomery_add_256(MT1,MT3,m2,m0);
const __m256i T0M2 = montgomery_sub_256(T0,MT2,m2,m0);
const __m256i T1M3 = montgomery_mul_256(
montgomery_sub_256(MT1,MT3,m2,m0),Imag,r,m1);
_mm256_storeu_si256((__m256i *)(a + j0),
montgomery_add_256(T0P2,T1P3,m2,m0));
_mm256_storeu_si256((__m256i *)(a + j1),
montgomery_sub_256(T0P2,T1P3,m2,m0));
_mm256_storeu_si256((__m256i *)(a + j2),
montgomery_add_256(T0M2,T1M3,m2,m0));
_mm256_storeu_si256((__m256i *)(a + j3),
montgomery_sub_256(T0M2,T1M3,m2,m0));
}
}
xx *= dw[__builtin_ctz((jh += 4))];
}
}
u <<= 2;
v >>= 2;
}
}
__attribute__((target("avx2"))) void intt(mint *a,int n,
int normalize = true) {
int k = n ? __builtin_ctz(n) : 0;
if(k == 0) return;
if(k == 1) {
mint a1 = a[1];
a[1] = a[0] - a[1];
a[0] = a[0] + a1;
if(normalize) {
a[0] *= mint(2).inverse();
a[1] *= mint(2).inverse();
}
return;
}
int u = 1 << (k - 2);
int v = 1;
mint one = mint(1);
mint imag = dy[1];
while(u) {
if(v == 1) {
mint ww = one,xx = one,yy = one;
u <<= 2;
for(int jh = 0; jh < u;) {
ww = xx * xx,yy = xx * imag;
mint t0 = a[jh + 0],t1 = a[jh + 1];
mint t2 = a[jh + 2],t3 = a[jh + 3];
mint t0p1 = t0 + t1,t2p3 = t2 + t3;
mint t0m1 = (t0 - t1) * xx,t2m3 = (t2 - t3) * yy;
a[jh + 0] = t0p1 + t2p3,a[jh + 2] = (t0p1 - t2p3) * ww;
a[jh + 1] = t0m1 + t2m3,a[jh + 3] = (t0m1 - t2m3) * ww;
xx *= dy[__builtin_ctz(jh += 4)];
}
}
else if(v == 4) {
const __m128i m0 = _mm_set1_epi32(0);
const __m128i m1 = _mm_set1_epi32(mod);
const __m128i m2 = _mm_set1_epi32(mod + mod);
const __m128i r = _mm_set1_epi32(mint::r);
const __m128i Imag = _mm_set1_epi32(imag.a);
mint ww = one,xx = one,yy = one;
u <<= 2;
for(int jh = 0; jh < u;) {
if(jh == 0) {
int j0 = 0;
int j1 = v;
int j2 = v + v;
int j3 = j2 + v;
for(; j0 < v; j0 += 4,j1 += 4,j2 += 4,j3 += 4) {
const __m128i T0 = _mm_loadu_si128((__m128i *)(a + j0));
const __m128i T1 = _mm_loadu_si128((__m128i *)(a + j1));
const __m128i T2 = _mm_loadu_si128((__m128i *)(a + j2));
const __m128i T3 = _mm_loadu_si128((__m128i *)(a + j3));
const __m128i T0P1 = montgomery_add_128(T0,T1,m2,m0);
const __m128i T2P3 = montgomery_add_128(T2,T3,m2,m0);
const __m128i T0M1 = montgomery_sub_128(T0,T1,m2,m0);
const __m128i T2M3 = montgomery_mul_128(
montgomery_sub_128(T2,T3,m2,m0),Imag,r,m1);
_mm_storeu_si128((__m128i *)(a + j0),
montgomery_add_128(T0P1,T2P3,m2,m0));
_mm_storeu_si128((__m128i *)(a + j2),
montgomery_sub_128(T0P1,T2P3,m2,m0));
_mm_storeu_si128((__m128i *)(a + j1),
montgomery_add_128(T0M1,T2M3,m2,m0));
_mm_storeu_si128((__m128i *)(a + j3),
montgomery_sub_128(T0M1,T2M3,m2,m0));
}
}
else {
ww = xx * xx,yy = xx * imag;
const __m128i WW = _mm_set1_epi32(ww.a);
const __m128i XX = _mm_set1_epi32(xx.a);
const __m128i YY = _mm_set1_epi32(yy.a);
int j0 = jh * v;
int j1 = j0 + v;
int j2 = j1 + v;
int j3 = j2 + v;
int je = j1;
for(; j0 < je; j0 += 4,j1 += 4,j2 += 4,j3 += 4) {
const __m128i T0 = _mm_loadu_si128((__m128i *)(a + j0));
const __m128i T1 = _mm_loadu_si128((__m128i *)(a + j1));
const __m128i T2 = _mm_loadu_si128((__m128i *)(a + j2));
const __m128i T3 = _mm_loadu_si128((__m128i *)(a + j3));
const __m128i T0P1 = montgomery_add_128(T0,T1,m2,m0);
const __m128i T2P3 = montgomery_add_128(T2,T3,m2,m0);
const __m128i T0M1 = montgomery_mul_128(
montgomery_sub_128(T0,T1,m2,m0),XX,r,m1);
__m128i T2M3 = montgomery_mul_128(
montgomery_sub_128(T2,T3,m2,m0),YY,r,m1);
_mm_storeu_si128((__m128i *)(a + j0),
montgomery_add_128(T0P1,T2P3,m2,m0));
_mm_storeu_si128(
(__m128i *)(a + j2),
montgomery_mul_128(montgomery_sub_128(T0P1,T2P3,m2,m0),WW,
r,m1));
_mm_storeu_si128((__m128i *)(a + j1),
montgomery_add_128(T0M1,T2M3,m2,m0));
_mm_storeu_si128(
(__m128i *)(a + j3),
montgomery_mul_128(montgomery_sub_128(T0M1,T2M3,m2,m0),WW,
r,m1));
}
}
xx *= dy[__builtin_ctz(jh += 4)];
}
}
else {
const __m256i m0 = _mm256_set1_epi32(0);
const __m256i m1 = _mm256_set1_epi32(mod);
const __m256i m2 = _mm256_set1_epi32(mod + mod);
const __m256i r = _mm256_set1_epi32(mint::r);
const __m256i Imag = _mm256_set1_epi32(imag.a);
mint ww = one,xx = one,yy = one;
u <<= 2;
for(int jh = 0; jh < u;) {
if(jh == 0) {
int j0 = 0;
int j1 = v;
int j2 = v + v;
int j3 = j2 + v;
for(; j0 < v; j0 += 8,j1 += 8,j2 += 8,j3 += 8) {
const __m256i T0 = _mm256_loadu_si256((__m256i *)(a + j0));
const __m256i T1 = _mm256_loadu_si256((__m256i *)(a + j1));
const __m256i T2 = _mm256_loadu_si256((__m256i *)(a + j2));
const __m256i T3 = _mm256_loadu_si256((__m256i *)(a + j3));
const __m256i T0P1 = montgomery_add_256(T0,T1,m2,m0);
const __m256i T2P3 = montgomery_add_256(T2,T3,m2,m0);
const __m256i T0M1 = montgomery_sub_256(T0,T1,m2,m0);
const __m256i T2M3 = montgomery_mul_256(
montgomery_sub_256(T2,T3,m2,m0),Imag,r,m1);
_mm256_storeu_si256((__m256i *)(a + j0),
montgomery_add_256(T0P1,T2P3,m2,m0));
_mm256_storeu_si256((__m256i *)(a + j2),
montgomery_sub_256(T0P1,T2P3,m2,m0));
_mm256_storeu_si256((__m256i *)(a + j1),
montgomery_add_256(T0M1,T2M3,m2,m0));
_mm256_storeu_si256((__m256i *)(a + j3),
montgomery_sub_256(T0M1,T2M3,m2,m0));
}
}
else {
ww = xx * xx,yy = xx * imag;
const __m256i WW = _mm256_set1_epi32(ww.a);
const __m256i XX = _mm256_set1_epi32(xx.a);
const __m256i YY = _mm256_set1_epi32(yy.a);
int j0 = jh * v;
int j1 = j0 + v;
int j2 = j1 + v;
int j3 = j2 + v;
int je = j1;
for(; j0 < je; j0 += 8,j1 += 8,j2 += 8,j3 += 8) {
const __m256i T0 = _mm256_loadu_si256((__m256i *)(a + j0));
const __m256i T1 = _mm256_loadu_si256((__m256i *)(a + j1));
const __m256i T2 = _mm256_loadu_si256((__m256i *)(a + j2));
const __m256i T3 = _mm256_loadu_si256((__m256i *)(a + j3));
const __m256i T0P1 = montgomery_add_256(T0,T1,m2,m0);
const __m256i T2P3 = montgomery_add_256(T2,T3,m2,m0);
const __m256i T0M1 = montgomery_mul_256(
montgomery_sub_256(T0,T1,m2,m0),XX,r,m1);
const __m256i T2M3 = montgomery_mul_256(
montgomery_sub_256(T2,T3,m2,m0),YY,r,m1);
_mm256_storeu_si256((__m256i *)(a + j0),
montgomery_add_256(T0P1,T2P3,m2,m0));
_mm256_storeu_si256(
(__m256i *)(a + j2),
montgomery_mul_256(montgomery_sub_256(T0P1,T2P3,m2,m0),WW,
r,m1));
_mm256_storeu_si256((__m256i *)(a + j1),
montgomery_add_256(T0M1,T2M3,m2,m0));
_mm256_storeu_si256(
(__m256i *)(a + j3),
montgomery_mul_256(montgomery_sub_256(T0M1,T2M3,m2,m0),WW,
r,m1));
}
}
xx *= dy[__builtin_ctz(jh += 4)];
}
}
u >>= 4;
v <<= 2;
}
if(k & 1) {
v = 1 << (k - 1);
if(v < 8) {
for(int j = 0; j < v; ++j) {
mint ajv = a[j] - a[j + v];
a[j] += a[j + v];
a[j + v] = ajv;
}
}
else {
const __m256i m0 = _mm256_set1_epi32(0);
const __m256i m2 = _mm256_set1_epi32(mod + mod);
int j0 = 0;
int j1 = v;
for(; j0 < v; j0 += 8,j1 += 8) {
const __m256i T0 = _mm256_loadu_si256((__m256i *)(a + j0));
const __m256i T1 = _mm256_loadu_si256((__m256i *)(a + j1));
__m256i naj = montgomery_add_256(T0,T1,m2,m0);
__m256i najv = montgomery_sub_256(T0,T1,m2,m0);
_mm256_storeu_si256((__m256i *)(a + j0),naj);
_mm256_storeu_si256((__m256i *)(a + j1),najv);
}
}
}
if(normalize) {
mint invn = mint(n).inverse();
for(int i = 0; i < n; i++) a[i] *= invn;
}
}
__attribute__((target("avx2"))) void inplace_multiply(
int l1,int l2,int zero_padding = true) {
int l = l1 + l2 - 1;
int M = 4;
while(M < l) M <<= 1;
if(zero_padding) {
for(int i = l1; i < M; i++) buf1_[i] = 0;
for(int i = l2; i < M; i++) buf2_[i] = 0;
}
const __m256i m0 = _mm256_set1_epi32(0);
const __m256i m1 = _mm256_set1_epi32(mod);
const __m256i r = _mm256_set1_epi32(mint::r);
const __m256i N2 = _mm256_set1_epi32(mint::n2);
for(int i = 0; i < l1; i += 8) {
__m256i a = _mm256_loadu_si256((__m256i *)(buf1_ + i));
__m256i b = montgomery_mul_256(a,N2,r,m1);
_mm256_storeu_si256((__m256i *)(buf1_ + i),b);
}
for(int i = 0; i < l2; i += 8) {
__m256i a = _mm256_loadu_si256((__m256i *)(buf2_ + i));
__m256i b = montgomery_mul_256(a,N2,r,m1);
_mm256_storeu_si256((__m256i *)(buf2_ + i),b);
}
ntt(buf1,M);
ntt(buf2,M);
for(int i = 0; i < M; i += 8) {
__m256i a = _mm256_loadu_si256((__m256i *)(buf1_ + i));
__m256i b = _mm256_loadu_si256((__m256i *)(buf2_ + i));
__m256i c = montgomery_mul_256(a,b,r,m1);
_mm256_storeu_si256((__m256i *)(buf1_ + i),c);
}
intt(buf1,M,false);
const __m256i INVM = _mm256_set1_epi32((mint(M).inverse()).a);
for(int i = 0; i < l; i += 8) {
__m256i a = _mm256_loadu_si256((__m256i *)(buf1_ + i));
__m256i b = montgomery_mul_256(a,INVM,r,m1);
__m256i c = my256_mulhi_epu32(my256_mullo_epu32(b,r),m1);
__m256i d = _mm256_and_si256(_mm256_cmpgt_epi32(c,m0),m1);
__m256i e = _mm256_sub_epi32(d,c);
_mm256_storeu_si256((__m256i *)(buf1_ + i),e);
}
}
void ntt(vector<mint> &a) {
int M = (int)a.size();
for(int i = 0; i < M; i++) buf1[i].a = a[i].a;
ntt(buf1,M);
for(int i = 0; i < M; i++) a[i].a = buf1[i].a;
}
void intt(vector<mint> &a) {
int M = (int)a.size();
for(int i = 0; i < M; i++) buf1[i].a = a[i].a;
intt(buf1,M,true);
for(int i = 0; i < M; i++) a[i].a = buf1[i].a;
}
vector<mint> multiply(const vector<mint> &a,const vector<mint> &b) {
int l = a.size() + b.size() - 1;
if(min<int>(a.size(),b.size()) <= 40) {
vector<mint> s(l);
for(int i = 0; i < (int)a.size(); ++i)
for(int j = 0; j < (int)b.size(); ++j) s[i + j] += a[i] * b[j];
return s;
}
int M = 4;
while(M < l) M <<= 1;
for(int i = 0; i < (int)a.size(); ++i) buf1[i].a = a[i].a;
for(int i = (int)a.size(); i < M; ++i) buf1[i].a = 0;
for(int i = 0; i < (int)b.size(); ++i) buf2[i].a = b[i].a;
for(int i = (int)b.size(); i < M; ++i) buf2[i].a = 0;
ntt(buf1,M);
ntt(buf2,M);
for(int i = 0; i < M; ++i)
buf1[i].a = mint::reduce(uint64_t(buf1[i].a) * buf2[i].a);
intt(buf1,M,false);
vector<mint> s(l);
mint invm = mint(M).inverse();
for(int i = 0; i < l; ++i) s[i] = buf1[i] * invm;
return s;
}
void ntt_doubling(vector<mint> &a) {
int M = (int)a.size();
for(int i = 0; i < M; i++) buf1[i].a = a[i].a;
intt(buf1,M);
mint r = 1,zeta = mint(pr).pow((mint::get_mod() - 1) / (M << 1));
for(int i = 0; i < M; i++) buf1[i] *= r,r *= zeta;
ntt(buf1,M);
a.resize(2 * M);
for(int i = 0; i < M; i++) a[M + i].a = buf1[i].a;
}
};using namespace std;
template <typename mint>
struct FormalPowerSeries : vector<mint> {
using vector<mint>::vector;
using FPS = FormalPowerSeries;
FPS &operator+=(const FPS &r) {
if(r.size() > this->size()) this->resize(r.size());
for(int i = 0; i < (int)r.size(); i++) (*this)[i] += r[i];
return *this;
}
FPS &operator+=(const mint &r) {
if(this->empty()) this->resize(1);
(*this)[0] += r;
return *this;
}
FPS &operator-=(const FPS &r) {
if(r.size() > this->size()) this->resize(r.size());
for(int i = 0; i < (int)r.size(); i++) (*this)[i] -= r[i];
return *this;
}
FPS &operator-=(const mint &r) {
if(this->empty()) this->resize(1);
(*this)[0] -= r;
return *this;
}
FPS &operator*=(const mint &v) {
for(int k = 0; k < (int)this->size(); k++) (*this)[k] *= v;
return *this;
}
FPS &operator/=(const FPS &r) {
if(this->size() < r.size()) {
this->clear();
return *this;
}
int n = this->size() - r.size() + 1;
return *this = ((*this).rev().pre(n) * r.rev().inv(n)).pre(n).rev();
}
FPS &operator%=(const FPS &r) {
*this -= *this / r * r;
shrink();
return *this;
}
FPS operator+(const FPS &r) const { return FPS(*this) += r; }
FPS operator+(const mint &v) const { return FPS(*this) += v; }
FPS operator-(const FPS &r) const { return FPS(*this) -= r; }
FPS operator-(const mint &v) const { return FPS(*this) -= v; }
FPS operator*(const FPS &r) const { return FPS(*this) *= r; }
FPS operator*(const mint &v) const { return FPS(*this) *= v; }
FPS operator/(const FPS &r) const { return FPS(*this) /= r; }
FPS operator%(const FPS &r) const { return FPS(*this) %= r; }
FPS operator-() const {
FPS ret(this->size());
for(int i = 0; i < (int)this->size(); i++) ret[i] = -(*this)[i];
return ret;
}
void shrink() {
while(this->size() && this->back() == mint(0)) this->pop_back();
}
FPS rev() const {
FPS ret(*this);
reverse(begin(ret),end(ret));
return ret;
}
FPS dot(FPS r) const {
FPS ret(min(this->size(),r.size()));
for(int i = 0; i < (int)ret.size(); i++) ret[i] = (*this)[i] * r[i];
return ret;
}
FPS pre(int sz) const {
return FPS(begin(*this),begin(*this) + min((int)this->size(),sz));
}
FPS operator>>(int sz) const {
if((int)this->size() <= sz) return {};
FPS ret(*this);
ret.erase(ret.begin(),ret.begin() + sz);
return ret;
}
FPS operator<<(int sz) const {
FPS ret(*this);
ret.insert(ret.begin(),sz,mint(0));
return ret;
}
FPS diff() const {
const int n = (int)this->size();
FPS ret(max(0,n - 1));
for(int i = 1; i < n; i++) ret[i - 1] = (*this)[i] * mint(i);
return ret;
}
FPS integral() const {
const int n = (int)this->size();
FPS ret(n + 1);
ret[0] = mint(0);
for(int i = 0; i < n; i++) ret[i + 1] = (*this)[i] / mint(i + 1);
return ret;
}
mint eval(mint x) const {
mint r = 0,w = 1;
for(auto &v : *this) r += w * v,w *= x;
return r;
}
FPS log(int deg = -1) const {
assert((*this)[0] == mint(1));
if(deg == -1) deg = (int)this->size();
return (this->diff() * this->inv(deg)).pre(deg - 1).integral();
}
FPS pow(int64_t k,int deg = -1) const {
const int n = (int)this->size();
if(deg == -1) deg = n;
for(int i = 0; i < n; i++) {
if((*this)[i] != mint(0)) {
if(i * k > deg) return FPS(deg,mint(0));
mint rev = mint(1) / (*this)[i];
FPS ret = (((*this * rev) >> i).log() * k).exp() * ((*this)[i].pow(k));
ret = (ret << (i * k)).pre(deg);
if((int)ret.size() < deg) ret.resize(deg,mint(0));
return ret;
}
}
return FPS(deg,mint(0));
}
static void *ntt_ptr;
static void set_fft();
FPS &operator*=(const FPS &r);
void ntt();
void intt();
void ntt_doubling();
static int ntt_pr();
FPS inv(int deg = -1) const;
FPS exp(int deg = -1) const;
};
template <typename mint>
void *FormalPowerSeries<mint>::ntt_ptr = nullptr;
/**
* @brief 多項式/形式的冪級数ライブラリ
* @docs docs/formal-power-series.md
*/
template <typename mint>
void FormalPowerSeries<mint>::set_fft() {
if(!ntt_ptr) ntt_ptr = new NTT<mint>;
}
template <typename mint>
FormalPowerSeries<mint>& FormalPowerSeries<mint>::operator*=(
const FormalPowerSeries<mint>& r) {
if(this->empty() || r.empty()) {
this->clear();
return *this;
}
set_fft();
auto ret = static_cast<NTT<mint>*>(ntt_ptr)->multiply(*this,r);
return *this = FormalPowerSeries<mint>(ret.begin(),ret.end());
}
template <typename mint>
void FormalPowerSeries<mint>::ntt() {
set_fft();
static_cast<NTT<mint>*>(ntt_ptr)->ntt(*this);
}
template <typename mint>
void FormalPowerSeries<mint>::intt() {
set_fft();
static_cast<NTT<mint>*>(ntt_ptr)->intt(*this);
}
template <typename mint>
void FormalPowerSeries<mint>::ntt_doubling() {
set_fft();
static_cast<NTT<mint>*>(ntt_ptr)->ntt_doubling(*this);
}
template <typename mint>
int FormalPowerSeries<mint>::ntt_pr() {
set_fft();
return static_cast<NTT<mint>*>(ntt_ptr)->pr;
}
template <typename mint>
FormalPowerSeries<mint> FormalPowerSeries<mint>::inv(int deg) const {
assert((*this)[0] != mint(0));
if(deg == -1) deg = (int)this->size();
FormalPowerSeries<mint> res(deg);
res[0] = {mint(1) / (*this)[0]};
for(int d = 1; d < deg; d <<= 1) {
FormalPowerSeries<mint> f(2 * d),g(2 * d);
for(int j = 0; j < min((int)this->size(),2 * d); j++) f[j] = (*this)[j];
for(int j = 0; j < d; j++) g[j] = res[j];
f.ntt();
g.ntt();
for(int j = 0; j < 2 * d; j++) f[j] *= g[j];
f.intt();
for(int j = 0; j < d; j++) f[j] = 0;
f.ntt();
for(int j = 0; j < 2 * d; j++) f[j] *= g[j];
f.intt();
for(int j = d; j < min(2 * d,deg); j++) res[j] = -f[j];
}
return res.pre(deg);
}
template <typename mint>
FormalPowerSeries<mint> FormalPowerSeries<mint>::exp(int deg) const {
assert((*this).size() == 0 || (*this)[0] == mint(0));
if(deg == -1) deg = (int)this->size();
FormalPowerSeries<mint> ret({mint(1)});
for(int i = 1; i < deg; i <<= 1) {
ret = (ret * (pre(i << 1) + mint(1) - ret.log(i << 1))).pre(i << 1);
}
return ret.pre(deg);
}
using namespace std;
template <typename T>
struct Binomial {
vector<T> fac_,finv_,inv_;
Binomial(int MAX): fac_(MAX + 10),finv_(MAX + 10),inv_(MAX + 10) {
MAX += 9;
fac_[0] = finv_[0] = inv_[0] = 1;
for(int i = 1; i <= MAX; i++) fac_[i] = fac_[i - 1] * i;
finv_[MAX] = fac_[MAX].inverse();
for(int i = MAX - 1; i > 0; i--) finv_[i] = finv_[i + 1] * (i + 1);
for(int i = 1; i <= MAX; i++) inv_[i] = finv_[i] * fac_[i - 1];
}
inline T fac(int i) const { return fac_[i]; }
inline T finv(int i) const { return finv_[i]; }
inline T inv(int i) const { return inv_[i]; }
T C(int n,int r) const {
if(n < r || r < 0) return T(0);
return fac_[n] * finv_[n - r] * finv_[r];
}
T C_naive(int n,int r) const {
if(n < r || r < 0) return T(0);
T ret = 1;
for(T i = 1; i <= r; i += T(1)) {
ret *= n--;
ret *= i.inverse();
}
return ret;
}
T P(int n,int r) const {
if(n < r || r < 0) return T(0);
return fac_[n] * finv_[n - r];
}
T H(int n,int r) const {
if(n < 0 || r < 0) return (0);
return r == 0 ? 1 : C(n + r - 1,r);
}
};
// calculate F(x + a)
template <typename mint>
FormalPowerSeries<mint> TaylorShift(FormalPowerSeries<mint> f,mint a,
Binomial<mint>& C) {
using fps = FormalPowerSeries<mint>;
assert(C.fac_.size() >= f.size() + 1);
int N = f.size();
for(int i = 0; i < N; i++) f[i] *= C.fac(i);
reverse(begin(f),end(f));
fps g(N,mint(1));
for(int i = 1; i < N; i++) g[i] = g[i - 1] * a * C.inv(i);
f = (f * g).pre(N);
reverse(begin(f),end(f));
for(int i = 0; i < N; i++) f[i] *= C.finv(i);
return f;
}
/**
* @brief 平行移動
* @docs docs/fps-taylor-shift.md
*/
using namespace std;
namespace fastio {
static constexpr int SZ = 1 << 17;
char ibuf[SZ],obuf[SZ];
int pil = 0,pir = 0,por = 0;
struct Pre {
char num[40000];
constexpr Pre(): num() {
for(int i = 0; i < 10000; i++) {
int n = i;
for(int j = 3; j >= 0; j--) {
num[i * 4 + j] = n % 10 + '0';
n /= 10;
}
}
}
} constexpr pre;
inline void load() {
memcpy(ibuf,ibuf + pil,pir - pil);
pir = pir - pil + fread(ibuf + pir - pil,1,SZ - pir + pil,stdin);
pil = 0;
}
inline void flush() {
fwrite(obuf,1,por,stdout);
por = 0;
}
inline void rd(char& c) { c = ibuf[pil++]; }
template <typename T>
inline void rd(T& x) {
if(pil + 32 > pir) load();
char c;
do
c = ibuf[pil++];
while(c < '-');
bool minus = 0;
if(c == '-') {
minus = 1;
c = ibuf[pil++];
}
x = 0;
while(c >= '0') {
x = x * 10 + (c & 15);
c = ibuf[pil++];
}
if(minus) x = -x;
}
inline void wt(char c) { obuf[por++] = c; }
template <typename T>
inline void wt(T x) {
if(por > SZ - 32) flush();
if(!x) {
obuf[por++] = '0';
return;
}
if(x < 0) {
obuf[por++] = '-';
x = -x;
}
int i = 12;
char buf[16];
while(x >= 10000) {
memcpy(buf + i,pre.num + (x % 10000) * 4,4);
x /= 10000;
i -= 4;
}
int d = x < 100 ? (x < 10 ? 1 : 2) : (x < 1000 ? 3 : 4);
memcpy(obuf + por,pre.num + x * 4 + 4 - d,d);
por += d;
memcpy(obuf + por,buf + i + 4,12 - i);
por += 12 - i;
}
struct Dummy {
Dummy() { atexit(flush); }
} dummy;
} // namespace fastio
using fastio::rd;
using fastio::wt;
using namespace std;
template <uint32_t mod>
struct LazyMontgomeryModInt {
using mint = LazyMontgomeryModInt;
using i32 = int32_t;
using u32 = uint32_t;
using u64 = uint64_t;
static constexpr u32 get_r() {
u32 ret = mod;
for(i32 i = 0; i < 4; ++i) ret *= 2 - mod * ret;
return ret;
}
static constexpr u32 r = get_r();
static constexpr u32 n2 = -u64(mod) % mod;
static_assert(r * mod == 1,"invalid, r * mod != 1");
static_assert(mod < (1 << 30),"invalid, mod >= 2 ^ 30");
static_assert((mod & 1) == 1,"invalid, mod % 2 == 0");
u32 a;
constexpr LazyMontgomeryModInt(): a(0) {}
constexpr LazyMontgomeryModInt(const int64_t &b)
: a(reduce(u64(b % mod + mod) * n2)){};
static constexpr u32 reduce(const u64 &b) {
return (b + u64(u32(b) * u32(-r)) * mod) >> 32;
}
constexpr mint &operator+=(const mint &b) {
if(i32(a += b.a - 2 * mod) < 0) a += 2 * mod;
return *this;
}
constexpr mint &operator-=(const mint &b) {
if(i32(a -= b.a) < 0) a += 2 * mod;
return *this;
}
constexpr mint &operator*=(const mint &b) {
a = reduce(u64(a) * b.a);
return *this;
}
constexpr mint &operator/=(const mint &b) {
*this *= b.inverse();
return *this;
}
constexpr mint operator+(const mint &b) const { return mint(*this) += b; }
constexpr mint operator-(const mint &b) const { return mint(*this) -= b; }
constexpr mint operator*(const mint &b) const { return mint(*this) *= b; }
constexpr mint operator/(const mint &b) const { return mint(*this) /= b; }
constexpr bool operator==(const mint &b) const {
return (a >= mod ? a - mod : a) == (b.a >= mod ? b.a - mod : b.a);
}
constexpr bool operator!=(const mint &b) const {
return (a >= mod ? a - mod : a) != (b.a >= mod ? b.a - mod : b.a);
}
constexpr mint operator-() const { return mint() - mint(*this); }
constexpr mint pow(u64 n) const {
mint ret(1),mul(*this);
while(n > 0) {
if(n & 1) ret *= mul;
mul *= mul;
n >>= 1;
}
return ret;
}
constexpr mint inverse() const { return pow(mod - 2); }
friend ostream &operator<<(ostream &os,const mint &b) {
return os << b.get();
}
friend istream &operator>>(istream &is,mint &b) {
int64_t t;
is >> t;
b = LazyMontgomeryModInt<mod>(t);
return (is);
}
constexpr u32 get() const {
u32 ret = reduce(a);
return ret >= mod ? ret - mod : ret;
}
static constexpr u32 get_mod() { return mod; }
};
template <typename mint>
FormalPowerSeries<mint> Stirling1st(int N,Binomial<mint> &C) {
using fps = FormalPowerSeries<mint>;
if(N <= 0) return fps{1};
int lg = 31 - __builtin_clz(N);
fps f = {0, 1};
for(int i = lg - 1; i >= 0; i--) {
int n = N >> i;
f *= TaylorShift(f,mint(n >> 1),C);
if(n & 1) f = (f << 1) + f * (n - 1);
}
return f;
}
template <typename mint>
FormalPowerSeries<mint> Stirling2nd(int N,Binomial<mint> &C) {
using fps = FormalPowerSeries<mint>;
fps f(N + 1),g(N + 1);
for(int i = 0; i <= N; i++) {
f[i] = mint(i).pow(N) * C.finv(i);
g[i] = (i & 1) ? -C.finv(i) : C.finv(i);
}
return (f * g).pre(N + 1);
}
template <typename mint>
FormalPowerSeries<mint> BernoulliEGF(int N,Binomial<mint> &C) {
using fps = FormalPowerSeries<mint>;
fps f(N + 1);
for(int i = 0; i <= N; i++) f[i] = C.finv(i + 1);
return f.inv(N + 1);
}
template <typename mint>
FormalPowerSeries<mint> SamplePointShift(FormalPowerSeries<mint>& y,mint t,
int m = -1) {
if(m == -1) m = y.size();
long long T = t.get();
int k = (int)y.size() - 1;
T %= mint::get_mod();
if(T <= k) {
FormalPowerSeries<mint> ret(m);
int ptr = 0;
for(int64_t i = T; i <= k and ptr < m; i++) {
ret[ptr++] = y[i];
}
if(k + 1 < T + m) {
auto suf = SamplePointShift<mint>(y,k + 1,m - ptr);
for(int i = k + 1; i < T + m; i++) {
ret[ptr++] = suf[i - (k + 1)];
}
}
return ret;
}
if(T + m > mint::get_mod()) {
auto pref = SamplePointShift<mint>(y,T,mint::get_mod() - T);
auto suf = SamplePointShift<mint>(y,0,m - pref.size());
copy(begin(suf),end(suf),back_inserter(pref));
return pref;
}
FormalPowerSeries<mint> finv(k + 1,1),d(k + 1);
for(int i = 2; i <= k; i++) finv[k] *= i;
finv[k] = mint(1) / finv[k];
for(int i = k; i >= 1; i--) finv[i - 1] = finv[i] * i;
for(int i = 0; i <= k; i++) {
d[i] = finv[i] * finv[k - i] * y[i];
if((k - i) & 1) d[i] = -d[i];
}
FormalPowerSeries<mint> h(m + k);
for(int i = 0; i < m + k; i++) {
h[i] = mint(1) / (T - k + i);
}
auto dh = d * h;
FormalPowerSeries<mint> ret(m);
mint cur = T;
for(int i = 1; i <= k; i++) cur *= T - i;
for(int i = 0; i < m; i++) {
ret[i] = cur * dh[k + i];
cur *= T + i + 1;
cur *= h[i];
}
return ret;
}
ll modinv(ll A,ll M){
//A*r+B*n=gcd(A,B)
A %= M;
if(A == 0 || __gcd(A,M) != 1){
//cout << "Error modinv(" << A << "," << M << ")" << endl;
return -1;
}
ll B = M,U = 1,V = 0;
while(B){
ll T = A / B;
A -= T * B;
swap(A,B);
U -= T * V;
swap(U,V);
}
U %= M;
if(U < 0) U += M;
return U;
}
template <typename mint>
mint factorial(int n) {
if(n <= 1) return 1;
using fps = FormalPowerSeries<mint>;
long long v = 1;
while(v * v < n) v *= 2;
mint iv = mint(v).inverse();
fps G{1, v + 1};
for(long long d = 1; d != v; d <<= 1) {
fps G1 = SamplePointShift(G,mint(d) * iv);
fps G2 = SamplePointShift(G,mint(d * v + v) * iv);
fps G3 = SamplePointShift(G,mint(d * v + d + v) * iv);
for(int i = 0; i <= d; i++) G[i] *= G1[i],G2[i] *= G3[i];
copy(begin(G2),end(G2) - 1,back_inserter(G));
}
mint res = 1;
long long i = 0;
while(i + v <= n) res *= G[i / v],i += v;
while(i < n) res *= ++i;
return res;
}
int main(){
constexpr int MOD9 = 998244353;
using mint = LazyMontgomeryModInt<MOD9>;
Binomial<mint> C(1530000);
auto seq = BernoulliEGF(1000010,C);
vector<ll> B(1000010);
for(int i = 0; i < 1000010; i++) {
B[i] = ((seq[i] * C.fac(i)).get());
}
wt('\n');
ll N,M;
ll n,k;
cin >> N >> M;
n = N;
k = M;
N -= 1;
/*for(ll i = 0;i <= N;i++){
for(ll j = 0;j <= K;j++){
ll x = 0;
for(ll p = 1;p < i;p++){
x += modpow(p,j,MOD) * (i - p);
}
cout << x << " ";
}
cout << endl;
}*/
COMinit(5000000);
ll a1 = 0;
vector<ll> modp(M + 200,1);
for(ll i = 1;i <= M + 100;i++) modp[i] = (modp[i - 1] * (N + 1 % MOD9)) % MOD9;
for(ll i = 0;i <= M;i++){
ll x = COM(M + 1,i) * B[i] % MOD9;
x *= modp[M + 1 - i];
x %= MOD9;
a1 += x;
a1 %= MOD9;
if(a1 < 0) a1 += MOD9;
}
a1 *= N + 1;
a1 %= MOD9;
a1 *= modinv(M + 1,MOD9);
a1 %= MOD9;
ll a2 = 0;
for(ll i = 0;i <= M + 1;i++){
ll x = COM(M + 2,i) * B[i] % MOD9;
x *= modp[M + 2 - i];
x %= MOD9;
a2 += x;
a2 %= MOD9;
if(a2 < 0) a2 += MOD9;
}
a2 *= modinv(M + 2,MOD9);
a2 %= MOD9;
if(a2 < 0) a2 += MOD9;
ll ans = a1 - a2;
ans %= MOD9;
if(ans < 0) ans += MOD9;
//cerr << ans << endl;
ans *= factorial<mint>(n - 1).get();
//for(ll i = 1;i <= n - 1;i++) ans = (ans * i) % MOD;
ans *= 2;
ans %= MOD;
if(ans < 0) ans += MOD;
cout << ans;
}