結果

問題 No.3044 よくあるカエルさん
ユーザー hato336
提出日時 2025-02-28 22:58:14
言語 C++23
(gcc 13.3.0 + boost 1.87.0)
結果
AC  
実行時間 3 ms / 2,000 ms
コード長 52,070 bytes
コンパイル時間 7,778 ms
コンパイル使用メモリ 432,176 KB
実行使用メモリ 6,824 KB
最終ジャッジ日時 2025-02-28 22:58:23
合計ジャッジ時間 8,595 ms
ジャッジサーバーID
(参考情報)
judge3 / judge1
このコードへのチャレンジ
(要ログイン)
ファイルパターン 結果
sample AC * 1
other AC * 20
権限があれば一括ダウンロードができます
コンパイルメッセージ
main.cpp:476:2: warning: ‘always_inline’ function might not be inlinable [-Wattributes]
  476 |  montgomery_sub_256(const __m256i &a, const __m256i &b, const __m256i &m2,
      |  ^~~~~~~~~~~~~~~~~~
main.cpp:468:2: warning: ‘always_inline’ function might not be inlinable [-Wattributes]
  468 |  montgomery_add_256(const __m256i &a, const __m256i &b, const __m256i &m2,
      |  ^~~~~~~~~~~~~~~~~~
main.cpp:460:2: warning: ‘always_inline’ function might not be inlinable [-Wattributes]
  460 |  montgomery_mul_256(const __m256i &a, const __m256i &b, const __m256i &r,
      |  ^~~~~~~~~~~~~~~~~~
main.cpp:449:2: warning: ‘always_inline’ function might not be inlinable [-Wattributes]
  449 |  my256_mulhi_epu32(const __m256i &a, const __m256i &b) {
      |  ^~~~~~~~~~~~~~~~~
main.cpp:444:2: warning: ‘always_inline’ function might not be inlinable [-Wattributes]
  444 |  my256_mullo_epu32(const __m256i &a, const __m256i &b) {
      |  ^~~~~~~~~~~~~~~~~
main.cpp:437:2: warning: ‘always_inline’ function might not be inlinable [-Wattributes]
  437 |  montgomery_sub_128(const __m128i &a, const __m128i &b, const __m128i &m2,
      |  ^~~~~~~~~~~~~~~~~~
main.cpp:430:2: warning: ‘always_inline’ function might not be inlinable [-Wattributes]
  430 |  montgomery_add_128(const __m128i &a, const __m128i &b, const __m128i &m2,
      |  ^~~~~~~~~~~~~~~~~~
main.cpp:422:2: warning: ‘always_inline’ function might not be inlinable [-Wattributes]
  422 |  montgomery_mul_128(const __m128i &a, const __m128i &b, const __m128i &r,
      |  ^~~~~~~~~~~~~~~~~~
main.cpp:411:2: warning: ‘always_inline’ function might not be inlinable [-Wattributes]
  411 |  my128_mulhi_epu32(const __m128i &a, const __m128i &b) {
      |  ^~~~~~~~~~~~~~~~~
main.cpp:406:2: warning: ‘always_inline’ function might not be inlinable [-Wattributes]
  406 |  my128_mullo_epu32(const __m128i &a, const __m128i &b) {
      |  ^~~~~~~~~~~~~~~~~

ソースコード

diff #

//https://yukicoder.me/submissions/583989
/**
 *  date : 2020-11-25 08:26:54
 */

 #pragma region kyopro_template
 #define Nyaan_template
 #include <immintrin.h>
 #include <bits/stdc++.h>
 #define pb push_back
 #define eb emplace_back
 #define fi first
 #define se second
 #define each(x, v) for (auto &x : v)
 #define all(v) (v).begin(), (v).end()
 #define sz(v) ((int)(v).size())
 #define mem(a, val) memset(a, val, sizeof(a))
 #define ini(...)   \
   int __VA_ARGS__; \
   in(__VA_ARGS__)
 #define inl(...)         \
   long long __VA_ARGS__; \
   in(__VA_ARGS__)
 #define ins(...)      \
   string __VA_ARGS__; \
   in(__VA_ARGS__)
 #define inc(...)    \
   char __VA_ARGS__; \
   in(__VA_ARGS__)
 #define in2(s, t)                           \
   for (int i = 0; i < (int)s.size(); i++) { \
     in(s[i], t[i]);                         \
   }
 #define in3(s, t, u)                        \
   for (int i = 0; i < (int)s.size(); i++) { \
     in(s[i], t[i], u[i]);                   \
   }
 #define in4(s, t, u, v)                     \
   for (int i = 0; i < (int)s.size(); i++) { \
     in(s[i], t[i], u[i], v[i]);             \
   }
 #define rep(i, N) for (long long i = 0; i < (long long)(N); i++)
 #define repr(i, N) for (long long i = (long long)(N)-1; i >= 0; i--)
 #define rep1(i, N) for (long long i = 1; i <= (long long)(N); i++)
 #define repr1(i, N) for (long long i = (N); (long long)(i) > 0; i--)
 #define reg(i, a, b) for (long long i = (a); i < (b); i++)
 #define die(...)      \
   do {                \
     out(__VA_ARGS__); \
     return;           \
   } while (0)
 using namespace std;
 using ll = long long;
 template <class T>
 using V = vector<T>;
 using vi = vector<int>;
 using vl = vector<long long>;
 using vvi = vector<vector<int>>;
 using vd = V<double>;
 using vs = V<string>;
 using vvl = vector<vector<long long>>;
 using P = pair<long long, long long>;
 using vp = vector<P>;
 using pii = pair<int, int>;
 using vpi = vector<pair<int, int>>;
 constexpr int inf = 1001001001;
 constexpr long long infLL = (1LL << 61) - 1;
 template <typename T, typename U>
 inline bool amin(T &x, U y) {
   return (y < x) ? (x = y, true) : false;
 }
 template <typename T, typename U>
 inline bool amax(T &x, U y) {
   return (x < y) ? (x = y, true) : false;
 }
 template <typename T, typename U>
 ostream &operator<<(ostream &os, const pair<T, U> &p) {
   os << p.first << " " << p.second;
   return os;
 }
 template <typename T, typename U>
 istream &operator>>(istream &is, pair<T, U> &p) {
   is >> p.first >> p.second;
   return is;
 }
 template <typename T>
 ostream &operator<<(ostream &os, const vector<T> &v) {
   int s = (int)v.size();
   for (int i = 0; i < s; i++) os << (i ? " " : "") << v[i];
   return os;
 }
 template <typename T>
 istream &operator>>(istream &is, vector<T> &v) {
   for (auto &x : v) is >> x;
   return is;
 }
 void in() {}
 template <typename T, class... U>
 void in(T &t, U &... u) {
   cin >> t;
   in(u...);
 }
 void out() { cout << "\n"; }
 template <typename T, class... U>
 void out(const T &t, const U &... u) {
   cout << t;
   if (sizeof...(u)) cout << " ";
   out(u...);
 }
 
 #ifdef NyaanDebug
 #define trc(...)                   \
   do {                             \
     cerr << #__VA_ARGS__ << " = "; \
     dbg_out(__VA_ARGS__);          \
   } while (0)
 #define trca(v, N)       \
   do {                   \
     cerr << #v << " = "; \
     array_out(v, N);     \
   } while (0)
 #define trcc(v)                             \
   do {                                      \
     cerr << #v << " = {";                   \
     each(x, v) { cerr << " " << x << ","; } \
     cerr << "}" << endl;                    \
   } while (0)
 template <typename T>
 void _cout(const T &c) {
   cerr << c;
 }
 void _cout(const int &c) {
   if (c == 1001001001)
     cerr << "inf";
   else if (c == -1001001001)
     cerr << "-inf";
   else
     cerr << c;
 }
 void _cout(const unsigned int &c) {
   if (c == 1001001001)
     cerr << "inf";
   else
     cerr << c;
 }
 void _cout(const long long &c) {
   if (c == 1001001001 || c == (1LL << 61) - 1)
     cerr << "inf";
   else if (c == -1001001001 || c == -((1LL << 61) - 1))
     cerr << "-inf";
   else
     cerr << c;
 }
 void _cout(const unsigned long long &c) {
   if (c == 1001001001 || c == (1LL << 61) - 1)
     cerr << "inf";
   else
     cerr << c;
 }
 template <typename T, typename U>
 void _cout(const pair<T, U> &p) {
   cerr << "{ ";
   _cout(p.fi);
   cerr << ", ";
   _cout(p.se);
   cerr << " } ";
 }
 template <typename T>
 void _cout(const vector<T> &v) {
   int s = v.size();
   cerr << "{ ";
   for (int i = 0; i < s; i++) {
     cerr << (i ? ", " : "");
     _cout(v[i]);
   }
   cerr << " } ";
 }
 template <typename T>
 void _cout(const vector<vector<T>> &v) {
   cerr << "[ ";
   for (const auto &x : v) {
     cerr << endl;
     _cout(x);
     cerr << ", ";
   }
   cerr << endl << " ] ";
 }
 void dbg_out() { cerr << endl; }
 template <typename T, class... U>
 void dbg_out(const T &t, const U &... u) {
   _cout(t);
   if (sizeof...(u)) cerr << ", ";
   dbg_out(u...);
 }
 template <typename T>
 void array_out(const T &v, int s) {
   cerr << "{ ";
   for (int i = 0; i < s; i++) {
     cerr << (i ? ", " : "");
     _cout(v[i]);
   }
   cerr << " } " << endl;
 }
 template <typename T>
 void array_out(const T &v, int H, int W) {
   cerr << "[ ";
   for (int i = 0; i < H; i++) {
     cerr << (i ? ", " : "");
     array_out(v[i], W);
   }
   cerr << " ] " << endl;
 }
 #else
 #define trc(...)
 #define trca(...)
 #define trcc(...)
 #endif
 
 inline int popcnt(unsigned long long a) { return __builtin_popcountll(a); }
 inline int lsb(unsigned long long a) { return __builtin_ctzll(a); }
 inline int msb(unsigned long long a) { return 63 - __builtin_clzll(a); }
 template <typename T>
 inline int getbit(T a, int i) {
   return (a >> i) & 1;
 }
 template <typename T>
 inline void setbit(T &a, int i) {
   a |= (1LL << i);
 }
 template <typename T>
 inline void delbit(T &a, int i) {
   a &= ~(1LL << i);
 }
 template <typename T>
 int lb(const vector<T> &v, const T &a) {
   return lower_bound(begin(v), end(v), a) - begin(v);
 }
 template <typename T>
 int ub(const vector<T> &v, const T &a) {
   return upper_bound(begin(v), end(v), a) - begin(v);
 }
 template <typename T>
 int btw(T a, T x, T b) {
   return a <= x && x < b;
 }
 template <typename T, typename U>
 T ceil(T a, U b) {
   return (a + b - 1) / b;
 }
 constexpr long long TEN(int n) {
   long long ret = 1, x = 10;
   while (n) {
     if (n & 1) ret *= x;
     x *= x;
     n >>= 1;
   }
   return ret;
 }
 template <typename T>
 vector<T> mkrui(const vector<T> &v) {
   vector<T> ret(v.size() + 1);
   for (int i = 0; i < int(v.size()); i++) ret[i + 1] = ret[i] + v[i];
   return ret;
 };
 template <typename T>
 vector<T> mkuni(const vector<T> &v) {
   vector<T> ret(v);
   sort(ret.begin(), ret.end());
   ret.erase(unique(ret.begin(), ret.end()), ret.end());
   return ret;
 }
 template <typename F>
 vector<int> mkord(int N, F f) {
   vector<int> ord(N);
   iota(begin(ord), end(ord), 0);
   sort(begin(ord), end(ord), f);
   return ord;
 }
 template <typename T = int>
 vector<T> mkiota(int N) {
   vector<T> ret(N);
   iota(begin(ret), end(ret), 0);
   return ret;
 }
 template <typename T>
 vector<int> mkinv(vector<T> &v) {
   vector<int> inv(v.size());
   for (int i = 0; i < (int)v.size(); i++) inv[v[i]] = i;
   return inv;
 }
 
 struct IoSetupNya {
   IoSetupNya() {
     cin.tie(nullptr);
     ios::sync_with_stdio(false);
     cout << fixed << setprecision(15);
     cerr << fixed << setprecision(7);
   }
 } iosetupnya;
 
 void solve();

 
 #pragma endregion
 using namespace std;
 
 using namespace std;
 
 using namespace std;
 
 template <uint32_t mod>
 struct LazyMontgomeryModInt {
   using mint = LazyMontgomeryModInt;
   using i32 = int32_t;
   using u32 = uint32_t;
   using u64 = uint64_t;
 
   static constexpr u32 get_r() {
     u32 ret = mod;
     for (i32 i = 0; i < 4; ++i) ret *= 2 - mod * ret;
     return ret;
   }
 
   static constexpr u32 r = get_r();
   static constexpr u32 n2 = -u64(mod) % mod;
   static_assert(r * mod == 1, "invalid, r * mod != 1");
   static_assert(mod < (1 << 30), "invalid, mod >= 2 ^ 30");
   static_assert((mod & 1) == 1, "invalid, mod % 2 == 0");
 
   u32 a;
 
   constexpr LazyMontgomeryModInt() : a(0) {}
   constexpr LazyMontgomeryModInt(const int64_t &b)
       : a(reduce(u64(b % mod + mod) * n2)){};
 
   static constexpr u32 reduce(const u64 &b) {
     return (b + u64(u32(b) * u32(-r)) * mod) >> 32;
   }
 
   constexpr mint &operator+=(const mint &b) {
     if (i32(a += b.a - 2 * mod) < 0) a += 2 * mod;
     return *this;
   }
 
   constexpr mint &operator-=(const mint &b) {
     if (i32(a -= b.a) < 0) a += 2 * mod;
     return *this;
   }
 
   constexpr mint &operator*=(const mint &b) {
     a = reduce(u64(a) * b.a);
     return *this;
   }
 
   constexpr mint &operator/=(const mint &b) {
     *this *= b.inverse();
     return *this;
   }
 
   constexpr mint operator+(const mint &b) const { return mint(*this) += b; }
   constexpr mint operator-(const mint &b) const { return mint(*this) -= b; }
   constexpr mint operator*(const mint &b) const { return mint(*this) *= b; }
   constexpr mint operator/(const mint &b) const { return mint(*this) /= b; }
   constexpr bool operator==(const mint &b) const {
     return (a >= mod ? a - mod : a) == (b.a >= mod ? b.a - mod : b.a);
   }
   constexpr bool operator!=(const mint &b) const {
     return (a >= mod ? a - mod : a) != (b.a >= mod ? b.a - mod : b.a);
   }
   constexpr mint operator-() const { return mint() - mint(*this); }
 
   constexpr mint pow(u64 n) const {
     mint ret(1), mul(*this);
     while (n > 0) {
       if (n & 1) ret *= mul;
       mul *= mul;
       n >>= 1;
     }
     return ret;
   }
   
   constexpr mint inverse() const { return pow(mod - 2); }
 
   friend ostream &operator<<(ostream &os, const mint &b) {
     return os << b.get();
   }
 
   friend istream &operator>>(istream &is, mint &b) {
     int64_t t;
     is >> t;
     b = LazyMontgomeryModInt<mod>(t);
     return (is);
   }
   
   constexpr u32 get() const {
     u32 ret = reduce(a);
     return ret >= mod ? ret - mod : ret;
   }
 
   static constexpr u32 get_mod() { return mod; }
 };using namespace std;
 
 using namespace std;
 
 __attribute__((target("sse4.2"))) __attribute__((always_inline)) __m128i
 my128_mullo_epu32(const __m128i &a, const __m128i &b) {
   return _mm_mullo_epi32(a, b);
 }
 
 __attribute__((target("sse4.2"))) __attribute__((always_inline)) __m128i
 my128_mulhi_epu32(const __m128i &a, const __m128i &b) {
   __m128i a13 = _mm_shuffle_epi32(a, 0xF5);
   __m128i b13 = _mm_shuffle_epi32(b, 0xF5);
   __m128i prod02 = _mm_mul_epu32(a, b);
   __m128i prod13 = _mm_mul_epu32(a13, b13);
   __m128i prod = _mm_unpackhi_epi64(_mm_unpacklo_epi32(prod02, prod13),
                                     _mm_unpackhi_epi32(prod02, prod13));
   return prod;
 }
 
 __attribute__((target("sse4.2"))) __attribute__((always_inline)) __m128i
 montgomery_mul_128(const __m128i &a, const __m128i &b, const __m128i &r,
                    const __m128i &m1) {
   return _mm_sub_epi32(
       _mm_add_epi32(my128_mulhi_epu32(a, b), m1),
       my128_mulhi_epu32(my128_mullo_epu32(my128_mullo_epu32(a, b), r), m1));
 }
 
 __attribute__((target("sse4.2"))) __attribute__((always_inline)) __m128i
 montgomery_add_128(const __m128i &a, const __m128i &b, const __m128i &m2,
                    const __m128i &m0) {
   __m128i ret = _mm_sub_epi32(_mm_add_epi32(a, b), m2);
   return _mm_add_epi32(_mm_and_si128(_mm_cmpgt_epi32(m0, ret), m2), ret);
 }
 
 __attribute__((target("sse4.2"))) __attribute__((always_inline)) __m128i
 montgomery_sub_128(const __m128i &a, const __m128i &b, const __m128i &m2,
                    const __m128i &m0) {
   __m128i ret = _mm_sub_epi32(a, b);
   return _mm_add_epi32(_mm_and_si128(_mm_cmpgt_epi32(m0, ret), m2), ret);
 }
 
 __attribute__((target("avx2"))) __attribute__((always_inline)) __m256i
 my256_mullo_epu32(const __m256i &a, const __m256i &b) {
   return _mm256_mullo_epi32(a, b);
 }
 
 __attribute__((target("avx2"))) __attribute__((always_inline)) __m256i
 my256_mulhi_epu32(const __m256i &a, const __m256i &b) {
   __m256i a13 = _mm256_shuffle_epi32(a, 0xF5);
   __m256i b13 = _mm256_shuffle_epi32(b, 0xF5);
   __m256i prod02 = _mm256_mul_epu32(a, b);
   __m256i prod13 = _mm256_mul_epu32(a13, b13);
   __m256i prod = _mm256_unpackhi_epi64(_mm256_unpacklo_epi32(prod02, prod13),
                                        _mm256_unpackhi_epi32(prod02, prod13));
   return prod;
 }
 
 __attribute__((target("avx2"))) __attribute__((always_inline)) __m256i
 montgomery_mul_256(const __m256i &a, const __m256i &b, const __m256i &r,
                    const __m256i &m1) {
   return _mm256_sub_epi32(
       _mm256_add_epi32(my256_mulhi_epu32(a, b), m1),
       my256_mulhi_epu32(my256_mullo_epu32(my256_mullo_epu32(a, b), r), m1));
 }
 
 __attribute__((target("avx2"))) __attribute__((always_inline)) __m256i
 montgomery_add_256(const __m256i &a, const __m256i &b, const __m256i &m2,
                    const __m256i &m0) {
   __m256i ret = _mm256_sub_epi32(_mm256_add_epi32(a, b), m2);
   return _mm256_add_epi32(_mm256_and_si256(_mm256_cmpgt_epi32(m0, ret), m2),
                           ret);
 }
 
 __attribute__((target("avx2"))) __attribute__((always_inline)) __m256i
 montgomery_sub_256(const __m256i &a, const __m256i &b, const __m256i &m2,
                    const __m256i &m0) {
   __m256i ret = _mm256_sub_epi32(a, b);
   return _mm256_add_epi32(_mm256_and_si256(_mm256_cmpgt_epi32(m0, ret), m2),
                           ret);
 }
 constexpr int SZ_FFT_BUF = 1 << 23;
 uint32_t buf1_[SZ_FFT_BUF] __attribute__((aligned(64)));
 uint32_t buf2_[SZ_FFT_BUF] __attribute__((aligned(64)));
 
 template <typename mint>
 struct NTT {
   static constexpr uint32_t get_pr() {
     uint32_t mod = mint::get_mod();
     using u64 = uint64_t;
     u64 ds[32] = {};
     int idx = 0;
     u64 m = mod - 1;
     for (u64 i = 2; i * i <= m; ++i) {
       if (m % i == 0) {
         ds[idx++] = i;
         while (m % i == 0) m /= i;
       }
     }
     if (m != 1) ds[idx++] = m;
 
     uint32_t pr = 2;
     while (1) {
       int flg = 1;
       for (int i = 0; i < idx; ++i) {
         u64 a = pr, b = (mod - 1) / ds[i], r = 1;
         while (b) {
           if (b & 1) r = r * a % mod;
           a = a * a % mod;
           b >>= 1;
         }
         if (r == 1) {
           flg = 0;
           break;
         }
       }
       if (flg == 1) break;
       ++pr;
     }
     return pr;
   };
 
   static constexpr uint32_t mod = mint::get_mod();
   static constexpr uint32_t pr = get_pr();
   static constexpr int level = __builtin_ctzll(mod - 1);
   mint dw[level], dy[level];
   mint *buf1, *buf2;
 
   constexpr NTT() {
     setwy(level);
     buf1 = reinterpret_cast<mint *>(::buf1_);
     buf2 = reinterpret_cast<mint *>(::buf2_);
   }
 
   constexpr void setwy(int k) {
     mint w[level], y[level];
     w[k - 1] = mint(pr).pow((mod - 1) / (1 << k));
     y[k - 1] = w[k - 1].inverse();
     for (int i = k - 2; i > 0; --i)
       w[i] = w[i + 1] * w[i + 1], y[i] = y[i + 1] * y[i + 1];
     dw[0] = dy[0] = w[1] * w[1];
     dw[1] = w[1], dy[1] = y[1], dw[2] = w[2], dy[2] = y[2];
     for (int i = 3; i < k; ++i) {
       dw[i] = dw[i - 1] * y[i - 2] * w[i];
       dy[i] = dy[i - 1] * w[i - 2] * y[i];
     }
   }
 
   __attribute__((target("avx2"))) void ntt(mint *a, int n) {
     int k = n ? __builtin_ctz(n) : 0;
     if (k == 0) return;
     if (k == 1) {
       mint a1 = a[1];
       a[1] = a[0] - a[1];
       a[0] = a[0] + a1;
       return;
     }
     if (k & 1) {
       int v = 1 << (k - 1);
       if (v < 8) {
         for (int j = 0; j < v; ++j) {
           mint ajv = a[j + v];
           a[j + v] = a[j] - ajv;
           a[j] += ajv;
         }
       } else {
         const __m256i m0 = _mm256_set1_epi32(0);
         const __m256i m2 = _mm256_set1_epi32(mod + mod);
         int j0 = 0;
         int j1 = v;
         for (; j0 < v; j0 += 8, j1 += 8) {
           __m256i T0 = _mm256_loadu_si256((__m256i *)(a + j0));
           __m256i T1 = _mm256_loadu_si256((__m256i *)(a + j1));
           __m256i naj = montgomery_add_256(T0, T1, m2, m0);
           __m256i najv = montgomery_sub_256(T0, T1, m2, m0);
           _mm256_storeu_si256((__m256i *)(a + j0), naj);
           _mm256_storeu_si256((__m256i *)(a + j1), najv);
         }
       }
     }
     int u = 1 << (2 + (k & 1));
     int v = 1 << (k - 2 - (k & 1));
     mint one = mint(1);
     mint imag = dw[1];
     while (v) {
       if (v == 1) {
         mint ww = one, xx = one, wx = one;
         for (int jh = 0; jh < u;) {
           ww = xx * xx, wx = ww * xx;
           mint t0 = a[jh + 0], t1 = a[jh + 1] * xx;
           mint t2 = a[jh + 2] * ww, t3 = a[jh + 3] * wx;
           mint t0p2 = t0 + t2, t1p3 = t1 + t3;
           mint t0m2 = t0 - t2, t1m3 = (t1 - t3) * imag;
           a[jh + 0] = t0p2 + t1p3, a[jh + 1] = t0p2 - t1p3;
           a[jh + 2] = t0m2 + t1m3, a[jh + 3] = t0m2 - t1m3;
           xx *= dw[__builtin_ctz((jh += 4))];
         }
       } else if (v == 4) {
         const __m128i m0 = _mm_set1_epi32(0);
         const __m128i m1 = _mm_set1_epi32(mod);
         const __m128i m2 = _mm_set1_epi32(mod + mod);
         const __m128i r = _mm_set1_epi32(mint::r);
         const __m128i Imag = _mm_set1_epi32(imag.a);
         mint ww = one, xx = one, wx = one;
         for (int jh = 0; jh < u;) {
           if (jh == 0) {
             int j0 = 0;
             int j1 = v;
             int j2 = j1 + v;
             int j3 = j2 + v;
             int je = v;
             for (; j0 < je; j0 += 4, j1 += 4, j2 += 4, j3 += 4) {
               const __m128i T0 = _mm_loadu_si128((__m128i *)(a + j0));
               const __m128i T1 = _mm_loadu_si128((__m128i *)(a + j1));
               const __m128i T2 = _mm_loadu_si128((__m128i *)(a + j2));
               const __m128i T3 = _mm_loadu_si128((__m128i *)(a + j3));
               const __m128i T0P2 = montgomery_add_128(T0, T2, m2, m0);
               const __m128i T1P3 = montgomery_add_128(T1, T3, m2, m0);
               const __m128i T0M2 = montgomery_sub_128(T0, T2, m2, m0);
               const __m128i T1M3 = montgomery_mul_128(
                   montgomery_sub_128(T1, T3, m2, m0), Imag, r, m1);
               _mm_storeu_si128((__m128i *)(a + j0),
                                montgomery_add_128(T0P2, T1P3, m2, m0));
               _mm_storeu_si128((__m128i *)(a + j1),
                                montgomery_sub_128(T0P2, T1P3, m2, m0));
               _mm_storeu_si128((__m128i *)(a + j2),
                                montgomery_add_128(T0M2, T1M3, m2, m0));
               _mm_storeu_si128((__m128i *)(a + j3),
                                montgomery_sub_128(T0M2, T1M3, m2, m0));
             }
           } else {
             ww = xx * xx, wx = ww * xx;
             const __m128i WW = _mm_set1_epi32(ww.a);
             const __m128i WX = _mm_set1_epi32(wx.a);
             const __m128i XX = _mm_set1_epi32(xx.a);
             int j0 = jh * v;
             int j1 = j0 + v;
             int j2 = j1 + v;
             int j3 = j2 + v;
             int je = j1;
             for (; j0 < je; j0 += 4, j1 += 4, j2 += 4, j3 += 4) {
               const __m128i T0 = _mm_loadu_si128((__m128i *)(a + j0));
               const __m128i T1 = _mm_loadu_si128((__m128i *)(a + j1));
               const __m128i T2 = _mm_loadu_si128((__m128i *)(a + j2));
               const __m128i T3 = _mm_loadu_si128((__m128i *)(a + j3));
               const __m128i MT1 = montgomery_mul_128(T1, XX, r, m1);
               const __m128i MT2 = montgomery_mul_128(T2, WW, r, m1);
               const __m128i MT3 = montgomery_mul_128(T3, WX, r, m1);
               const __m128i T0P2 = montgomery_add_128(T0, MT2, m2, m0);
               const __m128i T1P3 = montgomery_add_128(MT1, MT3, m2, m0);
               const __m128i T0M2 = montgomery_sub_128(T0, MT2, m2, m0);
               const __m128i T1M3 = montgomery_mul_128(
                   montgomery_sub_128(MT1, MT3, m2, m0), Imag, r, m1);
               _mm_storeu_si128((__m128i *)(a + j0),
                                montgomery_add_128(T0P2, T1P3, m2, m0));
               _mm_storeu_si128((__m128i *)(a + j1),
                                montgomery_sub_128(T0P2, T1P3, m2, m0));
               _mm_storeu_si128((__m128i *)(a + j2),
                                montgomery_add_128(T0M2, T1M3, m2, m0));
               _mm_storeu_si128((__m128i *)(a + j3),
                                montgomery_sub_128(T0M2, T1M3, m2, m0));
             }
           }
           xx *= dw[__builtin_ctz((jh += 4))];
         }
       } else {
         const __m256i m0 = _mm256_set1_epi32(0);
         const __m256i m1 = _mm256_set1_epi32(mod);
         const __m256i m2 = _mm256_set1_epi32(mod + mod);
         const __m256i r = _mm256_set1_epi32(mint::r);
         const __m256i Imag = _mm256_set1_epi32(imag.a);
         mint ww = one, xx = one, wx = one;
         for (int jh = 0; jh < u;) {
           if (jh == 0) {
             int j0 = 0;
             int j1 = v;
             int j2 = j1 + v;
             int j3 = j2 + v;
             int je = v;
             for (; j0 < je; j0 += 8, j1 += 8, j2 += 8, j3 += 8) {
               const __m256i T0 = _mm256_loadu_si256((__m256i *)(a + j0));
               const __m256i T1 = _mm256_loadu_si256((__m256i *)(a + j1));
               const __m256i T2 = _mm256_loadu_si256((__m256i *)(a + j2));
               const __m256i T3 = _mm256_loadu_si256((__m256i *)(a + j3));
               const __m256i T0P2 = montgomery_add_256(T0, T2, m2, m0);
               const __m256i T1P3 = montgomery_add_256(T1, T3, m2, m0);
               const __m256i T0M2 = montgomery_sub_256(T0, T2, m2, m0);
               const __m256i T1M3 = montgomery_mul_256(
                   montgomery_sub_256(T1, T3, m2, m0), Imag, r, m1);
               _mm256_storeu_si256((__m256i *)(a + j0),
                                   montgomery_add_256(T0P2, T1P3, m2, m0));
               _mm256_storeu_si256((__m256i *)(a + j1),
                                   montgomery_sub_256(T0P2, T1P3, m2, m0));
               _mm256_storeu_si256((__m256i *)(a + j2),
                                   montgomery_add_256(T0M2, T1M3, m2, m0));
               _mm256_storeu_si256((__m256i *)(a + j3),
                                   montgomery_sub_256(T0M2, T1M3, m2, m0));
             }
           } else {
             ww = xx * xx, wx = ww * xx;
             const __m256i WW = _mm256_set1_epi32(ww.a);
             const __m256i WX = _mm256_set1_epi32(wx.a);
             const __m256i XX = _mm256_set1_epi32(xx.a);
             int j0 = jh * v;
             int j1 = j0 + v;
             int j2 = j1 + v;
             int j3 = j2 + v;
             int je = j1;
             for (; j0 < je; j0 += 8, j1 += 8, j2 += 8, j3 += 8) {
               const __m256i T0 = _mm256_loadu_si256((__m256i *)(a + j0));
               const __m256i T1 = _mm256_loadu_si256((__m256i *)(a + j1));
               const __m256i T2 = _mm256_loadu_si256((__m256i *)(a + j2));
               const __m256i T3 = _mm256_loadu_si256((__m256i *)(a + j3));
               const __m256i MT1 = montgomery_mul_256(T1, XX, r, m1);
               const __m256i MT2 = montgomery_mul_256(T2, WW, r, m1);
               const __m256i MT3 = montgomery_mul_256(T3, WX, r, m1);
               const __m256i T0P2 = montgomery_add_256(T0, MT2, m2, m0);
               const __m256i T1P3 = montgomery_add_256(MT1, MT3, m2, m0);
               const __m256i T0M2 = montgomery_sub_256(T0, MT2, m2, m0);
               const __m256i T1M3 = montgomery_mul_256(
                   montgomery_sub_256(MT1, MT3, m2, m0), Imag, r, m1);
               _mm256_storeu_si256((__m256i *)(a + j0),
                                   montgomery_add_256(T0P2, T1P3, m2, m0));
               _mm256_storeu_si256((__m256i *)(a + j1),
                                   montgomery_sub_256(T0P2, T1P3, m2, m0));
               _mm256_storeu_si256((__m256i *)(a + j2),
                                   montgomery_add_256(T0M2, T1M3, m2, m0));
               _mm256_storeu_si256((__m256i *)(a + j3),
                                   montgomery_sub_256(T0M2, T1M3, m2, m0));
             }
           }
           xx *= dw[__builtin_ctz((jh += 4))];
         }
       }
       u <<= 2;
       v >>= 2;
     }
   }
 
   __attribute__((target("avx2"))) void intt(mint *a, int n,
                                             int normalize = true) {
     int k = n ? __builtin_ctz(n) : 0;
     if (k == 0) return;
     if (k == 1) {
       mint a1 = a[1];
       a[1] = a[0] - a[1];
       a[0] = a[0] + a1;
       if (normalize) {
         a[0] *= mint(2).inverse();
         a[1] *= mint(2).inverse();
       }
       return;
     }
     int u = 1 << (k - 2);
     int v = 1;
     mint one = mint(1);
     mint imag = dy[1];
     while (u) {
       if (v == 1) {
         mint ww = one, xx = one, yy = one;
         u <<= 2;
         for (int jh = 0; jh < u;) {
           ww = xx * xx, yy = xx * imag;
           mint t0 = a[jh + 0], t1 = a[jh + 1];
           mint t2 = a[jh + 2], t3 = a[jh + 3];
           mint t0p1 = t0 + t1, t2p3 = t2 + t3;
           mint t0m1 = (t0 - t1) * xx, t2m3 = (t2 - t3) * yy;
           a[jh + 0] = t0p1 + t2p3, a[jh + 2] = (t0p1 - t2p3) * ww;
           a[jh + 1] = t0m1 + t2m3, a[jh + 3] = (t0m1 - t2m3) * ww;
           xx *= dy[__builtin_ctz(jh += 4)];
         }
       } else if (v == 4) {
         const __m128i m0 = _mm_set1_epi32(0);
         const __m128i m1 = _mm_set1_epi32(mod);
         const __m128i m2 = _mm_set1_epi32(mod + mod);
         const __m128i r = _mm_set1_epi32(mint::r);
         const __m128i Imag = _mm_set1_epi32(imag.a);
         mint ww = one, xx = one, yy = one;
         u <<= 2;
         for (int jh = 0; jh < u;) {
           if (jh == 0) {
             int j0 = 0;
             int j1 = v;
             int j2 = v + v;
             int j3 = j2 + v;
             for (; j0 < v; j0 += 4, j1 += 4, j2 += 4, j3 += 4) {
               const __m128i T0 = _mm_loadu_si128((__m128i *)(a + j0));
               const __m128i T1 = _mm_loadu_si128((__m128i *)(a + j1));
               const __m128i T2 = _mm_loadu_si128((__m128i *)(a + j2));
               const __m128i T3 = _mm_loadu_si128((__m128i *)(a + j3));
               const __m128i T0P1 = montgomery_add_128(T0, T1, m2, m0);
               const __m128i T2P3 = montgomery_add_128(T2, T3, m2, m0);
               const __m128i T0M1 = montgomery_sub_128(T0, T1, m2, m0);
               const __m128i T2M3 = montgomery_mul_128(
                   montgomery_sub_128(T2, T3, m2, m0), Imag, r, m1);
               _mm_storeu_si128((__m128i *)(a + j0),
                                montgomery_add_128(T0P1, T2P3, m2, m0));
               _mm_storeu_si128((__m128i *)(a + j2),
                                montgomery_sub_128(T0P1, T2P3, m2, m0));
               _mm_storeu_si128((__m128i *)(a + j1),
                                montgomery_add_128(T0M1, T2M3, m2, m0));
               _mm_storeu_si128((__m128i *)(a + j3),
                                montgomery_sub_128(T0M1, T2M3, m2, m0));
             }
           } else {
             ww = xx * xx, yy = xx * imag;
             const __m128i WW = _mm_set1_epi32(ww.a);
             const __m128i XX = _mm_set1_epi32(xx.a);
             const __m128i YY = _mm_set1_epi32(yy.a);
             int j0 = jh * v;
             int j1 = j0 + v;
             int j2 = j1 + v;
             int j3 = j2 + v;
             int je = j1;
             for (; j0 < je; j0 += 4, j1 += 4, j2 += 4, j3 += 4) {
               const __m128i T0 = _mm_loadu_si128((__m128i *)(a + j0));
               const __m128i T1 = _mm_loadu_si128((__m128i *)(a + j1));
               const __m128i T2 = _mm_loadu_si128((__m128i *)(a + j2));
               const __m128i T3 = _mm_loadu_si128((__m128i *)(a + j3));
               const __m128i T0P1 = montgomery_add_128(T0, T1, m2, m0);
               const __m128i T2P3 = montgomery_add_128(T2, T3, m2, m0);
               const __m128i T0M1 = montgomery_mul_128(
                   montgomery_sub_128(T0, T1, m2, m0), XX, r, m1);
               __m128i T2M3 = montgomery_mul_128(
                   montgomery_sub_128(T2, T3, m2, m0), YY, r, m1);
               _mm_storeu_si128((__m128i *)(a + j0),
                                montgomery_add_128(T0P1, T2P3, m2, m0));
               _mm_storeu_si128(
                   (__m128i *)(a + j2),
                   montgomery_mul_128(montgomery_sub_128(T0P1, T2P3, m2, m0), WW,
                                      r, m1));
               _mm_storeu_si128((__m128i *)(a + j1),
                                montgomery_add_128(T0M1, T2M3, m2, m0));
               _mm_storeu_si128(
                   (__m128i *)(a + j3),
                   montgomery_mul_128(montgomery_sub_128(T0M1, T2M3, m2, m0), WW,
                                      r, m1));
             }
           }
           xx *= dy[__builtin_ctz(jh += 4)];
         }
       } else {
         const __m256i m0 = _mm256_set1_epi32(0);
         const __m256i m1 = _mm256_set1_epi32(mod);
         const __m256i m2 = _mm256_set1_epi32(mod + mod);
         const __m256i r = _mm256_set1_epi32(mint::r);
         const __m256i Imag = _mm256_set1_epi32(imag.a);
         mint ww = one, xx = one, yy = one;
         u <<= 2;
         for (int jh = 0; jh < u;) {
           if (jh == 0) {
             int j0 = 0;
             int j1 = v;
             int j2 = v + v;
             int j3 = j2 + v;
             for (; j0 < v; j0 += 8, j1 += 8, j2 += 8, j3 += 8) {
               const __m256i T0 = _mm256_loadu_si256((__m256i *)(a + j0));
               const __m256i T1 = _mm256_loadu_si256((__m256i *)(a + j1));
               const __m256i T2 = _mm256_loadu_si256((__m256i *)(a + j2));
               const __m256i T3 = _mm256_loadu_si256((__m256i *)(a + j3));
               const __m256i T0P1 = montgomery_add_256(T0, T1, m2, m0);
               const __m256i T2P3 = montgomery_add_256(T2, T3, m2, m0);
               const __m256i T0M1 = montgomery_sub_256(T0, T1, m2, m0);
               const __m256i T2M3 = montgomery_mul_256(
                   montgomery_sub_256(T2, T3, m2, m0), Imag, r, m1);
               _mm256_storeu_si256((__m256i *)(a + j0),
                                   montgomery_add_256(T0P1, T2P3, m2, m0));
               _mm256_storeu_si256((__m256i *)(a + j2),
                                   montgomery_sub_256(T0P1, T2P3, m2, m0));
               _mm256_storeu_si256((__m256i *)(a + j1),
                                   montgomery_add_256(T0M1, T2M3, m2, m0));
               _mm256_storeu_si256((__m256i *)(a + j3),
                                   montgomery_sub_256(T0M1, T2M3, m2, m0));
             }
           } else {
             ww = xx * xx, yy = xx * imag;
             const __m256i WW = _mm256_set1_epi32(ww.a);
             const __m256i XX = _mm256_set1_epi32(xx.a);
             const __m256i YY = _mm256_set1_epi32(yy.a);
             int j0 = jh * v;
             int j1 = j0 + v;
             int j2 = j1 + v;
             int j3 = j2 + v;
             int je = j1;
             for (; j0 < je; j0 += 8, j1 += 8, j2 += 8, j3 += 8) {
               const __m256i T0 = _mm256_loadu_si256((__m256i *)(a + j0));
               const __m256i T1 = _mm256_loadu_si256((__m256i *)(a + j1));
               const __m256i T2 = _mm256_loadu_si256((__m256i *)(a + j2));
               const __m256i T3 = _mm256_loadu_si256((__m256i *)(a + j3));
               const __m256i T0P1 = montgomery_add_256(T0, T1, m2, m0);
               const __m256i T2P3 = montgomery_add_256(T2, T3, m2, m0);
               const __m256i T0M1 = montgomery_mul_256(
                   montgomery_sub_256(T0, T1, m2, m0), XX, r, m1);
               const __m256i T2M3 = montgomery_mul_256(
                   montgomery_sub_256(T2, T3, m2, m0), YY, r, m1);
               _mm256_storeu_si256((__m256i *)(a + j0),
                                   montgomery_add_256(T0P1, T2P3, m2, m0));
               _mm256_storeu_si256(
                   (__m256i *)(a + j2),
                   montgomery_mul_256(montgomery_sub_256(T0P1, T2P3, m2, m0), WW,
                                      r, m1));
               _mm256_storeu_si256((__m256i *)(a + j1),
                                   montgomery_add_256(T0M1, T2M3, m2, m0));
               _mm256_storeu_si256(
                   (__m256i *)(a + j3),
                   montgomery_mul_256(montgomery_sub_256(T0M1, T2M3, m2, m0), WW,
                                      r, m1));
             }
           }
           xx *= dy[__builtin_ctz(jh += 4)];
         }
       }
       u >>= 4;
       v <<= 2;
     }
     if (k & 1) {
       v = 1 << (k - 1);
       if (v < 8) {
         for (int j = 0; j < v; ++j) {
           mint ajv = a[j] - a[j + v];
           a[j] += a[j + v];
           a[j + v] = ajv;
         }
       } else {
         const __m256i m0 = _mm256_set1_epi32(0);
         const __m256i m2 = _mm256_set1_epi32(mod + mod);
         int j0 = 0;
         int j1 = v;
         for (; j0 < v; j0 += 8, j1 += 8) {
           const __m256i T0 = _mm256_loadu_si256((__m256i *)(a + j0));
           const __m256i T1 = _mm256_loadu_si256((__m256i *)(a + j1));
           __m256i naj = montgomery_add_256(T0, T1, m2, m0);
           __m256i najv = montgomery_sub_256(T0, T1, m2, m0);
           _mm256_storeu_si256((__m256i *)(a + j0), naj);
           _mm256_storeu_si256((__m256i *)(a + j1), najv);
         }
       }
     }
     if (normalize) {
       mint invn = mint(n).inverse();
       for (int i = 0; i < n; i++) a[i] *= invn;
     }
   }
 
   __attribute__((target("avx2"))) void inplace_multiply(
       int l1, int l2, int zero_padding = true) {
     int l = l1 + l2 - 1;
     int M = 4;
     while (M < l) M <<= 1;
     if (zero_padding) {
       for (int i = l1; i < M; i++) buf1_[i] = 0;
       for (int i = l2; i < M; i++) buf2_[i] = 0;
     }
     const __m256i m0 = _mm256_set1_epi32(0);
     const __m256i m1 = _mm256_set1_epi32(mod);
     const __m256i r = _mm256_set1_epi32(mint::r);
     const __m256i N2 = _mm256_set1_epi32(mint::n2);
     for (int i = 0; i < l1; i += 8) {
       __m256i a = _mm256_loadu_si256((__m256i *)(buf1_ + i));
       __m256i b = montgomery_mul_256(a, N2, r, m1);
       _mm256_storeu_si256((__m256i *)(buf1_ + i), b);
     }
     for (int i = 0; i < l2; i += 8) {
       __m256i a = _mm256_loadu_si256((__m256i *)(buf2_ + i));
       __m256i b = montgomery_mul_256(a, N2, r, m1);
       _mm256_storeu_si256((__m256i *)(buf2_ + i), b);
     }
     ntt(buf1, M);
     ntt(buf2, M);
     for (int i = 0; i < M; i += 8) {
       __m256i a = _mm256_loadu_si256((__m256i *)(buf1_ + i));
       __m256i b = _mm256_loadu_si256((__m256i *)(buf2_ + i));
       __m256i c = montgomery_mul_256(a, b, r, m1);
       _mm256_storeu_si256((__m256i *)(buf1_ + i), c);
     }
     intt(buf1, M, false);
     const __m256i INVM = _mm256_set1_epi32((mint(M).inverse()).a);
     for (int i = 0; i < l; i += 8) {
       __m256i a = _mm256_loadu_si256((__m256i *)(buf1_ + i));
       __m256i b = montgomery_mul_256(a, INVM, r, m1);
       __m256i c = my256_mulhi_epu32(my256_mullo_epu32(b, r), m1);
       __m256i d = _mm256_and_si256(_mm256_cmpgt_epi32(c, m0), m1);
       __m256i e = _mm256_sub_epi32(d, c);
       _mm256_storeu_si256((__m256i *)(buf1_ + i), e);
     }
   }
 
   void ntt(vector<mint> &a) {
     int M = (int)a.size();
     for (int i = 0; i < M; i++) buf1[i].a = a[i].a;
     ntt(buf1, M);
     for (int i = 0; i < M; i++) a[i].a = buf1[i].a;
   }
 
   void intt(vector<mint> &a) {
     int M = (int)a.size();
     for (int i = 0; i < M; i++) buf1[i].a = a[i].a;
     intt(buf1, M, true);
     for (int i = 0; i < M; i++) a[i].a = buf1[i].a;
   }
 
   vector<mint> multiply(const vector<mint> &a, const vector<mint> &b) {
     if (a.size() == 0 && b.size() == 0) return vector<mint>{};
     int l = a.size() + b.size() - 1;
     if (min<int>(a.size(), b.size()) <= 40) {
       vector<mint> s(l);
       for (int i = 0; i < (int)a.size(); ++i)
         for (int j = 0; j < (int)b.size(); ++j) s[i + j] += a[i] * b[j];
       return s;
     }
     assert(l <= SZ_FFT_BUF);
     int M = 4;
     while (M < l) M <<= 1;
     for (int i = 0; i < (int)a.size(); ++i) buf1[i].a = a[i].a;
     for (int i = (int)a.size(); i < M; ++i) buf1[i].a = 0;
     for (int i = 0; i < (int)b.size(); ++i) buf2[i].a = b[i].a;
     for (int i = (int)b.size(); i < M; ++i) buf2[i].a = 0;
     ntt(buf1, M);
     ntt(buf2, M);
     for (int i = 0; i < M; ++i)
       buf1[i].a = mint::reduce(uint64_t(buf1[i].a) * buf2[i].a);
     intt(buf1, M, false);
     vector<mint> s(l);
     mint invm = mint(M).inverse();
     for (int i = 0; i < l; ++i) s[i] = buf1[i] * invm;
     return s;
   }
 
   void ntt_doubling(vector<mint> &a) {
     int M = (int)a.size();
     for (int i = 0; i < M; i++) buf1[i].a = a[i].a;
     intt(buf1, M);
     mint r = 1, zeta = mint(pr).pow((mint::get_mod() - 1) / (M << 1));
     for (int i = 0; i < M; i++) buf1[i] *= r, r *= zeta;
     ntt(buf1, M);
     a.resize(2 * M);
     for (int i = 0; i < M; i++) a[M + i].a = buf1[i].a;
   }
 };
 
 namespace ArbitraryNTT {
 using i64 = int64_t;
 using u128 = __uint128_t;
 constexpr int32_t m0 = 167772161;
 constexpr int32_t m1 = 469762049;
 constexpr int32_t m2 = 754974721;
 using mint0 = LazyMontgomeryModInt<m0>;
 using mint1 = LazyMontgomeryModInt<m1>;
 using mint2 = LazyMontgomeryModInt<m2>;
 constexpr int r01 = mint1(m0).inverse().get();
 constexpr int r02 = mint2(m0).inverse().get();
 constexpr int r12 = mint2(m1).inverse().get();
 constexpr int r02r12 = i64(r02) * r12 % m2;
 constexpr i64 w1 = m0;
 constexpr i64 w2 = i64(m0) * m1;
 
 template <typename T, typename submint>
 vector<submint> mul(const vector<T> &a, const vector<T> &b) {
   NTT<submint> ntt;
   vector<submint> s(a.size()), t(b.size());
   for (int i = 0; i < (int)a.size(); ++i) s[i] = i64(a[i] % submint::get_mod());
   for (int i = 0; i < (int)b.size(); ++i) t[i] = i64(b[i] % submint::get_mod());
   return ntt.multiply(s, t);
 }
 
 template <typename T>
 vector<int> multiply(const vector<T> &s, const vector<T> &t, int mod) {
   auto d0 = mul<T, mint0>(s, t);
   auto d1 = mul<T, mint1>(s, t);
   auto d2 = mul<T, mint2>(s, t);
   int n = d0.size();
   vector<int> ret(n);
   const int W1 = w1 % mod;
   const int W2 = w2 % mod;
   for (int i = 0; i < n; i++) {
     int n1 = d1[i].get(), n2 = d2[i].get(), a = d0[i].get();
     int b = i64(n1 + m1 - a) * r01 % m1;
     int c = (i64(n2 + m2 - a) * r02r12 + i64(m2 - b) * r12) % m2;
     ret[i] = (i64(a) + i64(b) * W1 + i64(c) * W2) % mod;
   }
   return ret;
 }
 
 template <typename mint>
 vector<mint> multiply(const vector<mint> &a, const vector<mint> &b) {
   if (a.size() == 0 && b.size() == 0) return {};
   if (min<int>(a.size(), b.size()) < 128) {
     vector<mint> ret(a.size() + b.size() - 1);
     for (int i = 0; i < (int)a.size(); ++i)
       for (int j = 0; j < (int)b.size(); ++j) ret[i + j] += a[i] * b[j];
     return ret;
   }
   vector<int> s(a.size()), t(b.size());
   for (int i = 0; i < (int)a.size(); ++i) s[i] = a[i].get();
   for (int i = 0; i < (int)b.size(); ++i) t[i] = b[i].get();
   vector<int> u = multiply<int>(s, t, mint::get_mod());
   vector<mint> ret(u.size());
   for (int i = 0; i < (int)u.size(); ++i) ret[i] = mint(u[i]);
   return ret;
 }
 
 template <typename T>
 vector<u128> multiply_u128(const vector<T> &s, const vector<T> &t) {
   if (s.size() == 0 && t.size() == 0) return {};
   if (min<int>(s.size(), t.size()) < 128) {
     vector<u128> ret(s.size() + t.size() - 1);
     for (int i = 0; i < (int)s.size(); ++i)
       for (int j = 0; j < (int)t.size(); ++j) ret[i + j] += i64(s[i]) * t[j];
     return ret;
   }
   auto d0 = mul<T, mint0>(s, t);
   auto d1 = mul<T, mint1>(s, t);
   auto d2 = mul<T, mint2>(s, t);
   int n = d0.size();
   vector<u128> ret(n);
   for (int i = 0; i < n; i++) {
     i64 n1 = d1[i].get(), n2 = d2[i].get();
     i64 a = d0[i].get();
     u128 b = (n1 + m1 - a) * r01 % m1;
     u128 c = ((n2 + m2 - a) * r02r12 + (m2 - b) * r12) % m2;
     ret[i] = a + b * w1 + c * w2;
   }
   return ret;
 }
 }  // namespace ArbitraryNTTusing namespace std;
 
 template <typename mint>
 struct FormalPowerSeries : vector<mint> {
   using vector<mint>::vector;
   using FPS = FormalPowerSeries;
 
   FPS &operator+=(const FPS &r) {
     if (r.size() > this->size()) this->resize(r.size());
     for (int i = 0; i < (int)r.size(); i++) (*this)[i] += r[i];
     return *this;
   }
 
   FPS &operator+=(const mint &r) {
     if (this->empty()) this->resize(1);
     (*this)[0] += r;
     return *this;
   }
 
   FPS &operator-=(const FPS &r) {
     if (r.size() > this->size()) this->resize(r.size());
     for (int i = 0; i < (int)r.size(); i++) (*this)[i] -= r[i];
     return *this;
   }
 
   FPS &operator-=(const mint &r) {
     if (this->empty()) this->resize(1);
     (*this)[0] -= r;
     return *this;
   }
 
   FPS &operator*=(const mint &v) {
     for (int k = 0; k < (int)this->size(); k++) (*this)[k] *= v;
     return *this;
   }
 
   FPS &operator/=(const FPS &r) {
     if (this->size() < r.size()) {
       this->clear();
       return *this;
     }
     int n = this->size() - r.size() + 1;
     if ((int)r.size() <= 64) {
       FPS f(*this), g(r);
       g.shrink();
       mint coeff = g.back().inverse();
       for (auto &x : g) x *= coeff;
       int deg = (int)f.size() - (int)g.size() + 1;
       int gs = g.size();
       FPS quo(deg);
       for (int i = deg - 1; i >= 0; i--) {
         quo[i] = f[i + gs - 1];
         for (int j = 0; j < gs; j++) f[i + j] -= quo[i] * g[j];
       }
       *this = quo * coeff;
       this->resize(n, mint(0));
       return *this;
     }
     return *this = ((*this).rev().pre(n) * r.rev().inv(n)).pre(n).rev();
   }
 
   FPS &operator%=(const FPS &r) {
     *this -= *this / r * r;
     shrink();
     return *this;
   }
 
   FPS operator+(const FPS &r) const { return FPS(*this) += r; }
   FPS operator+(const mint &v) const { return FPS(*this) += v; }
   FPS operator-(const FPS &r) const { return FPS(*this) -= r; }
   FPS operator-(const mint &v) const { return FPS(*this) -= v; }
   FPS operator*(const FPS &r) const { return FPS(*this) *= r; }
   FPS operator*(const mint &v) const { return FPS(*this) *= v; }
   FPS operator/(const FPS &r) const { return FPS(*this) /= r; }
   FPS operator%(const FPS &r) const { return FPS(*this) %= r; }
   FPS operator-() const {
     FPS ret(this->size());
     for (int i = 0; i < (int)this->size(); i++) ret[i] = -(*this)[i];
     return ret;
   }
 
   void shrink() {
     while (this->size() && this->back() == mint(0)) this->pop_back();
   }
 
   FPS rev() const {
     FPS ret(*this);
     reverse(begin(ret), end(ret));
     return ret;
   }
 
   FPS dot(FPS r) const {
     FPS ret(min(this->size(), r.size()));
     for (int i = 0; i < (int)ret.size(); i++) ret[i] = (*this)[i] * r[i];
     return ret;
   }
 
   FPS pre(int sz) const {
     return FPS(begin(*this), begin(*this) + min((int)this->size(), sz));
   }
 
   FPS operator>>(int sz) const {
     if ((int)this->size() <= sz) return {};
     FPS ret(*this);
     ret.erase(ret.begin(), ret.begin() + sz);
     return ret;
   }
 
   FPS operator<<(int sz) const {
     FPS ret(*this);
     ret.insert(ret.begin(), sz, mint(0));
     return ret;
   }
 
   FPS diff() const {
     const int n = (int)this->size();
     FPS ret(max(0, n - 1));
     mint one(1), coeff(1);
     for (int i = 1; i < n; i++) {
       ret[i - 1] = (*this)[i] * coeff;
       coeff += one;
     }
     return ret;
   }
 
   FPS integral() const {
     const int n = (int)this->size();
     FPS ret(n + 1);
     ret[0] = mint(0);
     if (n > 0) ret[1] = mint(1);
     auto mod = mint::get_mod();
     for (int i = 2; i <= n; i++) ret[i] = (-ret[mod % i]) * (mod / i);
     for (int i = 0; i < n; i++) ret[i + 1] *= (*this)[i];
     return ret;
   }
 
   mint eval(mint x) const {
     mint r = 0, w = 1;
     for (auto &v : *this) r += w * v, w *= x;
     return r;
   }
 
   FPS log(int deg = -1) const {
     assert((*this)[0] == mint(1));
     if (deg == -1) deg = (int)this->size();
     return (this->diff() * this->inv(deg)).pre(deg - 1).integral();
   }
 
   FPS pow(int64_t k, int deg = -1) const {
     const int n = (int)this->size();
     if (deg == -1) deg = n;
     for (int i = 0; i < n; i++) {
       if ((*this)[i] != mint(0)) {
         if (i * k > deg) return FPS(deg, mint(0));
         mint rev = mint(1) / (*this)[i];
         FPS ret = (((*this * rev) >> i).log() * k).exp() * ((*this)[i].pow(k));
         ret = (ret << (i * k)).pre(deg);
         if ((int)ret.size() < deg) ret.resize(deg, mint(0));
         return ret;
       }
     }
     return FPS(deg, mint(0));
   }
 
   static void *ntt_ptr;
   static void set_fft();
   FPS &operator*=(const FPS &r);
   void ntt();
   void intt();
   void ntt_doubling();
   static int ntt_pr();
   FPS inv(int deg = -1) const;
   FPS exp(int deg = -1) const;
 };
 template <typename mint>
 void *FormalPowerSeries<mint>::ntt_ptr = nullptr;
 
 /**
  * @brief 多項式/形式的冪級数ライブラリ
  * @docs docs/fps/formal-power-series.md
  */
 
 template <typename mint>
 void FormalPowerSeries<mint>::set_fft() {
   ntt_ptr = nullptr;
 }
 
 template <typename mint>
 void FormalPowerSeries<mint>::ntt() {
   exit(1);
 }
 
 template <typename mint>
 void FormalPowerSeries<mint>::intt() {
   exit(1);
 }
 
 template <typename mint>
 void FormalPowerSeries<mint>::ntt_doubling() {
   exit(1);
 }
 
 template <typename mint>
 int FormalPowerSeries<mint>::ntt_pr() {
   exit(1);
 }
 
 template <typename mint>
 FormalPowerSeries<mint>& FormalPowerSeries<mint>::operator*=(
     const FormalPowerSeries<mint>& r) {
   if (this->empty() || r.empty()) {
     this->clear();
     return *this;
   }
   auto ret = ArbitraryNTT::multiply(*this, r);
   return *this = FormalPowerSeries<mint>(ret.begin(), ret.end());
 }
 
 template <typename mint>
 FormalPowerSeries<mint> FormalPowerSeries<mint>::inv(int deg) const {
   assert((*this)[0] != mint(0));
   if (deg == -1) deg = (*this).size();
   FormalPowerSeries<mint> ret({mint(1) / (*this)[0]});
   for (int i = 1; i < deg; i <<= 1)
     ret = (ret + ret - ret * ret * (*this).pre(i << 1)).pre(i << 1);
   return ret.pre(deg);
 }
 
 template <typename mint>
 FormalPowerSeries<mint> FormalPowerSeries<mint>::exp(int deg) const {
   assert((*this).size() == 0 || (*this)[0] == mint(0));
   if (deg == -1) deg = (int)this->size();
   FormalPowerSeries<mint> ret({mint(1)});
   for (int i = 1; i < deg; i <<= 1) {
     ret = (ret * (pre(i << 1) + mint(1) - ret.log(i << 1))).pre(i << 1);
   }
   return ret.pre(deg);
 }
 
 
 using mint = LazyMontgomeryModInt<998244353>;
 using vm = vector<mint>;
 using vvm = vector<vm>;
 using fps = FormalPowerSeries<mint>;
 
 using namespace std;
 
 using namespace std;
 
 template <typename mint>
 vector<mint> BerlekampMassey(const vector<mint> &s) {
   const int N = (int)s.size();
   vector<mint> b, c;
   b.reserve(N + 1);
   c.reserve(N + 1);
   b.push_back(mint(1));
   c.push_back(mint(1));
   mint y = mint(1);
   for (int ed = 1; ed <= N; ed++) {
     int l = int(c.size()), m = int(b.size());
     mint x = 0;
     for (int i = 0; i < l; i++) x += c[i] * s[ed - l + i];
     b.emplace_back(mint(0));
     m++;
     if (x == mint(0)) continue;
     mint freq = x / y;
     if (l < m) {
       auto tmp = c;
       c.insert(begin(c), m - l, mint(0));
       for (int i = 0; i < m; i++) c[m - 1 - i] -= freq * b[m - 1 - i];
       b = tmp;
       y = x;
     } else {
       for (int i = 0; i < m; i++) c[l - 1 - i] -= freq * b[m - 1 - i];
     }
   }
   reverse(begin(c), end(c));
   return c;
 }
 using namespace std;
 
 
 template <typename mint>
 mint LinearRecursionFormula(long long k, FormalPowerSeries<mint> Q,
                             FormalPowerSeries<mint> P) {
   Q.shrink();
   mint ret = 0;
   if (P.size() >= Q.size()) {
     auto R = P / Q;
     P -= R * Q;
     P.shrink();
     if (k < (int)R.size()) ret += R[k];
   }
   if ((int)P.size() == 0) return ret;
 
   FormalPowerSeries<mint>::set_fft();
   if (FormalPowerSeries<mint>::ntt_ptr == nullptr) {
     P.resize((int)Q.size() - 1);
     while (k) {
       auto Q2 = Q;
       for (int i = 1; i < (int)Q2.size(); i += 2) Q2[i] = -Q2[i];
       auto S = P * Q2;
       auto T = Q * Q2;
       if (k & 1) {
         for (int i = 1; i < (int)S.size(); i += 2) P[i >> 1] = S[i];
         for (int i = 0; i < (int)T.size(); i += 2) Q[i >> 1] = T[i];
       } else {
         for (int i = 0; i < (int)S.size(); i += 2) P[i >> 1] = S[i];
         for (int i = 0; i < (int)T.size(); i += 2) Q[i >> 1] = T[i];
       }
       k >>= 1;
     }
     return ret + P[0];
   } else {
     int N = 1;
     while (N < (int)Q.size()) N <<= 1;
 
     P.resize(2 * N);
     Q.resize(2 * N);
     P.ntt();
     Q.ntt();
     vector<mint> S(2 * N), T(2 * N);
 
     vector<int> btr(N);
     for (int i = 0, logn = __builtin_ctz(N); i < (1 << logn); i++) {
       btr[i] = (btr[i >> 1] >> 1) + ((i & 1) << (logn - 1));
     }
     mint dw = mint(FormalPowerSeries<mint>::ntt_pr())
                   .inverse()
                   .pow((mint::get_mod() - 1) / (2 * N));
 
     while (k) {
       mint inv2 = mint(2).inverse();
 
       // even degree of Q(x)Q(-x)
       T.resize(N);
       for (int i = 0; i < N; i++) T[i] = Q[(i << 1) | 0] * Q[(i << 1) | 1];
 
       S.resize(N);
       if (k & 1) {
         // odd degree of P(x)Q(-x)
         for (auto &i : btr) {
           S[i] = (P[(i << 1) | 0] * Q[(i << 1) | 1] -
                   P[(i << 1) | 1] * Q[(i << 1) | 0]) *
                  inv2;
           inv2 *= dw;
         }
       } else {
         // even degree of P(x)Q(-x)
         for (int i = 0; i < N; i++) {
           S[i] = (P[(i << 1) | 0] * Q[(i << 1) | 1] +
                   P[(i << 1) | 1] * Q[(i << 1) | 0]) *
                  inv2;
         }
       }
       swap(P, S);
       swap(Q, T);
       k >>= 1;
       if (k < N) break;
       P.ntt_doubling();
       Q.ntt_doubling();
     }
     P.intt();
     Q.intt();
     return ret + (P * (Q.inv()))[k];
   }
 }
 
 template <typename mint>
 mint kitamasa(long long N, FormalPowerSeries<mint> Q,
               FormalPowerSeries<mint> a) {
   assert(!Q.empty() && Q[0] != 0);
   if(N < (int)a.size()) return a[N];
   assert((int)a.size() >= int(Q.size()) - 1);
   auto P = a.pre((int)Q.size() - 1) * Q;
   P.resize(Q.size() - 1);
   return LinearRecursionFormula<mint>(N, Q, P);
 }
 
 /**
  * @brief 線形漸化式の高速計算
  * @docs docs/fps/kitamasa.md
  */
 
 template <typename mint>
 mint nth_term(long long n, const vector<mint> &s) {
   using fps = FormalPowerSeries<mint>;
   auto bm = BerlekampMassey<mint>(s);
   return kitamasa(n, fps{begin(bm), end(bm)}, fps{begin(s), end(s)});
 }
 
 /**
  * @brief 与えられた数列の第$n$項を計算(Berlekamp-Massey/Bostan-Mori)
  */


int main(){
    ll n,t,k,l;
    cin >> n >> t >> k >> l;
    ll a = 0;
    ll b = 0;
    ll c = 0;
    for(int i=1;i<=6;i++){
        if(i < k){
            a++;
        }
        else if(k <= i && i < l){
            b++;
        }
        else{
            c++;
        }
    }
    mint p = mint(a) * mint(6).inverse();
    mint q = mint(b) * mint(6).inverse();
    mint r = mint(c) * mint(6).inverse();
    vector<mint> dp(300);
    dp[1] = 1;
    for(int i=1;i<300;i++){
        if(i+1 < 300){
            dp[i+1] += p * dp[i];
        }
        if(i+2 < 300){
            dp[i+2] += q * dp[i];
        }
        if(i+t < 300){
            dp[i+t] += r * dp[i];
        }
    }
    mint ans = nth_term(n,dp);
    cout << ans;
}
0