#pragma GCC target("avx2") #pragma GCC optimize("O3") #pragma GCC optimize("unroll-loops") #include template struct montgomery_modint { using Self = montgomery_modint; using i32 = int32_t; using u32 = uint32_t; using u64 = uint64_t; static constexpr u32 get_r() { u32 res = M; for(int i = 0; i < 4; i++) res *= 2 - M * res; return res; } static constexpr u32 reduce(u64 a) { return (a + u64(u32(a) * u32(-r)) * M) >> 32; } static constexpr u32 r = get_r(); static constexpr u32 n2 = -u64(M) % M; u32 a; constexpr montgomery_modint() : a(0) {} constexpr montgomery_modint(int64_t a) : a(reduce(u64(a % M + M) * n2)) {} constexpr u32 val() const { u32 res = reduce(a); return res >= M ? res - M : res; } constexpr Self pow(u64 r) const { Self ans(1); Self aa = *this; while(r) { if(r & 1) ans *= aa; aa *= aa; r >>= 1; } return ans; } constexpr Self inv() const { return this->pow(M - 2); } constexpr Self& operator+=(const Self& r) { if(i32(a += r.a - 2 * M) < 0) a += 2 * M; return *this; } constexpr Self& operator-=(const Self& r) { if(i32(a -= r.a) < 0) a += 2 * M; return *this; } constexpr Self& operator*=(const Self& r) { a = reduce(u64(a) * r.a); return *this; } constexpr Self& operator/=(const Self& r) { *this *= r.inv(); return *this; } constexpr Self operator+(const Self r) const { return Self(*this) += r; } constexpr Self operator-(const Self r) const { return Self(*this) -= r; } constexpr Self operator-() const { return Self() - Self(*this); } constexpr Self operator*(const Self r) const { return Self(*this) *= r; } constexpr Self operator/(const Self r) const { return Self(*this) /= r; } constexpr bool operator==(const Self& r) const { return (a >= M ? a - M : a) == (r.a >= M ? r.a - M : r.a); } constexpr bool operator!=(const Self& r) const { return (a >= M ? a - M : a) == (r.a >= M ? r.a - M : r.a); } }; template std::ostream& operator<<(std::ostream& os, const montgomery_modint& m) { return os << m.val(); } template std::istream& operator>>(std::istream& is, montgomery_modint& m) { int64_t t; is >> t; m = montgomery_modint(t); return is; } template using modint = montgomery_modint; #include using namespace std; using i64 = long long; constexpr i64 NTT_PRIMES[][2] = { {1224736769, 3}, // 2^24 * 73 + 1, {1053818881, 7}, // 2^20 * 3 * 5 * 67 + 1 {1051721729, 6}, // 2^20 * 17 * 59 + 1 {1045430273, 3}, // 2^20 * 997 + 1 {1012924417, 5}, // 2^21 * 3 * 7 * 23 + 1 {1007681537, 3}, // 2^20 * 31^2 + 1 {1004535809, 3}, // 2^21 * 479 + 1 {998244353, 3}, // 2^23 * 7 * 17 + 1 {985661441, 3}, // 2^22 * 5 * 47 + 1 {976224257, 3}, // 2^20 * 7^2 * 19 + 1 {975175681, 17}, // 2^21 * 3 * 5 * 31 + 1 {962592769, 7}, // 2^21 * 3^3 * 17 + 1 {950009857, 7}, // 2^21 * 4 * 151 + 1 {943718401, 7}, // 2^22 * 3^2 * 5^2 + 1 {935329793, 3}, // 2^22 * 223 + 1 {924844033, 5}, // 2^21 * 3^2 * 7^2 + 1 {469762049, 3}, // 2^26 * 7 + 1 {167772161, 3}, // 2^25 * 5 + 1 }; template vector> number_theoretic_transform4(vector> a) { i64 n = a.size(); vector> b(a.size()); auto unit_i = modint(primitive).pow((mod - 1) / 4); for(i64 s = 1, m = n; s < n; s <<= 1, m >>= 1) { if(m == 2) { for(i64 j = 0;j < s;j++) { auto x = a[j + 0]; auto y = a[j + s]; b[j + 0] = x + y; b[j + s] = x - y; } } else { modint zi1 = 1; modint zi2 = 1; modint zi3 = 1; i64 m1 = m >> 2; i64 m2 = m >> 1; i64 m3 = m1 | m2; modint zeta = modint(primitive).pow((mod - 1) / m); for(i64 i = 0;i < m1;i++) { for(i64 j = 0;j < s;j++) { auto w = a[j + s * (i + 0)]; auto x = a[j + s * (i + m1)]; auto y = a[j + s * (i + m2)]; auto z = a[j + s * (i + m3)]; auto wy1 = w + y; auto wy2 = w - y; auto xz1 = x + z; auto xz2 = (x - z) * unit_i; b[j + s * (4 * i + 0)] = wy1 + xz1; b[j + s * (4 * i + 1)] = (wy2 + xz2) * zi1; b[j + s * (4 * i + 2)] = (wy1 - xz1) * zi2; b[j + s * (4 * i + 3)] = (wy2 - xz2) * zi3; } zi1 = zi1 * zeta; zi2 = zi1 * zi1; zi3 = zi1 * zi2; } s <<= 1; m >>= 1; } swap(a, b); } return a; } template vector> inverse_number_theoretic_transform4(vector> a) { i64 n = a.size(); vector> b(a.size()); auto unit_i = modint(primitive).pow((mod - 1) / 4).inv(); i64 s = n; i64 m = 1; if(__builtin_ctzll(n) & 1) { s >>= 1; m <<= 1; for(i64 j = 0;j < s;j++) { auto x = a[j + 0]; auto y = a[j + s]; b[j + 0] = x + y; b[j + s] = x - y; } swap(a, b); } for(; s >>= 2, m <<= 2, s >= 1;) { { modint zi1 = 1; modint zi2 = 1; modint zi3 = 1; i64 m1 = m >> 2; i64 m2 = m >> 1; i64 m3 = m1 | m2; modint zeta = modint(primitive).pow((mod - 1) / m).inv(); for(i64 i = 0;i < m1;i++) { for(i64 j = 0;j < s;j++) { auto w = a[j + s * (4 * i + 0)]; auto x = a[j + s * (4 * i + 1)] * zi1; auto y = a[j + s * (4 * i + 2)] * zi2; auto z = a[j + s * (4 * i + 3)] * zi3; auto wy1 = w + y; auto wy2 = x + z; auto xz1 = w - y; auto xz2 = (x - z) * unit_i; b[j + s * (i + 0)] = wy1 + wy2; b[j + s * (i + m1)] = xz1 + xz2; b[j + s * (i + m2)] = wy1 - wy2; b[j + s * (i + m3)] = xz1 - xz2; } zi1 = zi1 * zeta; zi2 = zi1 * zi1; zi3 = zi1 * zi2; } } swap(a, b); } auto inv_n = modint(n).pow(mod - 2); for(int i = 0;i < n;i++) a[i] *= inv_n; return a; } template struct fps_ntt_multiply { using fps_type = modint; using conv_type = modint; static std::vector dft(std::vector arr) { return number_theoretic_transform4(std::move(arr)); } static std::vector idft(std::vector arr) { return inverse_number_theoretic_transform4(std::move(arr)); } static std::vector multiply(std::vector a, const std::vector& b) { for(int i = 0;i < a.size();i++) a[i] *= b[i]; return a; } static std::vector self_multiply(std::vector a) { for(int i = 0;i < a.size();i++) a[i] *= a[i]; return a; } }; #include using i64 = long long; std::size_t bound_pow2(std::size_t sz) { return 1ll << (__lg(sz - 1) + 1); } template struct FPS { std::vector coef; FPS(std::vector arr): coef(std::move(arr)) {} size_t size() const { return coef.size(); } void bound_resize() { this->coef.resize(bound_pow2(this->size())); } const T& operator[](int i) const { return coef[i]; } T & operator[](int i) { return coef[i]; } FPS pre(int n) const { std::vector nex(n, T(0)); for(int i = 0;i < coef.size() && i < n; i++) nex[i] = coef[i]; return FPS(nex); } // F(0) must not be 0 FPS inv() const { FPS g = FPS(std::vector{ T(1) / (*this)[0] }); i64 n = bound_pow2(this->size()); for(int i = 1;i < n;i <<= 1) { g = g.pre(i << 1); auto gdft = fps_multiply::dft(g.coef); auto e = fps_multiply::idft( fps_multiply::multiply( fps_multiply::dft(this->pre(i << 1).coef), gdft ) ); for(int j = 0;j < i;j++) { e[j] = T(0); e[j + i] = e[j + i] * T(-1); } auto f = fps_multiply::idft( fps_multiply::multiply( fps_multiply::dft(e), std::move(gdft) ) ); for(int j = 0;j < i;j++) { f[j] = g[j]; } g.coef = std::move(f); } return g.pre(n); } FPS diff() const { FPS res(std::vector(this->size() - 1, T(0))); for(i64 i = 1;i < this->size();i++) res[i - 1] = coef[i] * T(i); return res; } FPS integral() const { FPS res(std::vector(this->size() + 1, T(0))); for(i64 i = 0;i < this->size();i++) res[i + 1] = coef[i] / T(i + 1); return res; } // F(0) must be 0 FPS log() const { return (this->diff() * this->inv()).integral().pre(this->size()); } FPS exp() const { FPS f(std::vector{ T(1) }); FPS g = *this; g.bound_resize(); g[0] += T(1); for(i64 i = 1;i < size();i <<= 1 ) { f = (f * (g.pre(i << 1) - f.pre(i << 1).log())).pre(i << 1); } return f; } FPS& operator+=(const FPS& rhs) { i64 n = std::max(this->size(), rhs.size()); this->coef.resize(n, T(0)); for(int i = 0;i < rhs.size();i++) this->coef[i] += rhs[i]; return *this; } FPS& operator-=(const FPS& rhs) { i64 n = std::max(this->size(), rhs.size()); this->coef.resize(n, T(0)); for(int i = 0;i < rhs.size();i++) this->coef[i] -= rhs[i]; return *this; } FPS operator+(const FPS& rhs) const { return (*this) += rhs; } FPS operator-(const FPS& rhs) const { return (*this) -= rhs; } FPS operator*(const FPS& rhs) { i64 m = this->size() + rhs.size() - 1; i64 n = bound_pow2(m); auto res = fps_multiply::idft( fps_multiply::multiply( fps_multiply::dft(this->pre(n).coef), fps_multiply::dft(rhs.pre(n).coef) ) ); res.resize(m); return res; } }; #include using i64 = long long; template std::vector fast_kitamasa(std::vector c, i64 n) { using fps = FPS; i64 k = c.size() - 1; fps cf(std::move(c)); fps ic = cf.inv().pre(k - 1); i64 c_len = bound_pow2(k - 1 + cf.size() - 1); i64 ic_len = bound_pow2(k - 1 + ic.size() - 1); auto dc = FM::dft(cf.pre(c_len).coef); auto dic = FM::dft(ic.pre(ic_len).coef); i64 b_len = bound_pow2(k); std::vector b(b_len, F(0)); b[k - 1] = F(1); i64 m = bound_pow2(n); while(m) { b.resize(b_len * 2, F(0)); auto bt = FM::dft(std::move(b)); bt = FM::self_multiply(std::move(bt)); fps beta(FM::idft(std::move(bt))); // let q = (beta.clone().pre(k - 1) * ic.clone()).pre(k - 1); auto dbeta = FM::dft(beta.pre(k - 1).pre(ic_len).coef); auto q = FM::idft(FM::multiply(std::move(dbeta), dic)); q.resize(c_len); for(int i = k - 1; i < ic_len; i++) q[i] = F(0); //beta -= cf * q; fps cfq(FM::idft(FM::multiply(FM::dft(std::move(q)), dc))); cfq.coef.resize(k - 1 + cf.size() - 1); beta -= cfq; b.assign(b_len * 2, F(0)); for(int i = k - 1; i < 2 * k - 1; i++) { b[i - (k - 1)] = beta[i]; } if(m & n) { F freq = b[0]; for(int i = 0; i < k - 1; i++) { b[i] = b[i + 1] + freq * cf[i + 1]; } b[k - 1] = freq * cf[k]; } m >>= 1; } b.resize(k); return b; } #include template std::vector berlekamp_massey(const std::vector& s) { int n = s.size(); std::vector b { F(1) }; std::vector c { F(1) }; F y(1); int shift = 0; for(int len = 0; len < n; len++) { shift++; F x(0); for(int i = 0; i < c.size(); i++) { x += c[i] * s[len - i]; } if(x == F(0)) { continue; } std::vector old_c = c; F freq = x / y; c.resize(std::max(c.size(), b.size() + shift), F(0)); for(int i = 0; i < b.size(); i++) { c[i + shift] -= freq * b[i]; } if(old_c.size() < c.size()) { b = std::move(old_c); y = x; shift = 0; } } std::vector ans(c.size() - 1); for(int i = 1; i < c.size(); i++) { ans[i - 1] = -c[i]; } return ans; } #include template std::vector find_minimal_polynomial(const std::vector& a) { std::vector c = berlekamp_massey(a); c.insert(c.begin(), -F(1)); return c; } template std::vector find_minimal_polynomial_from_vector(int n, const std::vector>& a, NonZeroRandGen rnd) { std::vector u(n); for(int i = 0; i < n; i++) u[i] = rnd(); std::vector b(a.size(), F(0)); for(int i = 0; i < a.size(); i++) { for(int j = 0; j < n; j++) { b[i] += a[i][j] * u[j]; } } return find_minimal_polynomial(b); } template std::vector find_minimal_polynomial_from_dense_matrix_pow_b(const std::vector>& a, std::vector b, NonZeroRandGen rnd) { int n = a.size(); std::vector bf; std::vector u(n); for(int i = 0; i < n; i++) u[i] = rnd(); std::vector c(n * 2); for(int i = 0; i < 2 * n; i++) { for(int j = 0; j < n; j++) { c[i] += b[j] * u[j]; } if(i + 1 < 2 * n) { bf = b; for(int j = 0; j < n; j++) { b[j] = F(0); for(int k = 0; k < n; k++) { b[j] += a[j][k] * bf[k]; } } } } return find_minimal_polynomial(c); } // fast for dense matrix template std::vector find_minimal_polynomial_from_dense_matrix_pow(const std::vector>& a, NonZeroRandGen rnd) { int n = a.size(); std::vector b(n); for(int i = 0; i < n; i++) b[i] = rnd(); return find_minimal_polynomial_from_dense_matrix_pow_b(a, std::move(b), rnd); } #include template std::vector find_minimal_polynomial_from_sparse_matrix_pow_b(const std::vector>& a, std::vector b, int n, NonZeroRandGen rnd) { std::vector bf; std::vector u(n); for(int i = 0; i < n; i++) u[i] = rnd(); std::vector c(n * 2); for(int i = 0; i < 2 * n; i++) { for(int j = 0; j < n; j++) { c[i] += b[j] * u[j]; } if(i + 1 < 2 * n) { bf = b; for(int j = 0; j < n; j++) { b[j] = F(0); } for(auto& [j, k, v]: a) { b[j] += v * bf[k]; } } } return find_minimal_polynomial(c); } template std::vector find_minimal_polynomial_from_sparse_matrix_pow(const std::vector>& a, int n, NonZeroRandGen rnd) { std::vector b(n); for(int i = 0; i < n; i++) b[i] = rnd(); return find_minimal_polynomial_from_sparse_matrix_pow_b(a, std::move(b), n, rnd); } template std::vector bbla_dense_matrix_pow(const std::vector>& a, std::vector b, long long r, NonZeroRandGen rnd) { int n = a.size(); auto c = find_minimal_polynomial_from_dense_matrix_pow_b(a, b, rnd); auto d = fast_kitamasa(std::move(c), r); std::vector ans(n); std::vector bf; for(int i = 0; i < d.size(); i++) { for(int j = 0; j < n; j++) { ans[j] += d[d.size() - 1 - i] * b[j]; } if(i + 1 < d.size()) { bf = b; for(int j = 0; j < n; j++) { b[j] = F(0); for(int k = 0; k < n; k++) { b[j] += a[j][k] * bf[k]; } } } } return ans; } template std::vector bbla_sparse_matrix_pow(const std::vector>& a, std::vector b, long long r, NonZeroRandGen rnd) { int n = b.size(); auto c = find_minimal_polynomial_from_sparse_matrix_pow_b(a, b, n, rnd); auto d = fast_kitamasa(std::move(c), r); std::vector ans(n); std::vector bf; for(int i = 0; i < d.size(); i++) { for(int j = 0; j < n; j++) { ans[j] += d[d.size() - 1 - i] * b[j]; } if(i + 1 < d.size()) { bf = b; for(int j = 0; j < n; j++) { b[j] = F(0); } for(auto& [j, k, v]: a) { b[j] += v * bf[k]; } } } return ans; } #include using fp = modint<998244353>; constexpr fp li[] {fp(863303859),fp(339390315),fp(979599571),fp(199714654),fp(301357173),fp(400865054),fp(770490863),fp(356820773),fp(894972774),fp(967324291),fp(166944925),fp(654393003),fp(304140790),fp(20830991),fp(847528850),fp(156976274),fp(547185625),fp(621028859),fp(844422675),fp(326938261),fp(902391593),fp(397750615),fp(433030622),fp(244536178),fp(375895659),fp(902822037),fp(909826671),fp(718623378),fp(114518526),fp(347491628),fp(80068807),fp(269866676),fp(671564431),fp(352910149),fp(451609918),fp(726700835),fp(167580922),fp(40120952),fp(642222159),fp(738419087),fp(588793680),fp(624451442),fp(642541577),fp(314409044),fp(57883979),fp(87180399),fp(285373494),fp(813291789),fp(936336163),fp(285787459),fp(651754948),fp(899691323),fp(35048108),fp(297454293),fp(291950455),fp(377885604),fp(722386864),fp(676329426),fp(149479947),fp(13963991),fp(757101185),fp(940125946),fp(83989251),fp(685331496),fp(83323677),fp(260532830),fp(926688801),fp(262319285),fp(693808883),fp(944267560),fp(462457327),fp(206130177),fp(995396560),fp(577713216),fp(365155728),fp(579784548),fp(693449315),fp(8335689),fp(473361466),fp(41379181),fp(162849967),fp(365513965),fp(744881959),fp(925032280),fp(754083979),fp(696698057),fp(738556167),fp(5650271),fp(199816497),fp(947029884),fp(42818675),fp(856515444),fp(16191429),fp(297068619),fp(134930396),fp(375623842),fp(606645976),fp(246526416),fp(497437038),fp(928693228),fp(946582662),fp(862847352),fp(358513338),fp(463498435),fp(519603693),fp(991973628),fp(937271185),fp(275295290),fp(866218055),fp(536205886),fp(769241787),fp(322515550),fp(463584342),fp(747948043),fp(671365378),fp(373253000),fp(77410578),fp(387956371),fp(379285769),fp(696147192),fp(788593759),fp(70240204),fp(174654106),fp(435442880),fp(558134876)}; int main() { using multi = fps_ntt_multiply<998244353, 3>; cin.tie(nullptr); std::ios::sync_with_stdio(false); i64 N, k; cin >> N >> k; std::vector> a(k * k * k, std::vector(k * k * k)); std::vector b(k * k * k); b[0] = fp(1); for(int i = 0;i < k;i++) { for(int j = 0;j < k;j++) { for(int l = 0;l < k;l++) { a[(i+1)%k*k*k+j*k+l][i*k*k+j*k+l]++; a[i*k*k+(j+i)%k*k+l][i*k*k+j*k+l]++; a[i*k*k+j*k+(l+j)%k][i*k*k+j*k+l]++; } } } std::vector l { fp(0), fp(1), fp(2) }; std::vector> A; A.reserve(370); int sum = 0; for(int i = 0; i < a.size(); i++) { for(int j = 0; j < a.size(); j++) { if(a[i][j]) { A.push_back( { i, j, l[a[i][j]] } ); } } } int cnt = 0; auto rnd = [&]() { return li[cnt++]; }; auto res = bbla_sparse_matrix_pow(A, b, N, rnd); fp ans; for(int i = 0; i < k; i++) { for(int j = 0; j < k; j++) { ans += res[i * k * k + j * k]; } } cout << ans << endl; }