結果
| 問題 |
No.840 ほむほむほむら
|
| コンテスト | |
| ユーザー |
|
| 提出日時 | 2022-04-08 12:23:18 |
| 言語 | C++17(gcc12) (gcc 12.3.0 + boost 1.87.0) |
| 結果 |
AC
|
| 実行時間 | 16 ms / 4,000 ms |
| コード長 | 19,626 bytes |
| コンパイル時間 | 7,937 ms |
| コンパイル使用メモリ | 157,016 KB |
| 最終ジャッジ日時 | 2025-01-28 15:42:55 |
|
ジャッジサーバーID (参考情報) |
judge4 / judge5 |
(要ログイン)
| ファイルパターン | 結果 |
|---|---|
| sample | AC * 3 |
| other | AC * 25 |
ソースコード
#include <iostream>
template<uint32_t M>
struct montgomery_modint {
using Self = montgomery_modint<M>;
using i32 = int32_t;
using u32 = uint32_t;
using u64 = uint64_t;
static constexpr u32 get_r() {
u32 res = M;
for(int i = 0; i < 4; i++) res *= 2 - M * res;
return res;
}
static constexpr u32 reduce(u64 a) {
return (a + u64(u32(a) * u32(-r)) * M) >> 32;
}
static constexpr u32 r = get_r();
static constexpr u32 n2 = -u64(M) % M;
u32 a;
constexpr montgomery_modint() : a(0) {}
constexpr montgomery_modint(int64_t a) : a(reduce(u64(a % M + M) * n2)) {}
constexpr u32 val() const {
u32 res = reduce(a);
return res >= M ? res - M : res;
}
constexpr Self pow(u64 r) const {
Self ans(1); Self aa = *this;
while(r) { if(r & 1) ans *= aa; aa *= aa; r >>= 1; }
return ans;
}
constexpr Self inv() const { return this->pow(M - 2); }
constexpr Self& operator+=(const Self& r) {
if(i32(a += r.a - 2 * M) < 0) a += 2 * M;
return *this;
}
constexpr Self& operator-=(const Self& r) {
if(i32(a -= r.a) < 0) a += 2 * M;
return *this;
}
constexpr Self& operator*=(const Self& r) {
a = reduce(u64(a) * r.a);
return *this;
}
constexpr Self& operator/=(const Self& r) {
*this *= r.inv();
return *this;
}
constexpr Self operator+(const Self r) const { return Self(*this) += r; }
constexpr Self operator-(const Self r) const { return Self(*this) -= r; }
constexpr Self operator-() const { return Self() - Self(*this); }
constexpr Self operator*(const Self r) const { return Self(*this) *= r; }
constexpr Self operator/(const Self r) const { return Self(*this) /= r; }
constexpr bool operator==(const Self& r) const {
return (a >= M ? a - M : a) == (r.a >= M ? r.a - M : r.a);
}
constexpr bool operator!=(const Self& r) const {
return (a >= M ? a - M : a) == (r.a >= M ? r.a - M : r.a);
}
};
template<uint32_t M>
std::ostream& operator<<(std::ostream& os, const montgomery_modint<M>& m) {
return os << m.val();
}
template<uint32_t M>
std::istream& operator>>(std::istream& is, montgomery_modint<M>& m) {
int64_t t;
is >> t;
m = montgomery_modint<M>(t);
return is;
}
template<uint32_t mod>
using modint = montgomery_modint<mod>;
#include <vector>
using namespace std;
using i64 = long long;
constexpr i64 NTT_PRIMES[][2] = {
{1224736769, 3}, // 2^24 * 73 + 1,
{1053818881, 7}, // 2^20 * 3 * 5 * 67 + 1
{1051721729, 6}, // 2^20 * 17 * 59 + 1
{1045430273, 3}, // 2^20 * 997 + 1
{1012924417, 5}, // 2^21 * 3 * 7 * 23 + 1
{1007681537, 3}, // 2^20 * 31^2 + 1
{1004535809, 3}, // 2^21 * 479 + 1
{998244353, 3}, // 2^23 * 7 * 17 + 1
{985661441, 3}, // 2^22 * 5 * 47 + 1
{976224257, 3}, // 2^20 * 7^2 * 19 + 1
{975175681, 17}, // 2^21 * 3 * 5 * 31 + 1
{962592769, 7}, // 2^21 * 3^3 * 17 + 1
{950009857, 7}, // 2^21 * 4 * 151 + 1
{943718401, 7}, // 2^22 * 3^2 * 5^2 + 1
{935329793, 3}, // 2^22 * 223 + 1
{924844033, 5}, // 2^21 * 3^2 * 7^2 + 1
{469762049, 3}, // 2^26 * 7 + 1
{167772161, 3}, // 2^25 * 5 + 1
};
template<const i64 mod, const i64 primitive>
vector<modint<mod>> number_theoretic_transform4(vector<modint<mod>> a) {
i64 n = a.size();
vector<modint<mod>> b(a.size());
auto unit_i = modint<mod>(primitive).pow((mod - 1) / 4);
for(i64 s = 1, m = n; s < n; s <<= 1, m >>= 1) {
if(m == 2) {
for(i64 j = 0;j < s;j++) {
auto x = a[j + 0];
auto y = a[j + s];
b[j + 0] = x + y;
b[j + s] = x - y;
}
}
else {
modint<mod> zi1 = 1;
modint<mod> zi2 = 1;
modint<mod> zi3 = 1;
i64 m1 = m >> 2;
i64 m2 = m >> 1;
i64 m3 = m1 | m2;
modint<mod> zeta = modint<mod>(primitive).pow((mod - 1) / m);
for(i64 i = 0;i < m1;i++) {
for(i64 j = 0;j < s;j++) {
auto w = a[j + s * (i + 0)];
auto x = a[j + s * (i + m1)];
auto y = a[j + s * (i + m2)];
auto z = a[j + s * (i + m3)];
auto wy1 = w + y;
auto wy2 = w - y;
auto xz1 = x + z;
auto xz2 = (x - z) * unit_i;
b[j + s * (4 * i + 0)] = wy1 + xz1;
b[j + s * (4 * i + 1)] = (wy2 + xz2) * zi1;
b[j + s * (4 * i + 2)] = (wy1 - xz1) * zi2;
b[j + s * (4 * i + 3)] = (wy2 - xz2) * zi3;
}
zi1 = zi1 * zeta;
zi2 = zi1 * zi1;
zi3 = zi1 * zi2;
}
s <<= 1;
m >>= 1;
}
swap(a, b);
}
return a;
}
template<const i64 mod, const i64 primitive>
vector<modint<mod>> inverse_number_theoretic_transform4(vector<modint<mod>> a) {
i64 n = a.size();
vector<modint<mod>> b(a.size());
auto unit_i = modint<mod>(primitive).pow((mod - 1) / 4).inv();
i64 s = n;
i64 m = 1;
if(__builtin_ctzll(n) & 1) {
s >>= 1;
m <<= 1;
for(i64 j = 0;j < s;j++) {
auto x = a[j + 0];
auto y = a[j + s];
b[j + 0] = x + y;
b[j + s] = x - y;
}
swap(a, b);
}
for(; s >>= 2, m <<= 2, s >= 1;) {
{
modint<mod> zi1 = 1;
modint<mod> zi2 = 1;
modint<mod> zi3 = 1;
i64 m1 = m >> 2;
i64 m2 = m >> 1;
i64 m3 = m1 | m2;
modint<mod> zeta = modint<mod>(primitive).pow((mod - 1) / m).inv();
for(i64 i = 0;i < m1;i++) {
for(i64 j = 0;j < s;j++) {
auto w = a[j + s * (4 * i + 0)];
auto x = a[j + s * (4 * i + 1)] * zi1;
auto y = a[j + s * (4 * i + 2)] * zi2;
auto z = a[j + s * (4 * i + 3)] * zi3;
auto wy1 = w + y;
auto wy2 = x + z;
auto xz1 = w - y;
auto xz2 = (x - z) * unit_i;
b[j + s * (i + 0)] = wy1 + wy2;
b[j + s * (i + m1)] = xz1 + xz2;
b[j + s * (i + m2)] = wy1 - wy2;
b[j + s * (i + m3)] = xz1 - xz2;
}
zi1 = zi1 * zeta;
zi2 = zi1 * zi1;
zi3 = zi1 * zi2;
}
}
swap(a, b);
}
auto inv_n = modint<mod>(n).pow(mod - 2);
for(int i = 0;i < n;i++) a[i] *= inv_n;
return a;
}
template<const i64 mod, const i64 primitive>
struct fps_ntt_multiply {
using fps_type = modint<mod>;
using conv_type = modint<mod>;
static std::vector<conv_type> dft(std::vector<fps_type> arr) {
return number_theoretic_transform4<mod, primitive>(std::move(arr));
}
static std::vector<fps_type> idft(std::vector<conv_type> arr) {
return inverse_number_theoretic_transform4<mod, primitive>(std::move(arr));
}
static std::vector<conv_type> multiply(std::vector<conv_type> a, const std::vector<conv_type>& b) {
for(int i = 0;i < a.size();i++) a[i] *= b[i];
return a;
}
static std::vector<conv_type> self_multiply(std::vector<conv_type> a) {
for(int i = 0;i < a.size();i++) a[i] *= a[i];
return a;
}
};
#include <vector>
using i64 = long long;
std::size_t bound_pow2(std::size_t sz) {
return 1ll << (__lg(sz - 1) + 1);
}
template<class T, class fps_multiply>
struct FPS {
std::vector<T> coef;
FPS(std::vector<T> arr): coef(std::move(arr)) {}
size_t size() const { return coef.size(); }
void bound_resize() {
this->coef.resize(bound_pow2(this->size()));
}
const T& operator[](int i) const {
return coef[i];
}
T & operator[](int i) { return coef[i]; }
FPS pre(int n) const {
std::vector<T> nex(n, T(0));
for(int i = 0;i < coef.size() && i < n; i++) nex[i] = coef[i];
return FPS(nex);
}
// F(0) must not be 0
FPS inv() const {
FPS g = FPS(std::vector<T>{ T(1) / (*this)[0] });
i64 n = bound_pow2(this->size());
for(int i = 1;i < n;i <<= 1) {
g = g.pre(i << 1);
auto gdft = fps_multiply::dft(g.coef);
auto e = fps_multiply::idft(
fps_multiply::multiply(
fps_multiply::dft(this->pre(i << 1).coef), gdft
)
);
for(int j = 0;j < i;j++) {
e[j] = T(0);
e[j + i] = e[j + i] * T(-1);
}
auto f = fps_multiply::idft(
fps_multiply::multiply(
fps_multiply::dft(e), std::move(gdft)
)
);
for(int j = 0;j < i;j++) {
f[j] = g[j];
}
g.coef = std::move(f);
}
return g.pre(n);
}
FPS diff() const {
FPS res(std::vector<T>(this->size() - 1, T(0)));
for(i64 i = 1;i < this->size();i++) res[i - 1] = coef[i] * T(i);
return res;
}
FPS integral() const {
FPS res(std::vector<T>(this->size() + 1, T(0)));
for(i64 i = 0;i < this->size();i++) res[i + 1] = coef[i] / T(i + 1);
return res;
}
// F(0) must be 0
FPS log() const {
return (this->diff() * this->inv()).integral().pre(this->size());
}
FPS exp() const {
FPS f(std::vector<T>{ T(1) });
FPS g = *this;
g.bound_resize();
g[0] += T(1);
for(i64 i = 1;i < size();i <<= 1 ) {
f = (f * (g.pre(i << 1) - f.pre(i << 1).log())).pre(i << 1);
}
return f;
}
FPS& operator+=(const FPS& rhs) {
i64 n = std::max(this->size(), rhs.size());
this->coef.resize(n, T(0));
for(int i = 0;i < rhs.size();i++) this->coef[i] += rhs[i];
return *this;
}
FPS& operator-=(const FPS& rhs) {
i64 n = std::max(this->size(), rhs.size());
this->coef.resize(n, T(0));
for(int i = 0;i < rhs.size();i++) this->coef[i] -= rhs[i];
return *this;
}
FPS operator+(const FPS& rhs) const {
return (*this) += rhs;
}
FPS operator-(const FPS& rhs) const {
return (*this) -= rhs;
}
FPS operator*(const FPS& rhs) {
i64 m = this->size() + rhs.size() - 1;
i64 n = bound_pow2(m);
auto res = fps_multiply::idft(
fps_multiply::multiply(
fps_multiply::dft(this->pre(n).coef), fps_multiply::dft(rhs.pre(n).coef)
)
);
res.resize(m);
return res;
}
};
#include <vector>
using i64 = long long;
template<class F, class FM>
std::vector<F> fast_kitamasa(std::vector<F> c, i64 n) {
using fps = FPS<F, FM>;
i64 k = c.size() - 1;
fps cf(std::move(c));
fps ic = cf.inv().pre(k - 1);
i64 c_len = bound_pow2(k - 1 + cf.size() - 1);
i64 ic_len = bound_pow2(k - 1 + ic.size() - 1);
auto dc = FM::dft(cf.pre(c_len).coef);
auto dic = FM::dft(ic.pre(ic_len).coef);
i64 b_len = bound_pow2(k);
std::vector<F> b(b_len, F(0));
b[k - 1] = F(1);
i64 m = bound_pow2(n);
while(m) {
b.resize(b_len * 2, F(0));
auto bt = FM::dft(std::move(b));
bt = FM::self_multiply(std::move(bt));
fps beta(FM::idft(std::move(bt)));
// let q = (beta.clone().pre(k - 1) * ic.clone()).pre(k - 1);
auto dbeta = FM::dft(beta.pre(k - 1).pre(ic_len).coef);
auto q = FM::idft(FM::multiply(std::move(dbeta), dic));
q.resize(c_len);
for(int i = k - 1; i < ic_len; i++) q[i] = F(0);
//beta -= cf * q;
fps cfq(FM::idft(FM::multiply(FM::dft(std::move(q)), dc)));
cfq.coef.resize(k - 1 + cf.size() - 1);
beta -= cfq;
b.assign(b_len * 2, F(0));
for(int i = k - 1; i < 2 * k - 1; i++) {
b[i - (k - 1)] = beta[i];
}
if(m & n) {
F freq = b[0];
for(int i = 0; i < k - 1; i++) {
b[i] = b[i + 1] + freq * cf[i + 1];
}
b[k - 1] = freq * cf[k];
}
m >>= 1;
}
b.resize(k);
return b;
}
#include <vector>
template<class F>
std::vector<F> berlekamp_massey(const std::vector<F>& s) {
int n = s.size();
std::vector<F> b { F(1) };
std::vector<F> c { F(1) };
F y(1);
int shift = 0;
for(int len = 0; len < n; len++) {
shift++;
F x(0);
for(int i = 0; i < c.size(); i++) {
x += c[i] * s[len - i];
}
if(x == F(0)) { continue; }
std::vector<F> old_c = c;
F freq = x / y; c.resize(std::max(c.size(), b.size() + shift), F(0));
for(int i = 0; i < b.size(); i++) {
c[i + shift] -= freq * b[i];
}
if(old_c.size() < c.size()) {
b = std::move(old_c);
y = x;
shift = 0;
}
}
std::vector<F> ans(c.size() - 1);
for(int i = 1; i < c.size(); i++) {
ans[i - 1] = -c[i];
}
return ans;
}
#include <vector>
template<class F>
std::vector<F> find_minimal_polynomial(const std::vector<F>& a) {
std::vector<F> c = berlekamp_massey(a);
c.insert(c.begin(), -F(1));
return c;
}
template<class F, class NonZeroRandGen>
std::vector<F> find_minimal_polynomial_from_vector(int n, const std::vector<std::vector<F>>& a, NonZeroRandGen rnd) {
std::vector<F> u(n);
for(int i = 0; i < n; i++) u[i] = rnd();
std::vector<F> b(a.size(), F(0));
for(int i = 0; i < a.size(); i++) {
for(int j = 0; j < n; j++) {
b[i] += a[i][j] * u[j];
}
}
return find_minimal_polynomial(b);
}
template<class F, class NonZeroRandGen>
std::vector<F> find_minimal_polynomial_from_dense_matrix_pow_b(const std::vector<std::vector<F>>& a, std::vector<F> b, NonZeroRandGen rnd) {
int n = a.size();
std::vector<F> bf;
std::vector<F> u(n);
for(int i = 0; i < n; i++) u[i] = rnd();
std::vector<F> c(n * 2);
for(int i = 0; i < 2 * n; i++) {
for(int j = 0; j < n; j++) {
c[i] += b[j] * u[j];
}
if(i + 1 < 2 * n) {
bf = b;
for(int j = 0; j < n; j++) {
b[j] = F(0);
for(int k = 0; k < n; k++) {
b[j] += a[j][k] * bf[k];
}
}
}
}
return find_minimal_polynomial(c);
}
// fast for dense matrix
template<class F, class NonZeroRandGen>
std::vector<F> find_minimal_polynomial_from_dense_matrix_pow(const std::vector<std::vector<F>>& a, NonZeroRandGen rnd) {
int n = a.size();
std::vector<F> b(n);
for(int i = 0; i < n; i++) b[i] = rnd();
return find_minimal_polynomial_from_dense_matrix_pow_b(a, std::move(b), rnd);
}
#include <tuple>
template<class F, class NonZeroRandGen>
std::vector<F> find_minimal_polynomial_from_sparse_matrix_pow(const std::vector<std::tuple<int, int, F>>& a, int n, NonZeroRandGen rnd) {
std::vector<F> b(n);
std::vector<F> bf;
for(int i = 0; i < n; i++) b[i] = rnd();
std::vector<F> u(n);
for(int i = 0; i < n; i++) u[i] = rnd();
std::vector<F> c(n * 2);
for(int i = 0; i < 2 * n; i++) {
for(int j = 0; j < n; j++) {
c[i] += b[j] * u[j];
}
if(i + 1 < 2 * n) {
bf = b;
for(int j = 0; j < n; j++) {
b[j] = F(0);
}
for(auto& [j, k, v]: a) {
b[j] += v * bf[k];
}
}
}
return find_minimal_polynomial(c);
}
template<class F, class FM, class NonZeroRandGen>
std::vector<F> bbla_matrix_pow(const std::vector<std::vector<F>>& a, std::vector<F> b, long long r, NonZeroRandGen rnd) {
int n = a.size();
auto c = find_minimal_polynomial_from_dense_matrix_pow_b(a, b, rnd);
auto d = fast_kitamasa<F, FM>(std::move(c), r);
std::vector<F> ans(n);
std::vector<F> bf;
for(int i = 0; i < d.size(); i++) {
for(int j = 0; j < n; j++) {
ans[j] += d[d.size() - 1 - i] * b[j];
}
if(i + 1 < d.size()) {
bf = b;
for(int j = 0; j < n; j++) {
b[j] = F(0);
for(int k = 0; k < n; k++) {
b[j] += a[j][k] * bf[k];
}
}
}
}
return ans;
}
/*
#include <unistd.h>
namespace niu {
struct fastin {
static const int bufsize = 1 << 24;
char buf[bufsize];
char* iter;
fastin() {
iter = buf;
for(int t = 0, k; (k = read(STDIN_FILENO, buf + t, sizeof(buf)) - t) > 0; t += k);
}
template<class I>
fastin& operator>>(I& num) {
num = 0;
bool neg = false;
while(*iter < '+') iter++;
if(*iter == '-') { neg = true; iter++; }
else if(*iter == '+') iter++;
while(*iter >= '0') num = 10 * num + *(iter++) - '0';
if(neg) num = -num;
return *this;
}
} fin;
}
*/
using namespace std;
using i64 = long long;
i64 pow_mod(i64 x, i64 r, i64 mod) {
i64 ans = 1;
while(r) {
if(r & 1) ans = (ans * x) % mod;
r >>= 1;
x = x * x % mod;
}
return ans;
}
i64 inv_mod(i64 x, i64 mod) {
return pow_mod(x, mod - 2, mod);
}
i64 garner(const vector<i64> &x, vector<i64> mods, i64 mod) {
mods.emplace_back(mod);
vector<i64> coeffs(x.size() + 1, 1);
vector<i64> constants(x.size() + 1, 0);
for(i64 i = 0; i < x.size(); i++) {
i64 v = (x[i] - constants[i]) * inv_mod(coeffs[i], mods[i]) % mods[i];
if(v < 0) v += mods[i];
for(i64 j = i + 1; j < x.size() + 1; j++) {
constants[j] = (constants[j] + coeffs[j] * v) % mods[j];
coeffs[j] = (coeffs[j] * mods[i]) % mods[j];
}
}
return constants.back();
}
template<i64 M, i64... NTTis>
struct fps_multiply_arb {
using fps_type = std::vector<modint<M>>;
using conv_type = std::tuple<std::vector<modint<NTT_PRIMES[NTTis][0]>>...>;
const static std::size_t tsize = std::tuple_size<conv_type>::value;
template<i64 M2, i64 primitive>
static std::vector<modint<M2>> dft_m2(const fps_type& arr) {
std::vector<modint<M2>> res(arr.size());
for(std::size_t i = 0; i < arr.size(); i++)
res[i] = modint<M2>(arr[i].val());
return number_theoretic_transform4<M2, primitive>(std::move(res));
}
template<i64 M2, i64 primitive>
static std::vector<modint<M2>> idft_m2(std::vector<modint<M2>> arr) {
return inverse_number_theoretic_transform4<M2, primitive>(std::move(arr));
}
template<std::size_t... I>
static fps_type idft_func(std::index_sequence<I...>, conv_type arr) {
arr = std::make_tuple(idft_m2<NTT_PRIMES[NTTis][0], NTT_PRIMES[NTTis][1]>(std::get<I>(arr))...);
std::size_t len = std::get<0>(arr).size();
static std::vector<i64> primes = { NTT_PRIMES[NTTis][0]... };
fps_type res(len);
for(std::size_t i = 0; i < len; i++) {
std::vector<i64> x = { std::get<I>(arr)[i].val()... };
res[i] = modint<M>(garner(x, primes, M));
}
return std::move(res);
}
template<i64 M2>
static char mult_m2(std::vector<modint<M2>>& a, const std::vector<modint<M2>>& b) {
for(int i = 0;i < a.size();i++) a[i] *= b[i];
return 0;
}
template<std::size_t... I>
static void mult_func(std::index_sequence<I...>, conv_type& a, const conv_type& b) {
auto res = std::make_tuple(mult_m2<NTT_PRIMES[NTTis][0]>(std::get<I>(a), std::get<I>(b))...);
}
static conv_type dft(fps_type arr) {
return std::make_tuple(dft_m2<NTT_PRIMES[NTTis][0], NTT_PRIMES[NTTis][1]>(arr)...);
}
static fps_type idft(conv_type arr) {
return idft_func(std::make_index_sequence<tsize>(), std::move(arr));
}
static conv_type multiply(conv_type a, const conv_type& b) {
mult_func(std::make_index_sequence<tsize>(), a, b);
return a;
}
static conv_type self_multiply(conv_type a) {
mult_func(std::make_index_sequence<tsize>(), a, a);
return a;
}
};
#include <random>
int main() {
using fp = modint<998244353>;
using multi = fps_ntt_multiply<998244353, 3>;
cin.tie(nullptr);
std::ios::sync_with_stdio(false);
i64 N, k;
cin >> N >> k;
std::vector<std::vector<int>> a(k * k * k, std::vector<int>(k * k * k));
std::vector<fp> b(k * k * k);
b[0] = fp(1);
for(int i = 0;i < k;i++) {
for(int j = 0;j < k;j++) {
for(int l = 0;l < k;l++) {
a[(i+1)%k*k*k+j*k+l][i*k*k+j*k+l]++;
a[i*k*k+(j+i)%k*k+l][i*k*k+j*k+l]++;
a[i*k*k+j*k+(l+j)%k][i*k*k+j*k+l]++;
}
}
}
std::vector<std::vector<fp>> A(k * k * k, std::vector<fp>(k * k * k));
for(int i = 0; i < A.size(); i++) {
for(int j = 0; j < A.size(); j++) {
A[i][j] = fp(a[i][j]);
}
}
std::mt19937 mt(91);
std::uniform_int_distribution dist(1, 998244352);
auto rnd = [&]() {
return fp(dist(mt));
};
auto res = bbla_matrix_pow<fp, multi>(A, b, N, rnd);
fp ans;
for(int i = 0; i < k; i++) {
for(int j = 0; j < k; j++) {
ans += res[i * k * k + j * k];
}
}
cout << ans << endl;
}