#752293 (C++17(gcc12)) No.840 ほむほむほむら

提出ソース
結果

問題	No.840 ほむほむほむら
コンテスト
ユーザー	niuez
提出日時	2022-04-08 12:23:18
言語	C++17(gcc12) (gcc 12.4.0 + boost 1.89.0) コンパイル: `g++-12 -O2 -lm -std=c++17 -Wuninitialized -DONLINE_JUDGE -o a.out _filename_` 実行: `./a.out`
結果	AC
実行時間	16 ms / 4,000 ms
コード長	19,626 bytes
記録記録タグの例: 初AC ショートコード純ショートコード純主流ショートコード最速実行時間
コンパイル時間	7,937 ms
コンパイル使用メモリ	157,016 KB
最終ジャッジ日時	2025-01-28 15:42:55
ジャッジサーバーID （参考情報）	judge4 / judge5
このコードへのチャレンジ
（要ログイン）
ファイルパターン	結果
sample	AC * 3
other	AC * 25
権限があれば一括ダウンロードができます
ソースコード

raw source code
#include <iostream>

template<uint32_t M>
struct montgomery_modint {
  using Self = montgomery_modint<M>;
  using i32 = int32_t;
  using u32 = uint32_t;
  using u64 = uint64_t;

  static constexpr u32 get_r() {
    u32 res = M;
    for(int i = 0; i < 4; i++) res *= 2 - M * res;
    return res;
  }
  static constexpr u32 reduce(u64 a) {
    return (a + u64(u32(a) * u32(-r)) * M) >> 32;
  }

  static constexpr u32 r = get_r();
  static constexpr u32 n2 = -u64(M) % M;

  u32 a;

  constexpr montgomery_modint() : a(0) {}
  constexpr montgomery_modint(int64_t a) : a(reduce(u64(a % M + M) * n2)) {}

  constexpr u32 val() const {
    u32 res = reduce(a);
    return res >= M ? res - M : res;
  }
  constexpr Self pow(u64 r) const {
    Self ans(1); Self aa = *this;
    while(r) { if(r & 1) ans *= aa; aa *= aa; r >>= 1; }
    return ans;
  }
  constexpr Self inv() const { return this->pow(M - 2); }
  constexpr Self& operator+=(const Self& r) {
    if(i32(a += r.a - 2 * M) < 0) a += 2 * M;
    return *this;
  }
  constexpr Self& operator-=(const Self& r) {
    if(i32(a -= r.a) < 0) a += 2 * M;
    return *this;
  }
  constexpr Self& operator*=(const Self& r) {
    a = reduce(u64(a) * r.a);
    return *this;
  }
  constexpr Self& operator/=(const Self& r) {
    *this *= r.inv();
    return *this;
  }
  constexpr Self operator+(const Self r) const { return Self(*this) += r; }
  constexpr Self operator-(const Self r) const { return Self(*this) -= r; }
  constexpr Self operator-() const { return Self() - Self(*this); }
  constexpr Self operator*(const Self r) const { return Self(*this) *= r; }
  constexpr Self operator/(const Self r) const { return Self(*this) /= r; }
  constexpr bool operator==(const Self& r) const {
    return (a >= M ? a - M : a) == (r.a >= M ? r.a - M : r.a);
  }
  constexpr bool operator!=(const Self& r) const {
    return (a >= M ? a - M : a) == (r.a >= M ? r.a - M : r.a);
  }
};

template<uint32_t M>
std::ostream& operator<<(std::ostream& os, const montgomery_modint<M>& m) {
  return os << m.val();
}
template<uint32_t M>
std::istream& operator>>(std::istream& is, montgomery_modint<M>& m) {
  int64_t t;
  is >> t;
  m = montgomery_modint<M>(t);
  return is;
}

template<uint32_t mod>
using modint = montgomery_modint<mod>;


#include <vector>
using namespace std;
using i64 = long long;
constexpr i64 NTT_PRIMES[][2] = {
    {1224736769, 3}, // 2^24 * 73 + 1,
    {1053818881, 7}, // 2^20 * 3 * 5 * 67 + 1
    {1051721729, 6}, // 2^20 * 17 * 59 + 1
    {1045430273, 3}, // 2^20 * 997 + 1
    {1012924417, 5}, // 2^21 * 3 * 7 * 23 + 1
    {1007681537, 3}, // 2^20 * 31^2 + 1
    {1004535809, 3}, // 2^21 * 479 + 1
    {998244353, 3},  // 2^23 * 7 * 17 + 1
    {985661441, 3},  // 2^22 * 5 * 47 + 1
    {976224257, 3},  // 2^20 * 7^2 * 19 + 1
    {975175681, 17}, // 2^21 * 3 * 5 * 31 + 1
    {962592769, 7},  // 2^21 * 3^3 * 17 + 1
    {950009857, 7},  // 2^21 * 4 * 151 + 1
    {943718401, 7},  // 2^22 * 3^2 * 5^2 + 1
    {935329793, 3},  // 2^22 * 223 + 1
    {924844033, 5},  // 2^21 * 3^2 * 7^2 + 1
    {469762049, 3},  // 2^26 * 7 + 1
    {167772161, 3},  // 2^25 * 5 + 1
};

template<const i64 mod, const i64 primitive>
vector<modint<mod>> number_theoretic_transform4(vector<modint<mod>> a) {
  i64 n = a.size();
  vector<modint<mod>> b(a.size());
  auto unit_i = modint<mod>(primitive).pow((mod - 1) / 4);
  for(i64 s = 1, m = n; s < n; s <<= 1, m >>= 1) {
    if(m == 2) {
      for(i64 j = 0;j < s;j++) {
        auto x = a[j + 0];
        auto y = a[j + s];
        b[j + 0] = x + y;
        b[j + s] = x - y;
      }
    }
    else {
      modint<mod> zi1 = 1;
      modint<mod> zi2 = 1;
      modint<mod> zi3 = 1;
      i64 m1 = m >> 2;
      i64 m2 = m >> 1;
      i64 m3 = m1 | m2;
      modint<mod> zeta = modint<mod>(primitive).pow((mod - 1) / m);
      for(i64 i = 0;i < m1;i++) {
        for(i64 j = 0;j < s;j++) {
          auto w = a[j + s * (i + 0)];
          auto x = a[j + s * (i + m1)];
          auto y = a[j + s * (i + m2)];
          auto z = a[j + s * (i + m3)];
          auto wy1 = w + y;
          auto wy2 = w - y;
          auto xz1 = x + z;
          auto xz2 = (x - z) * unit_i;
          b[j + s * (4 * i + 0)] =  wy1 + xz1;
          b[j + s * (4 * i + 1)] = (wy2 + xz2) * zi1;
          b[j + s * (4 * i + 2)] = (wy1 - xz1) * zi2;
          b[j + s * (4 * i + 3)] = (wy2 - xz2) * zi3;
        }
        zi1 = zi1 * zeta;
        zi2 = zi1 * zi1;
        zi3 = zi1 * zi2;
      }
      s <<= 1;
      m >>= 1;
    }
    swap(a, b);
  }
  return a;
}

template<const i64 mod, const i64 primitive>
vector<modint<mod>> inverse_number_theoretic_transform4(vector<modint<mod>> a) {
  i64 n = a.size();
  vector<modint<mod>> b(a.size());
  auto unit_i = modint<mod>(primitive).pow((mod - 1) / 4).inv();
  i64 s = n;
  i64 m = 1;
  if(__builtin_ctzll(n) & 1) {
    s >>= 1;
    m <<= 1;
    for(i64 j = 0;j < s;j++) {
      auto x = a[j + 0];
      auto y = a[j + s];
      b[j + 0] = x + y;
      b[j + s] = x - y;
    }
    swap(a, b);
  }
  for(; s >>= 2, m <<= 2, s >= 1;) {
    {
      modint<mod> zi1 = 1;
      modint<mod> zi2 = 1;
      modint<mod> zi3 = 1;
      i64 m1 = m >> 2;
      i64 m2 = m >> 1;
      i64 m3 = m1 | m2;
      modint<mod> zeta = modint<mod>(primitive).pow((mod - 1) / m).inv();
      for(i64 i = 0;i < m1;i++) {
        for(i64 j = 0;j < s;j++) {
          auto w = a[j + s * (4 * i + 0)];
          auto x = a[j + s * (4 * i + 1)] * zi1;
          auto y = a[j + s * (4 * i + 2)] * zi2;
          auto z = a[j + s * (4 * i + 3)] * zi3;
          auto wy1 = w + y;
          auto wy2 = x + z;
          auto xz1 = w - y;
          auto xz2 = (x - z) * unit_i;
          b[j + s * (i + 0)]  = wy1 + wy2;
          b[j + s * (i + m1)] = xz1 + xz2;
          b[j + s * (i + m2)] = wy1 - wy2;
          b[j + s * (i + m3)] = xz1 - xz2;
        }
        zi1 = zi1 * zeta;
        zi2 = zi1 * zi1;
        zi3 = zi1 * zi2;
      }
    }
    swap(a, b);
  }
  auto inv_n = modint<mod>(n).pow(mod - 2);
  for(int i = 0;i < n;i++) a[i] *= inv_n;
  return a;
}


template<const i64 mod, const i64 primitive>
struct fps_ntt_multiply {
  using fps_type = modint<mod>;
  using conv_type = modint<mod>;
  static std::vector<conv_type> dft(std::vector<fps_type> arr) {
    return number_theoretic_transform4<mod, primitive>(std::move(arr));
  }
  static std::vector<fps_type> idft(std::vector<conv_type> arr) {
    return inverse_number_theoretic_transform4<mod, primitive>(std::move(arr));
  }
  static std::vector<conv_type> multiply(std::vector<conv_type> a, const std::vector<conv_type>& b) {
    for(int i = 0;i < a.size();i++) a[i] *= b[i];
    return a;
  }
  static std::vector<conv_type> self_multiply(std::vector<conv_type> a) {
    for(int i = 0;i < a.size();i++) a[i] *= a[i];
    return a;
  }
};


#include <vector>
using i64 = long long;

std::size_t bound_pow2(std::size_t sz) {
  return 1ll << (__lg(sz - 1) + 1);
}

template<class T, class fps_multiply>
struct FPS {
  std::vector<T> coef;

  FPS(std::vector<T> arr): coef(std::move(arr)) {}
  size_t size() const { return coef.size(); }
  void bound_resize() {
    this->coef.resize(bound_pow2(this->size()));
  }
  const T& operator[](int i) const {
    return coef[i];
  }
  T & operator[](int i) { return coef[i]; }
  FPS pre(int n) const {
    std::vector<T> nex(n, T(0));
    for(int i = 0;i < coef.size() && i < n; i++) nex[i] = coef[i];
    return FPS(nex);
  }
  
  // F(0) must not be 0
  FPS inv() const {
    FPS g = FPS(std::vector<T>{ T(1) / (*this)[0] });
    i64 n = bound_pow2(this->size());
    for(int i = 1;i < n;i <<= 1) {
      g = g.pre(i << 1);
      auto gdft = fps_multiply::dft(g.coef);
      auto e = fps_multiply::idft(
          fps_multiply::multiply(
            fps_multiply::dft(this->pre(i << 1).coef), gdft
            )
          );
      for(int j = 0;j < i;j++) {
        e[j] = T(0);
        e[j + i] = e[j + i] * T(-1);
      }
      auto f = fps_multiply::idft(
          fps_multiply::multiply(
            fps_multiply::dft(e), std::move(gdft)
            )
          );
      for(int j = 0;j < i;j++) {
        f[j] = g[j];
      }
      g.coef = std::move(f);
    }
    return g.pre(n);
  }

  FPS diff() const {
    FPS res(std::vector<T>(this->size() - 1, T(0)));
    for(i64 i = 1;i < this->size();i++) res[i - 1] = coef[i] * T(i);
    return res;
  }

  FPS integral() const {
    FPS res(std::vector<T>(this->size() + 1, T(0)));
    for(i64 i = 0;i < this->size();i++) res[i + 1] = coef[i] / T(i + 1);
    return res;
  }

  // F(0) must be 0
  FPS log() const {
    return (this->diff() * this->inv()).integral().pre(this->size());
  }

  FPS exp() const {
    FPS f(std::vector<T>{ T(1) });
    FPS g = *this;
    g.bound_resize();
    g[0] += T(1);
    for(i64 i = 1;i < size();i <<= 1 ) {
      f = (f * (g.pre(i << 1) - f.pre(i << 1).log())).pre(i << 1);
    }
    return f;
  }

  FPS& operator+=(const FPS& rhs) {
    i64 n = std::max(this->size(), rhs.size());
    this->coef.resize(n, T(0));
    for(int i = 0;i < rhs.size();i++) this->coef[i] += rhs[i];
    return *this;
  }
  FPS& operator-=(const FPS& rhs) {
    i64 n = std::max(this->size(), rhs.size());
    this->coef.resize(n, T(0));
    for(int i = 0;i < rhs.size();i++) this->coef[i] -= rhs[i];
    return *this;
  }
  
  FPS operator+(const FPS& rhs) const {
    return (*this) += rhs;
  }
  FPS operator-(const FPS& rhs) const {
    return (*this) -= rhs;
  }
  FPS operator*(const FPS& rhs) {
    i64 m = this->size() + rhs.size() - 1;
    i64 n = bound_pow2(m);
    auto res = fps_multiply::idft(
        fps_multiply::multiply(
          fps_multiply::dft(this->pre(n).coef), fps_multiply::dft(rhs.pre(n).coef)
          )
        );
    res.resize(m);
    return res;
  }

};



#include <vector>
using i64 = long long;

template<class F, class FM>
std::vector<F> fast_kitamasa(std::vector<F> c, i64 n) {
  using fps = FPS<F, FM>;
  i64 k = c.size() - 1;
  fps cf(std::move(c));
  fps ic = cf.inv().pre(k - 1);
  
  i64 c_len = bound_pow2(k - 1 + cf.size() - 1);
  i64 ic_len = bound_pow2(k - 1 + ic.size() - 1);
  auto dc = FM::dft(cf.pre(c_len).coef);
  auto dic = FM::dft(ic.pre(ic_len).coef);

  i64 b_len = bound_pow2(k);
  std::vector<F> b(b_len, F(0));
  b[k - 1] = F(1);
  i64 m = bound_pow2(n);
  while(m) {
    b.resize(b_len * 2, F(0));
    auto bt = FM::dft(std::move(b));
    bt = FM::self_multiply(std::move(bt));
    fps beta(FM::idft(std::move(bt)));
    
    //  let q = (beta.clone().pre(k - 1) * ic.clone()).pre(k - 1);

    auto dbeta = FM::dft(beta.pre(k - 1).pre(ic_len).coef);
    auto q = FM::idft(FM::multiply(std::move(dbeta), dic));
    q.resize(c_len);
    for(int i = k - 1; i < ic_len; i++) q[i] = F(0);

    //beta -= cf * q;
    fps cfq(FM::idft(FM::multiply(FM::dft(std::move(q)), dc)));
    cfq.coef.resize(k - 1 + cf.size() - 1);
    beta -= cfq;

    b.assign(b_len * 2, F(0));
    for(int i = k - 1; i < 2 * k - 1; i++) {
      b[i - (k - 1)] = beta[i];
    }
    if(m & n) {
      F freq = b[0];
      for(int i = 0; i < k - 1; i++) {
        b[i] = b[i + 1] + freq * cf[i + 1];
      }
      b[k - 1] = freq * cf[k];
    }
    m >>= 1;
  }
  b.resize(k);
  return b;
}

#include <vector>

template<class F>
std::vector<F> berlekamp_massey(const std::vector<F>& s) {
  int n = s.size();
  std::vector<F> b { F(1) };
  std::vector<F> c { F(1) };
  F y(1);
  int shift = 0;
  for(int len = 0; len < n; len++) {
    shift++;
    F x(0);
    for(int i = 0; i < c.size(); i++) {
      x += c[i] * s[len - i];
    }
    if(x == F(0)) { continue; }
    std::vector<F> old_c = c;
    F freq = x / y; c.resize(std::max(c.size(), b.size() + shift), F(0));
    for(int i = 0; i < b.size(); i++) {
      c[i + shift] -= freq * b[i];
    }
    if(old_c.size() < c.size()) {
      b = std::move(old_c);
      y = x;
      shift = 0;
    }
  }
  std::vector<F> ans(c.size() - 1);
  for(int i = 1; i < c.size(); i++) {
    ans[i - 1] = -c[i];
  }
  return ans;
}

#include <vector>

template<class F>
std::vector<F> find_minimal_polynomial(const std::vector<F>& a) {
  std::vector<F> c = berlekamp_massey(a);
  c.insert(c.begin(), -F(1));
  return c;
}

template<class F, class NonZeroRandGen>
std::vector<F> find_minimal_polynomial_from_vector(int n, const std::vector<std::vector<F>>& a, NonZeroRandGen rnd) {
  std::vector<F> u(n);
  for(int i = 0; i < n; i++) u[i] = rnd();
  std::vector<F> b(a.size(), F(0));
  for(int i = 0; i < a.size(); i++) {
    for(int j = 0; j < n; j++) {
      b[i] += a[i][j] * u[j];
    }
  }
  return find_minimal_polynomial(b);
}

template<class F, class NonZeroRandGen>
std::vector<F> find_minimal_polynomial_from_dense_matrix_pow_b(const std::vector<std::vector<F>>& a, std::vector<F> b, NonZeroRandGen rnd) {
  int n = a.size();
  std::vector<F> bf;

  std::vector<F> u(n);
  for(int i = 0; i < n; i++) u[i] = rnd();

  std::vector<F> c(n * 2);
  for(int i = 0; i < 2 * n; i++) {
    for(int j = 0; j < n; j++) {
      c[i] += b[j] * u[j];
    }
    if(i + 1 < 2 * n) {
      bf = b;
      for(int j = 0; j < n; j++) {
        b[j] = F(0);
        for(int k = 0; k < n; k++) {
          b[j] += a[j][k] * bf[k];
        }
      }
    }
  }
  return find_minimal_polynomial(c);
}

// fast for dense matrix
template<class F, class NonZeroRandGen>
std::vector<F> find_minimal_polynomial_from_dense_matrix_pow(const std::vector<std::vector<F>>& a, NonZeroRandGen rnd) {
  int n = a.size();
  std::vector<F> b(n);
  for(int i = 0; i < n; i++) b[i] = rnd();
  return find_minimal_polynomial_from_dense_matrix_pow_b(a, std::move(b), rnd);
}

#include <tuple>

template<class F, class NonZeroRandGen>
std::vector<F> find_minimal_polynomial_from_sparse_matrix_pow(const std::vector<std::tuple<int, int, F>>& a, int n, NonZeroRandGen rnd) {
  std::vector<F> b(n);
  std::vector<F> bf;
  for(int i = 0; i < n; i++) b[i] = rnd();

  std::vector<F> u(n);
  for(int i = 0; i < n; i++) u[i] = rnd();

  std::vector<F> c(n * 2);
  for(int i = 0; i < 2 * n; i++) {
    for(int j = 0; j < n; j++) {
      c[i] += b[j] * u[j];
    }
    if(i + 1 < 2 * n) {
      bf = b;
      for(int j = 0; j < n; j++) {
        b[j] = F(0);
      }
      for(auto& [j, k, v]: a) {
        b[j] += v * bf[k];
      }
    }
  }
  return find_minimal_polynomial(c);
}

template<class F, class FM, class NonZeroRandGen>
std::vector<F> bbla_matrix_pow(const std::vector<std::vector<F>>& a, std::vector<F> b, long long r, NonZeroRandGen rnd) {
  int n = a.size();
  auto c = find_minimal_polynomial_from_dense_matrix_pow_b(a, b, rnd);
  auto d = fast_kitamasa<F, FM>(std::move(c), r);
  std::vector<F> ans(n);
  std::vector<F> bf;
  for(int i = 0; i < d.size(); i++) {
    for(int j = 0; j < n; j++) {
      ans[j] += d[d.size() - 1 - i] * b[j];
    }
    if(i + 1 < d.size()) {
      bf = b;
      for(int j = 0; j < n; j++) {
        b[j] = F(0);
        for(int k = 0; k < n; k++) {
          b[j] += a[j][k] * bf[k];
        }
      }
    }
  }
  return ans;
}

/*
#include <unistd.h>

namespace niu {

  struct fastin {
    static const int bufsize = 1 << 24;
    char buf[bufsize];
    char* iter;
    fastin() {
      iter = buf;
      for(int t = 0, k; (k = read(STDIN_FILENO, buf + t, sizeof(buf)) - t) > 0; t += k);
    }
    template<class I>
    fastin& operator>>(I& num) {
      num = 0;
      bool neg = false;
      while(*iter < '+') iter++;
      if(*iter == '-') { neg = true; iter++; }
      else if(*iter == '+') iter++;
      while(*iter >= '0') num = 10 * num + *(iter++) - '0';
      if(neg) num = -num;
      return *this;
    } 
  } fin;
}
*/

using namespace std;
using i64 = long long;

i64 pow_mod(i64 x, i64 r, i64 mod) {
  i64 ans = 1;
  while(r) {
    if(r & 1) ans = (ans * x) % mod;
    r >>= 1;
    x = x * x % mod;
  }
  return ans;
}

i64 inv_mod(i64 x, i64 mod) {
  return pow_mod(x, mod - 2, mod);
}

i64 garner(const vector<i64> &x, vector<i64> mods, i64 mod) {
  mods.emplace_back(mod);
  vector<i64> coeffs(x.size() + 1, 1);
  vector<i64> constants(x.size() + 1, 0);
  for(i64 i = 0; i < x.size(); i++) {
    i64 v = (x[i] - constants[i]) * inv_mod(coeffs[i], mods[i]) % mods[i];
    if(v < 0) v += mods[i];
    for(i64 j = i + 1; j < x.size() + 1; j++) {
      constants[j] = (constants[j] + coeffs[j] * v) % mods[j];
      coeffs[j] = (coeffs[j] * mods[i]) % mods[j];
    }
  }
  return constants.back();
}



template<i64 M, i64... NTTis>
struct fps_multiply_arb {
  using fps_type = std::vector<modint<M>>;
  using conv_type = std::tuple<std::vector<modint<NTT_PRIMES[NTTis][0]>>...>;
  const static std::size_t tsize = std::tuple_size<conv_type>::value;

  template<i64 M2, i64 primitive>
  static std::vector<modint<M2>> dft_m2(const fps_type& arr) {
    std::vector<modint<M2>> res(arr.size());
    for(std::size_t i = 0; i < arr.size(); i++)
      res[i] = modint<M2>(arr[i].val());
    return number_theoretic_transform4<M2, primitive>(std::move(res));
  }
  template<i64 M2, i64 primitive>
  static std::vector<modint<M2>> idft_m2(std::vector<modint<M2>> arr) {
    return inverse_number_theoretic_transform4<M2, primitive>(std::move(arr));
  }
  template<std::size_t... I>
  static fps_type idft_func(std::index_sequence<I...>, conv_type arr) {
    arr = std::make_tuple(idft_m2<NTT_PRIMES[NTTis][0], NTT_PRIMES[NTTis][1]>(std::get<I>(arr))...);
    std::size_t len = std::get<0>(arr).size();
    static std::vector<i64> primes = { NTT_PRIMES[NTTis][0]... };
    fps_type res(len);
    for(std::size_t i = 0; i < len; i++) {
      std::vector<i64> x = { std::get<I>(arr)[i].val()... };
      res[i] = modint<M>(garner(x, primes, M));
    }
    return std::move(res);
  }
  template<i64 M2>
  static char mult_m2(std::vector<modint<M2>>& a, const std::vector<modint<M2>>& b) {
    for(int i = 0;i < a.size();i++) a[i] *= b[i];
    return 0;
  }
  template<std::size_t... I>
  static void mult_func(std::index_sequence<I...>, conv_type& a, const conv_type& b) {
    auto res = std::make_tuple(mult_m2<NTT_PRIMES[NTTis][0]>(std::get<I>(a), std::get<I>(b))...);
  }
  static conv_type dft(fps_type arr) {
    return std::make_tuple(dft_m2<NTT_PRIMES[NTTis][0], NTT_PRIMES[NTTis][1]>(arr)...);
  }
  static fps_type idft(conv_type arr) {
    return idft_func(std::make_index_sequence<tsize>(), std::move(arr));
  }
  static conv_type multiply(conv_type a, const conv_type& b) {
    mult_func(std::make_index_sequence<tsize>(), a, b);
    return a;
  }
  static conv_type self_multiply(conv_type a) {
    mult_func(std::make_index_sequence<tsize>(), a, a);
    return a;
  }
};



#include <random>

int main() {
  using fp = modint<998244353>;
  using multi = fps_ntt_multiply<998244353, 3>;
  cin.tie(nullptr);
  std::ios::sync_with_stdio(false);

  i64 N, k;
  cin >> N >> k;

  std::vector<std::vector<int>> a(k * k * k, std::vector<int>(k * k * k));
  std::vector<fp> b(k * k * k);
  b[0] = fp(1);
  for(int i = 0;i < k;i++) {
    for(int j = 0;j < k;j++) {
      for(int l = 0;l < k;l++) {
        a[(i+1)%k*k*k+j*k+l][i*k*k+j*k+l]++;
        a[i*k*k+(j+i)%k*k+l][i*k*k+j*k+l]++;
        a[i*k*k+j*k+(l+j)%k][i*k*k+j*k+l]++;
      }
    }
  }
  std::vector<std::vector<fp>> A(k * k * k, std::vector<fp>(k * k * k));
  for(int i = 0; i < A.size(); i++) {
    for(int j = 0; j < A.size(); j++) {
      A[i][j] = fp(a[i][j]);
    }
  }
  
  std::mt19937 mt(91);
  std::uniform_int_distribution dist(1, 998244352);
  auto rnd = [&]() {
    return fp(dist(mt));
  };
  auto res = bbla_matrix_pow<fp, multi>(A, b, N, rnd);
  fp ans;
  for(int i = 0; i < k; i++) {
    for(int j = 0; j < k; j++) {
      ans += res[i * k * k + j * k];
    }
  }
  cout << ans << endl;
}
yukicoder

結果

ソースコード