結果

問題 No.840 ほむほむほむら
ユーザー niuezniuez
提出日時 2022-04-09 00:49:58
言語 C++17
(gcc 12.3.0 + boost 1.83.0)
結果
AC  
実行時間 3 ms / 4,000 ms
コード長 18,836 bytes
コンパイル時間 1,841 ms
コンパイル使用メモリ 129,736 KB
実行使用メモリ 5,248 KB
最終ジャッジ日時 2024-11-28 16:19:19
合計ジャッジ時間 2,648 ms
ジャッジサーバーID
(参考情報)
judge3 / judge5
このコードへのチャレンジ
(要ログイン)

テストケース

テストケース表示
入力 結果 実行時間
実行使用メモリ
testcase_00 AC 1 ms
5,248 KB
testcase_01 AC 2 ms
5,248 KB
testcase_02 AC 2 ms
5,248 KB
testcase_03 AC 2 ms
5,248 KB
testcase_04 AC 2 ms
5,248 KB
testcase_05 AC 1 ms
5,248 KB
testcase_06 AC 2 ms
5,248 KB
testcase_07 AC 2 ms
5,248 KB
testcase_08 AC 3 ms
5,248 KB
testcase_09 AC 1 ms
5,248 KB
testcase_10 AC 2 ms
5,248 KB
testcase_11 AC 2 ms
5,248 KB
testcase_12 AC 2 ms
5,248 KB
testcase_13 AC 3 ms
5,248 KB
testcase_14 AC 2 ms
5,248 KB
testcase_15 AC 2 ms
5,248 KB
testcase_16 AC 1 ms
5,248 KB
testcase_17 AC 2 ms
5,248 KB
testcase_18 AC 3 ms
5,248 KB
testcase_19 AC 3 ms
5,248 KB
testcase_20 AC 1 ms
5,248 KB
testcase_21 AC 1 ms
5,248 KB
testcase_22 AC 2 ms
5,248 KB
testcase_23 AC 3 ms
5,248 KB
testcase_24 AC 1 ms
5,248 KB
testcase_25 AC 1 ms
5,248 KB
testcase_26 AC 2 ms
5,248 KB
testcase_27 AC 3 ms
5,248 KB
権限があれば一括ダウンロードができます

ソースコード

diff #

#include <iostream>

template<uint32_t M>
struct montgomery_modint {
  using Self = montgomery_modint<M>;
  using i32 = int32_t;
  using u32 = uint32_t;
  using u64 = uint64_t;

  static constexpr u32 get_r() {
    u32 res = M;
    for(int i = 0; i < 4; i++) res *= 2 - M * res;
    return res;
  }
  static constexpr u32 reduce(u64 a) {
    return (a + u64(u32(a) * u32(-r)) * M) >> 32;
  }

  static constexpr u32 r = get_r();
  static constexpr u32 n2 = -u64(M) % M;

  u32 a;

  constexpr montgomery_modint() : a(0) {}
  constexpr montgomery_modint(int64_t a) : a(reduce(u64(a % M + M) * n2)) {}

  constexpr u32 val() const {
    u32 res = reduce(a);
    return res >= M ? res - M : res;
  }
  constexpr Self pow(u64 r) const {
    Self ans(1); Self aa = *this;
    while(r) { if(r & 1) ans *= aa; aa *= aa; r >>= 1; }
    return ans;
  }
  constexpr Self inv() const { return this->pow(M - 2); }
  constexpr Self& operator+=(const Self& r) {
    if(i32(a += r.a - 2 * M) < 0) a += 2 * M;
    return *this;
  }
  constexpr Self& operator-=(const Self& r) {
    if(i32(a -= r.a) < 0) a += 2 * M;
    return *this;
  }
  constexpr Self& operator*=(const Self& r) {
    a = reduce(u64(a) * r.a);
    return *this;
  }
  constexpr Self& operator/=(const Self& r) {
    *this *= r.inv();
    return *this;
  }
  constexpr Self operator+(const Self r) const { return Self(*this) += r; }
  constexpr Self operator-(const Self r) const { return Self(*this) -= r; }
  constexpr Self operator-() const { return Self() - Self(*this); }
  constexpr Self operator*(const Self r) const { return Self(*this) *= r; }
  constexpr Self operator/(const Self r) const { return Self(*this) /= r; }
  constexpr bool operator==(const Self& r) const {
    return (a >= M ? a - M : a) == (r.a >= M ? r.a - M : r.a);
  }
  constexpr bool operator!=(const Self& r) const {
    return (a >= M ? a - M : a) == (r.a >= M ? r.a - M : r.a);
  }
};

template<uint32_t M>
std::ostream& operator<<(std::ostream& os, const montgomery_modint<M>& m) {
  return os << m.val();
}
template<uint32_t M>
std::istream& operator>>(std::istream& is, montgomery_modint<M>& m) {
  int64_t t;
  is >> t;
  m = montgomery_modint<M>(t);
  return is;
}

template<uint32_t mod>
using modint = montgomery_modint<mod>;


#include <vector>
using namespace std;
using i64 = long long;
constexpr i64 NTT_PRIMES[][2] = {
    {1224736769, 3}, // 2^24 * 73 + 1,
    {1053818881, 7}, // 2^20 * 3 * 5 * 67 + 1
    {1051721729, 6}, // 2^20 * 17 * 59 + 1
    {1045430273, 3}, // 2^20 * 997 + 1
    {1012924417, 5}, // 2^21 * 3 * 7 * 23 + 1
    {1007681537, 3}, // 2^20 * 31^2 + 1
    {1004535809, 3}, // 2^21 * 479 + 1
    {998244353, 3},  // 2^23 * 7 * 17 + 1
    {985661441, 3},  // 2^22 * 5 * 47 + 1
    {976224257, 3},  // 2^20 * 7^2 * 19 + 1
    {975175681, 17}, // 2^21 * 3 * 5 * 31 + 1
    {962592769, 7},  // 2^21 * 3^3 * 17 + 1
    {950009857, 7},  // 2^21 * 4 * 151 + 1
    {943718401, 7},  // 2^22 * 3^2 * 5^2 + 1
    {935329793, 3},  // 2^22 * 223 + 1
    {924844033, 5},  // 2^21 * 3^2 * 7^2 + 1
    {469762049, 3},  // 2^26 * 7 + 1
    {167772161, 3},  // 2^25 * 5 + 1
};

template<const i64 mod, const i64 primitive>
vector<modint<mod>> number_theoretic_transform4(vector<modint<mod>> a) {
  i64 n = a.size();
  vector<modint<mod>> b(a.size());
  auto unit_i = modint<mod>(primitive).pow((mod - 1) / 4);
  for(i64 s = 1, m = n; s < n; s <<= 1, m >>= 1) {
    if(m == 2) {
      for(i64 j = 0;j < s;j++) {
        auto x = a[j + 0];
        auto y = a[j + s];
        b[j + 0] = x + y;
        b[j + s] = x - y;
      }
    }
    else {
      modint<mod> zi1 = 1;
      modint<mod> zi2 = 1;
      modint<mod> zi3 = 1;
      i64 m1 = m >> 2;
      i64 m2 = m >> 1;
      i64 m3 = m1 | m2;
      modint<mod> zeta = modint<mod>(primitive).pow((mod - 1) / m);
      for(i64 i = 0;i < m1;i++) {
        for(i64 j = 0;j < s;j++) {
          auto w = a[j + s * (i + 0)];
          auto x = a[j + s * (i + m1)];
          auto y = a[j + s * (i + m2)];
          auto z = a[j + s * (i + m3)];
          auto wy1 = w + y;
          auto wy2 = w - y;
          auto xz1 = x + z;
          auto xz2 = (x - z) * unit_i;
          b[j + s * (4 * i + 0)] =  wy1 + xz1;
          b[j + s * (4 * i + 1)] = (wy2 + xz2) * zi1;
          b[j + s * (4 * i + 2)] = (wy1 - xz1) * zi2;
          b[j + s * (4 * i + 3)] = (wy2 - xz2) * zi3;
        }
        zi1 = zi1 * zeta;
        zi2 = zi1 * zi1;
        zi3 = zi1 * zi2;
      }
      s <<= 1;
      m >>= 1;
    }
    swap(a, b);
  }
  return a;
}

template<const i64 mod, const i64 primitive>
vector<modint<mod>> inverse_number_theoretic_transform4(vector<modint<mod>> a) {
  i64 n = a.size();
  vector<modint<mod>> b(a.size());
  auto unit_i = modint<mod>(primitive).pow((mod - 1) / 4).inv();
  i64 s = n;
  i64 m = 1;
  if(__builtin_ctzll(n) & 1) {
    s >>= 1;
    m <<= 1;
    for(i64 j = 0;j < s;j++) {
      auto x = a[j + 0];
      auto y = a[j + s];
      b[j + 0] = x + y;
      b[j + s] = x - y;
    }
    swap(a, b);
  }
  for(; s >>= 2, m <<= 2, s >= 1;) {
    {
      modint<mod> zi1 = 1;
      modint<mod> zi2 = 1;
      modint<mod> zi3 = 1;
      i64 m1 = m >> 2;
      i64 m2 = m >> 1;
      i64 m3 = m1 | m2;
      modint<mod> zeta = modint<mod>(primitive).pow((mod - 1) / m).inv();
      for(i64 i = 0;i < m1;i++) {
        for(i64 j = 0;j < s;j++) {
          auto w = a[j + s * (4 * i + 0)];
          auto x = a[j + s * (4 * i + 1)] * zi1;
          auto y = a[j + s * (4 * i + 2)] * zi2;
          auto z = a[j + s * (4 * i + 3)] * zi3;
          auto wy1 = w + y;
          auto wy2 = x + z;
          auto xz1 = w - y;
          auto xz2 = (x - z) * unit_i;
          b[j + s * (i + 0)]  = wy1 + wy2;
          b[j + s * (i + m1)] = xz1 + xz2;
          b[j + s * (i + m2)] = wy1 - wy2;
          b[j + s * (i + m3)] = xz1 - xz2;
        }
        zi1 = zi1 * zeta;
        zi2 = zi1 * zi1;
        zi3 = zi1 * zi2;
      }
    }
    swap(a, b);
  }
  auto inv_n = modint<mod>(n).pow(mod - 2);
  for(int i = 0;i < n;i++) a[i] *= inv_n;
  return a;
}


template<const i64 mod, const i64 primitive>
struct fps_ntt_multiply {
  using fps_type = modint<mod>;
  using conv_type = modint<mod>;
  static std::vector<conv_type> dft(std::vector<fps_type> arr) {
    return number_theoretic_transform4<mod, primitive>(std::move(arr));
  }
  static std::vector<fps_type> idft(std::vector<conv_type> arr) {
    return inverse_number_theoretic_transform4<mod, primitive>(std::move(arr));
  }
  static std::vector<conv_type> multiply(std::vector<conv_type> a, const std::vector<conv_type>& b) {
    for(int i = 0;i < a.size();i++) a[i] *= b[i];
    return a;
  }
  static std::vector<conv_type> self_multiply(std::vector<conv_type> a) {
    for(int i = 0;i < a.size();i++) a[i] *= a[i];
    return a;
  }
};


#include <vector>
using i64 = long long;

std::size_t bound_pow2(std::size_t sz) {
  return 1ll << (__lg(sz - 1) + 1);
}

template<class T, class fps_multiply>
struct FPS {
  std::vector<T> coef;

  FPS(std::vector<T> arr): coef(std::move(arr)) {}
  size_t size() const { return coef.size(); }
  void bound_resize() {
    this->coef.resize(bound_pow2(this->size()));
  }
  const T& operator[](int i) const {
    return coef[i];
  }
  T & operator[](int i) { return coef[i]; }
  FPS pre(int n) const {
    std::vector<T> nex(n, T(0));
    for(int i = 0;i < coef.size() && i < n; i++) nex[i] = coef[i];
    return FPS(nex);
  }
  
  // F(0) must not be 0
  FPS inv() const {
    FPS g = FPS(std::vector<T>{ T(1) / (*this)[0] });
    i64 n = bound_pow2(this->size());
    for(int i = 1;i < n;i <<= 1) {
      g = g.pre(i << 1);
      auto gdft = fps_multiply::dft(g.coef);
      auto e = fps_multiply::idft(
          fps_multiply::multiply(
            fps_multiply::dft(this->pre(i << 1).coef), gdft
            )
          );
      for(int j = 0;j < i;j++) {
        e[j] = T(0);
        e[j + i] = e[j + i] * T(-1);
      }
      auto f = fps_multiply::idft(
          fps_multiply::multiply(
            fps_multiply::dft(e), std::move(gdft)
            )
          );
      for(int j = 0;j < i;j++) {
        f[j] = g[j];
      }
      g.coef = std::move(f);
    }
    return g.pre(n);
  }

  FPS diff() const {
    FPS res(std::vector<T>(this->size() - 1, T(0)));
    for(i64 i = 1;i < this->size();i++) res[i - 1] = coef[i] * T(i);
    return res;
  }

  FPS integral() const {
    FPS res(std::vector<T>(this->size() + 1, T(0)));
    for(i64 i = 0;i < this->size();i++) res[i + 1] = coef[i] / T(i + 1);
    return res;
  }

  // F(0) must be 0
  FPS log() const {
    return (this->diff() * this->inv()).integral().pre(this->size());
  }

  FPS exp() const {
    FPS f(std::vector<T>{ T(1) });
    FPS g = *this;
    g.bound_resize();
    g[0] += T(1);
    for(i64 i = 1;i < size();i <<= 1 ) {
      f = (f * (g.pre(i << 1) - f.pre(i << 1).log())).pre(i << 1);
    }
    return f;
  }

  FPS& operator+=(const FPS& rhs) {
    i64 n = std::max(this->size(), rhs.size());
    this->coef.resize(n, T(0));
    for(int i = 0;i < rhs.size();i++) this->coef[i] += rhs[i];
    return *this;
  }
  FPS& operator-=(const FPS& rhs) {
    i64 n = std::max(this->size(), rhs.size());
    this->coef.resize(n, T(0));
    for(int i = 0;i < rhs.size();i++) this->coef[i] -= rhs[i];
    return *this;
  }
  
  FPS operator+(const FPS& rhs) const {
    return (*this) += rhs;
  }
  FPS operator-(const FPS& rhs) const {
    return (*this) -= rhs;
  }
  FPS operator*(const FPS& rhs) {
    i64 m = this->size() + rhs.size() - 1;
    i64 n = bound_pow2(m);
    auto res = fps_multiply::idft(
        fps_multiply::multiply(
          fps_multiply::dft(this->pre(n).coef), fps_multiply::dft(rhs.pre(n).coef)
          )
        );
    res.resize(m);
    return res;
  }

};



#include <vector>
using i64 = long long;

template<class F, class FM>
std::vector<F> fast_kitamasa(std::vector<F> c, i64 n) {
  using fps = FPS<F, FM>;
  i64 k = c.size() - 1;
  fps cf(std::move(c));
  fps ic = cf.inv().pre(k - 1);
  
  i64 c_len = bound_pow2(k - 1 + cf.size() - 1);
  i64 ic_len = bound_pow2(k - 1 + ic.size() - 1);
  auto dc = FM::dft(cf.pre(c_len).coef);
  auto dic = FM::dft(ic.pre(ic_len).coef);

  i64 b_len = bound_pow2(k);
  std::vector<F> b(b_len, F(0));
  b[k - 1] = F(1);
  i64 m = bound_pow2(n);
  while(m) {
    b.resize(b_len * 2, F(0));
    auto bt = FM::dft(std::move(b));
    bt = FM::self_multiply(std::move(bt));
    fps beta(FM::idft(std::move(bt)));
    
    //  let q = (beta.clone().pre(k - 1) * ic.clone()).pre(k - 1);

    auto dbeta = FM::dft(beta.pre(k - 1).pre(ic_len).coef);
    auto q = FM::idft(FM::multiply(std::move(dbeta), dic));
    q.resize(c_len);
    for(int i = k - 1; i < ic_len; i++) q[i] = F(0);

    //beta -= cf * q;
    fps cfq(FM::idft(FM::multiply(FM::dft(std::move(q)), dc)));
    cfq.coef.resize(k - 1 + cf.size() - 1);
    beta -= cfq;

    b.assign(b_len * 2, F(0));
    for(int i = k - 1; i < 2 * k - 1; i++) {
      b[i - (k - 1)] = beta[i];
    }
    if(m & n) {
      F freq = b[0];
      for(int i = 0; i < k - 1; i++) {
        b[i] = b[i + 1] + freq * cf[i + 1];
      }
      b[k - 1] = freq * cf[k];
    }
    m >>= 1;
  }
  b.resize(k);
  return b;
}

#include <vector>

template<class F>
std::vector<F> berlekamp_massey(const std::vector<F>& s) {
  int n = s.size();
  std::vector<F> b { F(1) };
  std::vector<F> c { F(1) };
  F y(1);
  int shift = 0;
  for(int len = 0; len < n; len++) {
    shift++;
    F x(0);
    for(int i = 0; i < c.size(); i++) {
      x += c[i] * s[len - i];
    }
    if(x == F(0)) { continue; }
    std::vector<F> old_c = c;
    F freq = x / y; c.resize(std::max(c.size(), b.size() + shift), F(0));
    for(int i = 0; i < b.size(); i++) {
      c[i + shift] -= freq * b[i];
    }
    if(old_c.size() < c.size()) {
      b = std::move(old_c);
      y = x;
      shift = 0;
    }
  }
  std::vector<F> ans(c.size() - 1);
  for(int i = 1; i < c.size(); i++) {
    ans[i - 1] = -c[i];
  }
  return ans;
}

#include <vector>

template<class F>
std::vector<F> find_minimal_polynomial(const std::vector<F>& a) {
  std::vector<F> c = berlekamp_massey(a);
  c.insert(c.begin(), -F(1));
  return c;
}

template<class F, class NonZeroRandGen>
std::vector<F> find_minimal_polynomial_from_vector(int n, const std::vector<std::vector<F>>& a, NonZeroRandGen rnd) {
  std::vector<F> u(n);
  for(int i = 0; i < n; i++) u[i] = rnd();
  std::vector<F> b(a.size(), F(0));
  for(int i = 0; i < a.size(); i++) {
    for(int j = 0; j < n; j++) {
      b[i] += a[i][j] * u[j];
    }
  }
  return find_minimal_polynomial(b);
}

template<class F, class NonZeroRandGen>
std::vector<F> find_minimal_polynomial_from_dense_matrix_pow_b(const std::vector<std::vector<F>>& a, std::vector<F> b, NonZeroRandGen rnd) {
  int n = a.size();
  std::vector<F> bf;

  std::vector<F> u(n);
  for(int i = 0; i < n; i++) u[i] = rnd();

  std::vector<F> c(n * 2);
  for(int i = 0; i < 2 * n; i++) {
    for(int j = 0; j < n; j++) {
      c[i] += b[j] * u[j];
    }
    if(i + 1 < 2 * n) {
      bf = b;
      for(int j = 0; j < n; j++) {
        b[j] = F(0);
        for(int k = 0; k < n; k++) {
          b[j] += a[j][k] * bf[k];
        }
      }
    }
  }
  return find_minimal_polynomial(c);
}

// fast for dense matrix
template<class F, class NonZeroRandGen>
std::vector<F> find_minimal_polynomial_from_dense_matrix_pow(const std::vector<std::vector<F>>& a, NonZeroRandGen rnd) {
  int n = a.size();
  std::vector<F> b(n);
  for(int i = 0; i < n; i++) b[i] = rnd();
  return find_minimal_polynomial_from_dense_matrix_pow_b(a, std::move(b), rnd);
}

#include <tuple>

template<class F, class NonZeroRandGen>
std::vector<F> find_minimal_polynomial_from_sparse_matrix_pow_b(const std::vector<std::tuple<int, int, F>>& a, std::vector<F> b, int n, NonZeroRandGen rnd) {
  std::vector<F> bf;

  std::vector<F> u(n);
  for(int i = 0; i < n; i++) u[i] = rnd();

  std::vector<F> c(n * 2);
  for(int i = 0; i < 2 * n; i++) {
    for(int j = 0; j < n; j++) {
      c[i] += b[j] * u[j];
    }
    if(i + 1 < 2 * n) {
      bf = b;
      for(int j = 0; j < n; j++) {
        b[j] = F(0);
      }
      for(auto& [j, k, v]: a) {
        b[j] += v * bf[k];
      }
    }
  }
  return find_minimal_polynomial(c);
}

template<class F, class NonZeroRandGen>
std::vector<F> find_minimal_polynomial_from_sparse_matrix_pow(const std::vector<std::tuple<int, int, F>>& a, int n, NonZeroRandGen rnd) {
  std::vector<F> b(n);
  for(int i = 0; i < n; i++) b[i] = rnd();
  return find_minimal_polynomial_from_sparse_matrix_pow_b(a, std::move(b), n, rnd);
}

template<class F, class FM, class NonZeroRandGen>
std::vector<F> bbla_dense_matrix_pow(const std::vector<std::vector<F>>& a, std::vector<F> b, long long r, NonZeroRandGen rnd) {
  int n = a.size();
  auto c = find_minimal_polynomial_from_dense_matrix_pow_b(a, b, rnd);
  auto d = fast_kitamasa<F, FM>(std::move(c), r);
  std::vector<F> ans(n);
  std::vector<F> bf;
  for(int i = 0; i < d.size(); i++) {
    for(int j = 0; j < n; j++) {
      ans[j] += d[d.size() - 1 - i] * b[j];
    }
    if(i + 1 < d.size()) {
      bf = b;
      for(int j = 0; j < n; j++) {
        b[j] = F(0);
        for(int k = 0; k < n; k++) {
          b[j] += a[j][k] * bf[k];
        }
      }
    }
  }
  return ans;
}

template<class F, class FM, class NonZeroRandGen>
std::vector<F> bbla_sparse_matrix_pow(const std::vector<tuple<int, int, F>>& a, std::vector<F> b, long long r, NonZeroRandGen rnd) {
  int n = b.size();
  auto c = find_minimal_polynomial_from_sparse_matrix_pow_b(a, b, n, rnd);
  auto d = fast_kitamasa<F, FM>(std::move(c), r);
  std::vector<F> ans(n);
  std::vector<F> bf;
  for(int i = 0; i < d.size(); i++) {
    for(int j = 0; j < n; j++) {
      ans[j] += d[d.size() - 1 - i] * b[j];
    }
    if(i + 1 < d.size()) {
      bf = b;
      for(int j = 0; j < n; j++) {
        b[j] = F(0);
      }
      for(auto& [j, k, v]: a) {
        b[j] += v * bf[k];
      }
    }
  }
  return ans;
}

#include <random>

using fp = modint<998244353>;

constexpr fp li[] {fp(863303859),fp(339390315),fp(979599571),fp(199714654),fp(301357173),fp(400865054),fp(770490863),fp(356820773),fp(894972774),fp(967324291),fp(166944925),fp(654393003),fp(304140790),fp(20830991),fp(847528850),fp(156976274),fp(547185625),fp(621028859),fp(844422675),fp(326938261),fp(902391593),fp(397750615),fp(433030622),fp(244536178),fp(375895659),fp(902822037),fp(909826671),fp(718623378),fp(114518526),fp(347491628),fp(80068807),fp(269866676),fp(671564431),fp(352910149),fp(451609918),fp(726700835),fp(167580922),fp(40120952),fp(642222159),fp(738419087),fp(588793680),fp(624451442),fp(642541577),fp(314409044),fp(57883979),fp(87180399),fp(285373494),fp(813291789),fp(936336163),fp(285787459),fp(651754948),fp(899691323),fp(35048108),fp(297454293),fp(291950455),fp(377885604),fp(722386864),fp(676329426),fp(149479947),fp(13963991),fp(757101185),fp(940125946),fp(83989251),fp(685331496),fp(83323677),fp(260532830),fp(926688801),fp(262319285),fp(693808883),fp(944267560),fp(462457327),fp(206130177),fp(995396560),fp(577713216),fp(365155728),fp(579784548),fp(693449315),fp(8335689),fp(473361466),fp(41379181),fp(162849967),fp(365513965),fp(744881959),fp(925032280),fp(754083979),fp(696698057),fp(738556167),fp(5650271),fp(199816497),fp(947029884),fp(42818675),fp(856515444),fp(16191429),fp(297068619),fp(134930396),fp(375623842),fp(606645976),fp(246526416),fp(497437038),fp(928693228),fp(946582662),fp(862847352),fp(358513338),fp(463498435),fp(519603693),fp(991973628),fp(937271185),fp(275295290),fp(866218055),fp(536205886),fp(769241787),fp(322515550),fp(463584342),fp(747948043),fp(671365378),fp(373253000),fp(77410578),fp(387956371),fp(379285769),fp(696147192),fp(788593759),fp(70240204),fp(174654106),fp(435442880),fp(558134876)};

int main() {
  using multi = fps_ntt_multiply<998244353, 3>;
  cin.tie(nullptr);
  std::ios::sync_with_stdio(false);

  i64 N, k;
  cin >> N >> k;

  std::vector<std::vector<int>> a(k * k * k, std::vector<int>(k * k * k));
  std::vector<fp> b(k * k * k);
  b[0] = fp(1);
  for(int i = 0;i < k;i++) {
    for(int j = 0;j < k;j++) {
      for(int l = 0;l < k;l++) {
        a[(i+1)%k*k*k+j*k+l][i*k*k+j*k+l]++;
        a[i*k*k+(j+i)%k*k+l][i*k*k+j*k+l]++;
        a[i*k*k+j*k+(l+j)%k][i*k*k+j*k+l]++;
      }
    }
  }
  std::vector<fp> l { fp(0), fp(1), fp(2) };
  std::vector<std::tuple<int, int, fp>> A;
  A.reserve(370);
  int sum = 0;
  for(int i = 0; i < a.size(); i++) {
    for(int j = 0; j < a.size(); j++) {
      if(a[i][j]) {
        A.push_back( { i, j, l[a[i][j]] } );
      }
    }
  }
  int cnt = 0;
  auto rnd = [&]() {
    return li[cnt++];
  };
  auto res = bbla_sparse_matrix_pow<fp, multi>(A, b, N, rnd);
  fp ans;
  for(int i = 0; i < k; i++) {
    for(int j = 0; j < k; j++) {
      ans += res[i * k * k + j * k];
    }
  }
  cout << ans << "\n";
}

0