#754353 (C++23) No.1907 DETERMINATION

提出ソース

結果

問題	No.1907 DETERMINATION
コンテスト
ユーザー	kiyoshi0205
提出日時	2022-04-15 22:34:06
言語	C++23 (gcc 15.2.0 + boost 1.89.0)
結果	TLE
実行時間	-
コード長	20,852 bytes
記録記録タグの例: 初AC ショートコード純ショートコード純主流ショートコード最速実行時間
コンパイル時間	4,050 ms
コンパイル使用メモリ	378,280 KB
実行使用メモリ	22,144 KB
最終ジャッジ日時	2024-12-25 01:34:04
合計ジャッジ時間	228,978 ms
ジャッジサーバーID （参考情報）	judge3 / judge2

このコードへのチャレンジ
（要ログイン）

ファイルパターン	結果
sample	AC * 4
other	AC * 21 TLE * 42

権限があれば一括ダウンロードができます

コンパイルメッセージ

main.cpp:557:1: warning: 'always_inline' function might not be inlinable [-Wattributes]
  557 | montgomery_sub_256(const __m256i &a, const __m256i &b, const __m256i &m2,
      | ^~~~~~~~~~~~~~~~~~
main.cpp:549:1: warning: 'always_inline' function might not be inlinable [-Wattributes]
  549 | montgomery_add_256(const __m256i &a, const __m256i &b, const __m256i &m2,
      | ^~~~~~~~~~~~~~~~~~
main.cpp:541:1: warning: 'always_inline' function might not be inlinable [-Wattributes]
  541 | montgomery_mul_256(const __m256i &a, const __m256i &b, const __m256i &r,
      | ^~~~~~~~~~~~~~~~~~
main.cpp:530:1: warning: 'always_inline' function might not be inlinable [-Wattributes]
  530 | my256_mulhi_epu32(const __m256i &a, const __m256i &b) {
      | ^~~~~~~~~~~~~~~~~
main.cpp:525:1: warning: 'always_inline' function might not be inlinable [-Wattributes]
  525 | my256_mullo_epu32(const __m256i &a, const __m256i &b) {
      | ^~~~~~~~~~~~~~~~~
main.cpp:518:1: warning: 'always_inline' function might not be inlinable [-Wattributes]
  518 | montgomery_sub_128(const __m128i &a, const __m128i &b, const __m128i &m2,
      | ^~~~~~~~~~~~~~~~~~
main.cpp:511:1: warning: 'always_inline' function might not be inlinable [-Wattributes]
  511 | montgomery_add_128(const __m128i &a, const __m128i &b, const __m128i &m2,
      | ^~~~~~~~~~~~~~~~~~
main.cpp:503:1: warning: 'always_inline' function might not be inlinable [-Wattributes]
  503 | montgomery_mul_128(const __m128i &a, const __m128i &b, const __m128i &r,
      | ^~~~~~~~~~~~~~~~~~
main.cpp:492:1: warning: 'always_inline' function might not be inlinable [-Wattributes]
  492 | my128_mulhi_epu32(const __m128i &a, const __m128i &b) {
      | ^~~~~~~~~~~~~~~~~
main.cpp:487:1: warning: 'always_inline' function might not be inlinable [-Wattributes]
  487 | my128_mullo_epu32(const __m128i &a, const __m128i &b) {
      | ^~~~~~~~~~~~~~~~~

ソースコード

raw source code

//拝借、ありがとう………

#define PROBLEM "https://judge.yosupo.jp/problem/matrix_det"


#pragma region kyopro_template
#define Nyaan_template
#include <immintrin.h>
#include <bits/stdc++.h>
#define pb push_back
#define eb emplace_back
#define fi first
#define se second
#define each(x, v) for (auto &x : v)
#define all(v) (v).begin(), (v).end()
#define sz(v) ((int)(v).size())
#define mem(a, val) memset(a, val, sizeof(a))
#define ini(...)   \
  int __VA_ARGS__; \
  in(__VA_ARGS__)
#define inl(...)         \
  long long __VA_ARGS__; \
  in(__VA_ARGS__)
#define ins(...)      \
  string __VA_ARGS__; \
  in(__VA_ARGS__)
#define inc(...)    \
  char __VA_ARGS__; \
  in(__VA_ARGS__)
#define in2(s, t)                           \
  for (int i = 0; i < (int)s.size(); i++) { \
    in(s[i], t[i]);                         \
  }
#define in3(s, t, u)                        \
  for (int i = 0; i < (int)s.size(); i++) { \
    in(s[i], t[i], u[i]);                   \
  }
#define in4(s, t, u, v)                     \
  for (int i = 0; i < (int)s.size(); i++) { \
    in(s[i], t[i], u[i], v[i]);             \
  }
#define rep(i, N) for (long long i = 0; i < (long long)(N); i++)
#define repr(i, N) for (long long i = (long long)(N)-1; i >= 0; i--)
#define rep1(i, N) for (long long i = 1; i <= (long long)(N); i++)
#define repr1(i, N) for (long long i = (N); (long long)(i) > 0; i--)
#define reg(i, a, b) for (long long i = (a); i < (b); i++)
#define die(...)      \
  do {                \
    out(__VA_ARGS__); \
    return;           \
  } while (0)
using namespace std;
using ll = long long;
template <class T>
using V = vector<T>;
using vi = vector<int>;
using vl = vector<long long>;
using vvi = vector<vector<int>>;
using vd = V<double>;
using vs = V<string>;
using vvl = vector<vector<long long>>;
using P = pair<long long, long long>;
using vp = vector<P>;
using pii = pair<int, int>;
using vpi = vector<pair<int, int>>;
constexpr int inf = 1001001001;
constexpr long long infLL = (1LL << 61) - 1;
template <typename T, typename U>
inline bool amin(T &x, U y) {
  return (y < x) ? (x = y, true) : false;
}
template <typename T, typename U>
inline bool amax(T &x, U y) {
  return (x < y) ? (x = y, true) : false;
}
template <typename T, typename U>
ostream &operator<<(ostream &os, const pair<T, U> &p) {
  os << p.first << " " << p.second;
  return os;
}
template <typename T, typename U>
istream &operator>>(istream &is, pair<T, U> &p) {
  is >> p.first >> p.second;
  return is;
}
template <typename T>
ostream &operator<<(ostream &os, const vector<T> &v) {
  int s = (int)v.size();
  for (int i = 0; i < s; i++) os << (i ? " " : "") << v[i];
  return os;
}
template <typename T>
istream &operator>>(istream &is, vector<T> &v) {
  for (auto &x : v) is >> x;
  return is;
}
void in() {}
template <typename T, class... U>
void in(T &t, U &... u) {
  cin >> t;
  in(u...);
}
void out() { cout << "\n"; }
template <typename T, class... U>
void out(const T &t, const U &... u) {
  cout << t;
  if (sizeof...(u)) cout << " ";
  out(u...);
}

#ifdef NyaanDebug
#define trc(...)                   \
  do {                             \
    cerr << #__VA_ARGS__ << " = "; \
    dbg_out(__VA_ARGS__);          \
  } while (0)
#define trca(v, N)       \
  do {                   \
    cerr << #v << " = "; \
    array_out(v, N);     \
  } while (0)
#define trcc(v)                             \
  do {                                      \
    cerr << #v << " = {";                   \
    each(x, v) { cerr << " " << x << ","; } \
    cerr << "}" << endl;                    \
  } while (0)
template <typename T>
void _cout(const T &c) {
  cerr << c;
}
void _cout(const int &c) {
  if (c == 1001001001)
    cerr << "inf";
  else if (c == -1001001001)
    cerr << "-inf";
  else
    cerr << c;
}
void _cout(const unsigned int &c) {
  if (c == 1001001001)
    cerr << "inf";
  else
    cerr << c;
}
void _cout(const long long &c) {
  if (c == 1001001001 || c == (1LL << 61) - 1)
    cerr << "inf";
  else if (c == -1001001001 || c == -((1LL << 61) - 1))
    cerr << "-inf";
  else
    cerr << c;
}
void _cout(const unsigned long long &c) {
  if (c == 1001001001 || c == (1LL << 61) - 1)
    cerr << "inf";
  else
    cerr << c;
}
template <typename T, typename U>
void _cout(const pair<T, U> &p) {
  cerr << "{ ";
  _cout(p.fi);
  cerr << ", ";
  _cout(p.se);
  cerr << " } ";
}
template <typename T>
void _cout(const vector<T> &v) {
  int s = v.size();
  cerr << "{ ";
  for (int i = 0; i < s; i++) {
    cerr << (i ? ", " : "");
    _cout(v[i]);
  }
  cerr << " } ";
}
template <typename T>
void _cout(const vector<vector<T>> &v) {
  cerr << "[ ";
  for (const auto &x : v) {
    cerr << endl;
    _cout(x);
    cerr << ", ";
  }
  cerr << endl << " ] ";
}
void dbg_out() { cerr << endl; }
template <typename T, class... U>
void dbg_out(const T &t, const U &... u) {
  _cout(t);
  if (sizeof...(u)) cerr << ", ";
  dbg_out(u...);
}
template <typename T>
void array_out(const T &v, int s) {
  cerr << "{ ";
  for (int i = 0; i < s; i++) {
    cerr << (i ? ", " : "");
    _cout(v[i]);
  }
  cerr << " } " << endl;
}
template <typename T>
void array_out(const T &v, int H, int W) {
  cerr << "[ ";
  for (int i = 0; i < H; i++) {
    cerr << (i ? ", " : "");
    array_out(v[i], W);
  }
  cerr << " ] " << endl;
}
#else
#define trc(...)
#define trca(...)
#define trcc(...)
#endif

inline int popcnt(unsigned long long a) { return __builtin_popcountll(a); }
inline int lsb(unsigned long long a) { return __builtin_ctzll(a); }
inline int msb(unsigned long long a) { return 63 - __builtin_clzll(a); }
template <typename T>
inline int getbit(T a, int i) {
  return (a >> i) & 1;
}
template <typename T>
inline void setbit(T &a, int i) {
  a |= (1LL << i);
}
template <typename T>
inline void delbit(T &a, int i) {
  a &= ~(1LL << i);
}
template <typename T>
int lb(const vector<T> &v, const T &a) {
  return lower_bound(begin(v), end(v), a) - begin(v);
}
template <typename T>
int ub(const vector<T> &v, const T &a) {
  return upper_bound(begin(v), end(v), a) - begin(v);
}
template <typename T>
int btw(T a, T x, T b) {
  return a <= x && x < b;
}
template <typename T, typename U>
T ceil(T a, U b) {
  return (a + b - 1) / b;
}
constexpr long long TEN(int n) {
  long long ret = 1, x = 10;
  while (n) {
    if (n & 1) ret *= x;
    x *= x;
    n >>= 1;
  }
  return ret;
}
template <typename T>
vector<T> mkrui(const vector<T> &v) {
  vector<T> ret(v.size() + 1);
  for (int i = 0; i < int(v.size()); i++) ret[i + 1] = ret[i] + v[i];
  return ret;
};
template <typename T>
vector<T> mkuni(const vector<T> &v) {
  vector<T> ret(v);
  sort(ret.begin(), ret.end());
  ret.erase(unique(ret.begin(), ret.end()), ret.end());
  return ret;
}
template <typename F>
vector<int> mkord(int N, F f) {
  vector<int> ord(N);
  iota(begin(ord), end(ord), 0);
  sort(begin(ord), end(ord), f);
  return ord;
}
template <typename T = int>
vector<T> mkiota(int N) {
  vector<T> ret(N);
  iota(begin(ret), end(ret), 0);
  return ret;
}
template <typename T>
vector<int> mkinv(vector<T> &v) {
  vector<int> inv(v.size());
  for (int i = 0; i < (int)v.size(); i++) inv[v[i]] = i;
  return inv;
}

struct IoSetupNya {
  IoSetupNya() {
    cin.tie(nullptr);
    ios::sync_with_stdio(false);
    cout << fixed << setprecision(15);
    cerr << fixed << setprecision(7);
  }
} iosetupnya;

void solve();
int main() { solve(); }

#pragma endregion
using namespace std;

namespace fastio {
static constexpr int SZ = 1 << 17;
char ibuf[SZ], obuf[SZ];
int pil = 0, pir = 0, por = 0;

struct Pre {
  char num[40000];
  constexpr Pre() : num() {
    for (int i = 0; i < 10000; i++) {
      int n = i;
      for (int j = 3; j >= 0; j--) {
        num[i * 4 + j] = n % 10 + '0';
        n /= 10;
      }
    }
  }
} constexpr pre;

inline void load() {
  memcpy(ibuf, ibuf + pil, pir - pil);
  pir = pir - pil + fread(ibuf + pir - pil, 1, SZ - pir + pil, stdin);
  pil = 0;
}
inline void flush() {
  fwrite(obuf, 1, por, stdout);
  por = 0;
}

inline void rd(char& c) { c = ibuf[pil++]; }
template <typename T>
inline void rd(T& x) {
  if (pil + 32 > pir) load();
  char c;
  do
    c = ibuf[pil++];
  while (c < '-');
  bool minus = 0;
  if (c == '-') {
    minus = 1;
    c = ibuf[pil++];
  }
  x = 0;
  while (c >= '0') {
    x = x * 10 + (c & 15);
    c = ibuf[pil++];
  }
  if (minus) x = -x;
}

inline void wt(char c) { obuf[por++] = c; }
template <typename T>
inline void wt(T x) {
  if (por > SZ - 32) flush();
  if (!x) {
    obuf[por++] = '0';
    return;
  }
  if (x < 0) {
    obuf[por++] = '-';
    x = -x;
  }
  int i = 12;
  char buf[16];
  while (x >= 10000) {
    memcpy(buf + i, pre.num + (x % 10000) * 4, 4);
    x /= 10000;
    i -= 4;
  }
  int d = x < 100 ? (x < 10 ? 1 : 2) : (x < 1000 ? 3 : 4);
  memcpy(obuf + por, pre.num + x * 4 + 4 - d, d);
  por += d;
  memcpy(obuf + por, buf + i + 4, 12 - i);
  por += 12 - i;
}

struct Dummy {
  Dummy() { atexit(flush); }
} dummy;

}  // namespace fastio
using fastio::rd;
using fastio::wt;
using namespace std;

template <uint32_t mod>
struct LazyMontgomeryModInt {
  using mint = LazyMontgomeryModInt;
  using i32 = int32_t;
  using u32 = uint32_t;
  using u64 = uint64_t;

  static constexpr u32 get_r() {
    u32 ret = mod;
    for (i32 i = 0; i < 4; ++i) ret *= 2 - mod * ret;
    return ret;
  }

  static constexpr u32 r = get_r();
  static constexpr u32 n2 = -u64(mod) % mod;
  static_assert(r * mod == 1, "invalid, r * mod != 1");
  static_assert(mod < (1 << 30), "invalid, mod >= 2 ^ 30");
  static_assert((mod & 1) == 1, "invalid, mod % 2 == 0");

  u32 a;

  constexpr LazyMontgomeryModInt() : a(0) {}
  constexpr LazyMontgomeryModInt(const int64_t &b)
      : a(reduce(u64(b % mod + mod) * n2)){};

  static constexpr u32 reduce(const u64 &b) {
    return (b + u64(u32(b) * u32(-r)) * mod) >> 32;
  }

  constexpr mint &operator+=(const mint &b) {
    if (i32(a += b.a - 2 * mod) < 0) a += 2 * mod;
    return *this;
  }

  constexpr mint &operator-=(const mint &b) {
    if (i32(a -= b.a) < 0) a += 2 * mod;
    return *this;
  }

  constexpr mint &operator*=(const mint &b) {
    a = reduce(u64(a) * b.a);
    return *this;
  }

  constexpr mint &operator/=(const mint &b) {
    *this *= b.inverse();
    return *this;
  }

  constexpr mint operator+(const mint &b) const { return mint(*this) += b; }
  constexpr mint operator-(const mint &b) const { return mint(*this) -= b; }
  constexpr mint operator*(const mint &b) const { return mint(*this) *= b; }
  constexpr mint operator/(const mint &b) const { return mint(*this) /= b; }
  constexpr bool operator==(const mint &b) const {
    return (a >= mod ? a - mod : a) == (b.a >= mod ? b.a - mod : b.a);
  }
  constexpr bool operator!=(const mint &b) const {
    return (a >= mod ? a - mod : a) != (b.a >= mod ? b.a - mod : b.a);
  }
  constexpr mint operator-() const { return mint() - mint(*this); }

  constexpr mint pow(u64 n) const {
    mint ret(1), mul(*this);
    while (n > 0) {
      if (n & 1) ret *= mul;
      mul *= mul;
      n >>= 1;
    }
    return ret;
  }
  
  constexpr mint inverse() const { return pow(mod - 2); }

  friend ostream &operator<<(ostream &os, const mint &b) {
    return os << b.get();
  }

  friend istream &operator>>(istream &is, mint &b) {
    int64_t t;
    is >> t;
    b = LazyMontgomeryModInt<mod>(t);
    return (is);
  }
  
  constexpr u32 get() const {
    u32 ret = reduce(a);
    return ret >= mod ? ret - mod : ret;
  }

  static constexpr u32 get_mod() { return mod; }
};
using namespace std;

using namespace std;

__attribute__((target("sse4.2"))) __attribute__((always_inline)) __m128i
my128_mullo_epu32(const __m128i &a, const __m128i &b) {
  return _mm_mullo_epi32(a, b);
}

__attribute__((target("sse4.2"))) __attribute__((always_inline)) __m128i
my128_mulhi_epu32(const __m128i &a, const __m128i &b) {
  __m128i a13 = _mm_shuffle_epi32(a, 0xF5);
  __m128i b13 = _mm_shuffle_epi32(b, 0xF5);
  __m128i prod02 = _mm_mul_epu32(a, b);
  __m128i prod13 = _mm_mul_epu32(a13, b13);
  __m128i prod = _mm_unpackhi_epi64(_mm_unpacklo_epi32(prod02, prod13),
                                    _mm_unpackhi_epi32(prod02, prod13));
  return prod;
}

__attribute__((target("sse4.2"))) __attribute__((always_inline)) __m128i
montgomery_mul_128(const __m128i &a, const __m128i &b, const __m128i &r,
                   const __m128i &m1) {
  return _mm_sub_epi32(
      _mm_add_epi32(my128_mulhi_epu32(a, b), m1),
      my128_mulhi_epu32(my128_mullo_epu32(my128_mullo_epu32(a, b), r), m1));
}

__attribute__((target("sse4.2"))) __attribute__((always_inline)) __m128i
montgomery_add_128(const __m128i &a, const __m128i &b, const __m128i &m2,
                   const __m128i &m0) {
  __m128i ret = _mm_sub_epi32(_mm_add_epi32(a, b), m2);
  return _mm_add_epi32(_mm_and_si128(_mm_cmpgt_epi32(m0, ret), m2), ret);
}

__attribute__((target("sse4.2"))) __attribute__((always_inline)) __m128i
montgomery_sub_128(const __m128i &a, const __m128i &b, const __m128i &m2,
                   const __m128i &m0) {
  __m128i ret = _mm_sub_epi32(a, b);
  return _mm_add_epi32(_mm_and_si128(_mm_cmpgt_epi32(m0, ret), m2), ret);
}

__attribute__((target("avx2"))) __attribute__((always_inline)) __m256i
my256_mullo_epu32(const __m256i &a, const __m256i &b) {
  return _mm256_mullo_epi32(a, b);
}

__attribute__((target("avx2"))) __attribute__((always_inline)) __m256i
my256_mulhi_epu32(const __m256i &a, const __m256i &b) {
  __m256i a13 = _mm256_shuffle_epi32(a, 0xF5);
  __m256i b13 = _mm256_shuffle_epi32(b, 0xF5);
  __m256i prod02 = _mm256_mul_epu32(a, b);
  __m256i prod13 = _mm256_mul_epu32(a13, b13);
  __m256i prod = _mm256_unpackhi_epi64(_mm256_unpacklo_epi32(prod02, prod13),
                                       _mm256_unpackhi_epi32(prod02, prod13));
  return prod;
}

__attribute__((target("avx2"))) __attribute__((always_inline)) __m256i
montgomery_mul_256(const __m256i &a, const __m256i &b, const __m256i &r,
                   const __m256i &m1) {
  return _mm256_sub_epi32(
      _mm256_add_epi32(my256_mulhi_epu32(a, b), m1),
      my256_mulhi_epu32(my256_mullo_epu32(my256_mullo_epu32(a, b), r), m1));
}

__attribute__((target("avx2"))) __attribute__((always_inline)) __m256i
montgomery_add_256(const __m256i &a, const __m256i &b, const __m256i &m2,
                   const __m256i &m0) {
  __m256i ret = _mm256_sub_epi32(_mm256_add_epi32(a, b), m2);
  return _mm256_add_epi32(_mm256_and_si256(_mm256_cmpgt_epi32(m0, ret), m2),
                          ret);
}

__attribute__((target("avx2"))) __attribute__((always_inline)) __m256i
montgomery_sub_256(const __m256i &a, const __m256i &b, const __m256i &m2,
                   const __m256i &m0) {
  __m256i ret = _mm256_sub_epi32(a, b);
  return _mm256_add_epi32(_mm256_and_si256(_mm256_cmpgt_epi32(m0, ret), m2),
                          ret);
}
namespace Gauss {
uint32_t a_buf_[4096][4096] __attribute__((aligned(64)));

// return value: (rank, (-1) ^ (number of swap time))
template <typename mint>
__attribute__((target("avx2"))) pair<int, int> GaussianElimination(
    const vector<vector<mint>> &m, int LinearEquation = false) {
  mint(&a)[4096][4096] = *reinterpret_cast<mint(*)[4096][4096]>(a_buf_);
  int H = m.size(), W = m[0].size(), rank = 0;
  int det = 1;
  for (int i = 0; i < H; i++)
    for (int j = 0; j < W; j++) a[i][j].a = m[i][j].a;

  __m256i r = _mm256_set1_epi32(mint::r);
  __m256i m0 = _mm256_set1_epi32(0);
  __m256i m1 = _mm256_set1_epi32(mint::get_mod());
  __m256i m2 = _mm256_set1_epi32(mint::get_mod() << 1);

  for (int j = 0; j < (LinearEquation ? (W - 1) : W); j++) {
    // find basis
    if (rank == H) break;
    int idx = -1;
    for (int i = rank; i < H; i++) {
      if (a[i][j].get() != 0) idx = i;
      if (idx != -1) break;
    }
    if (idx == -1) {
      if (LinearEquation)
        continue;
      else
        return {0, 0};
    }

    // swap
    if (rank != idx) {
      det = -det;
      for (int l = j; l < W; l++) swap(a[rank][l], a[idx][l]);
    }

    // normalize
    if (LinearEquation) {
      if (a[rank][j].get() != 1) {
        mint coeff = a[rank][j].inverse();
        __m256i COEFF = _mm256_set1_epi32(coeff.a);
        for (int i = j / 8 * 8; i < W; i += 8) {
          __m256i R = _mm256_load_si256((__m256i *)(a[rank] + i));
          __m256i RmulC = montgomery_mul_256(R, COEFF, r, m1);
          _mm256_store_si256((__m256i *)(a[rank] + i), RmulC);
        }
      }
    }

    // elimination
    for (int k = (LinearEquation ? 0 : rank + 1); k < H; k++) {
      if (k == rank) continue;
      if (a[k][rank].get() != 0) {
        mint coeff = a[k][j] / a[rank][j];
        __m256i COEFF = _mm256_set1_epi32(coeff.a);
        for (int i = j / 8 * 8; i < W; i += 8) {
          __m256i R = _mm256_load_si256((__m256i *)(a[rank] + i));
          __m256i K = _mm256_load_si256((__m256i *)(a[k] + i));
          __m256i RmulC = montgomery_mul_256(R, COEFF, r, m1);
          __m256i KmnsR = montgomery_sub_256(K, RmulC, m2, m0);
          _mm256_store_si256((__m256i *)(a[k] + i), KmnsR);
        }
      }
    }
    rank++;
  }
  return {rank, det};
}

// calculate determinant
template <typename mint>
mint determinant(const vector<vector<mint>> &mat) {
  mint(&a)[4096][4096] = *reinterpret_cast<mint(*)[4096][4096]>(a_buf_);
  auto p = GaussianElimination(mat);
  if (p.first != (int)mat.size()) return mint(0);
  mint det = p.second;
  for (int i = 0; i < (int)mat.size(); i++) det *= a[i][i];
  return det;
}

// return V<V<mint>>
// 0 column ... one of solutions
// 1 ~ (W - rank) column ... bases
// if not exist, return empty vector
template <typename mint>
vector<vector<mint>> LinearEquation(vector<vector<mint>> A, vector<mint> B) {
  int H = A.size(), W = A[0].size();
  for (int i = 0; i < H; i++) A[i].push_back(B[i]);

  auto p = GaussianElimination(A, true);

  mint(&a)[4096][4096] = *reinterpret_cast<mint(*)[4096][4096]>(a_buf_);
  int rank = p.first;

  // check if solutions exist
  for (int i = rank; i < H; ++i)
    if (a[i][W] != 0) return vector<vector<mint>>{};

  vector<vector<mint>> res(1, vector<mint>(W));
  vector<int> pivot(W, -1);
  for (int i = 0, j = 0; i < rank; ++i) {
    while (a[i][j] == 0) ++j;
    res[0][j] = a[i][W], pivot[j] = i;
  }
  for (int j = 0; j < W; ++j) {
    if (pivot[j] == -1) {
      vector<mint> x(W);
      x[j] = -1;
      for (int k = 0; k < j; ++k)
        if (pivot[k] != -1) x[k] = a[pivot[k]][j];
      res.push_back(x);
    }
  }
  return res;
}

}  // namespace Gauss
using namespace Gauss;



using mint = LazyMontgomeryModInt<998244353>;
using vm = vector<mint>;





template <typename mint>
std::pair<int, mint> GaussElimination(vector<vector<mint>> &a,
                                      int pivot_end = -1,
                                      bool diagonalize = false) {
  int H = a.size(), W = a[0].size();
  int rank = 0, je = pivot_end;
  if (je == -1) je = W;
  mint det = 1;
  for (int j = 0; j < je; j++) {
    int idx = -1;
    for (int i = rank; i < H; i++) {
      if (a[i][j] != mint(0)) {
        idx = i;
        break;
      }
    }
    if (idx == -1) {
      det = 0;
      continue;
    }
    if (rank != idx) {
      det = -det;
      swap(a[rank], a[idx]);
    }
    det *= a[rank][j];
    if (diagonalize && a[rank][j] != mint(1)) {
      mint coeff = a[rank][j].inverse();
      for (int k = j; k < W; k++) a[rank][k] *= coeff;
    }
    int is = diagonalize ? 0 : rank + 1;
    for (int i = is; i < H; i++) {
      if (i == rank) continue;
      if (a[i][j] != mint(0)) {
        mint coeff = a[i][j] / a[rank][j];
        for (int k = j; k < W; k++) a[i][k] -= a[rank][k] * coeff;
      }
    }
    rank++;
  }
  return make_pair(rank, det);
}
#line 4 "matrix/inverse-matrix.hpp"

template <typename mint>
vector<vector<mint>> inverse_matrix(const vector<vector<mint>>& a) {
  int N = a.size();
  assert(N > 0);
  assert(N == (int)a[0].size());

  vector<vector<mint>> m(N, vector<mint>(2 * N));
  for (int i = 0; i < N; i++) {
    copy(begin(a[i]), end(a[i]), begin(m[i]));
    m[i][N + i] = 1;
  }

  auto [rank, det] = GaussElimination(m, N, true);
  if (rank != N) return {};

  vector<vector<mint>> b(N);
  for (int i = 0; i < N; i++) {
    copy(begin(m[i]) + N, end(m[i]), back_inserter(b[i]));
  }
  return b;
}



void solve() {
  int N;
  rd(N);
  V<vm> a(N, vm(N)),b(N,vm(N));
  int buf;
  rep(i, N) rep(j, N) {
    rd(buf);
    a[i][j] = buf;
  }
  rep(i, N) rep(j, N) {
    rd(buf);
    b[i][j] = buf;
  }

  vector<vector<mint>> calc(N+1,vector<mint>(N+1));
  vector<mint> res(N+1);
  rep(x,N+1){
    calc[x][0]=1;
    rep(i,N)calc[x][i+1]=calc[x][i]*(x+1);
    rep(i,N)rep(j,N)a[i][j]+=b[i][j];
    res[x]=Gauss::determinant(a).get();
  }
  auto tmp=inverse_matrix(calc);
  rep(i,N+1){
    mint ans=0;
    rep(j,N+1)ans+=tmp[i][j]*res[j];
    cout<<ans<<'\n';
  }
}

yukicoder

結果

コンパイルメッセージ

ソースコード