結果
| 問題 |
No.1783 Remix Sum
|
| コンテスト | |
| ユーザー |
|
| 提出日時 | 2021-12-12 03:11:46 |
| 言語 | C++17 (gcc 13.3.0 + boost 1.87.0) |
| 結果 |
MLE
|
| 実行時間 | - |
| コード長 | 58,794 bytes |
| コンパイル時間 | 3,875 ms |
| コンパイル使用メモリ | 333,180 KB |
| 最終ジャッジ日時 | 2025-01-26 08:42:04 |
|
ジャッジサーバーID (参考情報) |
judge3 / judge2 |
(要ログイン)
| ファイルパターン | 結果 |
|---|---|
| sample | AC * 4 |
| other | AC * 34 WA * 15 TLE * 22 MLE * 5 |
ソースコード
#include <bits/stdc++.h>
#include <immintrin.h>
/**
* ライブラリはNyaanさんのライブラリを拝借しています
* https://nyaannyaan.github.io/library/ntt/multivariate-multiplication.hpp
*/
#pragma region Header
using i32 = int;
using u32 = unsigned int;
using i64 = long long;
using u64 = unsigned long long;
using i128 = __int128_t;
using u128 = __uint128_t;
using f64 = double;
using f80 = long double;
using f128 = __float128;
constexpr i32 operator"" _i32(u64 v)
{
return v;
}
constexpr i32 operator"" _u32(u64 v)
{
return v;
}
constexpr i64 operator"" _i64(u64 v)
{
return v;
}
constexpr u64 operator"" _u64(u64 v)
{
return v;
}
constexpr f64 operator"" _f64(f80 v)
{
return v;
}
constexpr f80 operator"" _f80(f80 v)
{
return v;
}
using Istream = std::istream;
using Ostream = std::ostream;
using Str = std::string;
template<typename T>
using Lt = std::less<T>;
template<typename T>
using Gt = std::greater<T>;
template<typename T>
using IList = std::initializer_list<T>;
template<int n>
using BSet = std::bitset<n>;
template<typename T1, typename T2>
using Pair = std::pair<T1, T2>;
template<typename... Ts>
using Tup = std::tuple<Ts...>;
template<typename T, int N>
using Arr = std::array<T, N>;
template<typename... Ts>
using Deq = std::deque<Ts...>;
template<typename... Ts>
using Set = std::set<Ts...>;
template<typename... Ts>
using MSet = std::multiset<Ts...>;
template<typename... Ts>
using USet = std::unordered_set<Ts...>;
template<typename... Ts>
using UMSet = std::unordered_multiset<Ts...>;
template<typename... Ts>
using Map = std::map<Ts...>;
template<typename... Ts>
using MMap = std::multimap<Ts...>;
template<typename... Ts>
using UMap = std::unordered_map<Ts...>;
template<typename... Ts>
using UMMap = std::unordered_multimap<Ts...>;
template<typename... Ts>
using Vec = std::vector<Ts...>;
template<typename... Ts>
using Stack = std::stack<Ts...>;
template<typename... Ts>
using Queue = std::queue<Ts...>;
template<typename T>
using MaxHeap = std::priority_queue<T>;
template<typename T>
using MinHeap = std::priority_queue<T, Vec<T>, Gt<T>>;
using NSec = std::chrono::nanoseconds;
using USec = std::chrono::microseconds;
using MSec = std::chrono::milliseconds;
using Sec = std::chrono::seconds;
template<typename T>
constexpr T LIMMIN = std::numeric_limits<T>::min();
template<typename T>
constexpr T LIMMAX = std::numeric_limits<T>::max();
template<typename T>
constexpr T INF = (LIMMAX<T> - 1) / 2;
template<typename T>
constexpr T PI = T{3.141592653589793238462643383279502884};
template<typename T = u64>
constexpr T TEN(const int n)
{
return n == 0 ? T{1} : TEN<T>(n - 1) * T{10};
}
Ostream& operator<<(Ostream& os, i128 v)
{
bool minus = false;
if (v < 0) { minus = true, v = -v; }
Str ans;
if (v == 0) { ans = "0"; }
while (v) {
ans.push_back('0' + v % 10), v /= 10;
}
std::reverse(ans.begin(), ans.end());
return os << (minus ? "-" : "") << ans;
}
Ostream& operator<<(Ostream& os, u128 v)
{
Str ans;
if (v == 0) { ans = "0"; }
while (v) {
ans.push_back('0' + v % 10), v /= 10;
}
std::reverse(ans.begin(), ans.end());
return os << ans;
}
template<typename T>
bool chmin(T& a, const T& b)
{
if (a > b) {
a = b;
return true;
} else {
return false;
}
}
template<typename T>
bool chmax(T& a, const T& b)
{
if (a < b) {
a = b;
return true;
} else {
return false;
}
}
template<typename T>
constexpr T floorDiv(T x, T y)
{
if (y < T{}) { x = -x, y = -y; }
return x >= T{} ? x / y : (x - y + 1) / y;
}
template<typename T>
constexpr T ceilDiv(T x, T y)
{
if (y < T{}) { x = -x, y = -y; }
return x >= T{} ? (x + y - 1) / y : x / y;
}
template<typename T, typename I>
constexpr T modPower(T v, I n, T mod)
{
T ans = 1 % mod;
for (; n > 0; n >>= 1, (v *= v) %= mod) {
if (n % 2 == 1) { (ans *= v) %= mod; }
}
return ans;
}
template<typename T, typename I>
constexpr T power(T v, I n)
{
T ans = 1;
for (; n > 0; n >>= 1, v *= v) {
if (n % 2 == 1) { ans *= v; }
}
return ans;
}
template<typename T, typename I>
constexpr T power(T v, I n, const T& e)
{
T ans = e;
for (; n > 0; n >>= 1, v *= v) {
if (n % 2 == 1) { ans *= v; }
}
return ans;
}
template<typename T>
Vec<T> operator+=(Vec<T>& vs1, const Vec<T>& vs2)
{
vs1.insert(vs1.end(), vs2.begin(), vs2.end());
return vs1;
}
template<typename T>
Vec<T> operator+(const Vec<T>& vs1, const Vec<T>& vs2)
{
auto vs = vs1;
vs += vs2;
return vs;
}
template<typename Vs, typename V>
void fillAll(Vs& arr, const V& v)
{
if constexpr (std::is_convertible<V, Vs>::value) {
arr = v;
} else {
for (auto& subarr : arr) {
fillAll(subarr, v);
}
}
}
template<typename Vs>
void sortAll(Vs& vs)
{
std::sort(std::begin(vs), std::end(vs));
}
template<typename Vs, typename C>
void sortAll(Vs& vs, C comp)
{
std::sort(std::begin(vs), std::end(vs), comp);
}
template<typename Vs>
void reverseAll(Vs& vs)
{
std::reverse(std::begin(vs), std::end(vs));
}
template<typename V, typename Vs>
V sumAll(const Vs& vs)
{
if constexpr (std::is_convertible<Vs, V>::value) {
return static_cast<V>(vs);
} else {
V ans = 0;
for (const auto& v : vs) {
ans += sumAll<V>(v);
}
return ans;
}
}
template<typename Vs>
int minInd(const Vs& vs)
{
return std::min_element(std::begin(vs), std::end(vs)) - std::begin(vs);
}
template<typename Vs>
int maxInd(const Vs& vs)
{
return std::max_element(std::begin(vs), std::end(vs)) - std::begin(vs);
}
template<typename Vs, typename V>
int lbInd(const Vs& vs, const V& v)
{
return std::lower_bound(std::begin(vs), std::end(vs), v) - std::begin(vs);
}
template<typename Vs, typename V>
int ubInd(const Vs& vs, const V& v)
{
return std::upper_bound(std::begin(vs), std::end(vs), v) - std::begin(vs);
}
template<typename T, typename F>
Vec<T> genVec(int n, F gen)
{
Vec<T> ans;
std::generate_n(std::back_insert_iterator(ans), n, gen);
return ans;
}
Vec<int> iotaVec(int n, int offset = 0)
{
Vec<int> ans(n);
std::iota(ans.begin(), ans.end(), offset);
return ans;
}
constexpr int popcount(const u64 v)
{
return v ? __builtin_popcountll(v) : 0;
}
constexpr int log2p1(const u64 v)
{
return v ? 64 - __builtin_clzll(v) : 0;
}
constexpr int lsbp1(const u64 v)
{
return __builtin_ffsll(v);
}
constexpr int clog(const u64 v)
{
return v ? log2p1(v - 1) : 0;
}
constexpr u64 ceil2(const u64 v)
{
const int l = clog(v);
return (l == 64) ? 0_u64 : (1_u64 << l);
}
constexpr u64 floor2(const u64 v)
{
return v ? (1_u64 << (log2p1(v) - 1)) : 0_u64;
}
constexpr bool ispow2(const u64 v)
{
return (v > 0) and ((v & (v - 1)) == 0);
}
constexpr bool btest(const u64 mask, const int ind)
{
return (mask >> ind) & 1_u64;
}
template<typename F>
struct Fix : F
{
Fix(F&& f) : F{std::forward<F>(f)} {}
template<typename... Args>
auto operator()(Args&&... args) const
{
return F::operator()(*this, std::forward<Args>(args)...);
}
};
class irange
{
private:
struct itr
{
itr(i64 start = 0, i64 step = 1) : m_cnt{start}, m_step{step} {}
bool operator!=(const itr& it) const
{
return m_cnt != it.m_cnt;
}
int operator*()
{
return m_cnt;
}
itr& operator++()
{
m_cnt += m_step;
return *this;
}
i64 m_cnt, m_step;
};
i64 m_start, m_end, m_step;
public:
irange(i64 start, i64 end, i64 step = 1)
{
assert(step != 0);
const i64 d = std::abs(step);
const i64 l = (step > 0 ? start : end);
const i64 r = (step > 0 ? end : start);
int n = (r - l) / d + ((r - l) % d ? 1 : 0);
if (l >= r) { n = 0; }
m_start = start;
m_end = start + step * n;
m_step = step;
}
itr begin() const
{
return itr{m_start, m_step};
}
itr end() const
{
return itr{m_end, m_step};
}
};
irange rep(int end)
{
return irange(0, end, 1);
}
irange per(int rend)
{
return irange(rend - 1, -1, -1);
}
#pragma COMMENT("[REFS] Xoshiro: https://prng.di.unimi.it")
namespace xoshiro_impl {
u64 x;
u64 next()
{
uint64_t z = (x += 0x9e3779b97f4a7c15);
z = (z ^ (z >> 30)) * 0xbf58476d1ce4e5b9;
z = (z ^ (z >> 27)) * 0x94d049bb133111eb;
return z ^ (z >> 31);
}
} // namespace xoshiro_impl
class Xoshiro32
{
public:
using result_type = u32;
using T = result_type;
Xoshiro32(T seed = 0)
{
xoshiro_impl::x = seed;
s[0] = xoshiro_impl::next();
s[1] = xoshiro_impl::next();
s[2] = xoshiro_impl::next();
s[3] = xoshiro_impl::next();
}
static constexpr T min()
{
return LIMMIN<T>;
}
static constexpr T max()
{
return LIMMAX<T>;
}
T operator()()
{
return next();
}
private:
static constexpr T rotl(const T x, int k)
{
return (x << k) | (x >> (32 - k));
}
T next()
{
const T ans = rotl(s[1] * 5, 7) * 9;
const T t = s[1] << 9;
s[2] ^= s[0];
s[3] ^= s[1];
s[1] ^= s[2];
s[0] ^= s[3];
s[2] ^= t;
s[3] = rotl(s[3], 11);
return ans;
}
T s[4];
};
class Xoshiro64
{
public:
using result_type = u64;
using T = result_type;
Xoshiro64(T seed = 0)
{
xoshiro_impl::x = seed;
s[0] = xoshiro_impl::next();
s[1] = xoshiro_impl::next();
s[2] = xoshiro_impl::next();
s[3] = xoshiro_impl::next();
}
static constexpr T min()
{
return LIMMIN<T>;
}
static constexpr T max()
{
return LIMMAX<T>;
}
T operator()()
{
return next();
}
private:
static constexpr T rotl(const T x, int k)
{
return (x << k) | (x >> (64 - k));
}
T next()
{
const T ans = rotl(s[1] * 5, 7) * 9;
const T t = s[1] << 17;
s[2] ^= s[0];
s[3] ^= s[1];
s[1] ^= s[2];
s[0] ^= s[3];
s[2] ^= t;
s[3] = rotl(s[3], 45);
return ans;
}
T s[4];
};
template<typename Rng>
class RNG
{
public:
using result_type = typename Rng::result_type;
using T = result_type;
static constexpr T min()
{
return Rng::min();
}
static constexpr T max()
{
return Rng::max();
}
RNG() : RNG(std::random_device{}()) {}
RNG(T seed) : m_rng(seed) {}
T operator()()
{
return m_rng();
}
template<typename T>
T val(T min, T max)
{
return std::uniform_int_distribution<T>(min, max)(m_rng);
}
template<typename T>
Pair<T, T> pair(T min, T max)
{
return std::minmax({val<T>(min, max), val<T>(min, max)});
}
template<typename T>
Vec<T> vec(int n, T min, T max)
{
return genVec<T>(n, [&]() { return val<T>(min, max); });
}
template<typename T>
Vec<Vec<T>> vvec(int n, int m, T min, T max)
{
return genVec<Vec<T>>(n, [&]() { return vec(m, min, max); });
}
private:
Rng m_rng;
};
RNG<std::mt19937> rng;
RNG<std::mt19937_64> rng64;
RNG<Xoshiro32> rng_xo;
RNG<Xoshiro64> rng_xo64;
class Scanner
{
public:
Scanner(Istream& is = std::cin) : m_is{is}
{
m_is.tie(nullptr)->sync_with_stdio(false);
}
template<typename T>
T val()
{
T v;
return m_is >> v, v;
}
template<typename T>
T val(T offset)
{
return val<T>() - offset;
}
template<typename T>
Vec<T> vec(int n)
{
return genVec<T>(n, [&]() { return val<T>(); });
}
template<typename T>
Vec<T> vec(int n, T offset)
{
return genVec<T>(n, [&]() { return val<T>(offset); });
}
template<typename T>
Vec<Vec<T>> vvec(int n, int m)
{
return genVec<Vec<T>>(n, [&]() { return vec<T>(m); });
}
template<typename T>
Vec<Vec<T>> vvec(int n, int m, const T offset)
{
return genVec<Vec<T>>(n, [&]() { return vec<T>(m, offset); });
}
template<typename... Args>
auto tup()
{
return Tup<Args...>{val<Args>()...};
}
template<typename... Args>
auto tup(const Args&... offsets)
{
return Tup<Args...>{val<Args>(offsets)...};
}
private:
Istream& m_is;
};
Scanner in;
class Printer
{
public:
Printer(Ostream& os = std::cout) : m_os{os}
{
m_os << std::fixed << std::setprecision(15);
}
template<typename... Args>
int operator()(const Args&... args)
{
dump(args...);
return 0;
}
template<typename... Args>
int ln(const Args&... args)
{
dump(args...), m_os << '\n';
return 0;
}
template<typename... Args>
int el(const Args&... args)
{
dump(args...), m_os << std::endl;
return 0;
}
private:
template<typename T>
void dump(const T& v)
{
m_os << v;
}
template<typename T>
void dump(const Vec<T>& vs)
{
for (const int i : rep(vs.size())) {
m_os << (i ? " " : ""), dump(vs[i]);
}
}
template<typename T>
void dump(const Vec<Vec<T>>& vss)
{
for (const int i : rep(vss.size())) {
m_os << (i ? "\n" : ""), dump(vss[i]);
}
}
template<typename T, typename... Ts>
int dump(const T& v, const Ts&... args)
{
dump(v), m_os << ' ', dump(args...);
return 0;
}
Ostream& m_os;
};
Printer out;
#pragma endregion
__attribute__((target("sse4.2"))) inline __m128i
my128_mullo_epu32(const __m128i& a, const __m128i& b)
{
return _mm_mullo_epi32(a, b);
}
__attribute__((target("sse4.2"))) inline __m128i
my128_mulhi_epu32(const __m128i& a, const __m128i& b)
{
__m128i a13 = _mm_shuffle_epi32(a, 0xF5);
__m128i b13 = _mm_shuffle_epi32(b, 0xF5);
__m128i prod02 = _mm_mul_epu32(a, b);
__m128i prod13 = _mm_mul_epu32(a13, b13);
__m128i prod = _mm_unpackhi_epi64(_mm_unpacklo_epi32(prod02, prod13),
_mm_unpackhi_epi32(prod02, prod13));
return prod;
}
__attribute__((target("sse4.2"))) inline __m128i
montgomery_mul_128(const __m128i& a,
const __m128i& b,
const __m128i& r,
const __m128i& m1)
{
return _mm_sub_epi32(
_mm_add_epi32(my128_mulhi_epu32(a, b), m1),
my128_mulhi_epu32(my128_mullo_epu32(my128_mullo_epu32(a, b), r), m1));
}
__attribute__((target("sse4.2"))) inline __m128i
montgomery_add_128(const __m128i& a,
const __m128i& b,
const __m128i& m2,
const __m128i& m0)
{
__m128i ret = _mm_sub_epi32(_mm_add_epi32(a, b), m2);
return _mm_add_epi32(_mm_and_si128(_mm_cmpgt_epi32(m0, ret), m2), ret);
}
__attribute__((target("sse4.2"))) inline __m128i
montgomery_sub_128(const __m128i& a,
const __m128i& b,
const __m128i& m2,
const __m128i& m0)
{
__m128i ret = _mm_sub_epi32(a, b);
return _mm_add_epi32(_mm_and_si128(_mm_cmpgt_epi32(m0, ret), m2), ret);
}
__attribute__((target("avx2"))) inline __m256i
my256_mullo_epu32(const __m256i& a, const __m256i& b)
{
return _mm256_mullo_epi32(a, b);
}
__attribute__((target("avx2"))) inline __m256i
my256_mulhi_epu32(const __m256i& a, const __m256i& b)
{
__m256i a13 = _mm256_shuffle_epi32(a, 0xF5);
__m256i b13 = _mm256_shuffle_epi32(b, 0xF5);
__m256i prod02 = _mm256_mul_epu32(a, b);
__m256i prod13 = _mm256_mul_epu32(a13, b13);
__m256i prod = _mm256_unpackhi_epi64(_mm256_unpacklo_epi32(prod02, prod13),
_mm256_unpackhi_epi32(prod02, prod13));
return prod;
}
__attribute__((target("avx2"))) inline __m256i
montgomery_mul_256(const __m256i& a,
const __m256i& b,
const __m256i& r,
const __m256i& m1)
{
return _mm256_sub_epi32(
_mm256_add_epi32(my256_mulhi_epu32(a, b), m1),
my256_mulhi_epu32(my256_mullo_epu32(my256_mullo_epu32(a, b), r), m1));
}
__attribute__((target("avx2"))) inline __m256i
montgomery_add_256(const __m256i& a,
const __m256i& b,
const __m256i& m2,
const __m256i& m0)
{
__m256i ret = _mm256_sub_epi32(_mm256_add_epi32(a, b), m2);
return _mm256_add_epi32(_mm256_and_si256(_mm256_cmpgt_epi32(m0, ret), m2),
ret);
}
__attribute__((target("avx2"))) inline __m256i
montgomery_sub_256(const __m256i& a,
const __m256i& b,
const __m256i& m2,
const __m256i& m0)
{
__m256i ret = _mm256_sub_epi32(a, b);
return _mm256_add_epi32(_mm256_and_si256(_mm256_cmpgt_epi32(m0, ret), m2),
ret);
}
namespace ntt_inner {
using u64 = uint64_t;
constexpr uint32_t get_pr(uint32_t mod)
{
if (mod == 2) return 1;
u64 ds[32] = {};
int idx = 0;
u64 m = mod - 1;
for (u64 i = 2; i * i <= m; ++i) {
if (m % i == 0) {
ds[idx++] = i;
while (m % i == 0)
m /= i;
}
}
if (m != 1) ds[idx++] = m;
uint32_t pr = 2;
while (1) {
int flg = 1;
for (int i = 0; i < idx; ++i) {
u64 a = pr, b = (mod - 1) / ds[i], r = 1;
while (b) {
if (b & 1) r = r * a % mod;
a = a * a % mod;
b >>= 1;
}
if (r == 1) {
flg = 0;
break;
}
}
if (flg == 1) break;
++pr;
}
return pr;
}
constexpr int SZ_FFT_BUF = 1 << 23;
uint32_t _buf1[SZ_FFT_BUF] __attribute__((aligned(64)));
uint32_t _buf2[SZ_FFT_BUF] __attribute__((aligned(64)));
} // namespace ntt_inner
template<typename mint>
struct NTT
{
static constexpr uint32_t mod = mint::get_mod();
static constexpr uint32_t pr = ntt_inner::get_pr(mint::get_mod());
static constexpr int level = __builtin_ctzll(mod - 1);
mint dw[level], dy[level];
mint *buf1, *buf2;
constexpr NTT()
{
setwy(level);
union raw_cast
{
mint dat;
uint32_t _;
};
buf1 = &(((raw_cast*)(ntt_inner::_buf1))->dat);
buf2 = &(((raw_cast*)(ntt_inner::_buf2))->dat);
}
constexpr void setwy(int k)
{
mint w[level], y[level];
w[k - 1] = mint(pr).pow((mod - 1) / (1 << k));
y[k - 1] = w[k - 1].inverse();
for (int i = k - 2; i > 0; --i)
w[i] = w[i + 1] * w[i + 1], y[i] = y[i + 1] * y[i + 1];
dw[0] = dy[0] = w[1] * w[1];
dw[1] = w[1], dy[1] = y[1], dw[2] = w[2], dy[2] = y[2];
for (int i = 3; i < k; ++i) {
dw[i] = dw[i - 1] * y[i - 2] * w[i];
dy[i] = dy[i - 1] * w[i - 2] * y[i];
}
}
__attribute__((target("avx2"))) void ntt(mint* a, int n)
{
int k = n ? __builtin_ctz(n) : 0;
if (k == 0) return;
if (k == 1) {
mint a1 = a[1];
a[1] = a[0] - a[1];
a[0] = a[0] + a1;
return;
}
if (k & 1) {
int v = 1 << (k - 1);
if (v < 8) {
for (int j = 0; j < v; ++j) {
mint ajv = a[j + v];
a[j + v] = a[j] - ajv;
a[j] += ajv;
}
} else {
const __m256i m0 = _mm256_set1_epi32(0);
const __m256i m2 = _mm256_set1_epi32(mod + mod);
int j0 = 0;
int j1 = v;
for (; j0 < v; j0 += 8, j1 += 8) {
__m256i T0 = _mm256_loadu_si256((__m256i*)(a + j0));
__m256i T1 = _mm256_loadu_si256((__m256i*)(a + j1));
__m256i naj = montgomery_add_256(T0, T1, m2, m0);
__m256i najv = montgomery_sub_256(T0, T1, m2, m0);
_mm256_storeu_si256((__m256i*)(a + j0), naj);
_mm256_storeu_si256((__m256i*)(a + j1), najv);
}
}
}
int u = 1 << (2 + (k & 1));
int v = 1 << (k - 2 - (k & 1));
mint one = mint(1);
mint imag = dw[1];
while (v) {
if (v == 1) {
mint ww = one, xx = one, wx = one;
for (int jh = 0; jh < u;) {
ww = xx * xx, wx = ww * xx;
mint t0 = a[jh + 0], t1 = a[jh + 1] * xx;
mint t2 = a[jh + 2] * ww, t3 = a[jh + 3] * wx;
mint t0p2 = t0 + t2, t1p3 = t1 + t3;
mint t0m2 = t0 - t2, t1m3 = (t1 - t3) * imag;
a[jh + 0] = t0p2 + t1p3, a[jh + 1] = t0p2 - t1p3;
a[jh + 2] = t0m2 + t1m3, a[jh + 3] = t0m2 - t1m3;
xx *= dw[__builtin_ctz((jh += 4))];
}
} else if (v == 4) {
const __m128i m0 = _mm_set1_epi32(0);
const __m128i m1 = _mm_set1_epi32(mod);
const __m128i m2 = _mm_set1_epi32(mod + mod);
const __m128i r = _mm_set1_epi32(mint::r);
const __m128i Imag = _mm_set1_epi32(imag.a);
mint ww = one, xx = one, wx = one;
for (int jh = 0; jh < u;) {
if (jh == 0) {
int j0 = 0;
int j1 = v;
int j2 = j1 + v;
int j3 = j2 + v;
int je = v;
for (; j0 < je; j0 += 4, j1 += 4, j2 += 4, j3 += 4) {
const __m128i T0
= _mm_loadu_si128((__m128i*)(a + j0));
const __m128i T1
= _mm_loadu_si128((__m128i*)(a + j1));
const __m128i T2
= _mm_loadu_si128((__m128i*)(a + j2));
const __m128i T3
= _mm_loadu_si128((__m128i*)(a + j3));
const __m128i T0P2
= montgomery_add_128(T0, T2, m2, m0);
const __m128i T1P3
= montgomery_add_128(T1, T3, m2, m0);
const __m128i T0M2
= montgomery_sub_128(T0, T2, m2, m0);
const __m128i T1M3 = montgomery_mul_128(
montgomery_sub_128(T1, T3, m2, m0),
Imag,
r,
m1);
_mm_storeu_si128(
(__m128i*)(a + j0),
montgomery_add_128(T0P2, T1P3, m2, m0));
_mm_storeu_si128(
(__m128i*)(a + j1),
montgomery_sub_128(T0P2, T1P3, m2, m0));
_mm_storeu_si128(
(__m128i*)(a + j2),
montgomery_add_128(T0M2, T1M3, m2, m0));
_mm_storeu_si128(
(__m128i*)(a + j3),
montgomery_sub_128(T0M2, T1M3, m2, m0));
}
} else {
ww = xx * xx, wx = ww * xx;
const __m128i WW = _mm_set1_epi32(ww.a);
const __m128i WX = _mm_set1_epi32(wx.a);
const __m128i XX = _mm_set1_epi32(xx.a);
int j0 = jh * v;
int j1 = j0 + v;
int j2 = j1 + v;
int j3 = j2 + v;
int je = j1;
for (; j0 < je; j0 += 4, j1 += 4, j2 += 4, j3 += 4) {
const __m128i T0
= _mm_loadu_si128((__m128i*)(a + j0));
const __m128i T1
= _mm_loadu_si128((__m128i*)(a + j1));
const __m128i T2
= _mm_loadu_si128((__m128i*)(a + j2));
const __m128i T3
= _mm_loadu_si128((__m128i*)(a + j3));
const __m128i MT1
= montgomery_mul_128(T1, XX, r, m1);
const __m128i MT2
= montgomery_mul_128(T2, WW, r, m1);
const __m128i MT3
= montgomery_mul_128(T3, WX, r, m1);
const __m128i T0P2
= montgomery_add_128(T0, MT2, m2, m0);
const __m128i T1P3
= montgomery_add_128(MT1, MT3, m2, m0);
const __m128i T0M2
= montgomery_sub_128(T0, MT2, m2, m0);
const __m128i T1M3 = montgomery_mul_128(
montgomery_sub_128(MT1, MT3, m2, m0),
Imag,
r,
m1);
_mm_storeu_si128(
(__m128i*)(a + j0),
montgomery_add_128(T0P2, T1P3, m2, m0));
_mm_storeu_si128(
(__m128i*)(a + j1),
montgomery_sub_128(T0P2, T1P3, m2, m0));
_mm_storeu_si128(
(__m128i*)(a + j2),
montgomery_add_128(T0M2, T1M3, m2, m0));
_mm_storeu_si128(
(__m128i*)(a + j3),
montgomery_sub_128(T0M2, T1M3, m2, m0));
}
}
xx *= dw[__builtin_ctz((jh += 4))];
}
} else {
const __m256i m0 = _mm256_set1_epi32(0);
const __m256i m1 = _mm256_set1_epi32(mod);
const __m256i m2 = _mm256_set1_epi32(mod + mod);
const __m256i r = _mm256_set1_epi32(mint::r);
const __m256i Imag = _mm256_set1_epi32(imag.a);
mint ww = one, xx = one, wx = one;
for (int jh = 0; jh < u;) {
if (jh == 0) {
int j0 = 0;
int j1 = v;
int j2 = j1 + v;
int j3 = j2 + v;
int je = v;
for (; j0 < je; j0 += 8, j1 += 8, j2 += 8, j3 += 8) {
const __m256i T0
= _mm256_loadu_si256((__m256i*)(a + j0));
const __m256i T1
= _mm256_loadu_si256((__m256i*)(a + j1));
const __m256i T2
= _mm256_loadu_si256((__m256i*)(a + j2));
const __m256i T3
= _mm256_loadu_si256((__m256i*)(a + j3));
const __m256i T0P2
= montgomery_add_256(T0, T2, m2, m0);
const __m256i T1P3
= montgomery_add_256(T1, T3, m2, m0);
const __m256i T0M2
= montgomery_sub_256(T0, T2, m2, m0);
const __m256i T1M3 = montgomery_mul_256(
montgomery_sub_256(T1, T3, m2, m0),
Imag,
r,
m1);
_mm256_storeu_si256(
(__m256i*)(a + j0),
montgomery_add_256(T0P2, T1P3, m2, m0));
_mm256_storeu_si256(
(__m256i*)(a + j1),
montgomery_sub_256(T0P2, T1P3, m2, m0));
_mm256_storeu_si256(
(__m256i*)(a + j2),
montgomery_add_256(T0M2, T1M3, m2, m0));
_mm256_storeu_si256(
(__m256i*)(a + j3),
montgomery_sub_256(T0M2, T1M3, m2, m0));
}
} else {
ww = xx * xx, wx = ww * xx;
const __m256i WW = _mm256_set1_epi32(ww.a);
const __m256i WX = _mm256_set1_epi32(wx.a);
const __m256i XX = _mm256_set1_epi32(xx.a);
int j0 = jh * v;
int j1 = j0 + v;
int j2 = j1 + v;
int j3 = j2 + v;
int je = j1;
for (; j0 < je; j0 += 8, j1 += 8, j2 += 8, j3 += 8) {
const __m256i T0
= _mm256_loadu_si256((__m256i*)(a + j0));
const __m256i T1
= _mm256_loadu_si256((__m256i*)(a + j1));
const __m256i T2
= _mm256_loadu_si256((__m256i*)(a + j2));
const __m256i T3
= _mm256_loadu_si256((__m256i*)(a + j3));
const __m256i MT1
= montgomery_mul_256(T1, XX, r, m1);
const __m256i MT2
= montgomery_mul_256(T2, WW, r, m1);
const __m256i MT3
= montgomery_mul_256(T3, WX, r, m1);
const __m256i T0P2
= montgomery_add_256(T0, MT2, m2, m0);
const __m256i T1P3
= montgomery_add_256(MT1, MT3, m2, m0);
const __m256i T0M2
= montgomery_sub_256(T0, MT2, m2, m0);
const __m256i T1M3 = montgomery_mul_256(
montgomery_sub_256(MT1, MT3, m2, m0),
Imag,
r,
m1);
_mm256_storeu_si256(
(__m256i*)(a + j0),
montgomery_add_256(T0P2, T1P3, m2, m0));
_mm256_storeu_si256(
(__m256i*)(a + j1),
montgomery_sub_256(T0P2, T1P3, m2, m0));
_mm256_storeu_si256(
(__m256i*)(a + j2),
montgomery_add_256(T0M2, T1M3, m2, m0));
_mm256_storeu_si256(
(__m256i*)(a + j3),
montgomery_sub_256(T0M2, T1M3, m2, m0));
}
}
xx *= dw[__builtin_ctz((jh += 4))];
}
}
u <<= 2;
v >>= 2;
}
}
__attribute__((target("avx2"))) void
intt(mint* a, int n, int normalize = true)
{
int k = n ? __builtin_ctz(n) : 0;
if (k == 0) return;
if (k == 1) {
mint a1 = a[1];
a[1] = a[0] - a[1];
a[0] = a[0] + a1;
if (normalize) {
a[0] *= mint(2).inverse();
a[1] *= mint(2).inverse();
}
return;
}
int u = 1 << (k - 2);
int v = 1;
mint one = mint(1);
mint imag = dy[1];
while (u) {
if (v == 1) {
mint ww = one, xx = one, yy = one;
u <<= 2;
for (int jh = 0; jh < u;) {
ww = xx * xx, yy = xx * imag;
mint t0 = a[jh + 0], t1 = a[jh + 1];
mint t2 = a[jh + 2], t3 = a[jh + 3];
mint t0p1 = t0 + t1, t2p3 = t2 + t3;
mint t0m1 = (t0 - t1) * xx, t2m3 = (t2 - t3) * yy;
a[jh + 0] = t0p1 + t2p3, a[jh + 2] = (t0p1 - t2p3) * ww;
a[jh + 1] = t0m1 + t2m3, a[jh + 3] = (t0m1 - t2m3) * ww;
xx *= dy[__builtin_ctz(jh += 4)];
}
} else if (v == 4) {
const __m128i m0 = _mm_set1_epi32(0);
const __m128i m1 = _mm_set1_epi32(mod);
const __m128i m2 = _mm_set1_epi32(mod + mod);
const __m128i r = _mm_set1_epi32(mint::r);
const __m128i Imag = _mm_set1_epi32(imag.a);
mint ww = one, xx = one, yy = one;
u <<= 2;
for (int jh = 0; jh < u;) {
if (jh == 0) {
int j0 = 0;
int j1 = v;
int j2 = v + v;
int j3 = j2 + v;
for (; j0 < v; j0 += 4, j1 += 4, j2 += 4, j3 += 4) {
const __m128i T0
= _mm_loadu_si128((__m128i*)(a + j0));
const __m128i T1
= _mm_loadu_si128((__m128i*)(a + j1));
const __m128i T2
= _mm_loadu_si128((__m128i*)(a + j2));
const __m128i T3
= _mm_loadu_si128((__m128i*)(a + j3));
const __m128i T0P1
= montgomery_add_128(T0, T1, m2, m0);
const __m128i T2P3
= montgomery_add_128(T2, T3, m2, m0);
const __m128i T0M1
= montgomery_sub_128(T0, T1, m2, m0);
const __m128i T2M3 = montgomery_mul_128(
montgomery_sub_128(T2, T3, m2, m0),
Imag,
r,
m1);
_mm_storeu_si128(
(__m128i*)(a + j0),
montgomery_add_128(T0P1, T2P3, m2, m0));
_mm_storeu_si128(
(__m128i*)(a + j2),
montgomery_sub_128(T0P1, T2P3, m2, m0));
_mm_storeu_si128(
(__m128i*)(a + j1),
montgomery_add_128(T0M1, T2M3, m2, m0));
_mm_storeu_si128(
(__m128i*)(a + j3),
montgomery_sub_128(T0M1, T2M3, m2, m0));
}
} else {
ww = xx * xx, yy = xx * imag;
const __m128i WW = _mm_set1_epi32(ww.a);
const __m128i XX = _mm_set1_epi32(xx.a);
const __m128i YY = _mm_set1_epi32(yy.a);
int j0 = jh * v;
int j1 = j0 + v;
int j2 = j1 + v;
int j3 = j2 + v;
int je = j1;
for (; j0 < je; j0 += 4, j1 += 4, j2 += 4, j3 += 4) {
const __m128i T0
= _mm_loadu_si128((__m128i*)(a + j0));
const __m128i T1
= _mm_loadu_si128((__m128i*)(a + j1));
const __m128i T2
= _mm_loadu_si128((__m128i*)(a + j2));
const __m128i T3
= _mm_loadu_si128((__m128i*)(a + j3));
const __m128i T0P1
= montgomery_add_128(T0, T1, m2, m0);
const __m128i T2P3
= montgomery_add_128(T2, T3, m2, m0);
const __m128i T0M1 = montgomery_mul_128(
montgomery_sub_128(T0, T1, m2, m0), XX, r, m1);
__m128i T2M3 = montgomery_mul_128(
montgomery_sub_128(T2, T3, m2, m0), YY, r, m1);
_mm_storeu_si128(
(__m128i*)(a + j0),
montgomery_add_128(T0P1, T2P3, m2, m0));
_mm_storeu_si128(
(__m128i*)(a + j2),
montgomery_mul_128(
montgomery_sub_128(T0P1, T2P3, m2, m0),
WW,
r,
m1));
_mm_storeu_si128(
(__m128i*)(a + j1),
montgomery_add_128(T0M1, T2M3, m2, m0));
_mm_storeu_si128(
(__m128i*)(a + j3),
montgomery_mul_128(
montgomery_sub_128(T0M1, T2M3, m2, m0),
WW,
r,
m1));
}
}
xx *= dy[__builtin_ctz(jh += 4)];
}
} else {
const __m256i m0 = _mm256_set1_epi32(0);
const __m256i m1 = _mm256_set1_epi32(mod);
const __m256i m2 = _mm256_set1_epi32(mod + mod);
const __m256i r = _mm256_set1_epi32(mint::r);
const __m256i Imag = _mm256_set1_epi32(imag.a);
mint ww = one, xx = one, yy = one;
u <<= 2;
for (int jh = 0; jh < u;) {
if (jh == 0) {
int j0 = 0;
int j1 = v;
int j2 = v + v;
int j3 = j2 + v;
for (; j0 < v; j0 += 8, j1 += 8, j2 += 8, j3 += 8) {
const __m256i T0
= _mm256_loadu_si256((__m256i*)(a + j0));
const __m256i T1
= _mm256_loadu_si256((__m256i*)(a + j1));
const __m256i T2
= _mm256_loadu_si256((__m256i*)(a + j2));
const __m256i T3
= _mm256_loadu_si256((__m256i*)(a + j3));
const __m256i T0P1
= montgomery_add_256(T0, T1, m2, m0);
const __m256i T2P3
= montgomery_add_256(T2, T3, m2, m0);
const __m256i T0M1
= montgomery_sub_256(T0, T1, m2, m0);
const __m256i T2M3 = montgomery_mul_256(
montgomery_sub_256(T2, T3, m2, m0),
Imag,
r,
m1);
_mm256_storeu_si256(
(__m256i*)(a + j0),
montgomery_add_256(T0P1, T2P3, m2, m0));
_mm256_storeu_si256(
(__m256i*)(a + j2),
montgomery_sub_256(T0P1, T2P3, m2, m0));
_mm256_storeu_si256(
(__m256i*)(a + j1),
montgomery_add_256(T0M1, T2M3, m2, m0));
_mm256_storeu_si256(
(__m256i*)(a + j3),
montgomery_sub_256(T0M1, T2M3, m2, m0));
}
} else {
ww = xx * xx, yy = xx * imag;
const __m256i WW = _mm256_set1_epi32(ww.a);
const __m256i XX = _mm256_set1_epi32(xx.a);
const __m256i YY = _mm256_set1_epi32(yy.a);
int j0 = jh * v;
int j1 = j0 + v;
int j2 = j1 + v;
int j3 = j2 + v;
int je = j1;
for (; j0 < je; j0 += 8, j1 += 8, j2 += 8, j3 += 8) {
const __m256i T0
= _mm256_loadu_si256((__m256i*)(a + j0));
const __m256i T1
= _mm256_loadu_si256((__m256i*)(a + j1));
const __m256i T2
= _mm256_loadu_si256((__m256i*)(a + j2));
const __m256i T3
= _mm256_loadu_si256((__m256i*)(a + j3));
const __m256i T0P1
= montgomery_add_256(T0, T1, m2, m0);
const __m256i T2P3
= montgomery_add_256(T2, T3, m2, m0);
const __m256i T0M1 = montgomery_mul_256(
montgomery_sub_256(T0, T1, m2, m0), XX, r, m1);
const __m256i T2M3 = montgomery_mul_256(
montgomery_sub_256(T2, T3, m2, m0), YY, r, m1);
_mm256_storeu_si256(
(__m256i*)(a + j0),
montgomery_add_256(T0P1, T2P3, m2, m0));
_mm256_storeu_si256(
(__m256i*)(a + j2),
montgomery_mul_256(
montgomery_sub_256(T0P1, T2P3, m2, m0),
WW,
r,
m1));
_mm256_storeu_si256(
(__m256i*)(a + j1),
montgomery_add_256(T0M1, T2M3, m2, m0));
_mm256_storeu_si256(
(__m256i*)(a + j3),
montgomery_mul_256(
montgomery_sub_256(T0M1, T2M3, m2, m0),
WW,
r,
m1));
}
}
xx *= dy[__builtin_ctz(jh += 4)];
}
}
u >>= 4;
v <<= 2;
}
if (k & 1) {
v = 1 << (k - 1);
if (v < 8) {
for (int j = 0; j < v; ++j) {
mint ajv = a[j] - a[j + v];
a[j] += a[j + v];
a[j + v] = ajv;
}
} else {
const __m256i m0 = _mm256_set1_epi32(0);
const __m256i m2 = _mm256_set1_epi32(mod + mod);
int j0 = 0;
int j1 = v;
for (; j0 < v; j0 += 8, j1 += 8) {
const __m256i T0 = _mm256_loadu_si256((__m256i*)(a + j0));
const __m256i T1 = _mm256_loadu_si256((__m256i*)(a + j1));
__m256i naj = montgomery_add_256(T0, T1, m2, m0);
__m256i najv = montgomery_sub_256(T0, T1, m2, m0);
_mm256_storeu_si256((__m256i*)(a + j0), naj);
_mm256_storeu_si256((__m256i*)(a + j1), najv);
}
}
}
if (normalize) {
mint invn = mint(n).inverse();
for (int i = 0; i < n; i++)
a[i] *= invn;
}
}
__attribute__((target("avx2"))) void
inplace_multiply(int l1, int l2, int zero_padding = true)
{
int l = l1 + l2 - 1;
int M = 4;
while (M < l)
M <<= 1;
if (zero_padding) {
for (int i = l1; i < M; i++)
ntt_inner::_buf1[i] = 0;
for (int i = l2; i < M; i++)
ntt_inner::_buf2[i] = 0;
}
const __m256i m0 = _mm256_set1_epi32(0);
const __m256i m1 = _mm256_set1_epi32(mod);
const __m256i r = _mm256_set1_epi32(mint::r);
const __m256i N2 = _mm256_set1_epi32(mint::n2);
for (int i = 0; i < l1; i += 8) {
__m256i a = _mm256_loadu_si256((__m256i*)(ntt_inner::_buf1 + i));
__m256i b = montgomery_mul_256(a, N2, r, m1);
_mm256_storeu_si256((__m256i*)(ntt_inner::_buf1 + i), b);
}
for (int i = 0; i < l2; i += 8) {
__m256i a = _mm256_loadu_si256((__m256i*)(ntt_inner::_buf2 + i));
__m256i b = montgomery_mul_256(a, N2, r, m1);
_mm256_storeu_si256((__m256i*)(ntt_inner::_buf2 + i), b);
}
ntt(buf1, M);
ntt(buf2, M);
for (int i = 0; i < M; i += 8) {
__m256i a = _mm256_loadu_si256((__m256i*)(ntt_inner::_buf1 + i));
__m256i b = _mm256_loadu_si256((__m256i*)(ntt_inner::_buf2 + i));
__m256i c = montgomery_mul_256(a, b, r, m1);
_mm256_storeu_si256((__m256i*)(ntt_inner::_buf1 + i), c);
}
intt(buf1, M, false);
const __m256i INVM = _mm256_set1_epi32((mint(M).inverse()).a);
for (int i = 0; i < l; i += 8) {
__m256i a = _mm256_loadu_si256((__m256i*)(ntt_inner::_buf1 + i));
__m256i b = montgomery_mul_256(a, INVM, r, m1);
__m256i c = my256_mulhi_epu32(my256_mullo_epu32(b, r), m1);
__m256i d = _mm256_and_si256(_mm256_cmpgt_epi32(c, m0), m1);
__m256i e = _mm256_sub_epi32(d, c);
_mm256_storeu_si256((__m256i*)(ntt_inner::_buf1 + i), e);
}
}
void ntt(Vec<mint>& a)
{
int M = (int)a.size();
for (int i = 0; i < M; i++)
buf1[i].a = a[i].a;
ntt(buf1, M);
for (int i = 0; i < M; i++)
a[i].a = buf1[i].a;
}
void intt(Vec<mint>& a)
{
int M = (int)a.size();
for (int i = 0; i < M; i++)
buf1[i].a = a[i].a;
intt(buf1, M, true);
for (int i = 0; i < M; i++)
a[i].a = buf1[i].a;
}
Vec<mint> multiply(const Vec<mint>& a, const Vec<mint>& b)
{
if (a.size() == 0 && b.size() == 0) return Vec<mint>{};
int l = a.size() + b.size() - 1;
if (std::min<int>(a.size(), b.size()) <= 40) {
Vec<mint> s(l);
for (int i = 0; i < (int)a.size(); ++i)
for (int j = 0; j < (int)b.size(); ++j)
s[i + j] += a[i] * b[j];
return s;
}
assert(l <= ntt_inner::SZ_FFT_BUF);
int M = 4;
while (M < l)
M <<= 1;
for (int i = 0; i < (int)a.size(); ++i)
buf1[i].a = a[i].a;
for (int i = (int)a.size(); i < M; ++i)
buf1[i].a = 0;
for (int i = 0; i < (int)b.size(); ++i)
buf2[i].a = b[i].a;
for (int i = (int)b.size(); i < M; ++i)
buf2[i].a = 0;
ntt(buf1, M);
ntt(buf2, M);
for (int i = 0; i < M; ++i)
buf1[i].a = mint::reduce(uint64_t(buf1[i].a) * buf2[i].a);
intt(buf1, M, false);
Vec<mint> s(l);
mint invm = mint(M).inverse();
for (int i = 0; i < l; ++i)
s[i] = buf1[i] * invm;
return s;
}
void ntt_doubling(Vec<mint>& a)
{
int M = (int)a.size();
for (int i = 0; i < M; i++)
buf1[i].a = a[i].a;
intt(buf1, M);
mint r = 1, zeta = mint(pr).pow((mint::get_mod() - 1) / (M << 1));
for (int i = 0; i < M; i++)
buf1[i] *= r, r *= zeta;
ntt(buf1, M);
a.resize(2 * M);
for (int i = 0; i < M; i++)
a[M + i].a = buf1[i].a;
}
};
template<typename mint>
struct FormalPowerSeries : Vec<mint>
{
using Vec<mint>::Vec;
using FPS = FormalPowerSeries;
FPS& operator+=(const FPS& r)
{
if (r.size() > this->size()) this->resize(r.size());
for (int i = 0; i < (int)r.size(); i++)
(*this)[i] += r[i];
return *this;
}
FPS& operator+=(const mint& r)
{
if (this->empty()) this->resize(1);
(*this)[0] += r;
return *this;
}
FPS& operator-=(const FPS& r)
{
if (r.size() > this->size()) this->resize(r.size());
for (int i = 0; i < (int)r.size(); i++)
(*this)[i] -= r[i];
return *this;
}
FPS& operator-=(const mint& r)
{
if (this->empty()) this->resize(1);
(*this)[0] -= r;
return *this;
}
FPS& operator*=(const mint& v)
{
for (int k = 0; k < (int)this->size(); k++)
(*this)[k] *= v;
return *this;
}
FPS operator+(const FPS& r) const
{
return FPS(*this) += r;
}
FPS operator+(const mint& v) const
{
return FPS(*this) += v;
}
FPS operator-(const FPS& r) const
{
return FPS(*this) -= r;
}
FPS operator-(const mint& v) const
{
return FPS(*this) -= v;
}
FPS operator*(const FPS& r) const
{
return FPS(*this) *= r;
}
FPS operator*(const mint& v) const
{
return FPS(*this) *= v;
}
FPS operator-() const
{
FPS ret(this->size());
for (int i = 0; i < (int)this->size(); i++)
ret[i] = -(*this)[i];
return ret;
}
void shrink()
{
while (this->size() && this->back() == mint(0))
this->pop_back();
}
static void* ntt_ptr;
static void set_fft();
FPS& operator*=(const FPS& r);
void ntt();
void intt();
void ntt_doubling();
static int ntt_pr();
FPS inv(int deg = -1) const;
FPS exp(int deg = -1) const;
};
template<typename mint>
void* FormalPowerSeries<mint>::ntt_ptr = nullptr;
/**
* @brief 多項式/形式的冪級数ライブラリ
* @docs docs/fps/formal-power-series.md
*/
template<typename mint>
void FormalPowerSeries<mint>::set_fft()
{
if (!ntt_ptr) ntt_ptr = new NTT<mint>;
}
template<typename mint>
FormalPowerSeries<mint>&
FormalPowerSeries<mint>::operator*=(const FormalPowerSeries<mint>& r)
{
if (this->empty() || r.empty()) {
this->clear();
return *this;
}
set_fft();
auto ret = static_cast<NTT<mint>*>(ntt_ptr)->multiply(*this, r);
return *this = FormalPowerSeries<mint>(ret.begin(), ret.end());
}
template<typename mint>
void FormalPowerSeries<mint>::ntt()
{
set_fft();
static_cast<NTT<mint>*>(ntt_ptr)->ntt(*this);
}
template<typename mint>
void FormalPowerSeries<mint>::intt()
{
set_fft();
static_cast<NTT<mint>*>(ntt_ptr)->intt(*this);
}
template<typename mint>
void FormalPowerSeries<mint>::ntt_doubling()
{
set_fft();
static_cast<NTT<mint>*>(ntt_ptr)->ntt_doubling(*this);
}
template<typename mint>
int FormalPowerSeries<mint>::ntt_pr()
{
set_fft();
return static_cast<NTT<mint>*>(ntt_ptr)->pr;
}
/**
* @brief NTT mod用FPSライブラリ
* @docs docs/fps/ntt-friendly-fps.md
*/
template<typename fps>
fps multivariate_multiplication(const fps& f,
const fps& g,
const Vec<int>& base)
{
int n = f.size(), s = base.size(), W = 1;
if (s == 0) return fps{f[0] * g[0]};
while (W < 2 * n)
W *= 2;
Vec<int> chi(n);
for (int i = 0; i < n; i++) {
int x = i;
for (int j = 0; j < s - 1; j++)
chi[i] += (x /= base[j]);
chi[i] %= s;
}
Vec<fps> F(s, fps(W)), G(s, fps(W));
for (int i = 0; i < n; i++)
F[chi[i]][i] = f[i], G[chi[i]][i] = g[i];
for (auto& x : F)
x.ntt();
for (auto& x : G)
x.ntt();
fps a(s);
for (int k = 0; k < W; k++) {
fill(begin(a), end(a), typename fps::value_type());
for (int i = 0; i < s; i++)
for (int j = 0; j < s; j++) {
a[i + j - (i + j >= s ? s : 0)] += F[i][k] * G[j][k];
}
for (int i = 0; i < s; i++)
F[i][k] = a[i];
}
for (auto& x : F)
x.intt();
fps h(n);
for (int i = 0; i < n; i++)
h[i] = F[chi[i]][i];
return h;
}
/**
* @brief Multivariate Multiplication
* @docs docs/ntt/multivariate-multiplication.md
*/
template<uint32_t mod>
struct LazyMontgomeryModInt
{
using mint = LazyMontgomeryModInt;
using i32 = int32_t;
using u32 = uint32_t;
using u64 = uint64_t;
static constexpr u32 get_r()
{
u32 ret = mod;
for (i32 i = 0; i < 4; ++i)
ret *= 2 - mod * ret;
return ret;
}
static constexpr u32 r = get_r();
static constexpr u32 n2 = -u64(mod) % mod;
static_assert(r * mod == 1, "invalid, r * mod != 1");
static_assert(mod < (1 << 30), "invalid, mod >= 2 ^ 30");
static_assert((mod & 1) == 1, "invalid, mod % 2 == 0");
u32 a;
constexpr LazyMontgomeryModInt() : a(0) {}
constexpr LazyMontgomeryModInt(const int64_t& b)
: a(reduce(u64(b % mod + mod) * n2)){};
static constexpr u32 reduce(const u64& b)
{
return (b + u64(u32(b) * u32(-r)) * mod) >> 32;
}
constexpr mint& operator+=(const mint& b)
{
if (i32(a += b.a - 2 * mod) < 0) a += 2 * mod;
return *this;
}
constexpr mint& operator-=(const mint& b)
{
if (i32(a -= b.a) < 0) a += 2 * mod;
return *this;
}
constexpr mint& operator*=(const mint& b)
{
a = reduce(u64(a) * b.a);
return *this;
}
constexpr mint& operator/=(const mint& b)
{
*this *= b.inverse();
return *this;
}
constexpr mint operator+(const mint& b) const
{
return mint(*this) += b;
}
constexpr mint operator-(const mint& b) const
{
return mint(*this) -= b;
}
constexpr mint operator*(const mint& b) const
{
return mint(*this) *= b;
}
constexpr mint operator/(const mint& b) const
{
return mint(*this) /= b;
}
constexpr bool operator==(const mint& b) const
{
return (a >= mod ? a - mod : a) == (b.a >= mod ? b.a - mod : b.a);
}
constexpr bool operator!=(const mint& b) const
{
return (a >= mod ? a - mod : a) != (b.a >= mod ? b.a - mod : b.a);
}
constexpr mint operator-() const
{
return mint() - mint(*this);
}
constexpr mint pow(u64 n) const
{
mint ret(1), mul(*this);
while (n > 0) {
if (n & 1) ret *= mul;
mul *= mul;
n >>= 1;
}
return ret;
}
constexpr mint inverse() const
{
return pow(mod - 2);
}
friend Ostream& operator<<(Ostream& os, const mint& b)
{
return os << b.get();
}
friend Istream& operator>>(Istream& is, mint& b)
{
int64_t t;
is >> t;
b = LazyMontgomeryModInt<mod>(t);
return (is);
}
constexpr u32 get() const
{
u32 ret = reduce(a);
return ret >= mod ? ret - mod : ret;
}
static constexpr u32 get_mod()
{
return mod;
}
};
constexpr u32 MOD = (1 << 20) * 115 + 1; // 120586241
using mint = LazyMontgomeryModInt<MOD>;
using fps = FormalPowerSeries<mint>;
int main()
{
const auto [N, K, M, T] = in.tup<int, int, i64, int>();
const auto as = in.vec<int>(N);
Vec<int> ns(K);
for (int i : rep(K)) {
if (i < T) {
ns[i] = 10;
} else {
ns[i] = 20;
}
}
Vec<int> p10s(K + 1, 1);
Vec<int> p20s(K + 1, 1);
for (int i : rep(K)) {
p10s[i + 1] = p10s[i] * 10;
p20s[i + 1] = p20s[i] * 20;
}
const int B1 = p10s[T];
const int B2 = p20s[K - T];
auto d2x = [&, K = K, T = T](
int D) -> int { // dは10進数,xは(20,20,...,10,10,...)進数
int X = 0;
int B = 1;
for (int i : rep(K)) {
const int dig = (i < T ? 10 : 20);
X += (D % 10) * B;
D /= 10;
B *= dig;
}
return X;
};
auto x2d = [&, K = K, T = T](int X) -> Pair<bool, int> {
int D = 0;
for (int i : rep(K)) {
const int dig = (i < T ? 10 : 20);
if (X % dig >= 10) { return {false, 0}; }
D += p10s[i] * (X % dig);
X /= dig;
}
return {true, D};
};
auto mul = [&, K = K, T = T](const fps& f, const fps& g) {
auto h = multivariate_multiplication(f, g, ns);
for (int n2 : rep(B2)) {
int tmp = n2;
int nn2 = 0;
for (int i : rep(K - T)) {
int d = tmp % 20;
tmp /= 20;
nn2 += (d % 10) * p20s[i];
}
if (n2 == nn2) { continue; }
for (int n1 : rep(B1)) {
h[nn2 * B1 + n1] += h[n2 * B1 + n1];
h[n2 * B1 + n1] = 0;
}
}
void(0);
return h;
};
auto power = Fix([&](auto dfs, const fps& f, const i64 M) -> fps {
if (M == 1) {
return f;
} else if (M % 2 == 0) {
return dfs(mul(f, f), M / 2);
} else {
return mul(dfs(f, M - 1), f);
}
});
fps f(B2 * B1, 0);
for (int i : rep(N)) {
f[d2x(as[i])] += 1;
}
Vec<mint> ans(p10s[K]);
const auto dp = power(f, M);
for (int i : rep(B1 * B2)) {
const auto [b, j] = x2d(i);
if (b) { ans[j] += dp[i]; }
}
for (int i : rep(p10s[K])) {
out.ln(ans[i]);
}
return 0;
}