#pragma region opt #pragma GCC optimize("O3") #pragma GCC optimize("fast-math") #pragma GCC optimize("unroll-loops") #pragma GCC target("avx2") #pragma endregion opt #pragma region header #define _GNU_SOURCE #include #include #include #include #include #include #include #include #include #pragma endregion header #pragma region type /* signed integer */ typedef int8_t i8; typedef int16_t i16; typedef int32_t i32; typedef int64_t i64; typedef __int128_t i128; /* unsigned integer */ typedef uint8_t u8; typedef uint16_t u16; typedef uint32_t u32; typedef uint64_t u64; typedef __uint128_t u128; /* floating point number */ typedef float f32; typedef double f64; typedef long double f80; #pragma endregion type #pragma region macro #define MIN(a, b) (((a) < (b)) ? (a) : (b)) #define MAX(a, b) (((a) > (b)) ? (a) : (b)) #define SWAP(a, b) (((a) ^= (b)), ((b) ^= (a)), ((a) ^= (b))) #define POPCNT32(a) __builtin_popcount((a)) #define POPCNT64(a) __builtin_popcountll((a)) #define CTZ32(a) __builtin_ctz((a)) #define CLZ32(a) __builtin_clz((a)) #define CTZ64(a) __builtin_ctzll((a)) #define CLZ64(a) __builtin_clzll((a)) #define HAS_SINGLE_BIT32(a) (__builtin_popcount((a)) == (1)) #define HAS_SINGLE_BIT64(a) (__builtin_popcountll((a)) == (1)) #define MSB32(a) ((31) - __builtin_clz((a))) #define MSB64(a) ((63) - __builtin_clzll((a))) #define BIT_WIDTH32(a) ((a) ? ((32) - __builtin_clz((a))) : (0)) #define BIT_WIDTH64(a) ((a) ? ((64) - __builtin_clzll((a))) : (0)) #define LSBit(a) ((a) & (-(a))) #define CLSBit(a) ((a) & ((a) - (1))) #define BIT_CEIL32(a) ((!(a)) ? (1) : ((POPCNT32(a)) == (1) ? ((1u) << ((31) - CLZ32((a)))) : ((1u) << ((32) - CLZ32(a))))) #define BIT_CEIL64(a) ((!(a)) ? (1) : ((POPCNT64(a)) == (1) ? ((1ull) << ((63) - CLZ64((a)))) : ((1ull) << ((64) - CLZ64(a))))) #define BIT_FLOOR32(a) ((!(a)) ? (0) : ((1u) << ((31) - CLZ32((a))))) #define BIT_FLOOR64(a) ((!(a)) ? (0) : ((1ull) << ((63) - CLZ64((a))))) #define _ROTL64(x, s) (((x) << ((s) % (64))) | (((x) >> ((64) - ((s) % (64)))))) #define _ROTR64(x, s) (((x) >> ((s) % (64))) | (((x) << ((64) - ((s) % (64)))))) #define ROTL64(x, s) (((s) == (0)) ? (x) : ((((i128)(s)) < (0)) ? (_ROTR64((x), -(s))) : (_ROTL64((x), (s))))) #define ROTR64(x, s) (((s) == (0)) ? (x) : ((((i128)(s)) < (0)) ? (_ROTL64((x), -(s))) : (_ROTR64((x), (s))))) #pragma endregion macro #pragma region io static inline int read_int(void) { // -2147483648 ~ 2147483647 (> 10 ^ 9) int c, x = 0, f = 1; while (c = getchar_unlocked(), c < 48 || c > 57) if (c == 45) f = -f; while (47 < c && c < 58) { x = x * 10 + c - 48; c = getchar_unlocked(); } return f * x; } static inline i32 in_i32(void) { // -2147483648 ~ 2147483647 (> 10 ^ 9) i32 c, x = 0, f = 1; while (c = getchar_unlocked(), c < 48 || c > 57) if (c == 45) f = -f; while (47 < c && c < 58) { x = x * 10 + c - 48; c = getchar_unlocked(); } return f * x; } static inline u32 in_u32(void) { // 0 ~ 4294967295 (> 10 ^ 9) u32 c, x = 0; while (c = getchar_unlocked(), c < 48 || c > 57); while (47 < c && c < 58) { x = x * 10 + c - 48; c = getchar_unlocked(); } return x; } static inline i64 in_i64(void) { // -9223372036854775808 ~ 9223372036854775807 (> 10 ^ 18) i64 c, x = 0, f = 1; while (c = getchar_unlocked(), c < 48 || c > 57) if (c == 45) f = -f; while (47 < c && c < 58) { x = x * 10 + c - 48; c = getchar_unlocked(); } return f * x; } static inline u64 in_u64(void) { // 0 ~ 18446744073709551615 (> 10 ^ 19) u64 c, x = 0; while (c = getchar_unlocked(), c < 48 || c > 57); while (47 < c && c < 58) { x = x * 10 + c - 48; c = getchar_unlocked(); } return x; } static inline void write_int_inner(int x) { if (x >= 10) write_int_inner(x / 10); putchar_unlocked(x - x / 10 * 10 + 48); } static inline void write_int(int x) { if (x < 0) { putchar_unlocked('-'); x = -x; } write_int_inner(x); } static inline void out_i32_inner(i32 x) { if (x >= 10) out_i32_inner(x / 10); putchar_unlocked(x - x / 10 * 10 + 48); } static inline void out_i32(i32 x) { if (x < 0) { putchar_unlocked('-'); x = -x; } out_i32_inner(x); } static inline void out_u32(u32 x) { if (x >= 10) out_u32(x / 10); putchar_unlocked(x - x / 10 * 10 + 48); } static inline void out_i64_inner(i64 x) { if (x >= 10) out_i64_inner(x / 10); putchar_unlocked(x - x / 10 * 10 + 48); } static inline void out_i64(i64 x) { if (x < 0) { putchar_unlocked('-'); x = -x; } out_i64_inner(x); } static inline void out_u64(u64 x) { if (x >= 10) out_u64(x / 10); putchar_unlocked(x - x / 10 * 10 + 48); } static inline void NL(void) { putchar_unlocked('\n'); } static inline void SP(void) { putchar_unlocked(' '); } #pragma endregion io #pragma region dump void dump_int(int x) { fprintf(stderr, "\033[1;31m%d\033[0m\n", x); } void dump_i32(i32 x) { fprintf(stderr, "\033[1;32m%d\033[0m\n", x); } void dump_i64(i64 x) { fprintf(stderr, "\033[1;33m%ld\033[0m\n", x); } void dump_u32(u32 x) { fprintf(stderr, "\033[1;34m%u\033[0m\n", x); } void dump_u64(u64 x) { fprintf(stderr, "\033[1;35m%lu\033[0m\n", x); } void dump_int_array(int *a, int a_len) { fprintf(stderr, "\033[1;31m"); for (int i = 0; i < a_len; i++) { if (i == a_len - 1) fprintf(stderr, "%d\n", a[i]); else fprintf(stderr, "%d ", a[i]); } fprintf(stderr, "\033[0m"); } void dump_i32_array(i32 *a, int a_len) { fprintf(stderr, "\033[1;32m"); for (int i = 0; i < a_len; i++) { if (i == a_len - 1) fprintf(stderr, "%d\n", a[i]); else fprintf(stderr, "%d ", a[i]); } fprintf(stderr, "\033[0m"); } void dump_i64_array(i64 *a, int a_len) { fprintf(stderr, "\033[1;33m"); for (int i = 0; i < a_len; i++) { if (i == a_len - 1) fprintf(stderr, "%ld\n", a[i]); else fprintf(stderr, "%ld ", a[i]); } fprintf(stderr, "\033[0m"); } void dump_u32_array(u32 *a, int a_len) { fprintf(stderr, "\033[1;34m"); for (int i = 0; i < a_len; i++) { if (i == a_len - 1) fprintf(stderr, "%u\n", a[i]); else fprintf(stderr, "%u ", a[i]); } fprintf(stderr, "\033[0m"); } void dump_u64_array(u64 *a, int a_len) { fprintf(stderr, "\033[1;35m"); for (int i = 0; i < a_len; i++) { if (i == a_len - 1) fprintf(stderr, "%lu\n", a[i]); else fprintf(stderr, "%lu ", a[i]); } fprintf(stderr, "\033[0m"); } void dump_int_array_range(int *a, int a_len, int l, int r) { if (r < l || r <= 0 || l >= a_len) return; if (l < 0) l = 0; if (r >= a_len) r = a_len - 1; fprintf(stderr, "\033[1;31m"); for (int i = l; i <= r; i++) { if (i == r) fprintf(stderr, "%d\n", a[i]); else fprintf(stderr, "%d ", a[i]); } fprintf(stderr, "\033[0m"); } void dump_i32_array_range(i32 *a, int a_len, int l, int r) { if (r < l || r <= 0 || l >= a_len) return; if (l < 0) l = 0; if (r >= a_len) r = a_len - 1; fprintf(stderr, "\033[1;32m"); for (int i = l; i <= r; i++) { if (i == r) fprintf(stderr, "%d\n", a[i]); else fprintf(stderr, "%d ", a[i]); } fprintf(stderr, "\033[0m"); } void dump_i64_array_range(i64 *a, int a_len, int l, int r) { if (r < l || r <= 0 || l >= a_len) return; if (l < 0) l = 0; if (r >= a_len) r = a_len - 1; fprintf(stderr, "\033[1;33m"); for (int i = l; i <= r; i++) { if (i == r) fprintf(stderr, "%ld\n", a[i]); else fprintf(stderr, "%ld ", a[i]); } fprintf(stderr, "\033[0m"); } void dump_u32_array_range(u32 *a, int a_len, int l, int r) { if (r < l || r <= 0 || l >= a_len) return; if (l < 0) l = 0; if (r >= a_len) r = a_len - 1; fprintf(stderr, "\033[1;34m"); for (int i = l; i <= r; i++) { if (i == r) fprintf(stderr, "%u\n", a[i]); else fprintf(stderr, "%u ", a[i]); } fprintf(stderr, "\033[0m"); } void dump_u64_array_range(u64 *a, int a_len, int l, int r) { if (r < l || r <= 0 || l >= a_len) return; if (l < 0) l = 0; if (r >= a_len) r = a_len - 1; fprintf(stderr, "\033[1;35m"); for (int i = l; i <= r; i++) { if (i == r) fprintf(stderr, "%lu\n", a[i]); else fprintf(stderr, "%lu ", a[i]); } fprintf(stderr, "\033[0m"); } void printb(u64 v) { u64 mask = (u64)1 << (sizeof(v) * CHAR_BIT - 1); do putchar(mask & v ? '1' : '0'); while (mask >>= 1); } void putb(u64 v) { putchar('0'), putchar('b'), printb(v), putchar('\n'); } #pragma endregion dump #pragma region montgomery32 typedef uint32_t m32; m32 _one_m32(u32 mod) { return -1u % mod + 1; } m32 _r2_m32(u32 mod) { return (u64)(i64)-1 % mod + 1; } m32 _inv_m32(u32 mod) { u32 u = 1, v = 0, x = 1u << 31; for (int i = 0; i < 32; i++) { if (u & 1) u = (u + mod) >> 1, v = (v >> 1) + x; else u >>= 1, v >>= 1; } return -v; } m32 _reduce_m32(u64 a, m32 inv, u32 mod) { i64 z = (a >> 32) - ((((u32)a * inv) * (u64)mod) >> 32); return z < 0 ? z + mod : z; } m32 to_m32(u32 a, m32 r2, m32 inv, u32 mod) { return _reduce_m32((u64)a * r2, inv, mod); } u32 from_m32(m32 A, m32 inv, u32 mod) { m32 t = _reduce_m32((u64)A, inv, mod) - mod; return t + (mod & -(t >> 31u)); } m32 add_m32(m32 A, m32 B, u32 mod2) { // assert(mod2 == (mod << 1)); A += B - mod2; A += mod2 & -(A >> 31u); return A; } m32 sub_m32(m32 A, m32 B, u32 mod2) { // assert(mod2 == (mod << 1)); A -= B; A += mod2 & -(A >> 31u); return A; } m32 min_m32(m32 A, u32 mod2) { // assert(mod2 == (mod << 1)); return sub_m32(0u, A, mod2); } m32 mul_m32(m32 A, m32 B, m32 inv, u32 mod) { return _reduce_m32((u64)A * B, inv, mod); } m32 pow_m32(m32 A, i64 n, m32 inv, u32 mod) { m32 ret = _one_m32(mod); while (n > 0) { if (n & 1) ret = mul_m32(ret, A, inv, mod); A = mul_m32(A, A, inv, mod); n >>= 1; } return ret; } m32 inv_m32(m32 A, m32 inv, u32 mod) { return pow_m32(A, (i64)mod - 2, inv, mod); } m32 div_m32(m32 A, m32 B, m32 inv, u32 mod) { /* assert(is_prime(mod)); */ return mul_m32(A, inv_m32(B, inv, mod), inv, mod); } m32 in_m32(m32 r2, m32 inv, u32 mod) { u32 c, a = 0; while (c = getchar_unlocked(), c < 48 || c > 57); while (47 < c && c < 58) { a = a * 10 + c - 48; c = getchar_unlocked(); } return to_m32(a, r2, inv, mod); } void out_m32(m32 A, m32 inv, u32 mod) { u32 a = from_m32(A, inv, mod); out_u32(a); } void dump_m32(m32 x, m32 inv, u32 mod) { fprintf(stderr, "\033[1;34m%u\033[0m\n", from_m32(x, inv, mod)); } void dump_m32_array(m32 *x, int x_len, m32 inv, u32 mod) { fprintf(stderr, "\033[1;37m"); for (int i = 0; i < x_len; i++) { if (i == x_len - 1) fprintf(stderr, "%u\n", x[i]); else fprintf(stderr, "%u ", x[i]); } fprintf(stderr, "\033[0m"); fprintf(stderr, "\033[1;33m"); for (int i = 0; i < x_len; i++) { if (i == x_len - 1) fprintf(stderr, "%u\n", from_m32(x[i], inv, mod)); else fprintf(stderr, "%u ", from_m32(x[i], inv, mod)); } fprintf(stderr, "\033[0m"); } void dump_m32_array_range(m32 *x, int x_len, int l, int r, m32 inv, u32 mod) { if (r < l || r <= 0 || l >= x_len) return; if (l < 0) l = 0; if (r >= x_len) r = x_len - 1; fprintf(stderr, "\033[1;31m"); for (int i = l; i <= r; i++) { if (i == r) fprintf(stderr, "%u\n", x[i]); else fprintf(stderr, "%u ", x[i]); } fprintf(stderr, "\033[0m"); fprintf(stderr, "\033[1;35m"); for (int i = l; i <= r; i++) { if (i == r) fprintf(stderr, "%u\n", from_m32(x[i], inv, mod)); else fprintf(stderr, "%u ", from_m32(x[i], inv, mod)); } fprintf(stderr, "\033[0m"); } #pragma endregion montgomery32 #pragma region ntt1000000007 const u32 pr1 = 3u; const u32 pr2 = 6u; const u32 pr3 = 7u; const u32 m1 = 1045430273u; const u32 m1_d = 2090860546u; const m32 m1_r2 = 798281873u; const m32 m1_inv = 3249537025u; const m32 m1_one = 113246204u; const u32 m2 = 1051721729u; const u32 m2_d = 2103443458u; const m32 m2_r2 = 748646691u; const m32 m2_inv = 3243245569u; const m32 m2_one = 88080380u; const u32 m3 = 1053818881u; const u32 m3_d = 2107637762u; const m32 m3_r2 = 159648582u; const m32 m3_inv = 3241148417u; const m32 m3_one = 79691772u; const m32 sum_e1[] = { 885458361u, 900924659u, 962059782u, 415593171u, 1042963020u, 670679244u, 541100730u, 791232404u, 1017138063u, 1009790060u, 862479644u, 14783221u, 244867424u, 431174616u, 403070125u, 826977693u, 130509512u, 234433839u, 583350782u }; const m32 sum_e2[] = { 467413740u, 850027207u, 242082110u, 469704687u, 798875242u, 948071666u, 405859818u, 794843055u, 771520840u, 176008702u, 644573208u, 715864305u, 237085256u, 357934180u, 654888890u, 961560509u, 740357787u, 329663983u, 679393590u }; const m32 sum_e3[] = { 882957072u, 871538451u, 673579336u, 73537272u, 425574675u, 697950553u, 938595210u, 127556594u, 879998874u, 1019737367u, 353383985u, 36465421u, 901066455u, 93757427u, 979102987u, 501430327u, 40143827u, 92524426u, 698947874u }; const m32 sum_ie1[] = { 159971912u, 22701698u, 360170016u, 9164511u, 967212496u, 707952636u, 928400066u, 788102537u, 248599354u, 954265785u, 539282025u, 749013941u, 66727014u, 835563346u, 909028729u, 532555753u, 488995841u, 863413679u, 575546441u }; const m32 sum_ie2[] = { 584307989u, 414819672u, 727414907u, 1042290158u, 763418088u, 582659108u, 106169741u, 486228312u, 106468254u, 164241005u, 856035592u, 117294265u, 988587910u, 117912639u, 562619904u, 608072619u, 298409394u, 523681378u, 94283612u }; const m32 sum_ie3[] = { 170861809u, 516220135u, 927824564u, 603681864u, 586404698u, 720550924u, 639641177u, 182451670u, 257278119u, 437349354u, 600413897u, 648372120u, 399254828u, 182276433u, 483901237u, 893329600u, 390180935u, 919465749u, 333795344u }; void ntt1(m32 *A, int A_len) { int h = 0; while (A_len > (1 << h)) h++; for (int ph = 1; ph <= h; ph++) { int w = 1 << (ph - 1); int p = 1 << (h - ph); m32 now = m1_one; for (int s = 0; s < w; s++) { int offset = s << (h - ph + 1); for (int i = 0; i < p; i++) { m32 l = A[i + offset]; m32 r = mul_m32(A[i + offset + p], now, m1_inv, m1); A[i + offset] = add_m32(l, r, m1_d); A[i + offset + p] = sub_m32(l, r, m1_d); } now = mul_m32(now, sum_e1[CTZ32(~s)], m1_inv, m1); } } } void ntt2(m32 *A, int A_len) { int h = 0; while (A_len > (1 << h)) h++; for (int ph = 1; ph <= h; ph++) { int w = 1 << (ph - 1); int p = 1 << (h - ph); m32 now = m2_one; for (int s = 0; s < w; s++) { int offset = s << (h - ph + 1); for (int i = 0; i < p; i++) { m32 l = A[i + offset]; m32 r = mul_m32(A[i + offset + p], now, m2_inv, m2); A[i + offset] = add_m32(l, r, m2_d); A[i + offset + p] = sub_m32(l, r, m2_d); } now = mul_m32(now, sum_e2[CTZ32(~s)], m2_inv, m2); } } } void ntt3(m32 *A, int A_len) { int h = 0; while (A_len > (1 << h)) h++; for (int ph = 1; ph <= h; ph++) { int w = 1 << (ph - 1); int p = 1 << (h - ph); m32 now = m3_one; for (int s = 0; s < w; s++) { int offset = s << (h - ph + 1); for (int i = 0; i < p; i++) { m32 l = A[i + offset]; m32 r = mul_m32(A[i + offset + p], now, m3_inv, m3); A[i + offset] = add_m32(l, r, m3_d); A[i + offset + p] = sub_m32(l, r, m3_d); } now = mul_m32(now, sum_e3[CTZ32(~s)], m3_inv, m3); } } } void intt1(m32 *A, int A_len) { int h = 0; while (A_len > (1 << h)) h++; for (int ph = h; ph >= 1; ph--) { int w = 1 << (ph - 1); int p = 1 << (h - ph); m32 inow = m1_one; for (int s = 0; s < w; s++) { int offset = s << (h - ph + 1); for (int i = 0; i < p; i++) { m32 l = A[i + offset]; m32 r = A[i + offset + p]; A[i + offset] = add_m32(l, r, m1_d); A[i + offset + p] = mul_m32(sub_m32(l, r, m1_d), inow, m1_inv, m1); } inow = mul_m32(inow, sum_ie1[CTZ32(~s)], m1_inv, m1); } } m32 inv2t = inv_m32(to_m32(A_len, m1_r2, m1_inv, m1), m1_inv, m1); for (int i = 0; i < A_len; i++) A[i] = from_m32(mul_m32(A[i], inv2t, m1_inv, m1), m1_inv, m1); } void intt2(m32 *A, int A_len) { int h = 0; while (A_len > (1 << h)) h++; for (int ph = h; ph >= 1; ph--) { int w = 1 << (ph - 1); int p = 1 << (h - ph); m32 inow = m2_one; for (int s = 0; s < w; s++) { int offset = s << (h - ph + 1); for (int i = 0; i < p; i++) { m32 l = A[i + offset]; m32 r = A[i + offset + p]; A[i + offset] = add_m32(l, r, m2_d); A[i + offset + p] = mul_m32(sub_m32(l, r, m2_d), inow, m2_inv, m2); } inow = mul_m32(inow, sum_ie2[CTZ32(~s)], m2_inv, m2); } } m32 inv2t = inv_m32(to_m32(A_len, m2_r2, m2_inv, m2), m2_inv, m2); for (int i = 0; i < A_len; i++) A[i] = from_m32(mul_m32(A[i], inv2t, m2_inv, m2), m2_inv, m2); } void intt3(m32 *A, int A_len) { int h = 0; while (A_len > (1 << h)) h++; for (int ph = h; ph >= 1; ph--) { int w = 1 << (ph - 1); int p = 1 << (h - ph); m32 inow = m3_one; for (int s = 0; s < w; s++) { int offset = s << (h - ph + 1); for (int i = 0; i < p; i++) { m32 l = A[i + offset]; m32 r = A[i + offset + p]; A[i + offset] = add_m32(l, r, m3_d); A[i + offset + p] = mul_m32(sub_m32(l, r, m3_d), inow, m3_inv, m3); } inow = mul_m32(inow, sum_ie3[CTZ32(~s)], m3_inv, m3); } } m32 inv2t = inv_m32(to_m32(A_len, m3_r2, m3_inv, m3), m3_inv, m3); for (int i = 0; i < A_len; i++) A[i] = from_m32(mul_m32(A[i], inv2t, m3_inv, m3), m3_inv, m3); } u32 *convolute1(m32 *A, int A_len, m32 *B, int B_len) { int C_len = BIT_CEIL32(A_len + B_len - 1); m32 *C = (m32 *)calloc(C_len, sizeof(m32)); m32 *D = (m32 *)calloc(C_len, sizeof(m32)); #ifdef LOCAL if (C == NULL || D == NULL) exit(EXIT_FAILURE); #endif memcpy(C, A, sizeof(m32) * A_len); memcpy(D, B, sizeof(m32) * B_len); ntt1(C, C_len); ntt1(D, C_len); for (int i = 0; i < C_len; i++) C[i] = mul_m32(C[i], D[i], m1_inv, m1); free(D); intt1(C, C_len); return C; } u32 *convolute2(m32 *A, int A_len, m32 *B, int B_len) { int C_len = BIT_CEIL32(A_len + B_len - 1); m32 *C = (m32 *)calloc(C_len, sizeof(m32)); m32 *D = (m32 *)calloc(C_len, sizeof(m32)); #ifdef LOCAL if (C == NULL || D == NULL) exit(EXIT_FAILURE); #endif memcpy(C, A, sizeof(m32) * A_len); memcpy(D, B, sizeof(m32) * B_len); ntt2(C, C_len); ntt2(D, C_len); for (int i = 0; i < C_len; i++) C[i] = mul_m32(C[i], D[i], m2_inv, m2); free(D); intt2(C, C_len); return C; } u32 *convolute3(m32 *A, int A_len, m32 *B, int B_len) { int C_len = BIT_CEIL32(A_len + B_len - 1); m32 *C = (m32 *)calloc(C_len, sizeof(m32)); m32 *D = (m32 *)calloc(C_len, sizeof(m32)); #ifdef LOCAL if (C == NULL || D == NULL) exit(EXIT_FAILURE); #endif memcpy(C, A, sizeof(m32) * A_len); memcpy(D, B, sizeof(m32) * B_len); ntt3(C, C_len); ntt3(D, C_len); for (int i = 0; i < C_len; i++) C[i] = mul_m32(C[i], D[i], m3_inv, m3); free(D); intt3(C, C_len); return C; } u32 *arbitrary_mod_convolute(u32 *a, int a_len, u32 *b, int b_len, u32 mod) { u32 m0 = mod; u32 m0_d = m0 << 1; m32 m0_r2 = _r2_m32(m0); m32 m0_inv = _inv_m32(m0); m32 *A1 = (m32 *)calloc(a_len, sizeof(m32)); m32 *A2 = (m32 *)calloc(a_len, sizeof(m32)); m32 *A3 = (m32 *)calloc(a_len, sizeof(m32)); m32 *B1 = (m32 *)calloc(b_len, sizeof(m32)); m32 *B2 = (m32 *)calloc(b_len, sizeof(m32)); m32 *B3 = (m32 *)calloc(b_len, sizeof(m32)); #ifdef LOCAL if (A1 == NULL || A2 == NULL || A3 == NULL || B1 == NULL || B2 == NULL || B3 == NULL)) { exit(EXIT_FAILURE); } #endif for (int i = 0; i < a_len; i++) { A1[i] = to_m32(a[i], m1_r2, m1_inv, m1); A2[i] = to_m32(a[i], m2_r2, m2_inv, m2); A3[i] = to_m32(a[i], m3_r2, m3_inv, m3); } for (int i = 0; i < b_len; i++) { B1[i] = to_m32(b[i], m1_r2, m1_inv, m1); B2[i] = to_m32(b[i], m2_r2, m2_inv, m2); B3[i] = to_m32(b[i], m3_r2, m3_inv, m3); } u32 *x = convolute1(A1, a_len, B1, b_len); u32 *y = convolute2(A2, a_len, B2, b_len); u32 *z = convolute3(A3, a_len, B3, b_len); free(A1); free(A2); free(A3); free(B1); free(B2); free(B3); m32 m1_inv_m2 = inv_m32(to_m32(m1, m2_r2, m2_inv, m2), m2_inv, m2); m32 m12_inv_m3 = inv_m32(mul_m32(to_m32(m1, m3_r2, m3_inv, m3), to_m32(m2, m3_r2, m3_inv, m3), m3_inv, m3), m3_inv, m3); m32 m1_m3 = to_m32(m1, m3_r2, m3_inv, m3); m32 m1_m0 = to_m32(m1, m0_r2, m0_inv, m0); m32 m12_m0 = mul_m32(to_m32(m1, m0_r2, m0_inv, m0), to_m32(m2, m0_r2, m0_inv, m0), m0_inv, m0); u32 *ret = (u32 *)calloc(a_len + b_len - 1, sizeof(u32)); #ifdef LOCAL if (ret == NULL) exit(EXIT_FAILURE); #endif for (u32 i = 0; i < a_len + b_len - 1; i++) { m32 xi_m2 = to_m32(x[i], m2_r2, m2_inv, m2); m32 yi_m2 = to_m32(y[i], m2_r2, m2_inv, m2); m32 zi_m3 = to_m32(z[i], m3_r2, m3_inv, m3); m32 xi_m3 = to_m32(x[i], m3_r2, m3_inv, m3); u32 v1 = from_m32(mul_m32(sub_m32(yi_m2, xi_m2, m2_d), m1_inv_m2, m2_inv, m2), m2_inv, m2); m32 v1_m3 = to_m32(v1, m3_r2, m3_inv, m3); u32 v2 = from_m32(mul_m32(sub_m32(zi_m3, add_m32(xi_m3, mul_m32(m1_m3, v1_m3, m3_inv, m3), m3_d), m3_d), m12_inv_m3, m3_inv, m3), m3_inv, m3); m32 v2_m0 = to_m32(v2, m0_r2, m0_inv, m0); m32 xi_m0 = to_m32(x[i], m0_r2, m0_inv, m0); m32 v1_m0 = to_m32(v1, m0_r2, m0_inv, m0); ret[i] = from_m32(add_m32(add_m32(xi_m0, mul_m32(m1_m0, v1_m0, m0_inv, m0), m0_d), mul_m32(m12_m0, v2_m0, m0_inv, m0), m0_d), m0_inv, m0); } free(x); free(y); free(z); return ret; } #pragma endregion ntt1000000007 #pragma region sample point shift m32 _fact[1<<20]; m32 _inv_fact[1<<20]; m32 _inv_table[1<<20]; void pre_fact(int n, m32 r2, m32 inv, u32 mod) { _fact[0] = 1; for (int i = 0; i <= n + 1; i++) _fact[i + 1] = mul_m32(_fact[i], to_m32(i + 1, r2, inv, mod), inv, mod); _inv_fact[n + 2] = inv_m32(_fact[n + 2], inv, mod); for (int i = n + 2; i > 0; i--) _inv_fact[i - 1] = mul_m32(_inv_fact[i], to_m32(i, r2, inv, mod), inv, mod); for (int i = 1; i <= n + 1; i++) _inv_table[i] = mul_m32(_inv_fact[i], _fact[i - 1], inv, mod); } m32 *sample_point_shift(m32 *A, int A_len, m32 m, m32 r2, m32 inv, u32 mod) { u32 *f = (m32 *)calloc(A_len, sizeof(m32)); u32 *g = (m32 *)calloc((A_len << 1) - 1, sizeof(m32)); #ifdef LOCAL if (F == NULL || G == NULL || f == NULL || g == NULL) exit(EXIT_FAILURE); #endif for (int i = 0; i < A_len; i++) { f[i] = mul_m32(mul_m32(A[i], _inv_fact[i], inv, mod), _inv_fact[A_len - 1 - i], inv, mod); if ((A_len - 1 - i) & 1) f[i] = min_m32(f[i], mod << 1); f[i] = from_m32(f[i], inv, mod); } m32 d_m = to_m32(A_len - 1, r2, inv, mod); for (int i = 0; i < (A_len << 1) - 1; i++) { m32 i_m = to_m32(i, r2, inv, mod); g[i] = inv_m32(add_m32(sub_m32(m, d_m, mod << 1), i_m, mod << 1), inv, mod); g[i] = from_m32(g[i], inv, mod); } u32 *h = arbitrary_mod_convolute(f, A_len, g, (A_len << 1) - 1, mod); m32 coef = _one_m32(mod); for (int i = 0; i < (A_len << 1) - 1; i++) h[i] = to_m32(h[i], r2, inv, mod); for (int i = 0; i < A_len; i++) { m32 i_m = to_m32(i, r2, inv, mod); m32 mdi = add_m32(sub_m32(m, d_m, mod << 1), i_m, mod << 1); coef = mul_m32(coef, mdi, inv, mod); } for (int i = 0; i < A_len; i++) { m32 i1 = to_m32(i + 1, r2, inv, mod); h[i + A_len - 1] = mul_m32(h[i + A_len - 1], coef, inv, mod); coef = mul_m32(coef, mul_m32(add_m32(m, i1, mod << 1), g[i], inv, mod), inv, mod); } for (int i = A_len - 1; i < (A_len << 1) - 1; i++) f[i - A_len + 1] = h[i]; free(g); free(h); return f; } m32 factorial_mod(i64 n, m32 r2, m32 inv, u32 mod) { if (n <= 1) return _one_m32(mod); if (n >= mod) return 0; i64 v = 1; while (v * v < n) v <<= 1; m32 v_m = to_m32(v, r2, inv, mod); m32 iv_m = inv_m32(v_m, inv, mod); m32 *G = (m32 *)calloc(v + 1, sizeof(m32)); G[0] = _one_m32(mod); G[1] = to_m32(v + 1, r2, inv, mod); for (i64 d = 1; d != v; d <<= 1) { m32 d_m = to_m32(d, r2, inv, mod); m32 *g1 = sample_point_shift(G, d + 1, mul_m32(d_m, iv_m, inv, mod), r2, inv, mod); m32 *g2 = sample_point_shift(G, d + 1, mul_m32(add_m32(mul_m32(d_m, v_m, inv, mod), v_m, mod << 1), iv_m, inv, mod), r2, inv, mod); m32 *g3 = sample_point_shift(G, d + 1, mul_m32(add_m32(add_m32(mul_m32(d_m, v_m, inv, mod), d_m, mod << 1), v_m, mod << 1), iv_m, inv, mod), r2, inv, mod); for (int i = 0; i <= d; i++) { G[i] = mul_m32(G[i], g1[i], inv, mod); g2[i] = mul_m32(g2[i], g3[i], inv, mod); } for (int i = 0; i < d; i++) G[d + i] = g2[i]; } m32 ret = _one_m32(mod); i64 i = 0; while (i + v <= n) ret = mul_m32(ret, G[i / v], inv, mod), i += v; while (i < n) ret = mul_m32(ret, to_m32(++i, r2, inv, mod), inv, mod); return ret; } #pragma endregion sample point shift void Main(void) { i64 n = in_i64(); u32 mod = 1000000007u; m32 r2 = _r2_m32(mod); m32 inv = _inv_m32(mod); out_m32(factorial_mod(n, r2, inv, mod), inv, mod); NL(); } int main(void) { Main(); return 0; }