#pragma region opt #pragma GCC target("avx2") #pragma GCC optimize("O3") #pragma endregion opt #pragma region header #define _GNU_SOURCE #include #include #include #include #include #include #include #include #include #pragma endregion header #pragma region type /* signed integer */ typedef int8_t i8; typedef int16_t i16; typedef int32_t i32; typedef int64_t i64; typedef __int128_t i128; /* unsigned integer */ typedef uint8_t u8; typedef uint16_t u16; typedef uint32_t u32; typedef uint64_t u64; typedef __uint128_t u128; /* floating point number */ typedef float f32; typedef double f64; typedef long double f80; #pragma endregion type #pragma region macro #define MIN(a, b) (((a) < (b)) ? (a) : (b)) #define MAX(a, b) (((a) > (b)) ? (a) : (b)) #define SWAP(a, b) (((a) ^= (b)), ((b) ^= (a)), ((a) ^= (b))) #define POPCNT32(a) __builtin_popcount((a)) #define POPCNT64(a) __builtin_popcountll((a)) #define CTZ32(a) __builtin_ctz((a)) #define CLZ32(a) __builtin_clz((a)) #define CTZ64(a) __builtin_ctzll((a)) #define CLZ64(a) __builtin_clzll((a)) #define HAS_SINGLE_BIT32(a) (__builtin_popcount((a)) == (1)) #define HAS_SINGLE_BIT64(a) (__builtin_popcountll((a)) == (1)) #define MSB32(a) ((31) - __builtin_clz((a))) #define MSB64(a) ((63) - __builtin_clzll((a))) #define BIT_WIDTH32(a) ((a) ? ((32) - __builtin_clz((a))) : (0)) #define BIT_WIDTH64(a) ((a) ? ((64) - __builtin_clzll((a))) : (0)) #define LSBit(a) ((a) & (-(a))) #define CLSBit(a) ((a) & ((a) - (1))) #define BIT_CEIL32(a) ((!(a)) ? (1) : ((POPCNT32(a)) == (1) ? ((1u) << ((31) - CLZ32((a)))) : ((1u) << ((32) - CLZ32(a))))) #define BIT_CEIL64(a) ((!(a)) ? (1) : ((POPCNT64(a)) == (1) ? ((1ull) << ((63) - CLZ64((a)))) : ((1ull) << ((64) - CLZ64(a))))) #define BIT_FLOOR32(a) ((!(a)) ? (0) : ((1u) << ((31) - CLZ32((a))))) #define BIT_FLOOR64(a) ((!(a)) ? (0) : ((1ull) << ((63) - CLZ64((a))))) #define _ROTL32(x, s) (((x) << ((s) % (32))) | (((x) >> ((32) - ((s) % (32)))))) #define _ROTR32(x, s) (((x) >> ((s) % (32))) | (((x) << ((32) - ((s) % (32)))))) #define ROTL32(x, s) (((s) == (0)) ? (x) : ((((i64)(s)) < (0)) ? (_ROTR32((x), -(s))) : (_ROTL32((x), (s))))) #define ROTR32(x, s) (((s) == (0)) ? (x) : ((((i64)(s)) < (0)) ? (_ROTL32((x), -(s))) : (_ROTR32((x), (s))))) #define _ROTL64(x, s) (((x) << ((s) % (64))) | (((x) >> ((64) - ((s) % (64)))))) #define _ROTR64(x, s) (((x) >> ((s) % (64))) | (((x) << ((64) - ((s) % (64)))))) #define ROTL64(x, s) (((s) == (0)) ? (x) : ((((i128)(s)) < (0)) ? (_ROTR64((x), -(s))) : (_ROTL64((x), (s))))) #define ROTR64(x, s) (((s) == (0)) ? (x) : ((((i128)(s)) < (0)) ? (_ROTL64((x), -(s))) : (_ROTR64((x), (s))))) #pragma endregion macro #pragma region io int read_int(void) { // -2147483648 ~ 2147483647 (> 10 ^ 9) int c, x = 0, f = 1; while (c = getchar_unlocked(), c < 48 || c > 57) if (c == 45) f = -f; while (47 < c && c < 58) { x = x * 10 + c - 48; c = getchar_unlocked(); } return f * x; } i32 in_i32(void) { // -2147483648 ~ 2147483647 (> 10 ^ 9) i32 c, x = 0, f = 1; while (c = getchar_unlocked(), c < 48 || c > 57) if (c == 45) f = -f; while (47 < c && c < 58) { x = x * 10 + c - 48; c = getchar_unlocked(); } return f * x; } u32 in_u32(void) { // 0 ~ 4294967295 (> 10 ^ 9) u32 c, x = 0; while (c = getchar_unlocked(), c < 48 || c > 57); while (47 < c && c < 58) { x = x * 10 + c - 48; c = getchar_unlocked(); } return x; } i64 in_i64(void) { // -9223372036854775808 ~ 9223372036854775807 (> 10 ^ 18) i64 c, x = 0, f = 1; while (c = getchar_unlocked(), c < 48 || c > 57) if (c == 45) f = -f; while (47 < c && c < 58) { x = x * 10 + c - 48; c = getchar_unlocked(); } return f * x; } u64 in_u64(void) { // 0 ~ 18446744073709551615 (> 10 ^ 19) u64 c, x = 0; while (c = getchar_unlocked(), c < 48 || c > 57); while (47 < c && c < 58) { x = x * 10 + c - 48; c = getchar_unlocked(); } return x; } static inline void write_int_inner(int x) { if (x >= 10) write_int_inner(x / 10); putchar_unlocked(x - x / 10 * 10 + 48); } void write_int(int x) { if (x < 0) { putchar_unlocked('-'); x = -x; } write_int_inner(x); } static inline void out_i32_inner(i32 x) { if (x >= 10) out_i32_inner(x / 10); putchar_unlocked(x - x / 10 * 10 + 48); } void out_i32(i32 x) { if (x < 0) { putchar_unlocked('-'); x = -x; } out_i32_inner(x); } void out_u32(u32 x) { if (x >= 10) out_u32(x / 10); putchar_unlocked(x - x / 10 * 10 + 48); } static inline void out_i64_inner(i64 x) { if (x >= 10) out_i64_inner(x / 10); putchar_unlocked(x - x / 10 * 10 + 48); } void out_i64(i64 x) { if (x < 0) { putchar_unlocked('-'); x = -x; } out_i64_inner(x); } void out_u64(u64 x) { if (x >= 10) out_u64(x / 10); putchar_unlocked(x - x / 10 * 10 + 48); } void NL(void) { putchar_unlocked('\n'); } void SP(void) { putchar_unlocked(' '); } void write_int_array(int *a, int a_len) { for (int i = 0; i < a_len; i++) { if (i) SP(); write_int(a[i]); } NL(); } void out_i32_array(i32 *a, int a_len) { for (int i = 0; i < a_len; i++) { if (i) SP(); out_i32(a[i]); } NL(); } void out_u32_array(u32 *a, int a_len) { for (int i = 0; i < a_len; i++) { if (i) SP(); out_u32(a[i]); } NL(); } void out_i64_array(i64 *a, int a_len) { for (int i = 0; i < a_len; i++) { if (i) SP(); out_i64(a[i]); } NL(); } void out_u64_array(u64 *a, int a_len) { for (int i = 0; i < a_len; i++) { if (i) SP(); out_u64(a[i]); } NL(); } #pragma endregion io #pragma region m32 typedef uint32_t m32; m32 _one_m32(u32 mod) { return (u32)-1u % mod + 1; } m32 _r2_m32(u32 mod) { return (u64)(i64)-1 % mod + 1; } m32 _inv_m32(u32 mod) { u32 inv = mod; for (int i = 0; i < 4; ++i) inv *= 2 - inv * mod; return inv; /** u32 u = 1, v = 0, x = 1u << 31; for (int i = 0; i < 32; i++) { if (u & 1) u = (u + mod) >> 1, v = (v >> 1) + x; else u >>= 1, v >>= 1; } return -v; */ } m32 _reduce_m32(u64 a, m32 inv, u32 mod) { u32 y = (u32)(a >> 32) - (u32)(((u64)((u32)a * inv) * mod) >> 32); return (i32)y < 0 ? y + mod : y; } m32 to_m32(u32 a, m32 r2, m32 inv, u32 mod) { return _reduce_m32((u64)a * r2, inv, mod); } u32 from_m32(m32 A, m32 inv, u32 mod) { return _reduce_m32(A, inv, mod); } m32 add_m32(m32 A, m32 B, u32 mod) { return A + B >= mod ? A + B - mod: A + B; } m32 sub_m32(m32 A, m32 B, u32 mod) { return A >= B ? A - B : mod + A - B; } m32 min_m32(m32 A, u32 mod) { return sub_m32(0u, A, mod); } m32 mul_m32(m32 A, m32 B, m32 inv, u32 mod) { return _reduce_m32((u64)A * B, inv, mod); } m32 pow_m32(m32 A, i32 n, m32 inv, u32 mod) { m32 ret = _one_m32(mod); while (n > 0) { if (n & 1) ret = mul_m32(ret, A, inv, mod); A = mul_m32(A, A, inv, mod); n >>= 1; } return ret; } m32 inv_m32(m32 A, m32 inv, u32 mod) { return pow_m32(A, (i32)mod - 2, inv, mod); } m32 div_m32(m32 A, m32 B, m32 inv, u32 mod) { /* assert(is_prime(mod)); */ return mul_m32(A, inv_m32(B, inv, mod), inv, mod); } m32 in_m32(m32 r2, m32 inv, u32 mod) { u32 c, a = 0; while (c = getchar_unlocked(), c < 48 || c > 57); while (47 < c && c < 58) { a = a * 10 + c - 48; c = getchar_unlocked(); } return to_m32(a, r2, inv, mod); } void out_m32(m32 A, m32 inv, u32 mod) { u32 a = from_m32(A, inv, mod); out_u32(a); } #pragma endregion m32 #pragma region ntt #pragma region ntt1 const u32 m1 = 998244353u; const m32 m1_r2 = 932051910u; const m32 m1_inv = 3296722945u; const m32 m1_one = 301989884u; const m32 m1_rev = 696254469u; const m32 m1_gs[] = { 691295370, 307583142, 566821959, 878217029, 375146819, 138254384, 500602490, 79119218, 790898700, 978335284, 651424567, 308706579, 723000027, 474797508, 683394121, 44141573, 536892010, 945865189, 175417726, 536169764, 831722880, 721458245 }; const m32 m1_igs[] = { 306948983, 888603487, 138723248, 65668869, 842568658, 953245971, 195169681, 118717521, 792052763, 828450244, 908724728, 218560432, 628507989, 248210924, 566568154, 6285593, 82571768, 49985074, 225413092, 349167278, 61514562, 763211248 }; void ntt1(m32 *A, int A_len) { int h = 0; while (A_len > (1 << h)) h++; for (int ph = 1; ph <= h; ph++) { int w = 1 << (ph - 1); int p = 1 << (h - ph); m32 now = m1_one; for (int s = 0; s < w; s++) { int offset = s << (h - ph + 1); for (int i = 0; i < p; i++) { m32 l = A[i + offset]; m32 r = mul_m32(A[i + offset + p], now, m1_inv, m1); A[i + offset] = add_m32(l, r, m1); A[i + offset + p] = sub_m32(l, r, m1); } now = mul_m32(now, m1_gs[CTZ32(~s)], m1_inv, m1); } } } void intt1(m32 *A, int A_len) { int h = 0; while (A_len > (1 << h)) h++; for (int ph = h; ph >= 1; ph--) { int w = 1 << (ph - 1); int p = 1 << (h - ph); m32 inow = m1_one; for (int s = 0; s < w; s++) { int offset = s << (h - ph + 1); for (int i = 0; i < p; i++) { m32 l = A[i + offset]; m32 r = A[i + offset + p]; A[i + offset] = add_m32(l, r, m1); A[i + offset + p] = mul_m32(sub_m32(l, r, m1), inow, m1_inv, m1); } inow = mul_m32(inow, m1_igs[CTZ32(~s)], m1_inv, m1); } } m32 inv2t = inv_m32(to_m32(A_len, m1_r2, m1_inv, m1), m1_inv, m1); for (int i = 0; i < A_len; i++) A[i] = mul_m32(A[i], inv2t, m1_inv, m1); } m32 *convolute1(m32 *A, int A_len, m32 *B, int B_len) { int ret_len = BIT_CEIL32(A_len + B_len - 1); m32 *C = (m32 *)calloc(ret_len, sizeof(m32)); m32 *D = (m32 *)calloc(ret_len, sizeof(m32)); #ifdef LOCAL if (C == NULL || D == NULL) exit(EXIT_FAILURE); #endif memcpy(C, A, sizeof(m32) * A_len); memcpy(D, B, sizeof(m32) * B_len); ntt1(C, ret_len); ntt1(D, ret_len); for (int i = 0 ; i < ret_len; i++) C[i] = mul_m32(C[i], D[i], m1_inv, m1); free(D); intt1(C, ret_len); return C; } #pragma endregion ntt1 #pragma region ntt2 const u32 m2 = 985661441u; const m32 m2_r2 = 616455619u; const m32 m2_inv = 3309305857u; const m32 m2_one = 352321532u; const m32 m2_rev = 633339909u; const m32 m2_gs[] = { 969414155, 240156868, 716651500, 728800531, 977177032, 47314842, 240475723, 876076444, 626710676, 365360170, 808202916, 560909592, 755542104, 303317332, 75348256, 259192271, 882296372, 620044766, 876870197, 256206930, 761331788 }; const m32 m2_igs[] = { 16247286, 67104299, 325946810, 44505332, 582782266, 729124870, 724673072, 173952869, 594582867, 76943556, 66752559, 892797276, 469283465, 123325105, 933929770, 911329874, 741246559, 905930185, 828158135, 9523962, 198022420 }; void ntt2(m32 *A, int A_len) { int h = 0; while (A_len > (1 << h)) h++; for (int ph = 1; ph <= h; ph++) { int w = 1 << (ph - 1); int p = 1 << (h - ph); m32 now = m2_one; for (int s = 0; s < w; s++) { int offset = s << (h - ph + 1); for (int i = 0; i < p; i++) { m32 l = A[i + offset]; m32 r = mul_m32(A[i + offset + p], now, m2_inv, m2); A[i + offset] = add_m32(l, r, m2); A[i + offset + p] = sub_m32(l, r, m2); } now = mul_m32(now, m2_gs[CTZ32(~s)], m2_inv, m2); } } } void intt2(m32 *A, int A_len) { int h = 0; while (A_len > (1 << h)) h++; for (int ph = h; ph >= 1; ph--) { int w = 1 << (ph - 1); int p = 1 << (h - ph); m32 inow = m2_one; for (int s = 0; s < w; s++) { int offset = s << (h - ph + 1); for (int i = 0; i < p; i++) { m32 l = A[i + offset]; m32 r = A[i + offset + p]; A[i + offset] = add_m32(l, r, m2); A[i + offset + p] = mul_m32(sub_m32(l, r, m2), inow, m2_inv, m2); } inow = mul_m32(inow, m2_igs[CTZ32(~s)], m2_inv, m2); } } m32 inv2t = inv_m32(to_m32(A_len, m2_r2, m2_inv, m2), m2_inv, m2); for (int i = 0; i < A_len; i++) A[i] = mul_m32(A[i], inv2t, m2_inv, m2); } m32 *convolute2(m32 *A, int A_len, m32 *B, int B_len) { int ret_len = BIT_CEIL32(A_len + B_len - 1); m32 *C = (m32 *)calloc(ret_len, sizeof(m32)); m32 *D = (m32 *)calloc(ret_len, sizeof(m32)); #ifdef LOCAL if (C == NULL || D == NULL) exit(EXIT_FAILURE); #endif memcpy(C, A, sizeof(m32) * A_len); memcpy(D, B, sizeof(m32) * B_len); ntt2(C, ret_len); ntt2(D, ret_len); for (int i = 0 ; i < ret_len; i++) C[i] = mul_m32(C[i], D[i], m2_inv, m2); free(D); intt2(C, ret_len); return C; } #pragma endregion ntt2 #pragma region ntt3 const u32 m3 = 943718401u; const m32 m3_r2 = 917135855u; const m32 m3_inv = 3351248897u; const m32 m3_one = 520093692u; const m32 m3_rev = 423624709u; const m32 m3_gs[] = { 125689310, 401270432, 193546243, 204233475, 765072983, 793690592, 598110941, 560814539, 323055569, 635997590, 661263945, 671645950, 596439462, 577210208, 667936112, 172603057, 698142776, 3390265, 400541812, 419143563, 100582761 }; const m32 m3_igs[] = { 818029091, 177917178, 278610320, 675646939, 629165784, 803573782, 552038920, 685763768, 343497720, 610893888, 604907871, 366961343, 132493990, 882172703, 730481417, 529389095, 864269596, 777879390, 446333578, 468025435, 879098724 }; void ntt3(m32 *A, int A_len) { int h = 0; while (A_len > (1 << h)) h++; for (int ph = 1; ph <= h; ph++) { int w = 1 << (ph - 1); int p = 1 << (h - ph); m32 now = m3_one; for (int s = 0; s < w; s++) { int offset = s << (h - ph + 1); for (int i = 0; i < p; i++) { m32 l = A[i + offset]; m32 r = mul_m32(A[i + offset + p], now, m3_inv, m3); A[i + offset] = add_m32(l, r, m3); A[i + offset + p] = sub_m32(l, r, m3); } now = mul_m32(now, m3_gs[CTZ32(~s)], m3_inv, m3); } } } void intt3(m32 *A, int A_len) { int h = 0; while (A_len > (1 << h)) h++; for (int ph = h; ph >= 1; ph--) { int w = 1 << (ph - 1); int p = 1 << (h - ph); m32 inow = m3_one; for (int s = 0; s < w; s++) { int offset = s << (h - ph + 1); for (int i = 0; i < p; i++) { m32 l = A[i + offset]; m32 r = A[i + offset + p]; A[i + offset] = add_m32(l, r, m3); A[i + offset + p] = mul_m32(sub_m32(l, r, m3), inow, m3_inv, m3); } inow = mul_m32(inow, m3_igs[CTZ32(~s)], m3_inv, m3); } } m32 inv2t = inv_m32(to_m32(A_len, m3_r2, m3_inv, m3), m3_inv, m3); for (int i = 0; i < A_len; i++) A[i] = mul_m32(A[i], inv2t, m3_inv, m3); } m32 *convolute3(m32 *A, int A_len, m32 *B, int B_len) { int ret_len = BIT_CEIL32(A_len + B_len - 1); m32 *C = (m32 *)calloc(ret_len, sizeof(m32)); m32 *D = (m32 *)calloc(ret_len, sizeof(m32)); #ifdef LOCAL if (C == NULL || D == NULL) exit(EXIT_FAILURE); #endif memcpy(C, A, sizeof(m32) * A_len); memcpy(D, B, sizeof(m32) * B_len); ntt3(C, ret_len); ntt3(D, ret_len); for (int i = 0 ; i < ret_len; i++) C[i] = mul_m32(C[i], D[i], m3_inv, m3); free(D); intt3(C, ret_len); return C; } #pragma endregion ntt3 #pragma endregion ntt #pragma region convolute mod1000000007 const u32 mod = 1000000007u; const m32 r2 = 582344008u; const m32 inv = 2068349879u; const m32 one = 294967268u; u32 *convolute_mod1000000007(u32 *a, int a_len, u32 *b, int b_len) { m32 *A = (m32 *)calloc(a_len, sizeof(m32)); m32 *B = (m32 *)calloc(b_len, sizeof(m32)); #ifdef LOCAL if (A == NULL || B == NULL) exit(EXIT_FAILURE); #endif for (int i = 0; i < a_len; i++) A[i] = to_m32(a[i] % m1, m1_r2, m1_inv, m1); for (int i = 0; i < b_len; i++) B[i] = to_m32(b[i] % m1, m1_r2, m1_inv, m1); m32 *C1 = convolute1(A, a_len, B, b_len); for (int i = 0; i < a_len; i++) A[i] = to_m32(a[i] % m2, m2_r2, m2_inv, m2); for (int i = 0; i < b_len; i++) B[i] = to_m32(b[i] % m2, m2_r2, m2_inv, m2); m32 *C2 = convolute2(A, a_len, B, b_len); for (int i = 0; i < a_len; i++) A[i] = to_m32(a[i] % m3, m3_r2, m3_inv, m3); for (int i = 0; i < b_len; i++) B[i] = to_m32(b[i] % m3, m3_r2, m3_inv, m3); m32 *C3 = convolute3(A, a_len, B, b_len); free(A); free(B); u32 *ret = (u32 *)calloc(a_len + b_len - 1, sizeof(u32)); #ifdef LOCAL if (ret == NULL) exit(EXIT_FAILURE); #endif m32 m1_inv_m2 = inv_m32(to_m32(m1, m2_r2, m2_inv, m2), m2_inv, m2); m32 m12_inv_m3 = inv_m32(mul_m32(to_m32(m1, m3_r2, m3_inv, m3), to_m32(m2, m3_r2, m3_inv, m3), m3_inv, m3), m3_inv, m3); m32 m1_m3 = to_m32(m1, m3_r2, m3_inv, m3); m32 m1_m0 = to_m32(m1, r2, inv, 1000000007u); m32 m12_m0 = mul_m32(to_m32(m1, r2, inv, 1000000007u), to_m32(m2, r2, inv, 1000000007u), inv, 1000000007u); for (int i = 0; i < a_len + b_len - 1; ++i) { u32 xi = from_m32(C1[i], m1_inv, m1); u32 yi = from_m32(C2[i], m2_inv, m2); u32 zi = from_m32(C3[i], m3_inv, m3); m32 xi_m2 = to_m32(xi, m2_r2, m2_inv, m2); m32 yi_m2 = to_m32(yi, m2_r2, m2_inv, m2); m32 zi_m3 = to_m32(zi, m3_r2, m3_inv, m3); m32 xi_m3 = to_m32(xi, m3_r2, m3_inv, m3); u32 v1 = from_m32(mul_m32(sub_m32(yi_m2, xi_m2, m2), m1_inv_m2, m2_inv, m2), m2_inv, m2); m32 v1_m3 = to_m32(v1, m3_r2, m3_inv, m3); u32 v2 = from_m32(mul_m32(sub_m32(zi_m3, add_m32(xi_m3, mul_m32(m1_m3, v1_m3, m3_inv, m3), m3), m3), m12_inv_m3, m3_inv, m3), m3_inv, m3); m32 v2_m0 = to_m32(v2, r2, inv, 1000000007u); m32 xi_m0 = to_m32(xi, r2, inv, 1000000007u); m32 v1_m0 = to_m32(v1, r2, inv, 1000000007u); ret[i] = from_m32(add_m32(add_m32(xi_m0, mul_m32(m1_m0, v1_m0, inv, 1000000007u), 1000000007u), mul_m32(m12_m0, v2_m0, inv, 1000000007u), 1000000007u), inv, 1000000007u); } free(C1); free(C2); free(C3); return ret; } #pragma endregion convolute mod1000000007 #pragma region sample_point_shift m32 _fact[1<<20]; m32 _inv_fact[1<<20]; m32 _inv_table[1<<20]; void pre_fact(int n) { _fact[0] = one; for (int i = 0; i <= n + 1; i++) _fact[i + 1] = mul_m32(_fact[i], to_m32(i + 1, r2, inv, mod), inv, mod); _inv_fact[n + 2] = inv_m32(_fact[n + 2], inv, mod); for (int i = n + 2; i > 0; i--) _inv_fact[i - 1] = mul_m32(_inv_fact[i], to_m32(i, r2, inv, mod), inv, mod); for (int i = 1; i <= n + 1; i++) _inv_table[i] = mul_m32(_inv_fact[i], _fact[i - 1], inv, mod); } m32 *sample_point_shift(m32 *A, int A_len, m32 c) { m32 *f = (m32 *)calloc(A_len, sizeof(m32)); m32 *g = (m32 *)calloc((A_len << 1) - 1, sizeof(m32)); m32 *ret = (m32 *)calloc(A_len, sizeof(m32)); #ifdef LOCAL if (f == NULL || g == NULL || ret == NULL) exit(EXIT_FAILURE); #endif for (int i = 0; i < A_len; i++) { f[i] = mul_m32(mul_m32(A[i], _inv_fact[i], inv, mod), _inv_fact[A_len - 1 - i], inv, mod); if ((A_len - 1 - i) & 1) f[i] = min_m32(f[i], mod); } for (int i = 0; i < (A_len << 1) - 1; i++) g[i] = inv_m32(mod + c - (A_len - 1) + i, inv, mod); m32 *h = convolute_mod1000000007(f, A_len, g, (A_len << 1) - 1); m32 coef = one; for (int i = 0; i < A_len; i++) coef = mul_m32(coef, c - A_len + 1 + i, inv, mod); for (int i = 0; i < A_len; i++) { h[i + A_len - 1] = mul_m32(h[i + A_len - 1], coef, inv, mod); coef = mul_m32(coef, mul_m32(c + i + 1, g[i], inv, mod), inv, mod); } for (int i = 0; i < A_len; i++) ret[i] = h[A_len - 1 + i]; return ret; } #pragma endregion sample_point_shift #pragma region factorial mod m32 factorial_mod(u64 n) { if (n <= 1) return one; if (n >= 1000000007ul) return 0; const i64 v = 32768; m32 iv = inv_m32(to_m32((u32)v, r2, inv, mod), inv, mod); m32 *G = (m32 *)calloc(v + 1, sizeof(m32)); #ifdef LOCAL if (G == NULL) exit(EXIT_FAILURE); #endif G[0] = one; G[1] = to_m32((u32)v + 1, r2, inv, mod); for (i64 d = 1; d * v < 1000000007; d <<= 1) { m32 *G1 = sample_point_shift(G, d + 1, mul_m32(to_m32(d, r2, inv, mod), iv, inv, mod)); m32 *G2 = sample_point_shift(G, d + 1, mul_m32(to_m32(d * v + v, r2, inv, mod), iv, inv, mod)); m32 *G3 = sample_point_shift(G, d + 1, mul_m32(to_m32(d * v + d + v, r2, inv, mod), iv, inv, mod)); for (int i = 0; i <= d; i++) G[i] = mul_m32(G[i], G1[i], inv, mod), G2[i] = mul_m32(G2[i], G3[i], inv, mod); for (int i = 0; i < d; i++) G[d + i] = G2[i]; } m32 ret = one; i64 i = 0; while (i + v <= n) ret = mul_m32(ret, G[i / v], inv, mod), i += v; while (i < n) i++, ret = mul_m32(ret, to_m32(i, r2, inv, mod), inv, mod); return ret; } #pragma endregion factorial mod void Main(void) { pre_fact(1 << 19); out_m32(factorial_mod(in_u64()), inv, mod); NL(); } int main(void) { Main(); return 0; }