結果
| 問題 | No.502 階乗を計算するだけ |
| コンテスト | |
| ユーザー |
nonamae
|
| 提出日時 | 2021-10-25 21:42:04 |
| 言語 | C (gcc 13.3.0) |
| 結果 |
WA
|
| 実行時間 | - |
| コード長 | 24,998 bytes |
| コンパイル時間 | 2,575 ms |
| コンパイル使用メモリ | 81,252 KB |
| 実行使用メモリ | 6,824 KB |
| 最終ジャッジ日時 | 2024-10-03 19:43:29 |
| 合計ジャッジ時間 | 5,267 ms |
|
ジャッジサーバーID (参考情報) |
judge4 / judge5 |
(要ログイン)
| ファイルパターン | 結果 |
|---|---|
| other | AC * 12 WA * 40 |
ソースコード
#pragma region opt
#pragma GCC optimize("O3")
#pragma GCC optimize("fast-math")
#pragma GCC optimize("unroll-loops")
#pragma GCC target("avx2")
#pragma endregion opt
#pragma region header
#define _GNU_SOURCE
#include <stdbool.h>
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <assert.h>
#include <limits.h>
#include <math.h>
#include <time.h>
#pragma endregion header
#pragma region type
/* signed integer */
typedef int8_t i8;
typedef int16_t i16;
typedef int32_t i32;
typedef int64_t i64;
typedef __int128_t i128;
/* unsigned integer */
typedef uint8_t u8;
typedef uint16_t u16;
typedef uint32_t u32;
typedef uint64_t u64;
typedef __uint128_t u128;
/* floating point number */
typedef float f32;
typedef double f64;
typedef long double f80;
#pragma endregion type
#pragma region macro
#define MIN(a, b) (((a) < (b)) ? (a) : (b))
#define MAX(a, b) (((a) > (b)) ? (a) : (b))
#define SWAP(a, b) (((a) ^= (b)), ((b) ^= (a)), ((a) ^= (b)))
#define POPCNT32(a) __builtin_popcount((a))
#define POPCNT64(a) __builtin_popcountll((a))
#define CTZ32(a) __builtin_ctz((a))
#define CLZ32(a) __builtin_clz((a))
#define CTZ64(a) __builtin_ctzll((a))
#define CLZ64(a) __builtin_clzll((a))
#define HAS_SINGLE_BIT32(a) (__builtin_popcount((a)) == (1))
#define HAS_SINGLE_BIT64(a) (__builtin_popcountll((a)) == (1))
#define MSB32(a) ((31) - __builtin_clz((a)))
#define MSB64(a) ((63) - __builtin_clzll((a)))
#define BIT_WIDTH32(a) ((a) ? ((32) - __builtin_clz((a))) : (0))
#define BIT_WIDTH64(a) ((a) ? ((64) - __builtin_clzll((a))) : (0))
#define LSBit(a) ((a) & (-(a)))
#define CLSBit(a) ((a) & ((a) - (1)))
#define BIT_CEIL32(a) ((!(a)) ? (1) : ((POPCNT32(a)) == (1) ? ((1u) << ((31) - CLZ32((a)))) : ((1u) << ((32) - CLZ32(a)))))
#define BIT_CEIL64(a) ((!(a)) ? (1) : ((POPCNT64(a)) == (1) ? ((1ull) << ((63) - CLZ64((a)))) : ((1ull) << ((64) - CLZ64(a)))))
#define BIT_FLOOR32(a) ((!(a)) ? (0) : ((1u) << ((31) - CLZ32((a)))))
#define BIT_FLOOR64(a) ((!(a)) ? (0) : ((1ull) << ((63) - CLZ64((a)))))
#define _ROTL64(x, s) (((x) << ((s) % (64))) | (((x) >> ((64) - ((s) % (64))))))
#define _ROTR64(x, s) (((x) >> ((s) % (64))) | (((x) << ((64) - ((s) % (64))))))
#define ROTL64(x, s) (((s) == (0)) ? (x) : ((((i128)(s)) < (0)) ? (_ROTR64((x), -(s))) : (_ROTL64((x), (s)))))
#define ROTR64(x, s) (((s) == (0)) ? (x) : ((((i128)(s)) < (0)) ? (_ROTL64((x), -(s))) : (_ROTR64((x), (s)))))
#pragma endregion macro
#pragma region io
static inline int read_int(void) {
// -2147483648 ~ 2147483647 (> 10 ^ 9)
int c, x = 0, f = 1;
while (c = getchar_unlocked(), c < 48 || c > 57) if (c == 45) f = -f;
while (47 < c && c < 58) {
x = x * 10 + c - 48;
c = getchar_unlocked();
}
return f * x;
}
static inline i32 in_i32(void) {
// -2147483648 ~ 2147483647 (> 10 ^ 9)
i32 c, x = 0, f = 1;
while (c = getchar_unlocked(), c < 48 || c > 57) if (c == 45) f = -f;
while (47 < c && c < 58) {
x = x * 10 + c - 48;
c = getchar_unlocked();
}
return f * x;
}
static inline u32 in_u32(void) {
// 0 ~ 4294967295 (> 10 ^ 9)
u32 c, x = 0;
while (c = getchar_unlocked(), c < 48 || c > 57);
while (47 < c && c < 58) {
x = x * 10 + c - 48;
c = getchar_unlocked();
}
return x;
}
static inline i64 in_i64(void) {
// -9223372036854775808 ~ 9223372036854775807 (> 10 ^ 18)
i64 c, x = 0, f = 1;
while (c = getchar_unlocked(), c < 48 || c > 57) if (c == 45) f = -f;
while (47 < c && c < 58) {
x = x * 10 + c - 48;
c = getchar_unlocked();
}
return f * x;
}
static inline u64 in_u64(void) {
// 0 ~ 18446744073709551615 (> 10 ^ 19)
u64 c, x = 0;
while (c = getchar_unlocked(), c < 48 || c > 57);
while (47 < c && c < 58) {
x = x * 10 + c - 48;
c = getchar_unlocked();
}
return x;
}
static inline void write_int_inner(int x) {
if (x >= 10) write_int_inner(x / 10);
putchar_unlocked(x - x / 10 * 10 + 48);
}
static inline void write_int(int x) {
if (x < 0) {
putchar_unlocked('-');
x = -x;
}
write_int_inner(x);
}
static inline void out_i32_inner(i32 x) {
if (x >= 10) out_i32_inner(x / 10);
putchar_unlocked(x - x / 10 * 10 + 48);
}
static inline void out_i32(i32 x) {
if (x < 0) {
putchar_unlocked('-');
x = -x;
}
out_i32_inner(x);
}
static inline void out_u32(u32 x) {
if (x >= 10) out_u32(x / 10);
putchar_unlocked(x - x / 10 * 10 + 48);
}
static inline void out_i64_inner(i64 x) {
if (x >= 10) out_i64_inner(x / 10);
putchar_unlocked(x - x / 10 * 10 + 48);
}
static inline void out_i64(i64 x) {
if (x < 0) {
putchar_unlocked('-');
x = -x;
}
out_i64_inner(x);
}
static inline void out_u64(u64 x) {
if (x >= 10) out_u64(x / 10);
putchar_unlocked(x - x / 10 * 10 + 48);
}
static inline void NL(void) { putchar_unlocked('\n'); }
static inline void SP(void) { putchar_unlocked(' '); }
#pragma endregion io
#pragma region dump
void dump_int(int x) { fprintf(stderr, "\033[1;31m%d\033[0m\n", x); }
void dump_i32(i32 x) { fprintf(stderr, "\033[1;32m%d\033[0m\n", x); }
void dump_i64(i64 x) { fprintf(stderr, "\033[1;33m%ld\033[0m\n", x); }
void dump_u32(u32 x) { fprintf(stderr, "\033[1;34m%u\033[0m\n", x); }
void dump_u64(u64 x) { fprintf(stderr, "\033[1;35m%lu\033[0m\n", x); }
void dump_int_array(int *a, int a_len) {
fprintf(stderr, "\033[1;31m");
for (int i = 0; i < a_len; i++) {
if (i == a_len - 1) fprintf(stderr, "%d\n", a[i]);
else fprintf(stderr, "%d ", a[i]);
}
fprintf(stderr, "\033[0m");
}
void dump_i32_array(i32 *a, int a_len) {
fprintf(stderr, "\033[1;32m");
for (int i = 0; i < a_len; i++) {
if (i == a_len - 1) fprintf(stderr, "%d\n", a[i]);
else fprintf(stderr, "%d ", a[i]);
}
fprintf(stderr, "\033[0m");
}
void dump_i64_array(i64 *a, int a_len) {
fprintf(stderr, "\033[1;33m");
for (int i = 0; i < a_len; i++) {
if (i == a_len - 1) fprintf(stderr, "%ld\n", a[i]);
else fprintf(stderr, "%ld ", a[i]);
}
fprintf(stderr, "\033[0m");
}
void dump_u32_array(u32 *a, int a_len) {
fprintf(stderr, "\033[1;34m");
for (int i = 0; i < a_len; i++) {
if (i == a_len - 1) fprintf(stderr, "%u\n", a[i]);
else fprintf(stderr, "%u ", a[i]);
}
fprintf(stderr, "\033[0m");
}
void dump_u64_array(u64 *a, int a_len) {
fprintf(stderr, "\033[1;35m");
for (int i = 0; i < a_len; i++) {
if (i == a_len - 1) fprintf(stderr, "%lu\n", a[i]);
else fprintf(stderr, "%lu ", a[i]);
}
fprintf(stderr, "\033[0m");
}
void dump_int_array_range(int *a, int a_len, int l, int r) {
if (r < l || r <= 0 || l >= a_len) return;
if (l < 0) l = 0;
if (r >= a_len) r = a_len - 1;
fprintf(stderr, "\033[1;31m");
for (int i = l; i <= r; i++) {
if (i == r) fprintf(stderr, "%d\n", a[i]);
else fprintf(stderr, "%d ", a[i]);
}
fprintf(stderr, "\033[0m");
}
void dump_i32_array_range(i32 *a, int a_len, int l, int r) {
if (r < l || r <= 0 || l >= a_len) return;
if (l < 0) l = 0;
if (r >= a_len) r = a_len - 1;
fprintf(stderr, "\033[1;32m");
for (int i = l; i <= r; i++) {
if (i == r) fprintf(stderr, "%d\n", a[i]);
else fprintf(stderr, "%d ", a[i]);
}
fprintf(stderr, "\033[0m");
}
void dump_i64_array_range(i64 *a, int a_len, int l, int r) {
if (r < l || r <= 0 || l >= a_len) return;
if (l < 0) l = 0;
if (r >= a_len) r = a_len - 1;
fprintf(stderr, "\033[1;33m");
for (int i = l; i <= r; i++) {
if (i == r) fprintf(stderr, "%ld\n", a[i]);
else fprintf(stderr, "%ld ", a[i]);
}
fprintf(stderr, "\033[0m");
}
void dump_u32_array_range(u32 *a, int a_len, int l, int r) {
if (r < l || r <= 0 || l >= a_len) return;
if (l < 0) l = 0;
if (r >= a_len) r = a_len - 1;
fprintf(stderr, "\033[1;34m");
for (int i = l; i <= r; i++) {
if (i == r) fprintf(stderr, "%u\n", a[i]);
else fprintf(stderr, "%u ", a[i]);
}
fprintf(stderr, "\033[0m");
}
void dump_u64_array_range(u64 *a, int a_len, int l, int r) {
if (r < l || r <= 0 || l >= a_len) return;
if (l < 0) l = 0;
if (r >= a_len) r = a_len - 1;
fprintf(stderr, "\033[1;35m");
for (int i = l; i <= r; i++) {
if (i == r) fprintf(stderr, "%lu\n", a[i]);
else fprintf(stderr, "%lu ", a[i]);
}
fprintf(stderr, "\033[0m");
}
void printb(u64 v) {
u64 mask = (u64)1 << (sizeof(v) * CHAR_BIT - 1);
do
putchar(mask & v ? '1' : '0');
while (mask >>= 1);
}
void putb(u64 v) { putchar('0'), putchar('b'), printb(v), putchar('\n'); }
#pragma endregion dump
#pragma region montgomery32
typedef uint32_t m32;
m32 _one_m32(u32 mod) {
return -1u % mod + 1;
}
m32 _r2_m32(u32 mod) {
return (u64)(i64)-1 % mod + 1;
}
m32 _inv_m32(u32 mod) {
u32 u = 1, v = 0, x = 1u << 31;
for (int i = 0; i < 32; i++) {
if (u & 1) u = (u + mod) >> 1, v = (v >> 1) + x;
else u >>= 1, v >>= 1;
}
return -v;
}
m32 _reduce_m32(u64 a, m32 inv, u32 mod) {
i64 z = (a >> 32) - ((((u32)a * inv) * (u64)mod) >> 32);
return z < 0 ? z + mod : z;
}
m32 to_m32(u32 a, m32 r2, m32 inv, u32 mod) { return _reduce_m32((u64)a * r2, inv, mod); }
u32 from_m32(m32 A, m32 inv, u32 mod) {
m32 t = _reduce_m32((u64)A, inv, mod) - mod;
return t + (mod & -(t >> 31u));
}
m32 add_m32(m32 A, m32 B, u32 mod2) {
// assert(mod2 == (mod << 1));
A += B - mod2;
A += mod2 & -(A >> 31u);
return A;
}
m32 sub_m32(m32 A, m32 B, u32 mod2) {
// assert(mod2 == (mod << 1));
A -= B;
A += mod2 & -(A >> 31u);
return A;
}
m32 min_m32(m32 A, u32 mod2) {
// assert(mod2 == (mod << 1));
return sub_m32(0u, A, mod2);
}
m32 mul_m32(m32 A, m32 B, m32 inv, u32 mod) { return _reduce_m32((u64)A * B, inv, mod); }
m32 pow_m32(m32 A, i64 n, m32 inv, u32 mod) {
m32 ret = _one_m32(mod);
while (n > 0) {
if (n & 1) ret = mul_m32(ret, A, inv, mod);
A = mul_m32(A, A, inv, mod);
n >>= 1;
}
return ret;
}
m32 inv_m32(m32 A, m32 inv, u32 mod) { return pow_m32(A, (i64)mod - 2, inv, mod); }
m32 div_m32(m32 A, m32 B, m32 inv, u32 mod) {
/* assert(is_prime(mod)); */
return mul_m32(A, inv_m32(B, inv, mod), inv, mod);
}
m32 in_m32(m32 r2, m32 inv, u32 mod) {
u32 c, a = 0;
while (c = getchar_unlocked(), c < 48 || c > 57);
while (47 < c && c < 58) {
a = a * 10 + c - 48;
c = getchar_unlocked();
}
return to_m32(a, r2, inv, mod);
}
void out_m32(m32 A, m32 inv, u32 mod) {
u32 a = from_m32(A, inv, mod);
out_u32(a);
}
void dump_m32(m32 x, m32 inv, u32 mod) { fprintf(stderr, "\033[1;34m%u\033[0m\n", from_m32(x, inv, mod)); }
void dump_m32_array(m32 *x, int x_len, m32 inv, u32 mod) {
fprintf(stderr, "\033[1;37m");
for (int i = 0; i < x_len; i++) {
if (i == x_len - 1) fprintf(stderr, "%u\n", x[i]);
else fprintf(stderr, "%u ", x[i]);
}
fprintf(stderr, "\033[0m");
fprintf(stderr, "\033[1;33m");
for (int i = 0; i < x_len; i++) {
if (i == x_len - 1) fprintf(stderr, "%u\n", from_m32(x[i], inv, mod));
else fprintf(stderr, "%u ", from_m32(x[i], inv, mod));
}
fprintf(stderr, "\033[0m");
}
void dump_m32_array_range(m32 *x, int x_len, int l, int r, m32 inv, u32 mod) {
if (r < l || r <= 0 || l >= x_len) return;
if (l < 0) l = 0;
if (r >= x_len) r = x_len - 1;
fprintf(stderr, "\033[1;31m");
for (int i = l; i <= r; i++) {
if (i == r) fprintf(stderr, "%u\n", x[i]);
else fprintf(stderr, "%u ", x[i]);
}
fprintf(stderr, "\033[0m");
fprintf(stderr, "\033[1;35m");
for (int i = l; i <= r; i++) {
if (i == r) fprintf(stderr, "%u\n", from_m32(x[i], inv, mod));
else fprintf(stderr, "%u ", from_m32(x[i], inv, mod));
}
fprintf(stderr, "\033[0m");
}
#pragma endregion montgomery32
#pragma region ntt1000000007
const u32 pr1 = 3u;
const u32 pr2 = 6u;
const u32 pr3 = 7u;
const u32 m1 = 1045430273u; const u32 m1_d = 2090860546u; const m32 m1_r2 = 798281873u; const m32 m1_inv = 3249537025u; const m32 m1_one = 113246204u;
const u32 m2 = 1051721729u; const u32 m2_d = 2103443458u; const m32 m2_r2 = 748646691u; const m32 m2_inv = 3243245569u; const m32 m2_one = 88080380u;
const u32 m3 = 1053818881u; const u32 m3_d = 2107637762u; const m32 m3_r2 = 159648582u; const m32 m3_inv = 3241148417u; const m32 m3_one = 79691772u;
const m32 sum_e1[] = { 885458361u, 900924659u, 962059782u, 415593171u, 1042963020u, 670679244u, 541100730u, 791232404u, 1017138063u, 1009790060u, 862479644u, 14783221u, 244867424u, 431174616u, 403070125u, 826977693u, 130509512u, 234433839u, 583350782u };
const m32 sum_e2[] = { 467413740u, 850027207u, 242082110u, 469704687u, 798875242u, 948071666u, 405859818u, 794843055u, 771520840u, 176008702u, 644573208u, 715864305u, 237085256u, 357934180u, 654888890u, 961560509u, 740357787u, 329663983u, 679393590u };
const m32 sum_e3[] = { 882957072u, 871538451u, 673579336u, 73537272u, 425574675u, 697950553u, 938595210u, 127556594u, 879998874u, 1019737367u, 353383985u, 36465421u, 901066455u, 93757427u, 979102987u, 501430327u, 40143827u, 92524426u, 698947874u };
const m32 sum_ie1[] = { 159971912u, 22701698u, 360170016u, 9164511u, 967212496u, 707952636u, 928400066u, 788102537u, 248599354u, 954265785u, 539282025u, 749013941u, 66727014u, 835563346u, 909028729u, 532555753u, 488995841u, 863413679u, 575546441u };
const m32 sum_ie2[] = { 584307989u, 414819672u, 727414907u, 1042290158u, 763418088u, 582659108u, 106169741u, 486228312u, 106468254u, 164241005u, 856035592u, 117294265u, 988587910u, 117912639u, 562619904u, 608072619u, 298409394u, 523681378u, 94283612u };
const m32 sum_ie3[] = { 170861809u, 516220135u, 927824564u, 603681864u, 586404698u, 720550924u, 639641177u, 182451670u, 257278119u, 437349354u, 600413897u, 648372120u, 399254828u, 182276433u, 483901237u, 893329600u, 390180935u, 919465749u, 333795344u };
void ntt1(m32 *A, int A_len) {
int h = 0;
while (A_len > (1 << h)) h++;
for (int ph = 1; ph <= h; ph++) {
int w = 1 << (ph - 1);
int p = 1 << (h - ph);
m32 now = m1_one;
for (int s = 0; s < w; s++) {
int offset = s << (h - ph + 1);
for (int i = 0; i < p; i++) {
m32 l = A[i + offset];
m32 r = mul_m32(A[i + offset + p], now, m1_inv, m1);
A[i + offset] = add_m32(l, r, m1_d);
A[i + offset + p] = sub_m32(l, r, m1_d);
}
now = mul_m32(now, sum_e1[CTZ32(~s)], m1_inv, m1);
}
}
}
void ntt2(m32 *A, int A_len) {
int h = 0;
while (A_len > (1 << h)) h++;
for (int ph = 1; ph <= h; ph++) {
int w = 1 << (ph - 1);
int p = 1 << (h - ph);
m32 now = m2_one;
for (int s = 0; s < w; s++) {
int offset = s << (h - ph + 1);
for (int i = 0; i < p; i++) {
m32 l = A[i + offset];
m32 r = mul_m32(A[i + offset + p], now, m2_inv, m2);
A[i + offset] = add_m32(l, r, m2_d);
A[i + offset + p] = sub_m32(l, r, m2_d);
}
now = mul_m32(now, sum_e2[CTZ32(~s)], m2_inv, m2);
}
}
}
void ntt3(m32 *A, int A_len) {
int h = 0;
while (A_len > (1 << h)) h++;
for (int ph = 1; ph <= h; ph++) {
int w = 1 << (ph - 1);
int p = 1 << (h - ph);
m32 now = m3_one;
for (int s = 0; s < w; s++) {
int offset = s << (h - ph + 1);
for (int i = 0; i < p; i++) {
m32 l = A[i + offset];
m32 r = mul_m32(A[i + offset + p], now, m3_inv, m3);
A[i + offset] = add_m32(l, r, m3_d);
A[i + offset + p] = sub_m32(l, r, m3_d);
}
now = mul_m32(now, sum_e3[CTZ32(~s)], m3_inv, m3);
}
}
}
void intt1(m32 *A, int A_len) {
int h = 0;
while (A_len > (1 << h)) h++;
for (int ph = h; ph >= 1; ph--) {
int w = 1 << (ph - 1);
int p = 1 << (h - ph);
m32 inow = m1_one;
for (int s = 0; s < w; s++) {
int offset = s << (h - ph + 1);
for (int i = 0; i < p; i++) {
m32 l = A[i + offset];
m32 r = A[i + offset + p];
A[i + offset] = add_m32(l, r, m1_d);
A[i + offset + p] = mul_m32(sub_m32(l, r, m1_d), inow, m1_inv, m1);
}
inow = mul_m32(inow, sum_ie1[CTZ32(~s)], m1_inv, m1);
}
}
m32 inv2t = inv_m32(to_m32(A_len, m1_r2, m1_inv, m1), m1_inv, m1);
for (int i = 0; i < A_len; i++) A[i] = from_m32(mul_m32(A[i], inv2t, m1_inv, m1), m1_inv, m1);
}
void intt2(m32 *A, int A_len) {
int h = 0;
while (A_len > (1 << h)) h++;
for (int ph = h; ph >= 1; ph--) {
int w = 1 << (ph - 1);
int p = 1 << (h - ph);
m32 inow = m2_one;
for (int s = 0; s < w; s++) {
int offset = s << (h - ph + 1);
for (int i = 0; i < p; i++) {
m32 l = A[i + offset];
m32 r = A[i + offset + p];
A[i + offset] = add_m32(l, r, m2_d);
A[i + offset + p] = mul_m32(sub_m32(l, r, m2_d), inow, m2_inv, m2);
}
inow = mul_m32(inow, sum_ie2[CTZ32(~s)], m2_inv, m2);
}
}
m32 inv2t = inv_m32(to_m32(A_len, m2_r2, m2_inv, m2), m2_inv, m2);
for (int i = 0; i < A_len; i++) A[i] = from_m32(mul_m32(A[i], inv2t, m2_inv, m2), m2_inv, m2);
}
void intt3(m32 *A, int A_len) {
int h = 0;
while (A_len > (1 << h)) h++;
for (int ph = h; ph >= 1; ph--) {
int w = 1 << (ph - 1);
int p = 1 << (h - ph);
m32 inow = m3_one;
for (int s = 0; s < w; s++) {
int offset = s << (h - ph + 1);
for (int i = 0; i < p; i++) {
m32 l = A[i + offset];
m32 r = A[i + offset + p];
A[i + offset] = add_m32(l, r, m3_d);
A[i + offset + p] = mul_m32(sub_m32(l, r, m3_d), inow, m3_inv, m3);
}
inow = mul_m32(inow, sum_ie3[CTZ32(~s)], m3_inv, m3);
}
}
m32 inv2t = inv_m32(to_m32(A_len, m3_r2, m3_inv, m3), m3_inv, m3);
for (int i = 0; i < A_len; i++) A[i] = from_m32(mul_m32(A[i], inv2t, m3_inv, m3), m3_inv, m3);
}
u32 *convolute1(m32 *A, int A_len, m32 *B, int B_len) {
int C_len = BIT_CEIL32(A_len + B_len - 1);
m32 *C = (m32 *)calloc(C_len, sizeof(m32));
m32 *D = (m32 *)calloc(C_len, sizeof(m32));
#ifdef LOCAL
if (C == NULL || D == NULL) exit(EXIT_FAILURE);
#endif
memcpy(C, A, sizeof(m32) * A_len);
memcpy(D, B, sizeof(m32) * B_len);
ntt1(C, C_len);
ntt1(D, C_len);
for (int i = 0; i < C_len; i++) C[i] = mul_m32(C[i], D[i], m1_inv, m1);
free(D);
intt1(C, C_len);
return C;
}
u32 *convolute2(m32 *A, int A_len, m32 *B, int B_len) {
int C_len = BIT_CEIL32(A_len + B_len - 1);
m32 *C = (m32 *)calloc(C_len, sizeof(m32));
m32 *D = (m32 *)calloc(C_len, sizeof(m32));
#ifdef LOCAL
if (C == NULL || D == NULL) exit(EXIT_FAILURE);
#endif
memcpy(C, A, sizeof(m32) * A_len);
memcpy(D, B, sizeof(m32) * B_len);
ntt2(C, C_len);
ntt2(D, C_len);
for (int i = 0; i < C_len; i++) C[i] = mul_m32(C[i], D[i], m2_inv, m2);
free(D);
intt2(C, C_len);
return C;
}
u32 *convolute3(m32 *A, int A_len, m32 *B, int B_len) {
int C_len = BIT_CEIL32(A_len + B_len - 1);
m32 *C = (m32 *)calloc(C_len, sizeof(m32));
m32 *D = (m32 *)calloc(C_len, sizeof(m32));
#ifdef LOCAL
if (C == NULL || D == NULL) exit(EXIT_FAILURE);
#endif
memcpy(C, A, sizeof(m32) * A_len);
memcpy(D, B, sizeof(m32) * B_len);
ntt3(C, C_len);
ntt3(D, C_len);
for (int i = 0; i < C_len; i++) C[i] = mul_m32(C[i], D[i], m3_inv, m3);
free(D);
intt3(C, C_len);
return C;
}
u32 *arbitrary_mod_convolute(u32 *a, int a_len, u32 *b, int b_len, u32 mod) {
u32 m0 = mod;
u32 m0_d = m0 << 1;
m32 m0_r2 = _r2_m32(m0);
m32 m0_inv = _inv_m32(m0);
m32 *A1 = (m32 *)calloc(a_len, sizeof(m32));
m32 *A2 = (m32 *)calloc(a_len, sizeof(m32));
m32 *A3 = (m32 *)calloc(a_len, sizeof(m32));
m32 *B1 = (m32 *)calloc(b_len, sizeof(m32));
m32 *B2 = (m32 *)calloc(b_len, sizeof(m32));
m32 *B3 = (m32 *)calloc(b_len, sizeof(m32));
#ifdef LOCAL
if (A1 == NULL || A2 == NULL || A3 == NULL || B1 == NULL || B2 == NULL || B3 == NULL)) { exit(EXIT_FAILURE); }
#endif
for (int i = 0; i < a_len; i++) {
A1[i] = to_m32(a[i], m1_r2, m1_inv, m1);
A2[i] = to_m32(a[i], m2_r2, m2_inv, m2);
A3[i] = to_m32(a[i], m3_r2, m3_inv, m3);
}
for (int i = 0; i < b_len; i++) {
B1[i] = to_m32(b[i], m1_r2, m1_inv, m1);
B2[i] = to_m32(b[i], m2_r2, m2_inv, m2);
B3[i] = to_m32(b[i], m3_r2, m3_inv, m3);
}
u32 *x = convolute1(A1, a_len, B1, b_len);
u32 *y = convolute2(A2, a_len, B2, b_len);
u32 *z = convolute3(A3, a_len, B3, b_len);
free(A1);
free(A2);
free(A3);
free(B1);
free(B2);
free(B3);
m32 m1_inv_m2 = inv_m32(to_m32(m1, m2_r2, m2_inv, m2), m2_inv, m2);
m32 m12_inv_m3 = inv_m32(mul_m32(to_m32(m1, m3_r2, m3_inv, m3), to_m32(m2, m3_r2, m3_inv, m3), m3_inv, m3), m3_inv, m3);
m32 m1_m3 = to_m32(m1, m3_r2, m3_inv, m3);
m32 m1_m0 = to_m32(m1, m0_r2, m0_inv, m0);
m32 m12_m0 = mul_m32(to_m32(m1, m0_r2, m0_inv, m0), to_m32(m2, m0_r2, m0_inv, m0), m0_inv, m0);
u32 *ret = (u32 *)calloc(a_len + b_len - 1, sizeof(u32));
#ifdef LOCAL
if (ret == NULL) exit(EXIT_FAILURE);
#endif
for (u32 i = 0; i < a_len + b_len - 1; i++) {
m32 xi_m2 = to_m32(x[i], m2_r2, m2_inv, m2);
m32 yi_m2 = to_m32(y[i], m2_r2, m2_inv, m2);
m32 zi_m3 = to_m32(z[i], m3_r2, m3_inv, m3);
m32 xi_m3 = to_m32(x[i], m3_r2, m3_inv, m3);
u32 v1 = from_m32(mul_m32(sub_m32(yi_m2, xi_m2, m2_d), m1_inv_m2, m2_inv, m2), m2_inv, m2);
m32 v1_m3 = to_m32(v1, m3_r2, m3_inv, m3);
u32 v2 = from_m32(mul_m32(sub_m32(zi_m3, add_m32(xi_m3, mul_m32(m1_m3, v1_m3, m3_inv, m3), m3_d), m3_d), m12_inv_m3, m3_inv, m3), m3_inv, m3);
m32 v2_m0 = to_m32(v2, m0_r2, m0_inv, m0);
m32 xi_m0 = to_m32(x[i], m0_r2, m0_inv, m0);
m32 v1_m0 = to_m32(v1, m0_r2, m0_inv, m0);
ret[i] = from_m32(add_m32(add_m32(xi_m0, mul_m32(m1_m0, v1_m0, m0_inv, m0), m0_d), mul_m32(m12_m0, v2_m0, m0_inv, m0), m0_d), m0_inv, m0);
}
free(x);
free(y);
free(z);
return ret;
}
#pragma endregion ntt1000000007
#pragma region sample point shift
m32 _fact[1<<20];
m32 _inv_fact[1<<20];
m32 _inv_table[1<<20];
void pre_fact(int n, m32 r2, m32 inv, u32 mod) {
_fact[0] = 1;
for (int i = 0; i <= n + 1; i++) _fact[i + 1] = mul_m32(_fact[i], to_m32(i + 1, r2, inv, mod), inv, mod);
_inv_fact[n + 2] = inv_m32(_fact[n + 2], inv, mod);
for (int i = n + 2; i > 0; i--) _inv_fact[i - 1] = mul_m32(_inv_fact[i], to_m32(i, r2, inv, mod), inv, mod);
for (int i = 1; i <= n + 1; i++) _inv_table[i] = mul_m32(_inv_fact[i], _fact[i - 1], inv, mod);
}
m32 *sample_point_shift(m32 *A, int A_len, m32 m, m32 r2, m32 inv, u32 mod) {
u32 *f = (m32 *)calloc(A_len, sizeof(m32));
u32 *g = (m32 *)calloc((A_len << 1) - 1, sizeof(m32));
#ifdef LOCAL
if (F == NULL || G == NULL || f == NULL || g == NULL) exit(EXIT_FAILURE);
#endif
for (int i = 0; i < A_len; i++) {
f[i] = mul_m32(mul_m32(A[i], _inv_fact[i], inv, mod), _inv_fact[A_len - 1 - i], inv, mod);
if ((A_len - 1 - i) & 1) f[i] = min_m32(f[i], mod << 1);
f[i] = from_m32(f[i], inv, mod);
}
m32 d_m = to_m32(A_len - 1, r2, inv, mod);
for (int i = 0; i < (A_len << 1) - 1; i++) {
m32 i_m = to_m32(i, r2, inv, mod);
g[i] = inv_m32(add_m32(sub_m32(m, d_m, mod << 1), i_m, mod << 1), inv, mod);
g[i] = from_m32(g[i], inv, mod);
}
u32 *h = arbitrary_mod_convolute(f, A_len, g, (A_len << 1) - 1, mod);
m32 coef = _one_m32(mod);
for (int i = 0; i < (A_len << 1) - 1; i++) h[i] = to_m32(h[i], r2, inv, mod);
for (int i = 0; i < A_len; i++) {
m32 i_m = to_m32(i, r2, inv, mod);
m32 mdi = add_m32(sub_m32(m, d_m, mod << 1), i_m, mod << 1);
coef = mul_m32(coef, mdi, inv, mod);
}
for (int i = 0; i < A_len; i++) {
m32 i1 = to_m32(i + 1, r2, inv, mod);
h[i + A_len - 1] = mul_m32(h[i + A_len - 1], coef, inv, mod);
coef = mul_m32(coef, mul_m32(add_m32(m, i1, mod << 1), g[i], inv, mod), inv, mod);
}
for (int i = A_len - 1; i < (A_len << 1) - 1; i++) f[i - A_len + 1] = h[i];
free(g);
free(h);
return f;
}
m32 factorial_mod(i64 n, m32 r2, m32 inv, u32 mod) {
if (n <= 1) return _one_m32(mod);
if (n >= mod) return 0;
i64 v = 1;
while (v * v < n) v <<= 1;
m32 iv = inv_m32(to_m32(v, r2, inv, mod), inv, mod);
m32 *G = (m32 *)calloc(v + 1, sizeof(m32));
G[0] = _one_m32(mod);
G[1] = to_m32(v + 1, r2, inv, mod);
for (i64 d = 1; d != v; d <<= 1) {
m32 d_m = to_m32(d, r2, inv, mod);
m32 *g1 = sample_point_shift(G, d + 1, mul_m32(d_m, iv, inv, mod), r2, inv, mod);
m32 *g2 = sample_point_shift(G, d + 1, mul_m32(add_m32(mul_m32(d_m, v, inv, mod), v, mod << 1), iv, inv, mod), r2, inv, mod);
m32 *g3 = sample_point_shift(G, d + 1, mul_m32(add_m32(add_m32(mul_m32(d_m, v, inv, mod), d_m, mod << 1), v, mod << 1), iv, inv, mod), r2, inv, mod);
for (int i = 0; i <= d; i++) {
G[i] = mul_m32(G[i], g1[i], inv, mod);
g2[i] = mul_m32(g2[i], g3[i], inv, mod);
}
for (int i = 0; i < d; i++) G[d + i] = g2[i];
}
m32 ret = _one_m32(mod);
i64 i = 0;
while (i + v <= n) ret = mul_m32(ret, G[i / v], inv, mod), i += v;
while (i < n) ret = mul_m32(ret, to_m32(++i, r2, inv, mod), inv, mod);
return ret;
}
#pragma endregion sample point shift
void Main(void) {
i64 n = in_i64();
u32 mod = 1000000007u;
m32 r2 = _r2_m32(mod);
m32 inv = _inv_m32(mod);
out_m32(factorial_mod(n, r2, inv, mod), inv, mod);
NL();
}
int main(void) {
Main();
return 0;
}
nonamae