#include // #include using namespace std; using namespace std; template struct LazyMontgomeryModInt { using mint = LazyMontgomeryModInt; using i32 = int32_t; using u32 = uint32_t; using u64 = uint64_t; static constexpr u32 get_r() { u32 ret = mod; for (i32 i = 0; i < 4; ++i) ret *= 2 - mod * ret; return ret; } static constexpr u32 r = get_r(); static constexpr u32 n2 = -u64(mod) % mod; static_assert(r * mod == 1, "invalid, r * mod != 1"); static_assert(mod < (1 << 30), "invalid, mod >= 2 ^ 30"); static_assert((mod & 1) == 1, "invalid, mod % 2 == 0"); u32 a; constexpr LazyMontgomeryModInt() : a(0) {} constexpr LazyMontgomeryModInt(const int64_t &b) : a(reduce(u64(b % mod + mod) * n2)){}; static constexpr u32 reduce(const u64 &b) { return (b + u64(u32(b) * u32(-r)) * mod) >> 32; } constexpr mint &operator+=(const mint &b) { if (i32(a += b.a - 2 * mod) < 0) a += 2 * mod; return *this; } constexpr mint &operator-=(const mint &b) { if (i32(a -= b.a) < 0) a += 2 * mod; return *this; } constexpr mint &operator*=(const mint &b) { a = reduce(u64(a) * b.a); return *this; } constexpr mint &operator/=(const mint &b) { *this *= b.inverse(); return *this; } constexpr mint operator+(const mint &b) const { return mint(*this) += b; } constexpr mint operator-(const mint &b) const { return mint(*this) -= b; } constexpr mint operator*(const mint &b) const { return mint(*this) *= b; } constexpr mint operator/(const mint &b) const { return mint(*this) /= b; } constexpr bool operator==(const mint &b) const { return (a >= mod ? a - mod : a) == (b.a >= mod ? b.a - mod : b.a); } constexpr bool operator!=(const mint &b) const { return (a >= mod ? a - mod : a) != (b.a >= mod ? b.a - mod : b.a); } constexpr mint operator-() const { return mint() - mint(*this); } constexpr mint pow(u64 n) const { mint ret(1), mul(*this); while (n > 0) { if (n & 1) ret *= mul; mul *= mul; n >>= 1; } return ret; } constexpr mint inverse() const { return pow(mod - 2); } friend ostream &operator<<(ostream &os, const mint &b) { return os << b.get(); } friend istream &operator>>(istream &is, mint &b) { int64_t t; is >> t; b = LazyMontgomeryModInt(t); return (is); } constexpr u32 get() const { u32 ret = reduce(a); return ret >= mod ? ret - mod : ret; } static constexpr u32 get_mod() { return mod; } }; using namespace std; __attribute__((target("sse4.2"))) __attribute__((always_inline)) __m128i my128_mullo_epu32(const __m128i &a, const __m128i &b) { return _mm_mullo_epi32(a, b); } __attribute__((target("sse4.2"))) __attribute__((always_inline)) __m128i my128_mulhi_epu32(const __m128i &a, const __m128i &b) { __m128i a13 = _mm_shuffle_epi32(a, 0xF5); __m128i b13 = _mm_shuffle_epi32(b, 0xF5); __m128i prod02 = _mm_mul_epu32(a, b); __m128i prod13 = _mm_mul_epu32(a13, b13); __m128i prod = _mm_unpackhi_epi64(_mm_unpacklo_epi32(prod02, prod13), _mm_unpackhi_epi32(prod02, prod13)); return prod; } __attribute__((target("sse4.2"))) __attribute__((always_inline)) __m128i montgomery_mul_128(const __m128i &a, const __m128i &b, const __m128i &r, const __m128i &m1) { return _mm_sub_epi32( _mm_add_epi32(my128_mulhi_epu32(a, b), m1), my128_mulhi_epu32(my128_mullo_epu32(my128_mullo_epu32(a, b), r), m1)); } __attribute__((target("sse4.2"))) __attribute__((always_inline)) __m128i montgomery_add_128(const __m128i &a, const __m128i &b, const __m128i &m2, const __m128i &m0) { __m128i ret = _mm_sub_epi32(_mm_add_epi32(a, b), m2); return _mm_add_epi32(_mm_and_si128(_mm_cmpgt_epi32(m0, ret), m2), ret); } __attribute__((target("sse4.2"))) __attribute__((always_inline)) __m128i montgomery_sub_128(const __m128i &a, const __m128i &b, const __m128i &m2, const __m128i &m0) { __m128i ret = _mm_sub_epi32(a, b); return _mm_add_epi32(_mm_and_si128(_mm_cmpgt_epi32(m0, ret), m2), ret); } __attribute__((target("avx2"))) __attribute__((always_inline)) __m256i my256_mullo_epu32(const __m256i &a, const __m256i &b) { return _mm256_mullo_epi32(a, b); } __attribute__((target("avx2"))) __attribute__((always_inline)) __m256i my256_mulhi_epu32(const __m256i &a, const __m256i &b) { __m256i a13 = _mm256_shuffle_epi32(a, 0xF5); __m256i b13 = _mm256_shuffle_epi32(b, 0xF5); __m256i prod02 = _mm256_mul_epu32(a, b); __m256i prod13 = _mm256_mul_epu32(a13, b13); __m256i prod = _mm256_unpackhi_epi64(_mm256_unpacklo_epi32(prod02, prod13), _mm256_unpackhi_epi32(prod02, prod13)); return prod; } __attribute__((target("avx2"))) __attribute__((always_inline)) __m256i montgomery_mul_256(const __m256i &a, const __m256i &b, const __m256i &r, const __m256i &m1) { return _mm256_sub_epi32( _mm256_add_epi32(my256_mulhi_epu32(a, b), m1), my256_mulhi_epu32(my256_mullo_epu32(my256_mullo_epu32(a, b), r), m1)); } __attribute__((target("avx2"))) __attribute__((always_inline)) __m256i montgomery_add_256(const __m256i &a, const __m256i &b, const __m256i &m2, const __m256i &m0) { __m256i ret = _mm256_sub_epi32(_mm256_add_epi32(a, b), m2); return _mm256_add_epi32(_mm256_and_si256(_mm256_cmpgt_epi32(m0, ret), m2), ret); } __attribute__((target("avx2"))) __attribute__((always_inline)) __m256i montgomery_sub_256(const __m256i &a, const __m256i &b, const __m256i &m2, const __m256i &m0) { __m256i ret = _mm256_sub_epi32(a, b); return _mm256_add_epi32(_mm256_and_si256(_mm256_cmpgt_epi32(m0, ret), m2), ret); } constexpr int MOD = 1000000007; template int naive_optimized(int MAX) { mint eight = 8; mint a[8] = {1, 1, 1, 1, 1, 1, 1, 1}; mint b[8] = {1, 2, 3, 4, 5, 6, 7, 8}; for (int i = 1; i <= (MAX / 8); i++) { a[0] *= b[0]; b[0] += eight; a[1] *= b[1]; b[1] += eight; a[2] *= b[2]; b[2] += eight; a[3] *= b[3]; b[3] += eight; a[4] *= b[4]; b[4] += eight; a[5] *= b[5]; b[5] += eight; a[6] *= b[6]; b[6] += eight; a[7] *= b[7]; b[7] += eight; } mint ret = 1; for (int i = 0; i < 8; i++) ret *= a[i]; return ret.get(); } int r[32] __attribute__((aligned(64))); uint32_t res2[32] __attribute__((aligned(64))); __attribute__((target("avx2"))) int optimized_with_simd_2(int MAX) { using mint = LazyMontgomeryModInt<1000000007>; using u64 = uint64_t; for (int i = 0; i < 32; i++) r[i] = mint::reduce(u64(i + 1) * mint::n2); __m256i A0 = _mm256_set1_epi32(r[0]); __m256i A1 = _mm256_set1_epi32(r[0]); __m256i A2 = _mm256_set1_epi32(r[0]); __m256i A3 = _mm256_set1_epi32(r[0]); __m256i B0 = _mm256_loadu_si256((__m256i *)(r + 0)); __m256i B1 = _mm256_loadu_si256((__m256i *)(r + 8)); __m256i B2 = _mm256_loadu_si256((__m256i *)(r + 16)); __m256i B3 = _mm256_loadu_si256((__m256i *)(r + 24)); const __m256i EI = _mm256_set1_epi32(mint::get_mod() * 2 - r[31]); const __m256i R = _mm256_set1_epi32(mint::r); const __m256i M0 = _mm256_set1_epi32(0); const __m256i M1 = _mm256_set1_epi32(mint::get_mod()); const __m256i M2 = _mm256_set1_epi32(mint::get_mod() * 2); for (int i = 0; i < MAX / 32; ++i) { A0 = montgomery_mul_256(A0, B0, R, M1); A1 = montgomery_mul_256(A1, B1, R, M1); A2 = montgomery_mul_256(A2, B2, R, M1); A3 = montgomery_mul_256(A3, B3, R, M1); B0 = montgomery_sub_256(B0, EI, M2, M0); B1 = montgomery_sub_256(B1, EI, M2, M0); B2 = montgomery_sub_256(B2, EI, M2, M0); B3 = montgomery_sub_256(B3, EI, M2, M0); } _mm256_storeu_si256((__m256i *)(res2 + 0), A0); _mm256_storeu_si256((__m256i *)(res2 + 8), A1); _mm256_storeu_si256((__m256i *)(res2 + 16), A2); _mm256_storeu_si256((__m256i *)(res2 + 24), A3); mint ret = 1; for (int i = 0; i < 32; i++) ret *= *(reinterpret_cast(res2 + i)); return ret.get(); } using namespace std; namespace fastio { static constexpr int SZ = 1 << 17; char ibuf[SZ], obuf[SZ]; int pil = 0, pir = 0, por = 0; struct Pre { char num[40000]; constexpr Pre() : num() { for (int i = 0; i < 10000; i++) { int n = i; for (int j = 3; j >= 0; j--) { num[i * 4 + j] = n % 10 + '0'; n /= 10; } } } } constexpr pre; inline void load() { memcpy(ibuf, ibuf + pil, pir - pil); pir = pir - pil + fread(ibuf + pir - pil, 1, SZ - pir + pil, stdin); pil = 0; } inline void flush() { fwrite(obuf, 1, por, stdout); por = 0; } inline void rd(char& c) { c = ibuf[pil++]; } template inline void rd(T& x) { if (pil + 32 > pir) load(); char c; do c = ibuf[pil++]; while (c < '-'); bool minus = 0; if (c == '-') { minus = 1; c = ibuf[pil++]; } x = 0; while (c >= '0') { x = x * 10 + (c & 15); c = ibuf[pil++]; } if (minus) x = -x; } inline void rd() {} template inline void rd(Head& head, Tail&... tail) { rd(head); rd(tail...); } inline void wt(char c) { obuf[por++] = c; } template inline void wt(T x) { if (por > SZ - 32) flush(); if (!x) { obuf[por++] = '0'; return; } if (x < 0) { obuf[por++] = '-'; x = -x; } int i = 12; char buf[16]; while (x >= 10000) { memcpy(buf + i, pre.num + (x % 10000) * 4, 4); x /= 10000; i -= 4; } int d = x < 100 ? (x < 10 ? 1 : 2) : (x < 1000 ? 3 : 4); memcpy(obuf + por, pre.num + x * 4 + 4 - d, d); por += d; memcpy(obuf + por, buf + i + 4, 12 - i); por += 12 - i; } inline void wt() {} template inline void wt(Head head, Tail... tail) { wt(head); wt(tail...); } template inline void wtn(T x){ wt(x, '\n'); } struct Dummy { Dummy() { atexit(flush); } } dummy; } // namespace fastio using fastio::rd; using fastio::wt; using fastio::wtn; using namespace std; struct Timer { chrono::high_resolution_clock::time_point st; Timer() { reset(); } void reset() { st = chrono::high_resolution_clock::now(); } chrono::milliseconds::rep elapsed() { auto ed = chrono::high_resolution_clock::now(); return chrono::duration_cast(ed - st).count(); } }; int main() { using mint = LazyMontgomeryModInt; long long N; rd(N); if(N >= MOD) { wtn('0'); return 0; } if(N < 32){ mint ans = 1; for(int i = 1; i <= N; i++) ans *= i; wtn(ans.get()); return 0; } mint ans = optimized_with_simd_2(N / 32 * 32); for (int i = N / 32 * 32 + 1; i <= N; i++) ans *= i; wtn(ans.get()); }