#define NDEBUG #include #include using namespace std; template struct LazyMontgomeryModInt { using mint = LazyMontgomeryModInt; using i32 = int32_t; using u32 = uint32_t; using u64 = uint64_t; static constexpr u32 get_r() { u32 ret = mod; for (i32 i = 0; i < 4; ++i) ret *= 2 - mod * ret; return ret; } static constexpr u32 r = get_r(); static constexpr u32 n2 = -u64(mod) % mod; static_assert(r * mod == 1, "invalid, r * mod != 1"); static_assert(mod < (1 << 30), "invalid, mod >= 2 ^ 30"); static_assert((mod & 1) == 1, "invalid, mod % 2 == 0"); u32 a; constexpr LazyMontgomeryModInt() : a(0) {} constexpr LazyMontgomeryModInt(const int64_t &b) : a(reduce(u64(b % mod + mod) * n2)){}; static constexpr u32 reduce(const u64 &b) { return (b + u64(u32(b) * u32(-r)) * mod) >> 32; } constexpr mint &operator+=(const mint &b) { if (i32(a += b.a - 2 * mod) < 0) a += 2 * mod; return *this; } constexpr mint &operator-=(const mint &b) { if (i32(a -= b.a) < 0) a += 2 * mod; return *this; } constexpr mint &operator*=(const mint &b) { a = reduce(u64(a) * b.a); return *this; } constexpr mint &operator/=(const mint &b) { *this *= b.inverse(); return *this; } constexpr mint operator+(const mint &b) const { return mint(*this) += b; } constexpr mint operator-(const mint &b) const { return mint(*this) -= b; } constexpr mint operator*(const mint &b) const { return mint(*this) *= b; } constexpr mint operator/(const mint &b) const { return mint(*this) /= b; } constexpr bool operator==(const mint &b) const { return (a >= mod ? a - mod : a) == (b.a >= mod ? b.a - mod : b.a); } constexpr bool operator!=(const mint &b) const { return (a >= mod ? a - mod : a) != (b.a >= mod ? b.a - mod : b.a); } constexpr mint operator-() const { return mint() - mint(*this); } constexpr mint pow(u64 n) const { mint ret(1), mul(*this); while (n > 0) { if (n & 1) ret *= mul; mul *= mul; n >>= 1; } return ret; } constexpr mint inverse() const { return pow(mod - 2); } friend ostream &operator<<(ostream &os, const mint &b) { return os << b.get(); } friend istream &operator>>(istream &is, mint &b) { int64_t t; is >> t; b = LazyMontgomeryModInt(t); return (is); } constexpr u32 get() const { u32 ret = reduce(a); return ret >= mod ? ret - mod : ret; } static constexpr u32 get_mod() { return mod; } }; using mint = LazyMontgomeryModInt<1000000007>; #include __attribute__((target("sse4.2"))) __attribute__((always_inline)) __m128i my128_mullo_epu32(const __m128i &a, const __m128i &b) { return _mm_mullo_epi32(a, b); } __attribute__((target("sse4.2"))) __attribute__((always_inline)) __m128i my128_mulhi_epu32(const __m128i &a, const __m128i &b) { __m128i a13 = _mm_shuffle_epi32(a, 0xF5); __m128i b13 = _mm_shuffle_epi32(b, 0xF5); __m128i prod02 = _mm_mul_epu32(a, b); __m128i prod13 = _mm_mul_epu32(a13, b13); __m128i prod = _mm_unpackhi_epi64(_mm_unpacklo_epi32(prod02, prod13), _mm_unpackhi_epi32(prod02, prod13)); return prod; } __attribute__((target("sse4.2"))) __attribute__((always_inline)) __m128i montgomery_mul_128(const __m128i &a, const __m128i &b, const __m128i &r, const __m128i &m1) { return _mm_sub_epi32( _mm_add_epi32(my128_mulhi_epu32(a, b), m1), my128_mulhi_epu32(my128_mullo_epu32(my128_mullo_epu32(a, b), r), m1)); } __attribute__((target("sse4.2"))) __attribute__((always_inline)) __m128i montgomery_add_128(const __m128i &a, const __m128i &b, const __m128i &m2, const __m128i &m0) { __m128i ret = _mm_sub_epi32(_mm_add_epi32(a, b), m2); return _mm_add_epi32(_mm_and_si128(_mm_cmpgt_epi32(m0, ret), m2), ret); } __attribute__((target("sse4.2"))) __attribute__((always_inline)) __m128i montgomery_sub_128(const __m128i &a, const __m128i &b, const __m128i &m2, const __m128i &m0) { __m128i ret = _mm_sub_epi32(a, b); return _mm_add_epi32(_mm_and_si128(_mm_cmpgt_epi32(m0, ret), m2), ret); } __attribute__((target("avx2"))) __attribute__((always_inline)) __m256i my256_mullo_epu32(const __m256i &a, const __m256i &b) { return _mm256_mullo_epi32(a, b); } __attribute__((target("avx2"))) __attribute__((always_inline)) __m256i my256_mulhi_epu32(const __m256i &a, const __m256i &b) { __m256i a13 = _mm256_shuffle_epi32(a, 0xF5); __m256i b13 = _mm256_shuffle_epi32(b, 0xF5); __m256i prod02 = _mm256_mul_epu32(a, b); __m256i prod13 = _mm256_mul_epu32(a13, b13); __m256i prod = _mm256_unpackhi_epi64(_mm256_unpacklo_epi32(prod02, prod13), _mm256_unpackhi_epi32(prod02, prod13)); return prod; } __attribute__((target("avx2"))) __attribute__((always_inline)) __m256i montgomery_mul_256(const __m256i &a, const __m256i &b, const __m256i &r, const __m256i &m1) { return _mm256_sub_epi32( _mm256_add_epi32(my256_mulhi_epu32(a, b), m1), my256_mulhi_epu32(my256_mullo_epu32(my256_mullo_epu32(a, b), r), m1)); } __attribute__((target("avx2"))) __attribute__((always_inline)) __m256i montgomery_add_256(const __m256i &a, const __m256i &b, const __m256i &m2, const __m256i &m0) { __m256i ret = _mm256_sub_epi32(_mm256_add_epi32(a, b), m2); return _mm256_add_epi32(_mm256_and_si256(_mm256_cmpgt_epi32(m0, ret), m2), ret); } __attribute__((target("avx2"))) __attribute__((always_inline)) __m256i montgomery_sub_256(const __m256i &a, const __m256i &b, const __m256i &m2, const __m256i &m0) { __m256i ret = _mm256_sub_epi32(a, b); return _mm256_add_epi32(_mm256_and_si256(_mm256_cmpgt_epi32(m0, ret), m2), ret); } int r[32] __attribute__((aligned(64))); uint32_t res2[32] __attribute__((aligned(64))); __attribute__((target("avx2"), optimize("O3, unroll-loops"))) mint optimized_with_simd(int MAX) { using u64 = uint64_t; for (int i = 0; i < 32; i++) r[i] = mint::reduce(u64(i + 1) * mint::n2); __m256i A0 = _mm256_set1_epi32(r[0]); __m256i A1 = _mm256_set1_epi32(r[0]); __m256i A2 = _mm256_set1_epi32(r[0]); __m256i A3 = _mm256_set1_epi32(r[0]); __m256i B0 = _mm256_loadu_si256((__m256i *)(r + 0)); __m256i B1 = _mm256_loadu_si256((__m256i *)(r + 8)); __m256i B2 = _mm256_loadu_si256((__m256i *)(r + 16)); __m256i B3 = _mm256_loadu_si256((__m256i *)(r + 24)); const __m256i EI = _mm256_set1_epi32(mint::get_mod() * 2 - r[31]); const __m256i R = _mm256_set1_epi32(mint::r); const __m256i M0 = _mm256_set1_epi32(0); const __m256i M1 = _mm256_set1_epi32(mint::get_mod()); const __m256i M2 = _mm256_set1_epi32(mint::get_mod() * 2); int i = 1; for (; i + 32 <= MAX; i += 32) { A0 = montgomery_mul_256(A0, B0, R, M1); A1 = montgomery_mul_256(A1, B1, R, M1); A2 = montgomery_mul_256(A2, B2, R, M1); A3 = montgomery_mul_256(A3, B3, R, M1); B0 = montgomery_sub_256(B0, EI, M2, M0); B1 = montgomery_sub_256(B1, EI, M2, M0); B2 = montgomery_sub_256(B2, EI, M2, M0); B3 = montgomery_sub_256(B3, EI, M2, M0); } _mm256_storeu_si256((__m256i *)(res2 + 0), A0); _mm256_storeu_si256((__m256i *)(res2 + 8), A1); _mm256_storeu_si256((__m256i *)(res2 + 16), A2); _mm256_storeu_si256((__m256i *)(res2 + 24), A3); mint ret = 1; for (int j = 0; j < 32; j++) ret *= *(reinterpret_cast(res2 + j)); while (i <= MAX) ret *= i++; return ret; } mint fast_prime_factorial(uint32_t n) { if (n >= mint::get_mod()) return 0; if (n * 2 <= mint::get_mod()) return optimized_with_simd(n); mint fac = optimized_with_simd(mint::get_mod() - 1 - n); if (n % 2 == 0) fac = -fac; fac = fac.inverse(); return fac; } int main() { long long n; cin >> n; if (n >= (long long)mint::get_mod()) { cout << 0 << endl; return 0; } cout << fast_prime_factorial(n) << endl; }