結果
問題 | No.502 階乗を計算するだけ |
ユーザー |
|
提出日時 | 2020-08-12 09:56:21 |
言語 | C++17 (gcc 13.3.0 + boost 1.87.0) |
結果 |
AC
|
実行時間 | 468 ms / 1,000 ms |
コード長 | 10,829 bytes |
コンパイル時間 | 3,847 ms |
コンパイル使用メモリ | 306,548 KB |
最終ジャッジ日時 | 2025-01-12 21:13:45 |
ジャッジサーバーID (参考情報) |
judge4 / judge5 |
(要ログイン)
ファイルパターン | 結果 |
---|---|
other | AC * 52 |
コンパイルメッセージ
main.cpp:174:1: warning: ‘always_inline’ function might not be inlinable [-Wattributes] 174 | montgomery_sub_256(const __m256i &a, const __m256i &b, const __m256i &m2, | ^~~~~~~~~~~~~~~~~~ main.cpp:166:1: warning: ‘always_inline’ function might not be inlinable [-Wattributes] 166 | montgomery_add_256(const __m256i &a, const __m256i &b, const __m256i &m2, | ^~~~~~~~~~~~~~~~~~ main.cpp:158:1: warning: ‘always_inline’ function might not be inlinable [-Wattributes] 158 | montgomery_mul_256(const __m256i &a, const __m256i &b, const __m256i &r, | ^~~~~~~~~~~~~~~~~~ main.cpp:147:1: warning: ‘always_inline’ function might not be inlinable [-Wattributes] 147 | my256_mulhi_epu32(const __m256i &a, const __m256i &b) { | ^~~~~~~~~~~~~~~~~ main.cpp:142:1: warning: ‘always_inline’ function might not be inlinable [-Wattributes] 142 | my256_mullo_epu32(const __m256i &a, const __m256i &b) { | ^~~~~~~~~~~~~~~~~ main.cpp:135:1: warning: ‘always_inline’ function might not be inlinable [-Wattributes] 135 | montgomery_sub_128(const __m128i &a, const __m128i &b, const __m128i &m2, | ^~~~~~~~~~~~~~~~~~ main.cpp:128:1: warning: ‘always_inline’ function might not be inlinable [-Wattributes] 128 | montgomery_add_128(const __m128i &a, const __m128i &b, const __m128i &m2, | ^~~~~~~~~~~~~~~~~~ main.cpp:120:1: warning: ‘always_inline’ function might not be inlinable [-Wattributes] 120 | montgomery_mul_128(const __m128i &a, const __m128i &b, const __m128i &r, | ^~~~~~~~~~~~~~~~~~ main.cpp:109:1: warning: ‘always_inline’ function might not be inlinable [-Wattributes] 109 | my128_mulhi_epu32(const __m128i &a, const __m128i &b) { | ^~~~~~~~~~~~~~~~~ main.cpp:104:1: warning: ‘always_inline’ function might not be inlinable [-Wattributes] 104 | my128_mullo_epu32(const __m128i &a, const __m128i &b) { | ^~~~~~~~~~~~~~~~~
ソースコード
#include <immintrin.h>//#include <bits/stdc++.h>using namespace std;using namespace std;template <uint32_t mod>struct LazyMontgomeryModInt {using mint = LazyMontgomeryModInt;using i32 = int32_t;using u32 = uint32_t;using u64 = uint64_t;static constexpr u32 get_r() {u32 ret = mod;for (i32 i = 0; i < 4; ++i) ret *= 2 - mod * ret;return ret;}static constexpr u32 r = get_r();static constexpr u32 n2 = -u64(mod) % mod;static_assert(r * mod == 1, "invalid, r * mod != 1");static_assert(mod < (1 << 30), "invalid, mod >= 2 ^ 30");static_assert((mod & 1) == 1, "invalid, mod % 2 == 0");u32 a;constexpr LazyMontgomeryModInt() : a(0) {}constexpr LazyMontgomeryModInt(const int64_t &b): a(reduce(u64(b % mod + mod) * n2)){};static constexpr u32 reduce(const u64 &b) {return (b + u64(u32(b) * u32(-r)) * mod) >> 32;}constexpr mint &operator+=(const mint &b) {if (i32(a += b.a - 2 * mod) < 0) a += 2 * mod;return *this;}constexpr mint &operator-=(const mint &b) {if (i32(a -= b.a) < 0) a += 2 * mod;return *this;}constexpr mint &operator*=(const mint &b) {a = reduce(u64(a) * b.a);return *this;}constexpr mint &operator/=(const mint &b) {*this *= b.inverse();return *this;}constexpr mint operator+(const mint &b) const { return mint(*this) += b; }constexpr mint operator-(const mint &b) const { return mint(*this) -= b; }constexpr mint operator*(const mint &b) const { return mint(*this) *= b; }constexpr mint operator/(const mint &b) const { return mint(*this) /= b; }constexpr bool operator==(const mint &b) const {return (a >= mod ? a - mod : a) == (b.a >= mod ? b.a - mod : b.a);}constexpr bool operator!=(const mint &b) const {return (a >= mod ? a - mod : a) != (b.a >= mod ? b.a - mod : b.a);}constexpr mint operator-() const { return mint() - mint(*this); }constexpr mint pow(u64 n) const {mint ret(1), mul(*this);while (n > 0) {if (n & 1) ret *= mul;mul *= mul;n >>= 1;}return ret;}constexpr mint inverse() const { return pow(mod - 2); }friend ostream &operator<<(ostream &os, const mint &b) {return os << b.get();}friend istream &operator>>(istream &is, mint &b) {int64_t t;is >> t;b = LazyMontgomeryModInt<mod>(t);return (is);}constexpr u32 get() const {u32 ret = reduce(a);return ret >= mod ? ret - mod : ret;}static constexpr u32 get_mod() { return mod; }};using namespace std;__attribute__((target("sse4.2"))) __attribute__((always_inline)) __m128imy128_mullo_epu32(const __m128i &a, const __m128i &b) {return _mm_mullo_epi32(a, b);}__attribute__((target("sse4.2"))) __attribute__((always_inline)) __m128imy128_mulhi_epu32(const __m128i &a, const __m128i &b) {__m128i a13 = _mm_shuffle_epi32(a, 0xF5);__m128i b13 = _mm_shuffle_epi32(b, 0xF5);__m128i prod02 = _mm_mul_epu32(a, b);__m128i prod13 = _mm_mul_epu32(a13, b13);__m128i prod = _mm_unpackhi_epi64(_mm_unpacklo_epi32(prod02, prod13),_mm_unpackhi_epi32(prod02, prod13));return prod;}__attribute__((target("sse4.2"))) __attribute__((always_inline)) __m128imontgomery_mul_128(const __m128i &a, const __m128i &b, const __m128i &r,const __m128i &m1) {return _mm_sub_epi32(_mm_add_epi32(my128_mulhi_epu32(a, b), m1),my128_mulhi_epu32(my128_mullo_epu32(my128_mullo_epu32(a, b), r), m1));}__attribute__((target("sse4.2"))) __attribute__((always_inline)) __m128imontgomery_add_128(const __m128i &a, const __m128i &b, const __m128i &m2,const __m128i &m0) {__m128i ret = _mm_sub_epi32(_mm_add_epi32(a, b), m2);return _mm_add_epi32(_mm_and_si128(_mm_cmpgt_epi32(m0, ret), m2), ret);}__attribute__((target("sse4.2"))) __attribute__((always_inline)) __m128imontgomery_sub_128(const __m128i &a, const __m128i &b, const __m128i &m2,const __m128i &m0) {__m128i ret = _mm_sub_epi32(a, b);return _mm_add_epi32(_mm_and_si128(_mm_cmpgt_epi32(m0, ret), m2), ret);}__attribute__((target("avx2"))) __attribute__((always_inline)) __m256imy256_mullo_epu32(const __m256i &a, const __m256i &b) {return _mm256_mullo_epi32(a, b);}__attribute__((target("avx2"))) __attribute__((always_inline)) __m256imy256_mulhi_epu32(const __m256i &a, const __m256i &b) {__m256i a13 = _mm256_shuffle_epi32(a, 0xF5);__m256i b13 = _mm256_shuffle_epi32(b, 0xF5);__m256i prod02 = _mm256_mul_epu32(a, b);__m256i prod13 = _mm256_mul_epu32(a13, b13);__m256i prod = _mm256_unpackhi_epi64(_mm256_unpacklo_epi32(prod02, prod13),_mm256_unpackhi_epi32(prod02, prod13));return prod;}__attribute__((target("avx2"))) __attribute__((always_inline)) __m256imontgomery_mul_256(const __m256i &a, const __m256i &b, const __m256i &r,const __m256i &m1) {return _mm256_sub_epi32(_mm256_add_epi32(my256_mulhi_epu32(a, b), m1),my256_mulhi_epu32(my256_mullo_epu32(my256_mullo_epu32(a, b), r), m1));}__attribute__((target("avx2"))) __attribute__((always_inline)) __m256imontgomery_add_256(const __m256i &a, const __m256i &b, const __m256i &m2,const __m256i &m0) {__m256i ret = _mm256_sub_epi32(_mm256_add_epi32(a, b), m2);return _mm256_add_epi32(_mm256_and_si256(_mm256_cmpgt_epi32(m0, ret), m2),ret);}__attribute__((target("avx2"))) __attribute__((always_inline)) __m256imontgomery_sub_256(const __m256i &a, const __m256i &b, const __m256i &m2,const __m256i &m0) {__m256i ret = _mm256_sub_epi32(a, b);return _mm256_add_epi32(_mm256_and_si256(_mm256_cmpgt_epi32(m0, ret), m2),ret);}constexpr int MOD = 1000000007;template <typename mint>int naive_optimized(int MAX) {mint eight = 8;mint a[8] = {1, 1, 1, 1, 1, 1, 1, 1};mint b[8] = {1, 2, 3, 4, 5, 6, 7, 8};for (int i = 1; i <= (MAX / 8); i++) {a[0] *= b[0];b[0] += eight;a[1] *= b[1];b[1] += eight;a[2] *= b[2];b[2] += eight;a[3] *= b[3];b[3] += eight;a[4] *= b[4];b[4] += eight;a[5] *= b[5];b[5] += eight;a[6] *= b[6];b[6] += eight;a[7] *= b[7];b[7] += eight;}mint ret = 1;for (int i = 0; i < 8; i++) ret *= a[i];return ret.get();}int r[32] __attribute__((aligned(64)));uint32_t res2[32] __attribute__((aligned(64)));__attribute__((target("avx2"))) int optimized_with_simd_2(int MAX) {using mint = LazyMontgomeryModInt<1000000007>;using u64 = uint64_t;for (int i = 0; i < 32; i++) r[i] = mint::reduce(u64(i + 1) * mint::n2);__m256i A0 = _mm256_set1_epi32(r[0]);__m256i A1 = _mm256_set1_epi32(r[0]);__m256i A2 = _mm256_set1_epi32(r[0]);__m256i A3 = _mm256_set1_epi32(r[0]);__m256i B0 = _mm256_loadu_si256((__m256i *)(r + 0));__m256i B1 = _mm256_loadu_si256((__m256i *)(r + 8));__m256i B2 = _mm256_loadu_si256((__m256i *)(r + 16));__m256i B3 = _mm256_loadu_si256((__m256i *)(r + 24));const __m256i EI = _mm256_set1_epi32(mint::get_mod() * 2 - r[31]);const __m256i R = _mm256_set1_epi32(mint::r);const __m256i M0 = _mm256_set1_epi32(0);const __m256i M1 = _mm256_set1_epi32(mint::get_mod());const __m256i M2 = _mm256_set1_epi32(mint::get_mod() * 2);for (int i = 0; i < MAX / 32; ++i) {A0 = montgomery_mul_256(A0, B0, R, M1);A1 = montgomery_mul_256(A1, B1, R, M1);A2 = montgomery_mul_256(A2, B2, R, M1);A3 = montgomery_mul_256(A3, B3, R, M1);B0 = montgomery_sub_256(B0, EI, M2, M0);B1 = montgomery_sub_256(B1, EI, M2, M0);B2 = montgomery_sub_256(B2, EI, M2, M0);B3 = montgomery_sub_256(B3, EI, M2, M0);}_mm256_storeu_si256((__m256i *)(res2 + 0), A0);_mm256_storeu_si256((__m256i *)(res2 + 8), A1);_mm256_storeu_si256((__m256i *)(res2 + 16), A2);_mm256_storeu_si256((__m256i *)(res2 + 24), A3);mint ret = 1;for (int i = 0; i < 32; i++) ret *= *(reinterpret_cast<mint *>(res2 + i));return ret.get();}using namespace std;namespace fastio {static constexpr int SZ = 1 << 17;char ibuf[SZ], obuf[SZ];int pil = 0, pir = 0, por = 0;struct Pre {char num[40000];constexpr Pre() : num() {for (int i = 0; i < 10000; i++) {int n = i;for (int j = 3; j >= 0; j--) {num[i * 4 + j] = n % 10 + '0';n /= 10;}}}} constexpr pre;inline void load() {memcpy(ibuf, ibuf + pil, pir - pil);pir = pir - pil + fread(ibuf + pir - pil, 1, SZ - pir + pil, stdin);pil = 0;}inline void flush() {fwrite(obuf, 1, por, stdout);por = 0;}inline void rd(char& c) { c = ibuf[pil++]; }template <typename T>inline void rd(T& x) {if (pil + 32 > pir) load();char c;doc = ibuf[pil++];while (c < '-');bool minus = 0;if (c == '-') {minus = 1;c = ibuf[pil++];}x = 0;while (c >= '0') {x = x * 10 + (c & 15);c = ibuf[pil++];}if (minus) x = -x;}inline void rd() {}template <typename Head, typename... Tail>inline void rd(Head& head, Tail&... tail) {rd(head);rd(tail...);}inline void wt(char c) { obuf[por++] = c; }template <typename T>inline void wt(T x) {if (por > SZ - 32) flush();if (!x) {obuf[por++] = '0';return;}if (x < 0) {obuf[por++] = '-';x = -x;}int i = 12;char buf[16];while (x >= 10000) {memcpy(buf + i, pre.num + (x % 10000) * 4, 4);x /= 10000;i -= 4;}int d = x < 100 ? (x < 10 ? 1 : 2) : (x < 1000 ? 3 : 4);memcpy(obuf + por, pre.num + x * 4 + 4 - d, d);por += d;memcpy(obuf + por, buf + i + 4, 12 - i);por += 12 - i;}inline void wt() {}template <typename Head, typename... Tail>inline void wt(Head head, Tail... tail) {wt(head);wt(tail...);}template<typename T>inline void wtn(T x){wt(x, '\n');}struct Dummy {Dummy() { atexit(flush); }} dummy;} // namespace fastiousing fastio::rd;using fastio::wt;using fastio::wtn;using namespace std;struct Timer {chrono::high_resolution_clock::time_point st;Timer() { reset(); }void reset() { st = chrono::high_resolution_clock::now(); }chrono::milliseconds::rep elapsed() {auto ed = chrono::high_resolution_clock::now();return chrono::duration_cast<chrono::milliseconds>(ed - st).count();}};int main() {using mint = LazyMontgomeryModInt<MOD>;long long N;rd(N);if(N >= MOD) {wtn('0');return 0;}if(N < 32){mint ans = 1;for(int i = 1; i <= N; i++) ans *= i;wtn(ans.get());return 0;}mint ans = optimized_with_simd_2(N / 32 * 32);for (int i = N / 32 * 32 + 1; i <= N; i++) ans *= i;wtn(ans.get());}