結果

問題 No.502 階乗を計算するだけ
ユーザー NyaanNyaanNyaanNyaan
提出日時 2020-08-12 09:56:21
言語 C++17
(gcc 13.3.0 + boost 1.87.0)
結果
AC  
実行時間 468 ms / 1,000 ms
コード長 10,829 bytes
コンパイル時間 3,847 ms
コンパイル使用メモリ 306,548 KB
最終ジャッジ日時 2025-01-12 21:13:45
ジャッジサーバーID
(参考情報)
judge4 / judge5
このコードへのチャレンジ
(要ログイン)
ファイルパターン 結果
other AC * 52
権限があれば一括ダウンロードができます
コンパイルメッセージ
main.cpp:174:1: warning: ‘always_inline’ function might not be inlinable [-Wattributes]
  174 | montgomery_sub_256(const __m256i &a, const __m256i &b, const __m256i &m2,
      | ^~~~~~~~~~~~~~~~~~
main.cpp:166:1: warning: ‘always_inline’ function might not be inlinable [-Wattributes]
  166 | montgomery_add_256(const __m256i &a, const __m256i &b, const __m256i &m2,
      | ^~~~~~~~~~~~~~~~~~
main.cpp:158:1: warning: ‘always_inline’ function might not be inlinable [-Wattributes]
  158 | montgomery_mul_256(const __m256i &a, const __m256i &b, const __m256i &r,
      | ^~~~~~~~~~~~~~~~~~
main.cpp:147:1: warning: ‘always_inline’ function might not be inlinable [-Wattributes]
  147 | my256_mulhi_epu32(const __m256i &a, const __m256i &b) {
      | ^~~~~~~~~~~~~~~~~
main.cpp:142:1: warning: ‘always_inline’ function might not be inlinable [-Wattributes]
  142 | my256_mullo_epu32(const __m256i &a, const __m256i &b) {
      | ^~~~~~~~~~~~~~~~~
main.cpp:135:1: warning: ‘always_inline’ function might not be inlinable [-Wattributes]
  135 | montgomery_sub_128(const __m128i &a, const __m128i &b, const __m128i &m2,
      | ^~~~~~~~~~~~~~~~~~
main.cpp:128:1: warning: ‘always_inline’ function might not be inlinable [-Wattributes]
  128 | montgomery_add_128(const __m128i &a, const __m128i &b, const __m128i &m2,
      | ^~~~~~~~~~~~~~~~~~
main.cpp:120:1: warning: ‘always_inline’ function might not be inlinable [-Wattributes]
  120 | montgomery_mul_128(const __m128i &a, const __m128i &b, const __m128i &r,
      | ^~~~~~~~~~~~~~~~~~
main.cpp:109:1: warning: ‘always_inline’ function might not be inlinable [-Wattributes]
  109 | my128_mulhi_epu32(const __m128i &a, const __m128i &b) {
      | ^~~~~~~~~~~~~~~~~
main.cpp:104:1: warning: ‘always_inline’ function might not be inlinable [-Wattributes]
  104 | my128_mullo_epu32(const __m128i &a, const __m128i &b) {
      | ^~~~~~~~~~~~~~~~~

ソースコード

diff #
プレゼンテーションモードにする

#include <immintrin.h>
//
#include <bits/stdc++.h>
using namespace std;
using namespace std;
template <uint32_t mod>
struct LazyMontgomeryModInt {
using mint = LazyMontgomeryModInt;
using i32 = int32_t;
using u32 = uint32_t;
using u64 = uint64_t;
static constexpr u32 get_r() {
u32 ret = mod;
for (i32 i = 0; i < 4; ++i) ret *= 2 - mod * ret;
return ret;
}
static constexpr u32 r = get_r();
static constexpr u32 n2 = -u64(mod) % mod;
static_assert(r * mod == 1, "invalid, r * mod != 1");
static_assert(mod < (1 << 30), "invalid, mod >= 2 ^ 30");
static_assert((mod & 1) == 1, "invalid, mod % 2 == 0");
u32 a;
constexpr LazyMontgomeryModInt() : a(0) {}
constexpr LazyMontgomeryModInt(const int64_t &b)
: a(reduce(u64(b % mod + mod) * n2)){};
static constexpr u32 reduce(const u64 &b) {
return (b + u64(u32(b) * u32(-r)) * mod) >> 32;
}
constexpr mint &operator+=(const mint &b) {
if (i32(a += b.a - 2 * mod) < 0) a += 2 * mod;
return *this;
}
constexpr mint &operator-=(const mint &b) {
if (i32(a -= b.a) < 0) a += 2 * mod;
return *this;
}
constexpr mint &operator*=(const mint &b) {
a = reduce(u64(a) * b.a);
return *this;
}
constexpr mint &operator/=(const mint &b) {
*this *= b.inverse();
return *this;
}
constexpr mint operator+(const mint &b) const { return mint(*this) += b; }
constexpr mint operator-(const mint &b) const { return mint(*this) -= b; }
constexpr mint operator*(const mint &b) const { return mint(*this) *= b; }
constexpr mint operator/(const mint &b) const { return mint(*this) /= b; }
constexpr bool operator==(const mint &b) const {
return (a >= mod ? a - mod : a) == (b.a >= mod ? b.a - mod : b.a);
}
constexpr bool operator!=(const mint &b) const {
return (a >= mod ? a - mod : a) != (b.a >= mod ? b.a - mod : b.a);
}
constexpr mint operator-() const { return mint() - mint(*this); }
constexpr mint pow(u64 n) const {
mint ret(1), mul(*this);
while (n > 0) {
if (n & 1) ret *= mul;
mul *= mul;
n >>= 1;
}
return ret;
}
constexpr mint inverse() const { return pow(mod - 2); }
friend ostream &operator<<(ostream &os, const mint &b) {
return os << b.get();
}
friend istream &operator>>(istream &is, mint &b) {
int64_t t;
is >> t;
b = LazyMontgomeryModInt<mod>(t);
return (is);
}
constexpr u32 get() const {
u32 ret = reduce(a);
return ret >= mod ? ret - mod : ret;
}
static constexpr u32 get_mod() { return mod; }
};
using namespace std;
__attribute__((target("sse4.2"))) __attribute__((always_inline)) __m128i
my128_mullo_epu32(const __m128i &a, const __m128i &b) {
return _mm_mullo_epi32(a, b);
}
__attribute__((target("sse4.2"))) __attribute__((always_inline)) __m128i
my128_mulhi_epu32(const __m128i &a, const __m128i &b) {
__m128i a13 = _mm_shuffle_epi32(a, 0xF5);
__m128i b13 = _mm_shuffle_epi32(b, 0xF5);
__m128i prod02 = _mm_mul_epu32(a, b);
__m128i prod13 = _mm_mul_epu32(a13, b13);
__m128i prod = _mm_unpackhi_epi64(_mm_unpacklo_epi32(prod02, prod13),
_mm_unpackhi_epi32(prod02, prod13));
return prod;
}
__attribute__((target("sse4.2"))) __attribute__((always_inline)) __m128i
montgomery_mul_128(const __m128i &a, const __m128i &b, const __m128i &r,
const __m128i &m1) {
return _mm_sub_epi32(
_mm_add_epi32(my128_mulhi_epu32(a, b), m1),
my128_mulhi_epu32(my128_mullo_epu32(my128_mullo_epu32(a, b), r), m1));
}
__attribute__((target("sse4.2"))) __attribute__((always_inline)) __m128i
montgomery_add_128(const __m128i &a, const __m128i &b, const __m128i &m2,
const __m128i &m0) {
__m128i ret = _mm_sub_epi32(_mm_add_epi32(a, b), m2);
return _mm_add_epi32(_mm_and_si128(_mm_cmpgt_epi32(m0, ret), m2), ret);
}
__attribute__((target("sse4.2"))) __attribute__((always_inline)) __m128i
montgomery_sub_128(const __m128i &a, const __m128i &b, const __m128i &m2,
const __m128i &m0) {
__m128i ret = _mm_sub_epi32(a, b);
return _mm_add_epi32(_mm_and_si128(_mm_cmpgt_epi32(m0, ret), m2), ret);
}
__attribute__((target("avx2"))) __attribute__((always_inline)) __m256i
my256_mullo_epu32(const __m256i &a, const __m256i &b) {
return _mm256_mullo_epi32(a, b);
}
__attribute__((target("avx2"))) __attribute__((always_inline)) __m256i
my256_mulhi_epu32(const __m256i &a, const __m256i &b) {
__m256i a13 = _mm256_shuffle_epi32(a, 0xF5);
__m256i b13 = _mm256_shuffle_epi32(b, 0xF5);
__m256i prod02 = _mm256_mul_epu32(a, b);
__m256i prod13 = _mm256_mul_epu32(a13, b13);
__m256i prod = _mm256_unpackhi_epi64(_mm256_unpacklo_epi32(prod02, prod13),
_mm256_unpackhi_epi32(prod02, prod13));
return prod;
}
__attribute__((target("avx2"))) __attribute__((always_inline)) __m256i
montgomery_mul_256(const __m256i &a, const __m256i &b, const __m256i &r,
const __m256i &m1) {
return _mm256_sub_epi32(
_mm256_add_epi32(my256_mulhi_epu32(a, b), m1),
my256_mulhi_epu32(my256_mullo_epu32(my256_mullo_epu32(a, b), r), m1));
}
__attribute__((target("avx2"))) __attribute__((always_inline)) __m256i
montgomery_add_256(const __m256i &a, const __m256i &b, const __m256i &m2,
const __m256i &m0) {
__m256i ret = _mm256_sub_epi32(_mm256_add_epi32(a, b), m2);
return _mm256_add_epi32(_mm256_and_si256(_mm256_cmpgt_epi32(m0, ret), m2),
ret);
}
__attribute__((target("avx2"))) __attribute__((always_inline)) __m256i
montgomery_sub_256(const __m256i &a, const __m256i &b, const __m256i &m2,
const __m256i &m0) {
__m256i ret = _mm256_sub_epi32(a, b);
return _mm256_add_epi32(_mm256_and_si256(_mm256_cmpgt_epi32(m0, ret), m2),
ret);
}
constexpr int MOD = 1000000007;
template <typename mint>
int naive_optimized(int MAX) {
mint eight = 8;
mint a[8] = {1, 1, 1, 1, 1, 1, 1, 1};
mint b[8] = {1, 2, 3, 4, 5, 6, 7, 8};
for (int i = 1; i <= (MAX / 8); i++) {
a[0] *= b[0];
b[0] += eight;
a[1] *= b[1];
b[1] += eight;
a[2] *= b[2];
b[2] += eight;
a[3] *= b[3];
b[3] += eight;
a[4] *= b[4];
b[4] += eight;
a[5] *= b[5];
b[5] += eight;
a[6] *= b[6];
b[6] += eight;
a[7] *= b[7];
b[7] += eight;
}
mint ret = 1;
for (int i = 0; i < 8; i++) ret *= a[i];
return ret.get();
}
int r[32] __attribute__((aligned(64)));
uint32_t res2[32] __attribute__((aligned(64)));
__attribute__((target("avx2"))) int optimized_with_simd_2(int MAX) {
using mint = LazyMontgomeryModInt<1000000007>;
using u64 = uint64_t;
for (int i = 0; i < 32; i++) r[i] = mint::reduce(u64(i + 1) * mint::n2);
__m256i A0 = _mm256_set1_epi32(r[0]);
__m256i A1 = _mm256_set1_epi32(r[0]);
__m256i A2 = _mm256_set1_epi32(r[0]);
__m256i A3 = _mm256_set1_epi32(r[0]);
__m256i B0 = _mm256_loadu_si256((__m256i *)(r + 0));
__m256i B1 = _mm256_loadu_si256((__m256i *)(r + 8));
__m256i B2 = _mm256_loadu_si256((__m256i *)(r + 16));
__m256i B3 = _mm256_loadu_si256((__m256i *)(r + 24));
const __m256i EI = _mm256_set1_epi32(mint::get_mod() * 2 - r[31]);
const __m256i R = _mm256_set1_epi32(mint::r);
const __m256i M0 = _mm256_set1_epi32(0);
const __m256i M1 = _mm256_set1_epi32(mint::get_mod());
const __m256i M2 = _mm256_set1_epi32(mint::get_mod() * 2);
for (int i = 0; i < MAX / 32; ++i) {
A0 = montgomery_mul_256(A0, B0, R, M1);
A1 = montgomery_mul_256(A1, B1, R, M1);
A2 = montgomery_mul_256(A2, B2, R, M1);
A3 = montgomery_mul_256(A3, B3, R, M1);
B0 = montgomery_sub_256(B0, EI, M2, M0);
B1 = montgomery_sub_256(B1, EI, M2, M0);
B2 = montgomery_sub_256(B2, EI, M2, M0);
B3 = montgomery_sub_256(B3, EI, M2, M0);
}
_mm256_storeu_si256((__m256i *)(res2 + 0), A0);
_mm256_storeu_si256((__m256i *)(res2 + 8), A1);
_mm256_storeu_si256((__m256i *)(res2 + 16), A2);
_mm256_storeu_si256((__m256i *)(res2 + 24), A3);
mint ret = 1;
for (int i = 0; i < 32; i++) ret *= *(reinterpret_cast<mint *>(res2 + i));
return ret.get();
}
using namespace std;
namespace fastio {
static constexpr int SZ = 1 << 17;
char ibuf[SZ], obuf[SZ];
int pil = 0, pir = 0, por = 0;
struct Pre {
char num[40000];
constexpr Pre() : num() {
for (int i = 0; i < 10000; i++) {
int n = i;
for (int j = 3; j >= 0; j--) {
num[i * 4 + j] = n % 10 + '0';
n /= 10;
}
}
}
} constexpr pre;
inline void load() {
memcpy(ibuf, ibuf + pil, pir - pil);
pir = pir - pil + fread(ibuf + pir - pil, 1, SZ - pir + pil, stdin);
pil = 0;
}
inline void flush() {
fwrite(obuf, 1, por, stdout);
por = 0;
}
inline void rd(char& c) { c = ibuf[pil++]; }
template <typename T>
inline void rd(T& x) {
if (pil + 32 > pir) load();
char c;
do
c = ibuf[pil++];
while (c < '-');
bool minus = 0;
if (c == '-') {
minus = 1;
c = ibuf[pil++];
}
x = 0;
while (c >= '0') {
x = x * 10 + (c & 15);
c = ibuf[pil++];
}
if (minus) x = -x;
}
inline void rd() {}
template <typename Head, typename... Tail>
inline void rd(Head& head, Tail&... tail) {
rd(head);
rd(tail...);
}
inline void wt(char c) { obuf[por++] = c; }
template <typename T>
inline void wt(T x) {
if (por > SZ - 32) flush();
if (!x) {
obuf[por++] = '0';
return;
}
if (x < 0) {
obuf[por++] = '-';
x = -x;
}
int i = 12;
char buf[16];
while (x >= 10000) {
memcpy(buf + i, pre.num + (x % 10000) * 4, 4);
x /= 10000;
i -= 4;
}
int d = x < 100 ? (x < 10 ? 1 : 2) : (x < 1000 ? 3 : 4);
memcpy(obuf + por, pre.num + x * 4 + 4 - d, d);
por += d;
memcpy(obuf + por, buf + i + 4, 12 - i);
por += 12 - i;
}
inline void wt() {}
template <typename Head, typename... Tail>
inline void wt(Head head, Tail... tail) {
wt(head);
wt(tail...);
}
template<typename T>
inline void wtn(T x){
wt(x, '\n');
}
struct Dummy {
Dummy() { atexit(flush); }
} dummy;
} // namespace fastio
using fastio::rd;
using fastio::wt;
using fastio::wtn;
using namespace std;
struct Timer {
chrono::high_resolution_clock::time_point st;
Timer() { reset(); }
void reset() { st = chrono::high_resolution_clock::now(); }
chrono::milliseconds::rep elapsed() {
auto ed = chrono::high_resolution_clock::now();
return chrono::duration_cast<chrono::milliseconds>(ed - st).count();
}
};
int main() {
using mint = LazyMontgomeryModInt<MOD>;
long long N;
rd(N);
if(N >= MOD) {
wtn('0');
return 0;
}
if(N < 32){
mint ans = 1;
for(int i = 1; i <= N; i++) ans *= i;
wtn(ans.get());
return 0;
}
mint ans = optimized_with_simd_2(N / 32 * 32);
for (int i = N / 32 * 32 + 1; i <= N; i++) ans *= i;
wtn(ans.get());
}
הההההההההההההההההההההההההההההההההההההההההההההההההההההההההההההההההההההההההההההההההההההההההההההההההההההההההההההההההההההההההההההההההההההההההההההההההההההההההההההההההההההההההההההההההההההההההההההההההההההההההההההההההההההההההההההההההההההההההההההההההההההההההההההההה
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
0