// #pragma GCC optimize("O3") #include <vector> #pragma GCC target("sse4.1") #include <smmintrin.h> // SSE4.1 #include "bits/stdc++.h" using namespace std; #define rep(i,a,b) for(int i=(a); i<(b); ++i) const int N = 5e4+10, M = 5e6+69420; short T[N], a[N]; unsigned short cnts[M]; int main(){ cin.tie(NULL),cin.sync_with_stdio(false); int n,m; cin >> n >> m; rep(i,0,n) { int x; cin >> x; a[i] = x; } int bigPos = n; int bigNum = 0; for(int num = 1; num <= n; ++num){ // process old solution except large thing for(int ii = 0; ii < 1024; ii += 8){ cnts[ii] = 0; cnts[ii+1] = 0; cnts[ii+2] = 0; cnts[ii+3] = 0; cnts[ii+4] = 0; cnts[ii+5] = 0; cnts[ii+6] = 0; cnts[ii+7] = 0; } int ii1 = bigPos + 1; // Process 4 elements at a time for (; ii1 + 8 <= n; ii1 += 8) { __m128i vecT = _mm_loadu_si128((__m128i*)&T[ii1]); // Load 4 `T` values __m128i vecA = _mm_loadu_si128((__m128i*)&a[ii1]); // Load 4 `a` values vecT = _mm_add_epi16(vecT, vecA); // T[ii] += a[ii] _mm_storeu_si128((__m128i*)&T[ii1], vecT); // Store updated T } // Handle remaining elements for (; ii1 < n; ++ii1) { T[ii1] += a[ii1]; } // handle bignum if (bigPos < n) bigNum += a[bigPos]; // int iid = bigPos+1; for(; iid + 8 <= n; iid += 8){ cnts[T[iid]]++; cnts[T[iid+1]]++; cnts[T[iid+2]]++; cnts[T[iid+3]]++; cnts[T[iid+4]]++; cnts[T[iid+5]]++; cnts[T[iid+6]]++; cnts[T[iid+7]]++; } for(; iid < n; ++iid){ cnts[T[iid]]++; } if (bigPos < n and bigNum < 1024) cnts[bigNum]++; // count number of -1's we skipped int numMinOne = num - (n-bigPos); cnts[0] += numMinOne - 1; // calculate limit of which ones we take short lim = 0; int need = (num-1)/2, left = m; while(cnts[lim] < need){ need -= cnts[lim]; left -= cnts[lim]*lim; ++lim; } left -= lim*need; if (left>=0){ // case > lim verwijderen int ii = bigPos+1; __m128i vvec = _mm_set1_epi16(lim); // Broadcast v for (; ii + 8 <= n; ii += 8) { __m128i tvals = _mm_loadu_si128((__m128i*)&T[ii]); // Load T[i] values __m128i mask = _mm_cmpgt_epi16(tvals, vvec); // Mask where T[i] > v (0xFFFF or 0x0000) __m128i new_vals = _mm_andnot_si128(mask, tvals); // T[i] & ~mask _mm_storeu_si128((__m128i*)&T[ii], new_vals); // Store back results } for(;ii<n;++ii){ if (T[ii] > lim) T[ii] = 0; } // handle bignum if (bigPos<n and bigNum > lim){ bigNum =0; } int remnum = cnts[lim] - need; // how many we remove if (remnum and bigPos<n and bigNum == lim){ --remnum; bigNum = 0; } for(int i = bigPos+1; remnum; ++i){ int msk = (T[i] == lim); remnum -= msk; T[i] &= msk - 1; } // possible // for(int ii = n-1; ii >= bigPos; --ii){ // if (T[ii] > lim or (T[ii] == lim and --need < 0)) T[ii] = 0; // } // ii = bigPos; // for(;ii+4<n; ii+=4){ // __m128i mask = _mm_xor_si128(_mm_cmpgt_epi16(a, b), _mm_set1_epi16(-1)); // } // ii = n - 1; // for (; ii >= bigPos; --ii) { // int cmp1 = T[ii] > lim; // 1 if true, 0 if false // int cmp3 = (T[ii] == lim); // need -= cmp3; // int cmp2 = cmp3 & (need < 0); // 1 if T[ii] == lim and need < 0 // T[ii] &= -!(cmp1 | cmp2); // If cmp1 or cmp2 is true, set T[ii] to 0 // } // set all -1's to 0 // memset((T+n-num), 0, 4*numMinOne); NOT NEEDED, MEM IS ALREADY 0 // since we found a set, bigPos is now the current pirate // set old bigNum in the right position T[bigPos] = bigNum; // create new bigNum bigPos = n - num; bigNum = left; }else{ // impossible int ii = bigPos+1; // Process 4 elements at a time for (; ii + 8 <= n; ii += 8) { __m128i vecT = _mm_loadu_si128((__m128i*)&T[ii]); // Load 4 `T` values __m128i vecA = _mm_loadu_si128((__m128i*)&a[ii]); // Load 4 `a` values vecT = _mm_sub_epi16(vecT, vecA); // T[ii] -= a[ii] _mm_storeu_si128((__m128i*)&T[ii], vecT); // Store updated T } // Handle remaining elements for (; ii < n; ++ii) { T[ii] -= a[ii]; } // handle bignum if (bigPos < n){ bigNum -= a[bigPos]; } } } rep(i,0,bigPos) cout << "-1 "; cout << bigNum; rep(i,bigPos+1,n) cout << ' ' << T[i]; cout << '\n'; }
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 | // #pragma GCC optimize("O3") #include <vector> #pragma GCC target("sse4.1") #include <smmintrin.h> // SSE4.1 #include "bits/stdc++.h" using namespace std; #define rep(i,a,b) for(int i=(a); i<(b); ++i) const int N = 5e4+10, M = 5e6+69420; short T[N], a[N]; unsigned short cnts[M]; int main(){ cin.tie(NULL),cin.sync_with_stdio(false); int n,m; cin >> n >> m; rep(i,0,n) { int x; cin >> x; a[i] = x; } int bigPos = n; int bigNum = 0; for(int num = 1; num <= n; ++num){ // process old solution except large thing for(int ii = 0; ii < 1024; ii += 8){ cnts[ii] = 0; cnts[ii+1] = 0; cnts[ii+2] = 0; cnts[ii+3] = 0; cnts[ii+4] = 0; cnts[ii+5] = 0; cnts[ii+6] = 0; cnts[ii+7] = 0; } int ii1 = bigPos + 1; // Process 4 elements at a time for (; ii1 + 8 <= n; ii1 += 8) { __m128i vecT = _mm_loadu_si128((__m128i*)&T[ii1]); // Load 4 `T` values __m128i vecA = _mm_loadu_si128((__m128i*)&a[ii1]); // Load 4 `a` values vecT = _mm_add_epi16(vecT, vecA); // T[ii] += a[ii] _mm_storeu_si128((__m128i*)&T[ii1], vecT); // Store updated T } // Handle remaining elements for (; ii1 < n; ++ii1) { T[ii1] += a[ii1]; } // handle bignum if (bigPos < n) bigNum += a[bigPos]; // int iid = bigPos+1; for(; iid + 8 <= n; iid += 8){ cnts[T[iid]]++; cnts[T[iid+1]]++; cnts[T[iid+2]]++; cnts[T[iid+3]]++; cnts[T[iid+4]]++; cnts[T[iid+5]]++; cnts[T[iid+6]]++; cnts[T[iid+7]]++; } for(; iid < n; ++iid){ cnts[T[iid]]++; } if (bigPos < n and bigNum < 1024) cnts[bigNum]++; // count number of -1's we skipped int numMinOne = num - (n-bigPos); cnts[0] += numMinOne - 1; // calculate limit of which ones we take short lim = 0; int need = (num-1)/2, left = m; while(cnts[lim] < need){ need -= cnts[lim]; left -= cnts[lim]*lim; ++lim; } left -= lim*need; if (left>=0){ // case > lim verwijderen int ii = bigPos+1; __m128i vvec = _mm_set1_epi16(lim); // Broadcast v for (; ii + 8 <= n; ii += 8) { __m128i tvals = _mm_loadu_si128((__m128i*)&T[ii]); // Load T[i] values __m128i mask = _mm_cmpgt_epi16(tvals, vvec); // Mask where T[i] > v (0xFFFF or 0x0000) __m128i new_vals = _mm_andnot_si128(mask, tvals); // T[i] & ~mask _mm_storeu_si128((__m128i*)&T[ii], new_vals); // Store back results } for(;ii<n;++ii){ if (T[ii] > lim) T[ii] = 0; } // handle bignum if (bigPos<n and bigNum > lim){ bigNum =0; } int remnum = cnts[lim] - need; // how many we remove if (remnum and bigPos<n and bigNum == lim){ --remnum; bigNum = 0; } for(int i = bigPos+1; remnum; ++i){ int msk = (T[i] == lim); remnum -= msk; T[i] &= msk - 1; } // possible // for(int ii = n-1; ii >= bigPos; --ii){ // if (T[ii] > lim or (T[ii] == lim and --need < 0)) T[ii] = 0; // } // ii = bigPos; // for(;ii+4<n; ii+=4){ // __m128i mask = _mm_xor_si128(_mm_cmpgt_epi16(a, b), _mm_set1_epi16(-1)); // } // ii = n - 1; // for (; ii >= bigPos; --ii) { // int cmp1 = T[ii] > lim; // 1 if true, 0 if false // int cmp3 = (T[ii] == lim); // need -= cmp3; // int cmp2 = cmp3 & (need < 0); // 1 if T[ii] == lim and need < 0 // T[ii] &= -!(cmp1 | cmp2); // If cmp1 or cmp2 is true, set T[ii] to 0 // } // set all -1's to 0 // memset((T+n-num), 0, 4*numMinOne); NOT NEEDED, MEM IS ALREADY 0 // since we found a set, bigPos is now the current pirate // set old bigNum in the right position T[bigPos] = bigNum; // create new bigNum bigPos = n - num; bigNum = left; }else{ // impossible int ii = bigPos+1; // Process 4 elements at a time for (; ii + 8 <= n; ii += 8) { __m128i vecT = _mm_loadu_si128((__m128i*)&T[ii]); // Load 4 `T` values __m128i vecA = _mm_loadu_si128((__m128i*)&a[ii]); // Load 4 `a` values vecT = _mm_sub_epi16(vecT, vecA); // T[ii] -= a[ii] _mm_storeu_si128((__m128i*)&T[ii], vecT); // Store updated T } // Handle remaining elements for (; ii < n; ++ii) { T[ii] -= a[ii]; } // handle bignum if (bigPos < n){ bigNum -= a[bigPos]; } } } rep(i,0,bigPos) cout << "-1 "; cout << bigNum; rep(i,bigPos+1,n) cout << ' ' << T[i]; cout << '\n'; } |