Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
Download

Testing latest pari + WASM + node.js... and it works?! Wow.

28495 views
License: GPL3
ubuntu2004
1
/* Copyright (C) 2016 The PARI group.
2
3
This file is part of the PARI/GP package.
4
5
PARI/GP is free software; you can redistribute it and/or modify it under the
6
terms of the GNU General Public License as published by the Free Software
7
Foundation; either version 2 of the License, or (at your option) any later
8
version. It is distributed in the hope that it will be useful, but WITHOUT
9
ANY WARRANTY WHATSOEVER.
10
11
Check the License for details. You should have received a copy of it, along
12
with the package; see the file 'COPYING'. If not, write to the Free Software
13
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. */
14
15
#include "pari.h"
16
#include "paripriv.h"
17
18
/**********************************************************************/
19
/*** ***/
20
/*** Public prime table ***/
21
/*** ***/
22
/**********************************************************************/
23
24
static ulong _maxprime = 0;
25
static ulong diffptrlen;
26
27
/* Building/Rebuilding the diffptr table. The actual work is done by the
28
* following two subroutines; the user entry point is the function
29
* initprimes() below. initprimes1() is the old algorithm, called when
30
* maxnum (size) is moderate. Must be called after pari_init_stack() )*/
31
static void
32
initprimes1(ulong size, long *lenp, ulong *lastp, byteptr p1)
33
{
34
pari_sp av = avma;
35
long k;
36
byteptr q, r, s, p = (byteptr)stack_calloc(size+2), fin = p + size;
37
38
for (r=q=p,k=1; r<=fin; )
39
{
40
do { r+=k; k+=2; r+=k; } while (*++q);
41
for (s=r; s<=fin; s+=k) *s = 1;
42
}
43
r = p1; *r++ = 2; *r++ = 1; /* 2 and 3 */
44
for (s=q=p+1; ; s=q)
45
{
46
do q++; while (*q);
47
if (q > fin) break;
48
*r++ = (unsigned char) ((q-s) << 1);
49
}
50
*r++ = 0;
51
*lenp = r - p1;
52
*lastp = ((s - p) << 1) + 1;
53
set_avma(av);
54
}
55
56
/* Timing in ms (Athlon/850; reports 512K of secondary cache; looks
57
like there is 64K of quickier cache too).
58
59
arena| 30m 100m 300m 1000m 2000m <-- primelimit
60
=================================================
61
16K 1.1053 1.1407 1.2589 1.4368 1.6086
62
24K 1.0000 1.0625 1.1320 1.2443 1.3095
63
32K 1.0000 1.0469 1.0761 1.1336 1.1776
64
48K 1.0000 1.0000 1.0254 1.0445 1.0546
65
50K 1.0000 1.0000 1.0152 1.0345 1.0464
66
52K 1.0000 1.0000 1.0203 1.0273 1.0362
67
54K 1.0000 1.0000 1.0812 1.0216 1.0281
68
56K 1.0526 1.0000 1.0051 1.0144 1.0205
69
58K 1.0000 1.0000 1.0000 1.0086 1.0123
70
60K 0.9473 0.9844 1.0051 1.0014 1.0055
71
62K 1.0000 0.9844 0.9949 0.9971 0.9993
72
64K 1.0000 1.0000 1.0000 1.0000 1.0000
73
66K 1.2632 1.2187 1.2183 1.2055 1.1953
74
68K 1.4211 1.4844 1.4721 1.4425 1.4188
75
70K 1.7368 1.7188 1.7107 1.6767 1.6421
76
72K 1.9474 1.9531 1.9594 1.9023 1.8573
77
74K 2.2105 2.1875 2.1827 2.1207 2.0650
78
76K 2.4211 2.4219 2.4010 2.3305 2.2644
79
78K 2.5789 2.6250 2.6091 2.5330 2.4571
80
80K 2.8421 2.8125 2.8223 2.7213 2.6380
81
84K 3.1053 3.1875 3.1776 3.0819 2.9802
82
88K 3.5263 3.5312 3.5228 3.4124 3.2992
83
92K 3.7895 3.8438 3.8375 3.7213 3.5971
84
96K 4.0000 4.1093 4.1218 3.9986 3.9659
85
112K 4.3684 4.5781 4.5787 4.4583 4.6115
86
128K 4.7368 4.8750 4.9188 4.8075 4.8997
87
192K 5.5263 5.7188 5.8020 5.6911 5.7064
88
256K 6.0000 6.2187 6.3045 6.1954 6.1033
89
384K 6.7368 6.9531 7.0405 6.9181 6.7912
90
512K 7.3158 7.5156 7.6294 7.5000 7.4654
91
768K 9.1579 9.4531 9.6395 9.5014 9.1075
92
1024K 10.368 10.7497 10.9999 10.878 10.8201
93
1536K 12.579 13.3124 13.7660 13.747 13.4739
94
2048K 13.737 14.4839 15.0509 15.151 15.1282
95
3076K 14.789 15.5780 16.2993 16.513 16.3365
96
97
Now the same number relative to the model
98
99
(1 + 0.36*sqrt(primelimit)/arena) * (arena <= 64 ? 1.05 : (arena-64)**0.38)
100
101
[SLOW2_IN_ROOTS = 0.36, ALPHA = 0.38]
102
103
arena| 30m 100m 300m 1000m 2000m <-- primelimit
104
=================================================
105
16K 1.014 0.9835 0.9942 0.9889 1.004
106
24K 0.9526 0.9758 0.9861 0.9942 0.981
107
32K 0.971 0.9939 0.9884 0.9849 0.9806
108
48K 0.9902 0.9825 0.996 0.9945 0.9885
109
50K 0.9917 0.9853 0.9906 0.9926 0.9907
110
52K 0.9932 0.9878 0.9999 0.9928 0.9903
111
54K 0.9945 0.9902 1.064 0.9939 0.9913
112
56K 1.048 0.9924 0.9925 0.993 0.9921
113
58K 0.9969 0.9945 0.9909 0.9932 0.9918
114
60K 0.9455 0.9809 0.9992 0.9915 0.9923
115
62K 0.9991 0.9827 0.9921 0.9924 0.9929
116
64K 1 1 1 1 1
117
66K 1.02 0.9849 0.9857 0.9772 0.9704
118
68K 0.8827 0.9232 0.9176 0.9025 0.8903
119
70K 0.9255 0.9177 0.9162 0.9029 0.8881
120
72K 0.9309 0.936 0.9429 0.9219 0.9052
121
74K 0.9715 0.9644 0.967 0.9477 0.9292
122
76K 0.9935 0.9975 0.9946 0.9751 0.9552
123
78K 0.9987 1.021 1.021 1.003 0.9819
124
80K 1.047 1.041 1.052 1.027 1.006
125
84K 1.052 1.086 1.092 1.075 1.053
126
88K 1.116 1.125 1.133 1.117 1.096
127
92K 1.132 1.156 1.167 1.155 1.134
128
96K 1.137 1.177 1.195 1.185 1.196
129
112K 1.067 1.13 1.148 1.15 1.217
130
128K 1.04 1.083 1.113 1.124 1.178
131
192K 0.9368 0.985 1.025 1.051 1.095
132
256K 0.8741 0.9224 0.9619 0.995 1.024
133
384K 0.8103 0.8533 0.8917 0.9282 0.9568
134
512K 0.7753 0.8135 0.8537 0.892 0.935
135
768K 0.8184 0.8638 0.9121 0.9586 0.9705
136
1024K 0.8241 0.8741 0.927 0.979 1.03
137
1536K 0.8505 0.9212 0.9882 1.056 1.096
138
2048K 0.8294 0.8954 0.9655 1.041 1.102
139
140
*/
141
142
#ifndef SLOW2_IN_ROOTS
143
/* SLOW2_IN_ROOTS below 3: some slowdown starts to be noticable
144
* when things fit into the cache on Sparc.
145
* The choice of 2.6 gives a slowdown of 1-2% on UltraSparcII,
146
* but makes calculations for "maximum" of 436273009
147
* fit into 256K cache (still common for some architectures).
148
*
149
* One may change it when small caches become uncommon, but the gain
150
* is not going to be very noticable... */
151
# ifdef i386 /* gcc defines this? */
152
# define SLOW2_IN_ROOTS 0.36
153
# else
154
# define SLOW2_IN_ROOTS 2.6
155
# endif
156
#endif
157
#ifndef CACHE_ARENA
158
# ifdef i386 /* gcc defines this? */
159
/* Due to smaller SLOW2_IN_ROOTS, smaller arena is OK; fit L1 cache */
160
# define CACHE_ARENA (63 * 1024UL) /* No slowdown even with 64K L1 cache */
161
# else
162
# define CACHE_ARENA (200 * 1024UL) /* No slowdown even with 256K L2 cache */
163
# endif
164
#endif
165
166
#define CACHE_ALPHA (0.38) /* Cache performance model parameter */
167
#define CACHE_CUTOFF (0.018) /* Cache performance not smooth here */
168
169
static double slow2_in_roots = SLOW2_IN_ROOTS;
170
171
typedef struct {
172
ulong arena;
173
double power;
174
double cutoff;
175
} cache_model_t;
176
177
static cache_model_t cache_model = { CACHE_ARENA, CACHE_ALPHA, CACHE_CUTOFF };
178
179
/* Assume that some calculation requires a chunk of memory to be
180
accessed often in more or less random fashion (as in sieving).
181
Assume that the calculation can be done in steps by subdividing the
182
chunk into smaller subchunks (arenas) and treating them
183
separately. Assume that the overhead of subdivision is equivalent
184
to the number of arenas.
185
186
Find an optimal size of the arena taking into account the overhead
187
of subdivision, and the overhead of arena not fitting into the
188
cache. Assume that arenas of size slow2_in_roots slows down the
189
calculation 2x (comparing to very big arenas; when cache hits do
190
not matter). Since cache performance varies wildly with
191
architecture, load, and wheather (especially with cache coloring
192
enabled), use an idealized cache model based on benchmarks above.
193
194
Assume that an independent region of FIXED_TO_CACHE bytes is accessed
195
very often concurrently with the arena access.
196
*/
197
static ulong
198
good_arena_size(ulong slow2_size, ulong total, ulong fixed_to_cache,
199
cache_model_t *cache_model)
200
{
201
ulong asize, cache_arena = cache_model->arena;
202
double Xmin, Xmax, A, B, C1, C2, D, V;
203
double alpha = cache_model->power, cut_off = cache_model->cutoff;
204
205
/* Estimated relative slowdown,
206
with overhead = max((fixed_to_cache+arena)/cache_arena - 1, 0):
207
208
1 + slow2_size/arena due to initialization overhead;
209
210
max(1, 4.63 * overhead^0.38 ) due to footprint > cache size.
211
212
[The latter is hard to substantiate theoretically, but this
213
function describes benchmarks pretty close; it does not hurt that
214
one can minimize it explicitly too ;-). The switch between
215
different choices of max() happens when overhead=0.018.]
216
217
Thus the problem is minimizing (1 + slow2_size/arena)*overhead**0.29.
218
This boils down to F=((X+A)/(X+B))X^alpha, X=overhead,
219
B = (1 - fixed_to_cache/cache_arena), A = B + slow2_size/cache_arena,
220
alpha = 0.38, and X>=0.018, X>-B.
221
222
We need to find the rightmost root of (X+A)*(X+B) - alpha(A-B)X to the
223
right of 0.018 (if such exists and is below Xmax). Then we manually
224
check the remaining region [0, 0.018].
225
226
Since we cannot trust the purely-experimental cache-hit slowdown
227
function, as a sanity check always prefer fitting into the
228
cache (or "almost fitting") if F-law predicts that the larger
229
value of the arena provides less than 10% speedup.
230
*/
231
232
/* The simplest case: we fit into cache */
233
asize = cache_arena - fixed_to_cache;
234
if (total <= asize) return total;
235
/* The simple case: fitting into cache doesn't slow us down more than 10% */
236
if (asize > 10 * slow2_size) return asize;
237
/* Slowdown of not fitting into cache is significant. Try to optimize.
238
Do not be afraid to spend some time on optimization - in trivial
239
cases we do not reach this point; any gain we get should
240
compensate the time spent on optimization. */
241
242
B = (1 - ((double)fixed_to_cache)/cache_arena);
243
A = B + ((double)slow2_size)/cache_arena;
244
C2 = A*B;
245
C1 = (A + B - 1/alpha*(A - B))/2;
246
D = C1*C1 - C2;
247
if (D > 0)
248
V = cut_off*cut_off + 2*C1*cut_off + C2; /* Value at CUT_OFF */
249
else
250
V = 0; /* Peacify the warning */
251
Xmin = cut_off;
252
Xmax = ((double)total - fixed_to_cache)/cache_arena; /* Two candidates */
253
254
if ( D <= 0 || (V >= 0 && C1 + cut_off >= 0) ) /* slowdown increasing */
255
Xmax = cut_off; /* Only one candidate */
256
else if (V >= 0 && /* slowdown concave down */
257
((Xmax + C1) <= 0 || (Xmax*Xmax + 2*C1*Xmax + C2) <= 0))
258
/* DO NOTHING */; /* Keep both candidates */
259
else if (V <= 0 && (Xmax*Xmax + 2*C1*Xmax + C2) <= 0) /*slowdown decreasing*/
260
Xmin = cut_off; /* Only one candidate */
261
else /* Now we know: 2 roots, the largest is in CUT_OFF..Xmax */
262
Xmax = sqrt(D) - C1;
263
if (Xmax != Xmin) { /* Xmin == CUT_OFF; Check which one is better */
264
double v1 = (cut_off + A)/(cut_off + B);
265
double v2 = 2.33 * (Xmax + A)/(Xmax + B) * pow(Xmax, alpha);
266
267
if (1.1 * v2 >= v1) /* Prefer fitting into the cache if slowdown < 10% */
268
V = v1;
269
else
270
{ Xmin = Xmax; V = v2; }
271
} else if (B > 0) /* We need V */
272
V = 2.33 * (Xmin + A)/(Xmin + B) * pow(Xmin, alpha);
273
if (B > 0 && 1.1 * V > A/B) /* Now Xmin is the minumum. Compare with 0 */
274
Xmin = 0;
275
276
asize = (ulong)((1 + Xmin)*cache_arena - fixed_to_cache);
277
if (asize > total) asize = total; /* May happen due to approximations */
278
return asize;
279
}
280
281
/* Use as in
282
install(set_optimize,lLDG) \\ Through some M too?
283
set_optimize(2,1) \\ disable dependence on limit
284
\\ 1: how much cache usable, 2: slowdown of setup, 3: alpha, 4: cutoff
285
\\ 2,3,4 are in units of 0.001
286
287
{ time_primes_arena(ar,limit) = \\ ar = arena size in K
288
set_optimize(1,floor(ar*1024));
289
default(primelimit, 200 000); \\ 100000 results in *larger* malloc()!
290
gettime;
291
default(primelimit, floor(limit));
292
if(ar >= 1, ar=floor(ar));
293
print("arena "ar"K => "gettime"ms");
294
}
295
*/
296
long
297
set_optimize(long what, GEN g)
298
{
299
long ret = 0;
300
301
switch (what) {
302
case 1:
303
ret = (long)cache_model.arena;
304
break;
305
case 2:
306
ret = (long)(slow2_in_roots * 1000);
307
break;
308
case 3:
309
ret = (long)(cache_model.power * 1000);
310
break;
311
case 4:
312
ret = (long)(cache_model.cutoff * 1000);
313
break;
314
default:
315
pari_err_BUG("set_optimize");
316
break;
317
}
318
if (g != NULL) {
319
ulong val = itou(g);
320
321
switch (what) {
322
case 1: cache_model.arena = val; break;
323
case 2: slow2_in_roots = (double)val / 1000.; break;
324
case 3: cache_model.power = (double)val / 1000.; break;
325
case 4: cache_model.cutoff = (double)val / 1000.; break;
326
}
327
}
328
return ret;
329
}
330
331
/* s is odd; prime differences (starting from 5-3=2) start at known_primes[2],
332
terminated by a 0 byte. Checks n odd numbers starting at 'start', setting
333
bytes starting at data to 0 (composite) or 1 (prime) */
334
static void
335
sieve_chunk(byteptr known_primes, ulong s, byteptr data, ulong n)
336
{
337
ulong p, cnt = n-1, start = s, delta = 1;
338
byteptr q;
339
340
memset(data, 0, n);
341
start >>= 1; /* (start - 1)/2 */
342
start += n; /* Corresponds to the end */
343
/* data corresponds to start, q runs over primediffs */
344
for (q = known_primes + 1, p = 3; delta; delta = *++q, p += delta)
345
{ /* first odd number >= start > p and divisible by p
346
= last odd number <= start + 2p - 2 and 0 (mod p)
347
= p + last number <= start + p - 2 and 0 (mod 2p)
348
= p + start+p-2 - (start+p-2) % 2p
349
= start + 2(p - 1 - ((start-1)/2 + (p-1)/2) % p). */
350
long off = cnt - ((start+(p>>1)) % p);
351
while (off >= 0) { data[off] = 1; off -= p; }
352
}
353
}
354
355
/* assume maxnum <= 436273289 < 2^29 */
356
static void
357
initprimes0(ulong maxnum, long *lenp, ulong *lastp, byteptr p1)
358
{
359
pari_sp av = avma, bot = pari_mainstack->bot;
360
long alloced, psize;
361
byteptr q, end, p, end1, plast, curdiff;
362
ulong last, remains, curlow, rootnum, asize;
363
ulong prime_above;
364
byteptr p_prime_above;
365
366
maxnum |= 1; /* make it odd. */
367
/* base case */
368
if (maxnum < 1ul<<17) { initprimes1(maxnum>>1, lenp, lastp, p1); return; }
369
370
/* Checked to be enough up to 40e6, attained at 155893 */
371
rootnum = usqrt(maxnum) | 1;
372
initprimes1(rootnum>>1, &psize, &last, p1);
373
end1 = p1 + psize - 1;
374
remains = (maxnum - last) >> 1; /* number of odd numbers to check */
375
376
/* we access primes array of psize too; but we access it consecutively,
377
* thus we do not include it in fixed_to_cache */
378
asize = good_arena_size((ulong)(rootnum * slow2_in_roots), remains+1, 0,
379
&cache_model) - 1;
380
/* enough room on the stack ? */
381
alloced = (((byteptr)avma) <= ((byteptr)bot) + asize);
382
if (alloced)
383
p = (byteptr)pari_malloc(asize+1);
384
else
385
p = (byteptr)stack_malloc(asize+1);
386
end = p + asize; /* the 0 sentinel goes at end. */
387
curlow = last + 2; /* First candidate: know primes up to last (odd). */
388
curdiff = end1;
389
390
/* During each iteration p..end-1 represents a range of odd
391
numbers. plast is a pointer which represents the last prime seen,
392
it may point before p..end-1. */
393
plast = p - 1;
394
p_prime_above = p1 + 2;
395
prime_above = 3;
396
while (remains)
397
{ /* cycle over arenas; performance not crucial */
398
unsigned char was_delta;
399
if (asize > remains) { asize = remains; end = p + asize; }
400
/* Fake the upper limit appropriate for the given arena */
401
while (prime_above*prime_above <= curlow + (asize << 1) && *p_prime_above)
402
prime_above += *p_prime_above++;
403
was_delta = *p_prime_above;
404
*p_prime_above = 0; /* sentinel for sieve_chunk */
405
sieve_chunk(p1, curlow, p, asize);
406
*p_prime_above = was_delta; /* restore */
407
408
p[asize] = 0; /* sentinel */
409
for (q = p; ; plast = q++)
410
{ /* q runs over addresses corresponding to primes */
411
while (*q) q++; /* use sentinel at end */
412
if (q >= end) break;
413
*curdiff++ = (unsigned char)(q-plast) << 1; /* < 255 for q < 436273291 */
414
}
415
plast -= asize;
416
remains -= asize;
417
curlow += (asize<<1);
418
}
419
last = curlow - ((p - plast) << 1);
420
*curdiff++ = 0; /* sentinel */
421
*lenp = curdiff - p1;
422
*lastp = last;
423
if (alloced) pari_free(p); else set_avma(av);
424
}
425
426
ulong
427
maxprime(void) { return diffptr ? _maxprime : 0; }
428
ulong
429
maxprimeN(void) { return diffptr ? diffptrlen-1: 0; }
430
431
void
432
maxprime_check(ulong c) { if (_maxprime < c) pari_err_MAXPRIME(c); }
433
434
/* We ensure 65302 <= maxnum <= 436273289: the LHS ensures modular function
435
* have enough fast primes to work, the RHS ensures that p_{n+1} - p_n < 255
436
* (N.B. RHS would be incorrect since initprimes0 would make it odd, thereby
437
* increasing it by 1) */
438
byteptr
439
initprimes(ulong maxnum, long *lenp, ulong *lastp)
440
{
441
byteptr t;
442
if (maxnum < 65537)
443
maxnum = 65537;
444
else if (maxnum > 436273289)
445
maxnum = 436273289;
446
t = (byteptr)pari_malloc((size_t) (1.09 * maxnum/log((double)maxnum)) + 146);
447
initprimes0(maxnum, lenp, lastp, t);
448
return (byteptr)pari_realloc(t, *lenp);
449
}
450
451
void
452
initprimetable(ulong maxnum)
453
{
454
long len;
455
ulong last;
456
byteptr p = initprimes(maxnum, &len, &last), old = diffptr;
457
diffptrlen = minss(diffptrlen, len);
458
_maxprime = minss(_maxprime,last); /*Protect against ^C*/
459
diffptr = p; diffptrlen = len; _maxprime = last;
460
if (old) free(old);
461
}
462
463
/* all init_primepointer_xx routines set *ptr to the corresponding place
464
* in prime table */
465
/* smallest p >= a */
466
ulong
467
init_primepointer_geq(ulong a, byteptr *pd)
468
{
469
ulong n, p;
470
prime_table_next_p(a, pd, &p, &n);
471
return p;
472
}
473
/* largest p < a */
474
ulong
475
init_primepointer_lt(ulong a, byteptr *pd)
476
{
477
ulong n, p;
478
prime_table_next_p(a, pd, &p, &n);
479
PREC_PRIME_VIADIFF(p, *pd);
480
return p;
481
}
482
/* largest p <= a */
483
ulong
484
init_primepointer_leq(ulong a, byteptr *pd)
485
{
486
ulong n, p;
487
prime_table_next_p(a, pd, &p, &n);
488
if (p != a) PREC_PRIME_VIADIFF(p, *pd);
489
return p;
490
}
491
/* smallest p > a */
492
ulong
493
init_primepointer_gt(ulong a, byteptr *pd)
494
{
495
ulong n, p;
496
prime_table_next_p(a, pd, &p, &n);
497
if (p == a) NEXT_PRIME_VIADIFF(p, *pd);
498
return p;
499
}
500
501
/**********************************************************************/
502
/*** ***/
503
/*** forprime ***/
504
/*** ***/
505
/**********************************************************************/
506
507
/* return good chunk size for sieve, 16 | chunk + 2 */
508
static ulong
509
optimize_chunk(ulong a, ulong b)
510
{
511
/* TODO: Optimize size (surely < 512k to stay in L2 cache, but not so large
512
* as to force recalculating too often). */
513
ulong chunk = 0x80000UL;
514
ulong tmp = (b - a) / chunk + 1;
515
516
if (tmp == 1)
517
chunk = b - a + 16;
518
else
519
chunk = (b - a) / tmp + 15;
520
/* ensure 16 | chunk + 2 */
521
return (((chunk + 2)>>4)<<4) - 2;
522
}
523
static void
524
sieve_init(forprime_t *T, ulong a, ulong b)
525
{
526
T->sieveb = b;
527
T->chunk = optimize_chunk(a, b);
528
/* >> 1 [only odds] + 3 [convert from bits to bytes] */
529
T->isieve = (unsigned char*)stack_malloc(((T->chunk+2) >> 4) + 1);
530
T->cache[0] = 0;
531
T->a = a;
532
T->end = minuu(a + T->chunk, b);
533
T->pos = T->maxpos = 0;
534
}
535
536
enum {PRST_none, PRST_diffptr, PRST_sieve, PRST_unextprime, PRST_nextprime};
537
538
static void
539
u_forprime_set_prime_table(forprime_t *T, ulong a)
540
{
541
T->strategy = PRST_diffptr;
542
if (a < 3)
543
{
544
T->p = 0;
545
T->d = diffptr;
546
}
547
else
548
T->p = init_primepointer_lt(a, &T->d);
549
}
550
551
/* Set p so that p + q the smallest integer = c (mod q) and > original p.
552
* Assume 0 < c < q. Set p = 0 on overflow */
553
static void
554
arith_set(forprime_t *T)
555
{
556
ulong r = T->p % T->q; /* 0 <= r <= min(p, q-1) */
557
pari_sp av = avma;
558
GEN d = adduu(T->p - r, T->c);
559
if (T->c > r) d = subiu(d, T->q);
560
/* d = c mod q, d = c > r? p-r+c-q: p-r+c, so that
561
* d <= p and d+q = c>r? p-r+c : p-r+c+q > p */
562
T->p = itou_or_0(d); set_avma(av); /* d = 0 is impossible */
563
}
564
565
/* run through primes in arithmetic progression = c (mod q) */
566
static int
567
u_forprime_sieve_arith_init(forprime_t *T, struct pari_sieve *psieve,
568
ulong a, ulong b, ulong c, ulong q)
569
{
570
ulong maxp, maxp2;
571
if (!odd(b) && b > 2) b--;
572
if (a > b || b < 2)
573
{
574
T->strategy = PRST_diffptr; /* paranoia */
575
T->p = 0; /* empty */
576
T->b = 0; /* empty */
577
T->d = diffptr;
578
return 0;
579
}
580
maxp = maxprime();
581
if (q != 1)
582
{
583
c %= q;
584
if (ugcd(c,q) != 1) { a = maxuu(a,c); b = minuu(b,c); }
585
if (odd(q) && (a > 2 || c != 2))
586
{ /* only *odd* primes. If a <= c = 2, then p = 2 must be included :-( */
587
if (!odd(c)) c += q;
588
q <<= 1;
589
}
590
}
591
T->q = q;
592
T->c = c;
593
T->strategy = PRST_none; /* unknown */
594
T->psieve = psieve; /* unused for now */
595
T->isieve = NULL; /* unused for now */
596
T->b = b;
597
if (maxp >= b) { /* [a,b] \subset prime table */
598
u_forprime_set_prime_table(T, a);
599
return 1;
600
}
601
/* b > maxp */
602
if (a >= maxp)
603
{
604
T->p = a - 1;
605
if (T->q > 1) arith_set(T);
606
}
607
else
608
u_forprime_set_prime_table(T, a);
609
610
maxp2 = (maxp & HIGHMASK)? 0 : maxp*maxp;
611
/* FIXME: should sieve as well if q != 1, adapt sieve code */
612
if (q != 1 || (maxp2 && maxp2 <= a)
613
|| T->b - maxuu(a,maxp) < maxp / expu(b))
614
{ if (T->strategy==PRST_none) T->strategy = PRST_unextprime; }
615
else
616
{ /* worth sieving */
617
#ifdef LONG_IS_64BIT
618
const ulong UPRIME_MAX = 18446744073709551557UL;
619
#else
620
const ulong UPRIME_MAX = 4294967291UL;
621
#endif
622
ulong sieveb;
623
if (b > UPRIME_MAX) b = UPRIME_MAX;
624
sieveb = b;
625
if (maxp2 && maxp2 < b) sieveb = maxp2;
626
if (T->strategy==PRST_none) T->strategy = PRST_sieve;
627
sieve_init(T, maxuu(maxp+2, a), sieveb);
628
}
629
return 1;
630
}
631
632
int
633
u_forprime_arith_init(forprime_t *T, ulong a, ulong b, ulong c, ulong q)
634
{ return u_forprime_sieve_arith_init(T, NULL, a, b, c, q); }
635
636
/* will run through primes in [a,b] */
637
int
638
u_forprime_init(forprime_t *T, ulong a, ulong b)
639
{ return u_forprime_arith_init(T, a,b, 0,1); }
640
641
/* will run through primes in [a,b] */
642
static int
643
u_forprime_sieve_init(forprime_t *T, struct pari_sieve *s, ulong b)
644
{ return u_forprime_sieve_arith_init(T, s, s->start, b, s->c, s->q); }
645
646
/* now only run through primes <= c; assume c <= b above */
647
void
648
u_forprime_restrict(forprime_t *T, ulong c) { T->b = c; }
649
650
/* b = NULL: loop forever */
651
int
652
forprimestep_init(forprime_t *T, GEN a, GEN b, GEN q)
653
{
654
long lb;
655
a = gceil(a); if (typ(a) != t_INT) pari_err_TYPE("forprime_init",a);
656
if (signe(a) <= 0) a = gen_1;
657
if (b && typ(b) != t_INFINITY)
658
{
659
b = gfloor(b);
660
if (typ(b) != t_INT) pari_err_TYPE("forprime_init",b);
661
if (signe(b) < 0 || cmpii(a,b) > 0)
662
{
663
T->strategy = PRST_nextprime; /* paranoia */
664
T->bb = T->pp = gen_0; return 0;
665
}
666
lb = lgefint(b);
667
T->bb = b;
668
}
669
else if (!b || inf_get_sign(b) > 0)
670
{
671
lb = lgefint(a) + 4;
672
T->bb = NULL;
673
}
674
else /* b == -oo */
675
{
676
T->strategy = PRST_nextprime; /* paranoia */
677
T->bb = T->pp = gen_0; return 0;
678
}
679
T->pp = cgeti(lb);
680
T->c = 0;
681
T->q = 1;
682
/* a, b are positive integers, a <= b */
683
if (q)
684
{
685
switch(typ(q))
686
{
687
case t_INT: break;
688
case t_INTMOD: a = addii(a, modii(subii(gel(q,2),a), gel(q,1)));
689
q = gel(q,1); break;
690
default: pari_err_TYPE("forprimestep_init",q);
691
}
692
if (signe(q) <= 0) pari_err_TYPE("forprimestep_init (q <= 0)",q);
693
if (equali1(q)) q = NULL;
694
else
695
{
696
T->q = itou(q);
697
T->c = umodiu(a, T->q);
698
}
699
}
700
if (lgefint(a) == 3) /* lb == 3 implies b != NULL */
701
return u_forprime_arith_init(T, uel(a,2), lb == 3? uel(b,2): ULONG_MAX,
702
T->c, T->q);
703
T->strategy = PRST_nextprime;
704
affii(subiu(a,T->q), T->pp);
705
return 1;
706
}
707
int
708
forprime_init(forprime_t *T, GEN a, GEN b)
709
{ return forprimestep_init(T,a,b,NULL); }
710
711
/* assume a <= b <= maxprime()^2, a,b odd, sieve[n] corresponds to
712
* a+16*n, a+16*n+2, ..., a+16*n+14 (bits 0 to 7)
713
* maxpos = index of last sieve cell.
714
* b-a+2 must be divisible by 16 for use by u_forprime_next */
715
static void
716
sieve_block(ulong a, ulong b, ulong maxpos, unsigned char* sieve)
717
{
718
ulong p = 2, lim = usqrt(b), sz = (b-a) >> 1;
719
byteptr d = diffptr+1;
720
(void)memset(sieve, 0, maxpos+1);
721
for (;;)
722
{ /* p is odd */
723
ulong k, r;
724
NEXT_PRIME_VIADIFF(p, d); /* starts at p = 3 */
725
if (p > lim) break;
726
727
/* solve a + 2k = 0 (mod p) */
728
r = a % p;
729
if (r == 0)
730
k = 0;
731
else
732
{
733
k = p - r;
734
if (odd(k)) k += p;
735
k >>= 1;
736
}
737
/* m = a + 2k is the smallest odd m >= a, p | m */
738
/* position n (corresponds to a+2n) is sieve[n>>3], bit n&7 */
739
while (k <= sz) { sieve[k>>3] |= 1 << (k&7); k += p; /* 2k += 2p */ }
740
}
741
}
742
743
static void
744
pari_sieve_init(struct pari_sieve *s, ulong a, ulong b)
745
{
746
ulong maxpos= (b - a) >> 4;
747
s->start = a; s->end = b;
748
s->sieve = (unsigned char*) pari_malloc(maxpos+1);
749
s->c = 0; s->q = 1;
750
sieve_block(a, b, maxpos, s->sieve);
751
s->maxpos = maxpos; /* must be last in case of SIGINT */
752
}
753
754
static struct pari_sieve pari_sieve_modular;
755
756
#ifdef LONG_IS_64BIT
757
#define PARI_MODULAR_BASE ((1UL<<((BITS_IN_LONG-2)>>1))+1)
758
#else
759
#define PARI_MODULAR_BASE ((1UL<<(BITS_IN_LONG-1))+1)
760
#endif
761
762
void
763
pari_init_primes(ulong maxprime)
764
{
765
ulong a = PARI_MODULAR_BASE, b = a + (1UL<<20)-2;
766
initprimetable(maxprime);
767
pari_sieve_init(&pari_sieve_modular, a, b);
768
}
769
770
void
771
pari_close_primes(void)
772
{
773
pari_free(diffptr);
774
pari_free(pari_sieve_modular.sieve);
775
}
776
777
void
778
init_modular_small(forprime_t *S)
779
{
780
#ifdef LONG_IS_64BIT
781
u_forprime_sieve_init(S, &pari_sieve_modular, ULONG_MAX);
782
#else
783
ulong a = (1UL<<((BITS_IN_LONG-2)>>1))+1;
784
u_forprime_init(S, a, ULONG_MAX);
785
#endif
786
}
787
788
void
789
init_modular_big(forprime_t *S)
790
{
791
#ifdef LONG_IS_64BIT
792
u_forprime_init(S, HIGHBIT + 1, ULONG_MAX);
793
#else
794
u_forprime_sieve_init(S, &pari_sieve_modular, ULONG_MAX);
795
#endif
796
}
797
798
/* T->cache is a 0-terminated list of primes, return the first one and
799
* remove it from list. Most of the time the list contains a single prime */
800
static ulong
801
shift_cache(forprime_t *T)
802
{
803
long i;
804
T->p = T->cache[0];
805
for (i = 1;; i++) /* remove one prime from cache */
806
if (! (T->cache[i-1] = T->cache[i]) ) break;
807
return T->p;
808
}
809
810
ulong
811
u_forprime_next(forprime_t *T)
812
{
813
if (T->strategy == PRST_diffptr)
814
{
815
for(;;)
816
{
817
if (!*(T->d))
818
{
819
T->strategy = T->isieve? PRST_sieve: PRST_unextprime;
820
if (T->q != 1) { arith_set(T); if (!T->p) return 0; }
821
/* T->p possibly not a prime if q != 1 */
822
break;
823
}
824
else
825
{
826
NEXT_PRIME_VIADIFF(T->p, T->d);
827
if (T->p > T->b) return 0;
828
if (T->q == 1 || T->p % T->q == T->c) return T->p;
829
}
830
}
831
}
832
if (T->strategy == PRST_sieve)
833
{
834
ulong n;
835
if (T->cache[0]) return shift_cache(T);
836
NEXT_CHUNK:
837
if (T->psieve)
838
{
839
T->sieve = T->psieve->sieve;
840
T->end = T->psieve->end;
841
if (T->end > T->sieveb) T->end = T->sieveb;
842
T->maxpos = T->psieve->maxpos;
843
T->pos = 0;
844
T->psieve = NULL;
845
}
846
for (n = T->pos; n < T->maxpos; n++)
847
if (T->sieve[n] != 0xFF)
848
{
849
unsigned char mask = T->sieve[n];
850
ulong p = T->a + (n<<4);
851
long i = 0;
852
T->pos = n;
853
if (!(mask & 1)) T->cache[i++] = p;
854
if (!(mask & 2)) T->cache[i++] = p+2;
855
if (!(mask & 4)) T->cache[i++] = p+4;
856
if (!(mask & 8)) T->cache[i++] = p+6;
857
if (!(mask & 16)) T->cache[i++] = p+8;
858
if (!(mask & 32)) T->cache[i++] = p+10;
859
if (!(mask & 64)) T->cache[i++] = p+12;
860
if (!(mask &128)) T->cache[i++] = p+14;
861
T->cache[i] = 0;
862
T->pos = n+1;
863
return shift_cache(T);
864
}
865
/* n = T->maxpos, last cell: check p <= b */
866
if (T->maxpos && n == T->maxpos && T->sieve[n] != 0xFF)
867
{
868
unsigned char mask = T->sieve[n];
869
ulong p = T->a + (n<<4);
870
long i = 0;
871
T->pos = n;
872
if (!(mask & 1) && p <= T->sieveb) T->cache[i++] = p;
873
if (!(mask & 2) && p <= T->sieveb-2) T->cache[i++] = p+2;
874
if (!(mask & 4) && p <= T->sieveb-4) T->cache[i++] = p+4;
875
if (!(mask & 8) && p <= T->sieveb-6) T->cache[i++] = p+6;
876
if (!(mask & 16) && p <= T->sieveb-8) T->cache[i++] = p+8;
877
if (!(mask & 32) && p <= T->sieveb-10) T->cache[i++] = p+10;
878
if (!(mask & 64) && p <= T->sieveb-12) T->cache[i++] = p+12;
879
if (!(mask &128) && p <= T->sieveb-14) T->cache[i++] = p+14;
880
if (i)
881
{
882
T->cache[i] = 0;
883
T->pos = n+1;
884
return shift_cache(T);
885
}
886
}
887
888
if (T->maxpos && T->end >= T->sieveb) /* done with sieves ? */
889
{
890
if (T->sieveb == T->b && T->b != ULONG_MAX) return 0;
891
T->strategy = PRST_unextprime;
892
}
893
else
894
{ /* initialize next chunk */
895
T->sieve = T->isieve;
896
if (T->maxpos == 0)
897
T->a |= 1; /* first time; ensure odd */
898
else
899
T->a = (T->end + 2) | 1;
900
T->end = T->a + T->chunk; /* may overflow */
901
if (T->end < T->a || T->end > T->sieveb) T->end = T->sieveb;
902
/* end and a are odd; sieve[k] contains the a + 8*2k + (0,2,...,14).
903
* The largest k is (end-a) >> 4 */
904
T->pos = 0;
905
T->maxpos = (T->end - T->a) >> 4;
906
sieve_block(T->a, T->end, T->maxpos, T->sieve);
907
goto NEXT_CHUNK;
908
}
909
}
910
if (T->strategy == PRST_unextprime)
911
{
912
if (T->q == 1)
913
{
914
#ifdef LONG_IS_64BIT
915
switch(T->p)
916
{
917
#define retp(x) return T->p = (HIGHBIT+x <= T->b)? HIGHBIT+x: 0
918
case HIGHBIT: retp(29);
919
case HIGHBIT + 29: retp(99);
920
case HIGHBIT + 99: retp(123);
921
case HIGHBIT +123: retp(131);
922
case HIGHBIT +131: retp(155);
923
case HIGHBIT +155: retp(255);
924
case HIGHBIT +255: retp(269);
925
case HIGHBIT +269: retp(359);
926
case HIGHBIT +359: retp(435);
927
case HIGHBIT +435: retp(449);
928
case HIGHBIT +449: retp(453);
929
case HIGHBIT +453: retp(485);
930
case HIGHBIT +485: retp(491);
931
case HIGHBIT +491: retp(543);
932
case HIGHBIT +543: retp(585);
933
case HIGHBIT +585: retp(599);
934
case HIGHBIT +599: retp(753);
935
case HIGHBIT +753: retp(849);
936
case HIGHBIT +849: retp(879);
937
case HIGHBIT +879: retp(885);
938
case HIGHBIT +885: retp(903);
939
case HIGHBIT +903: retp(995);
940
#undef retp
941
}
942
#endif
943
T->p = unextprime(T->p + 1);
944
}
945
else do {
946
T->p += T->q;
947
if (T->p < T->q || T->p > T->b) { T->p = 0; break; } /* overflow */
948
} while (!uisprime(T->p));
949
if (T->p && T->p <= T->b) return T->p;
950
/* overflow ulong, switch to GEN */
951
T->strategy = PRST_nextprime;
952
}
953
return 0; /* overflow */
954
}
955
956
GEN
957
forprime_next(forprime_t *T)
958
{
959
pari_sp av;
960
GEN p;
961
if (T->strategy != PRST_nextprime)
962
{
963
ulong u = u_forprime_next(T);
964
if (u) { affui(u, T->pp); return T->pp; }
965
/* failure */
966
if (T->strategy != PRST_nextprime) return NULL; /* we're done */
967
/* overflow ulong, switch to GEN */
968
u = ULONG_MAX;
969
if (T->q > 1) u -= (ULONG_MAX-T->c) % T->q;
970
affui(u, T->pp);
971
}
972
av = avma; p = T->pp;
973
if (T->q == 1)
974
{
975
p = nextprime(addiu(p, 1));
976
if (T->bb && abscmpii(p, T->bb) > 0) return gc_NULL(av);
977
} else do {
978
p = addiu(p, T->q);
979
if (T->bb && abscmpii(p, T->bb) > 0) return gc_NULL(av);
980
} while (!BPSW_psp(p));
981
affii(p, T->pp); return gc_const(av, T->pp);
982
}
983
984
void
985
forprimestep(GEN a, GEN b, GEN q, GEN code)
986
{
987
pari_sp av = avma;
988
forprime_t T;
989
990
if (!forprimestep_init(&T, a,b,q)) { set_avma(av); return; }
991
992
push_lex(T.pp,code);
993
while(forprime_next(&T))
994
{
995
closure_evalvoid(code); if (loop_break()) break;
996
/* p changed in 'code', complain */
997
if (get_lex(-1) != T.pp)
998
pari_err(e_MISC,"prime index read-only: was changed to %Ps", get_lex(-1));
999
}
1000
pop_lex(1); set_avma(av);
1001
}
1002
void
1003
forprime(GEN a, GEN b, GEN code) { return forprimestep(a,b,NULL,code); }
1004
1005
int
1006
forcomposite_init(forcomposite_t *C, GEN a, GEN b)
1007
{
1008
pari_sp av = avma;
1009
a = gceil(a);
1010
if (typ(a)!=t_INT) pari_err_TYPE("forcomposite",a);
1011
if (b) {
1012
if (typ(b) == t_INFINITY) b = NULL;
1013
else
1014
{
1015
b = gfloor(b);
1016
if (typ(b)!=t_INT) pari_err_TYPE("forcomposite",b);
1017
}
1018
}
1019
if (signe(a) < 0) pari_err_DOMAIN("forcomposite", "a", "<", gen_0, a);
1020
if (abscmpiu(a, 4) < 0) a = utoipos(4);
1021
C->first = 1;
1022
if (!forprime_init(&C->T, a,b) && cmpii(a,b) > 0)
1023
{
1024
C->n = gen_1; /* in case caller forgets to check the return value */
1025
C->b = gen_0; return gc_bool(av,0);
1026
}
1027
C->n = setloop(a);
1028
C->b = b;
1029
C->p = NULL; return 1;
1030
}
1031
1032
GEN
1033
forcomposite_next(forcomposite_t *C)
1034
{
1035
if (C->first) /* first call ever */
1036
{
1037
C->first = 0;
1038
C->p = forprime_next(&C->T);
1039
}
1040
else
1041
C->n = incloop(C->n);
1042
if (C->p)
1043
{
1044
if (cmpii(C->n, C->p) < 0) return C->n;
1045
C->n = incloop(C->n);
1046
/* n = p+1 */
1047
C->p = forprime_next(&C->T); /* nextprime(p) > n */
1048
if (C->p) return C->n;
1049
}
1050
if (!C->b || cmpii(C->n, C->b) <= 0) return C->n;
1051
return NULL;
1052
}
1053
1054
void
1055
forcomposite(GEN a, GEN b, GEN code)
1056
{
1057
pari_sp av = avma;
1058
forcomposite_t T;
1059
GEN n;
1060
if (!forcomposite_init(&T,a,b)) return;
1061
push_lex(T.n,code);
1062
while((n = forcomposite_next(&T)))
1063
{
1064
closure_evalvoid(code); if (loop_break()) break;
1065
/* n changed in 'code', complain */
1066
if (get_lex(-1) != n)
1067
pari_err(e_MISC,"index read-only: was changed to %Ps", get_lex(-1));
1068
}
1069
pop_lex(1); set_avma(av);
1070
}
1071
1072