CoCalc -- forprime.c

Testing latest pari + WASM + node.js... and it works?! Wow.
pari / 2021-07-01 / pari-2.14.0.alpha / src / language / forprime.c
²⁸⁴⁹⁵ views
License: GPL3
ubuntu2004
1
/* Copyright (C) 2016  The PARI group.
2

3
This file is part of the PARI/GP package.
4

5
PARI/GP is free software; you can redistribute it and/or modify it under the
6
terms of the GNU General Public License as published by the Free Software
7
Foundation; either version 2 of the License, or (at your option) any later
8
version. It is distributed in the hope that it will be useful, but WITHOUT
9
ANY WARRANTY WHATSOEVER.
10

11
Check the License for details. You should have received a copy of it, along
12
with the package; see the file 'COPYING'. If not, write to the Free Software
13
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. */
14

15
#include "pari.h"
16
#include "paripriv.h"
17

18
/**********************************************************************/
19
/***                                                                ***/
20
/***                     Public prime table                         ***/
21
/***                                                                ***/
22
/**********************************************************************/
23

24
static ulong _maxprime = 0;
25
static ulong diffptrlen;
26

27
/* Building/Rebuilding the diffptr table. The actual work is done by the
28
 * following two subroutines;  the user entry point is the function
29
 * initprimes() below.  initprimes1() is the old algorithm, called when
30
 * maxnum (size) is moderate. Must be called after pari_init_stack() )*/
31
static void
32
initprimes1(ulong size, long *lenp, ulong *lastp, byteptr p1)
33
{
34
  pari_sp av = avma;
35
  long k;
36
  byteptr q, r, s, p = (byteptr)stack_calloc(size+2), fin = p + size;
37

38
  for (r=q=p,k=1; r<=fin; )
39
  {
40
    do { r+=k; k+=2; r+=k; } while (*++q);
41
    for (s=r; s<=fin; s+=k) *s = 1;
42
  }
43
  r = p1; *r++ = 2; *r++ = 1; /* 2 and 3 */
44
  for (s=q=p+1; ; s=q)
45
  {
46
    do q++; while (*q);
47
    if (q > fin) break;
48
    *r++ = (unsigned char) ((q-s) << 1);
49
  }
50
  *r++ = 0;
51
  *lenp = r - p1;
52
  *lastp = ((s - p) << 1) + 1;
53
  set_avma(av);
54
}
55

56
/*  Timing in ms (Athlon/850; reports 512K of secondary cache; looks
57
    like there is 64K of quickier cache too).
58

59
      arena|    30m     100m    300m    1000m    2000m  <-- primelimit
60
      =================================================
61
      16K       1.1053  1.1407  1.2589  1.4368   1.6086
62
      24K       1.0000  1.0625  1.1320  1.2443   1.3095
63
      32K       1.0000  1.0469  1.0761  1.1336   1.1776
64
      48K       1.0000  1.0000  1.0254  1.0445   1.0546
65
      50K       1.0000  1.0000  1.0152  1.0345   1.0464
66
      52K       1.0000  1.0000  1.0203  1.0273   1.0362
67
      54K       1.0000  1.0000  1.0812  1.0216   1.0281
68
      56K       1.0526  1.0000  1.0051  1.0144   1.0205
69
      58K       1.0000  1.0000  1.0000  1.0086   1.0123
70
      60K       0.9473  0.9844  1.0051  1.0014   1.0055
71
      62K       1.0000  0.9844  0.9949  0.9971   0.9993
72
      64K       1.0000  1.0000  1.0000  1.0000   1.0000
73
      66K       1.2632  1.2187  1.2183  1.2055   1.1953
74
      68K       1.4211  1.4844  1.4721  1.4425   1.4188
75
      70K       1.7368  1.7188  1.7107  1.6767   1.6421
76
      72K       1.9474  1.9531  1.9594  1.9023   1.8573
77
      74K       2.2105  2.1875  2.1827  2.1207   2.0650
78
      76K       2.4211  2.4219  2.4010  2.3305   2.2644
79
      78K       2.5789  2.6250  2.6091  2.5330   2.4571
80
      80K       2.8421  2.8125  2.8223  2.7213   2.6380
81
      84K       3.1053  3.1875  3.1776  3.0819   2.9802
82
      88K       3.5263  3.5312  3.5228  3.4124   3.2992
83
      92K       3.7895  3.8438  3.8375  3.7213   3.5971
84
      96K       4.0000  4.1093  4.1218  3.9986   3.9659
85
      112K      4.3684  4.5781  4.5787  4.4583   4.6115
86
      128K      4.7368  4.8750  4.9188  4.8075   4.8997
87
      192K      5.5263  5.7188  5.8020  5.6911   5.7064
88
      256K      6.0000  6.2187  6.3045  6.1954   6.1033
89
      384K      6.7368  6.9531  7.0405  6.9181   6.7912
90
      512K      7.3158  7.5156  7.6294  7.5000   7.4654
91
      768K      9.1579  9.4531  9.6395  9.5014   9.1075
92
      1024K    10.368  10.7497 10.9999 10.878   10.8201
93
      1536K    12.579  13.3124 13.7660 13.747   13.4739
94
      2048K    13.737  14.4839 15.0509 15.151   15.1282
95
      3076K    14.789  15.5780 16.2993 16.513   16.3365
96

97
    Now the same number relative to the model
98

99
    (1 + 0.36*sqrt(primelimit)/arena) * (arena <= 64 ? 1.05 : (arena-64)**0.38)
100

101
     [SLOW2_IN_ROOTS = 0.36, ALPHA = 0.38]
102

103
      arena|    30m     100m    300m    1000m    2000m  <-- primelimit
104
      =================================================
105
        16K    1.014    0.9835  0.9942  0.9889  1.004
106
        24K    0.9526   0.9758  0.9861  0.9942  0.981
107
        32K    0.971    0.9939  0.9884  0.9849  0.9806
108
        48K    0.9902   0.9825  0.996   0.9945  0.9885
109
        50K    0.9917   0.9853  0.9906  0.9926  0.9907
110
        52K    0.9932   0.9878  0.9999  0.9928  0.9903
111
        54K    0.9945   0.9902  1.064   0.9939  0.9913
112
        56K    1.048    0.9924  0.9925  0.993   0.9921
113
        58K    0.9969   0.9945  0.9909  0.9932  0.9918
114
        60K    0.9455   0.9809  0.9992  0.9915  0.9923
115
        62K    0.9991   0.9827  0.9921  0.9924  0.9929
116
        64K    1        1       1       1       1
117
        66K    1.02     0.9849  0.9857  0.9772  0.9704
118
        68K    0.8827   0.9232  0.9176  0.9025  0.8903
119
        70K    0.9255   0.9177  0.9162  0.9029  0.8881
120
        72K    0.9309   0.936   0.9429  0.9219  0.9052
121
        74K    0.9715   0.9644  0.967   0.9477  0.9292
122
        76K    0.9935   0.9975  0.9946  0.9751  0.9552
123
        78K    0.9987   1.021   1.021   1.003   0.9819
124
        80K    1.047    1.041   1.052   1.027   1.006
125
        84K    1.052    1.086   1.092   1.075   1.053
126
        88K    1.116    1.125   1.133   1.117   1.096
127
        92K    1.132    1.156   1.167   1.155   1.134
128
        96K    1.137    1.177   1.195   1.185   1.196
129
       112K    1.067    1.13    1.148   1.15    1.217
130
       128K    1.04     1.083   1.113   1.124   1.178
131
       192K    0.9368   0.985   1.025   1.051   1.095
132
       256K    0.8741   0.9224  0.9619  0.995   1.024
133
       384K    0.8103   0.8533  0.8917  0.9282  0.9568
134
       512K    0.7753   0.8135  0.8537  0.892   0.935
135
       768K    0.8184   0.8638  0.9121  0.9586  0.9705
136
      1024K    0.8241   0.8741  0.927   0.979   1.03
137
      1536K    0.8505   0.9212  0.9882  1.056   1.096
138
      2048K    0.8294   0.8954  0.9655  1.041   1.102
139

140
*/
141

142
#ifndef SLOW2_IN_ROOTS
143
  /* SLOW2_IN_ROOTS below 3: some slowdown starts to be noticable
144
   * when things fit into the cache on Sparc.
145
   * The choice of 2.6 gives a slowdown of 1-2% on UltraSparcII,
146
   * but makes calculations for "maximum" of 436273009
147
   * fit into 256K cache (still common for some architectures).
148
   *
149
   * One may change it when small caches become uncommon, but the gain
150
   * is not going to be very noticable... */
151
#  ifdef i386           /* gcc defines this? */
152
#    define SLOW2_IN_ROOTS      0.36
153
#  else
154
#    define SLOW2_IN_ROOTS      2.6
155
#  endif
156
#endif
157
#ifndef CACHE_ARENA
158
#  ifdef i386           /* gcc defines this? */
159
   /* Due to smaller SLOW2_IN_ROOTS, smaller arena is OK; fit L1 cache */
160
#    define CACHE_ARENA (63 * 1024UL) /* No slowdown even with 64K L1 cache */
161
#  else
162
#    define CACHE_ARENA (200 * 1024UL) /* No slowdown even with 256K L2 cache */
163
#  endif
164
#endif
165

166
#define CACHE_ALPHA     (0.38)          /* Cache performance model parameter */
167
#define CACHE_CUTOFF    (0.018)         /* Cache performance not smooth here */
168

169
static double slow2_in_roots = SLOW2_IN_ROOTS;
170

171
typedef struct {
172
    ulong arena;
173
    double power;
174
    double cutoff;
175
} cache_model_t;
176

177
static cache_model_t cache_model = { CACHE_ARENA, CACHE_ALPHA, CACHE_CUTOFF };
178

179
/* Assume that some calculation requires a chunk of memory to be
180
   accessed often in more or less random fashion (as in sieving).
181
   Assume that the calculation can be done in steps by subdividing the
182
   chunk into smaller subchunks (arenas) and treating them
183
   separately.  Assume that the overhead of subdivision is equivalent
184
   to the number of arenas.
185

186
   Find an optimal size of the arena taking into account the overhead
187
   of subdivision, and the overhead of arena not fitting into the
188
   cache.  Assume that arenas of size slow2_in_roots slows down the
189
   calculation 2x (comparing to very big arenas; when cache hits do
190
   not matter).  Since cache performance varies wildly with
191
   architecture, load, and wheather (especially with cache coloring
192
   enabled), use an idealized cache model based on benchmarks above.
193

194
   Assume that an independent region of FIXED_TO_CACHE bytes is accessed
195
   very often concurrently with the arena access.
196
 */
197
static ulong
198
good_arena_size(ulong slow2_size, ulong total, ulong fixed_to_cache,
199
                cache_model_t *cache_model)
200
{
201
  ulong asize, cache_arena = cache_model->arena;
202
  double Xmin, Xmax, A, B, C1, C2, D, V;
203
  double alpha = cache_model->power, cut_off = cache_model->cutoff;
204

205
  /* Estimated relative slowdown,
206
     with overhead = max((fixed_to_cache+arena)/cache_arena - 1, 0):
207

208
     1 + slow2_size/arena due to initialization overhead;
209

210
     max(1, 4.63 * overhead^0.38 ) due to footprint > cache size.
211

212
     [The latter is hard to substantiate theoretically, but this
213
     function describes benchmarks pretty close; it does not hurt that
214
     one can minimize it explicitly too ;-).  The switch between
215
     different choices of max() happens when overhead=0.018.]
216

217
     Thus the problem is minimizing (1 + slow2_size/arena)*overhead**0.29.
218
     This boils down to F=((X+A)/(X+B))X^alpha, X=overhead,
219
     B = (1 - fixed_to_cache/cache_arena), A = B + slow2_size/cache_arena,
220
     alpha = 0.38, and X>=0.018, X>-B.
221

222
     We need to find the rightmost root of (X+A)*(X+B) - alpha(A-B)X to the
223
     right of 0.018 (if such exists and is below Xmax).  Then we manually
224
     check the remaining region [0, 0.018].
225

226
     Since we cannot trust the purely-experimental cache-hit slowdown
227
     function, as a sanity check always prefer fitting into the
228
     cache (or "almost fitting") if F-law predicts that the larger
229
     value of the arena provides less than 10% speedup.
230
   */
231

232
  /* The simplest case: we fit into cache */
233
  asize = cache_arena - fixed_to_cache;
234
  if (total <= asize) return total;
235
  /* The simple case: fitting into cache doesn't slow us down more than 10% */
236
  if (asize > 10 * slow2_size) return asize;
237
  /* Slowdown of not fitting into cache is significant.  Try to optimize.
238
     Do not be afraid to spend some time on optimization - in trivial
239
     cases we do not reach this point; any gain we get should
240
     compensate the time spent on optimization.  */
241

242
  B = (1 - ((double)fixed_to_cache)/cache_arena);
243
  A = B + ((double)slow2_size)/cache_arena;
244
  C2 = A*B;
245
  C1 = (A + B - 1/alpha*(A - B))/2;
246
  D = C1*C1 - C2;
247
  if (D > 0)
248
    V = cut_off*cut_off + 2*C1*cut_off + C2; /* Value at CUT_OFF */
249
  else
250
    V = 0; /* Peacify the warning */
251
  Xmin = cut_off;
252
  Xmax = ((double)total - fixed_to_cache)/cache_arena; /* Two candidates */
253

254
  if ( D <= 0 || (V >= 0 && C1 + cut_off >= 0) ) /* slowdown increasing */
255
    Xmax = cut_off; /* Only one candidate */
256
  else if (V >= 0 && /* slowdown concave down */
257
           ((Xmax + C1) <= 0 || (Xmax*Xmax + 2*C1*Xmax + C2) <= 0))
258
      /* DO NOTHING */;  /* Keep both candidates */
259
  else if (V <= 0 && (Xmax*Xmax + 2*C1*Xmax + C2) <= 0) /*slowdown decreasing*/
260
      Xmin = cut_off; /* Only one candidate */
261
  else /* Now we know: 2 roots, the largest is in CUT_OFF..Xmax */
262
      Xmax = sqrt(D) - C1;
263
  if (Xmax != Xmin) { /* Xmin == CUT_OFF; Check which one is better */
264
    double v1 = (cut_off + A)/(cut_off + B);
265
    double v2 = 2.33 * (Xmax + A)/(Xmax + B) * pow(Xmax, alpha);
266

267
    if (1.1 * v2 >= v1) /* Prefer fitting into the cache if slowdown < 10% */
268
      V = v1;
269
    else
270
    { Xmin = Xmax; V = v2; }
271
  } else if (B > 0) /* We need V */
272
    V = 2.33 * (Xmin + A)/(Xmin + B) * pow(Xmin, alpha);
273
  if (B > 0 && 1.1 * V > A/B)  /* Now Xmin is the minumum.  Compare with 0 */
274
    Xmin = 0;
275

276
  asize = (ulong)((1 + Xmin)*cache_arena - fixed_to_cache);
277
  if (asize > total) asize = total; /* May happen due to approximations */
278
  return asize;
279
}
280

281
/* Use as in
282
    install(set_optimize,lLDG)          \\ Through some M too?
283
    set_optimize(2,1) \\ disable dependence on limit
284
    \\ 1: how much cache usable, 2: slowdown of setup, 3: alpha, 4: cutoff
285
    \\ 2,3,4 are in units of 0.001
286

287
    { time_primes_arena(ar,limit) =     \\ ar = arena size in K
288
        set_optimize(1,floor(ar*1024));
289
        default(primelimit, 200 000);   \\ 100000 results in *larger* malloc()!
290
        gettime;
291
        default(primelimit, floor(limit));
292
        if(ar >= 1, ar=floor(ar));
293
        print("arena "ar"K => "gettime"ms");
294
    }
295
*/
296
long
297
set_optimize(long what, GEN g)
298
{
299
  long ret = 0;
300

301
  switch (what) {
302
  case 1:
303
    ret = (long)cache_model.arena;
304
    break;
305
  case 2:
306
    ret = (long)(slow2_in_roots * 1000);
307
    break;
308
  case 3:
309
    ret = (long)(cache_model.power * 1000);
310
    break;
311
  case 4:
312
    ret = (long)(cache_model.cutoff * 1000);
313
    break;
314
  default:
315
    pari_err_BUG("set_optimize");
316
    break;
317
  }
318
  if (g != NULL) {
319
    ulong val = itou(g);
320

321
    switch (what) {
322
    case 1: cache_model.arena = val; break;
323
    case 2: slow2_in_roots     = (double)val / 1000.; break;
324
    case 3: cache_model.power  = (double)val / 1000.; break;
325
    case 4: cache_model.cutoff = (double)val / 1000.; break;
326
    }
327
  }
328
  return ret;
329
}
330

331
/* s is odd; prime differences (starting from 5-3=2) start at known_primes[2],
332
  terminated by a 0 byte. Checks n odd numbers starting at 'start', setting
333
  bytes starting at data to 0 (composite) or 1 (prime) */
334
static void
335
sieve_chunk(byteptr known_primes, ulong s, byteptr data, ulong n)
336
{
337
  ulong p, cnt = n-1, start = s, delta = 1;
338
  byteptr q;
339

340
  memset(data, 0, n);
341
  start >>= 1;  /* (start - 1)/2 */
342
  start += n; /* Corresponds to the end */
343
  /* data corresponds to start, q runs over primediffs */
344
  for (q = known_primes + 1, p = 3; delta; delta = *++q, p += delta)
345
  { /* first odd number >= start > p and divisible by p
346
       = last odd number <= start + 2p - 2 and 0 (mod p)
347
       = p + last number <= start + p - 2 and 0 (mod 2p)
348
       = p + start+p-2 - (start+p-2) % 2p
349
       = start + 2(p - 1 - ((start-1)/2 + (p-1)/2) % p). */
350
    long off = cnt - ((start+(p>>1)) % p);
351
    while (off >= 0) { data[off] = 1; off -= p; }
352
  }
353
}
354

355
/* assume maxnum <= 436273289 < 2^29 */
356
static void
357
initprimes0(ulong maxnum, long *lenp, ulong *lastp, byteptr p1)
358
{
359
  pari_sp av = avma, bot = pari_mainstack->bot;
360
  long alloced, psize;
361
  byteptr q, end, p, end1, plast, curdiff;
362
  ulong last, remains, curlow, rootnum, asize;
363
  ulong prime_above;
364
  byteptr p_prime_above;
365

366
  maxnum |= 1; /* make it odd. */
367
  /* base case */
368
  if (maxnum < 1ul<<17) { initprimes1(maxnum>>1, lenp, lastp, p1); return; }
369

370
  /* Checked to be enough up to 40e6, attained at 155893 */
371
  rootnum = usqrt(maxnum) | 1;
372
  initprimes1(rootnum>>1, &psize, &last, p1);
373
  end1 = p1 + psize - 1;
374
  remains = (maxnum - last) >> 1; /* number of odd numbers to check */
375

376
  /* we access primes array of psize too; but we access it consecutively,
377
   * thus we do not include it in fixed_to_cache */
378
  asize = good_arena_size((ulong)(rootnum * slow2_in_roots), remains+1, 0,
379
                          &cache_model) - 1;
380
  /* enough room on the stack ? */
381
  alloced = (((byteptr)avma) <= ((byteptr)bot) + asize);
382
  if (alloced)
383
    p = (byteptr)pari_malloc(asize+1);
384
  else
385
    p = (byteptr)stack_malloc(asize+1);
386
  end = p + asize; /* the 0 sentinel goes at end. */
387
  curlow = last + 2; /* First candidate: know primes up to last (odd). */
388
  curdiff = end1;
389

390
  /* During each iteration p..end-1 represents a range of odd
391
     numbers.  plast is a pointer which represents the last prime seen,
392
     it may point before p..end-1. */
393
  plast = p - 1;
394
  p_prime_above = p1 + 2;
395
  prime_above = 3;
396
  while (remains)
397
  { /* cycle over arenas; performance not crucial */
398
    unsigned char was_delta;
399
    if (asize > remains) { asize = remains; end = p + asize; }
400
    /* Fake the upper limit appropriate for the given arena */
401
    while (prime_above*prime_above <= curlow + (asize << 1) && *p_prime_above)
402
      prime_above += *p_prime_above++;
403
    was_delta = *p_prime_above;
404
    *p_prime_above = 0; /* sentinel for sieve_chunk */
405
    sieve_chunk(p1, curlow, p, asize);
406
    *p_prime_above = was_delta; /* restore */
407

408
    p[asize] = 0; /* sentinel */
409
    for (q = p; ; plast = q++)
410
    { /* q runs over addresses corresponding to primes */
411
      while (*q) q++; /* use sentinel at end */
412
      if (q >= end) break;
413
      *curdiff++ = (unsigned char)(q-plast) << 1; /* < 255 for q < 436273291 */
414
    }
415
    plast -= asize;
416
    remains -= asize;
417
    curlow += (asize<<1);
418
  }
419
  last = curlow - ((p - plast) << 1);
420
  *curdiff++ = 0; /* sentinel */
421
  *lenp = curdiff - p1;
422
  *lastp = last;
423
  if (alloced) pari_free(p); else set_avma(av);
424
}
425

426
ulong
427
maxprime(void) { return diffptr ? _maxprime : 0; }
428
ulong
429
maxprimeN(void) { return diffptr ? diffptrlen-1: 0; }
430

431
void
432
maxprime_check(ulong c) { if (_maxprime < c) pari_err_MAXPRIME(c); }
433

434
/* We ensure 65302 <= maxnum <= 436273289: the LHS ensures modular function
435
 * have enough fast primes to work, the RHS ensures that p_{n+1} - p_n < 255
436
 * (N.B. RHS would be incorrect since initprimes0 would make it odd, thereby
437
 * increasing it by 1) */
438
byteptr
439
initprimes(ulong maxnum, long *lenp, ulong *lastp)
440
{
441
  byteptr t;
442
  if (maxnum < 65537)
443
    maxnum = 65537;
444
  else if (maxnum > 436273289)
445
    maxnum = 436273289;
446
  t = (byteptr)pari_malloc((size_t) (1.09 * maxnum/log((double)maxnum)) + 146);
447
  initprimes0(maxnum, lenp, lastp, t);
448
  return (byteptr)pari_realloc(t, *lenp);
449
}
450

451
void
452
initprimetable(ulong maxnum)
453
{
454
  long len;
455
  ulong last;
456
  byteptr p = initprimes(maxnum, &len, &last), old = diffptr;
457
  diffptrlen = minss(diffptrlen, len);
458
  _maxprime  = minss(_maxprime,last); /*Protect against ^C*/
459
  diffptr = p; diffptrlen = len; _maxprime = last;
460
  if (old) free(old);
461
}
462

463
/* all init_primepointer_xx routines set *ptr to the corresponding place
464
 * in prime table */
465
/* smallest p >= a */
466
ulong
467
init_primepointer_geq(ulong a, byteptr *pd)
468
{
469
  ulong n, p;
470
  prime_table_next_p(a, pd, &p, &n);
471
  return p;
472
}
473
/* largest p < a */
474
ulong
475
init_primepointer_lt(ulong a, byteptr *pd)
476
{
477
  ulong n, p;
478
  prime_table_next_p(a, pd, &p, &n);
479
  PREC_PRIME_VIADIFF(p, *pd);
480
  return p;
481
}
482
/* largest p <= a */
483
ulong
484
init_primepointer_leq(ulong a, byteptr *pd)
485
{
486
  ulong n, p;
487
  prime_table_next_p(a, pd, &p, &n);
488
  if (p != a) PREC_PRIME_VIADIFF(p, *pd);
489
  return p;
490
}
491
/* smallest p > a */
492
ulong
493
init_primepointer_gt(ulong a, byteptr *pd)
494
{
495
  ulong n, p;
496
  prime_table_next_p(a, pd, &p, &n);
497
  if (p == a) NEXT_PRIME_VIADIFF(p, *pd);
498
  return p;
499
}
500

501
/**********************************************************************/
502
/***                                                                ***/
503
/***                     forprime                                   ***/
504
/***                                                                ***/
505
/**********************************************************************/
506

507
/* return good chunk size for sieve, 16 | chunk + 2 */
508
static ulong
509
optimize_chunk(ulong a, ulong b)
510
{
511
  /* TODO: Optimize size (surely < 512k to stay in L2 cache, but not so large
512
   * as to force recalculating too often). */
513
  ulong chunk = 0x80000UL;
514
  ulong tmp = (b - a) / chunk + 1;
515

516
  if (tmp == 1)
517
    chunk = b - a + 16;
518
  else
519
    chunk = (b - a) / tmp + 15;
520
  /* ensure 16 | chunk + 2 */
521
  return (((chunk + 2)>>4)<<4) - 2;
522
}
523
static void
524
sieve_init(forprime_t *T, ulong a, ulong b)
525
{
526
  T->sieveb = b;
527
  T->chunk = optimize_chunk(a, b);
528
  /* >> 1 [only odds] + 3 [convert from bits to bytes] */
529
  T->isieve = (unsigned char*)stack_malloc(((T->chunk+2) >> 4) + 1);
530
  T->cache[0] = 0;
531
  T->a = a;
532
  T->end = minuu(a + T->chunk, b);
533
  T->pos = T->maxpos = 0;
534
}
535

536
enum {PRST_none, PRST_diffptr, PRST_sieve, PRST_unextprime, PRST_nextprime};
537

538
static void
539
u_forprime_set_prime_table(forprime_t *T, ulong a)
540
{
541
  T->strategy = PRST_diffptr;
542
  if (a < 3)
543
  {
544
    T->p = 0;
545
    T->d = diffptr;
546
  }
547
  else
548
    T->p = init_primepointer_lt(a, &T->d);
549
}
550

551
/* Set p so that p + q the smallest integer = c (mod q) and > original p.
552
 * Assume 0 < c < q. Set p = 0 on overflow */
553
static void
554
arith_set(forprime_t *T)
555
{
556
  ulong r = T->p % T->q; /* 0 <= r <= min(p, q-1) */
557
  pari_sp av = avma;
558
  GEN d = adduu(T->p - r, T->c);
559
  if (T->c > r) d = subiu(d, T->q);
560
  /* d = c mod q,  d = c > r? p-r+c-q: p-r+c, so that
561
   *  d <= p  and  d+q = c>r? p-r+c  : p-r+c+q > p */
562
  T->p = itou_or_0(d); set_avma(av); /* d = 0 is impossible */
563
}
564

565
/* run through primes in arithmetic progression = c (mod q) */
566
static int
567
u_forprime_sieve_arith_init(forprime_t *T, struct pari_sieve *psieve,
568
                            ulong a, ulong b, ulong c, ulong q)
569
{
570
  ulong maxp, maxp2;
571
  if (!odd(b) && b > 2) b--;
572
  if (a > b || b < 2)
573
  {
574
    T->strategy = PRST_diffptr; /* paranoia */
575
    T->p = 0; /* empty */
576
    T->b = 0; /* empty */
577
    T->d = diffptr;
578
    return 0;
579
  }
580
  maxp = maxprime();
581
  if (q != 1)
582
  {
583
    c %= q;
584
    if (ugcd(c,q) != 1) { a = maxuu(a,c); b = minuu(b,c); }
585
    if (odd(q) && (a > 2 || c != 2))
586
    { /* only *odd* primes. If a <= c = 2, then p = 2 must be included :-( */
587
      if (!odd(c)) c += q;
588
      q <<= 1;
589
    }
590
  }
591
  T->q = q;
592
  T->c = c;
593
  T->strategy = PRST_none; /* unknown */
594
  T->psieve = psieve; /* unused for now */
595
  T->isieve = NULL; /* unused for now */
596
  T->b = b;
597
  if (maxp >= b) { /* [a,b] \subset prime table */
598
    u_forprime_set_prime_table(T, a);
599
    return 1;
600
  }
601
  /* b > maxp */
602
  if (a >= maxp)
603
  {
604
    T->p = a - 1;
605
    if (T->q > 1) arith_set(T);
606
  }
607
  else
608
    u_forprime_set_prime_table(T, a);
609

610
  maxp2 = (maxp & HIGHMASK)? 0 : maxp*maxp;
611
  /* FIXME: should sieve as well if q != 1, adapt sieve code */
612
  if (q != 1 || (maxp2 && maxp2 <= a)
613
             || T->b - maxuu(a,maxp) < maxp / expu(b))
614
  { if (T->strategy==PRST_none) T->strategy = PRST_unextprime; }
615
  else
616
  { /* worth sieving */
617
#ifdef LONG_IS_64BIT
618
    const ulong UPRIME_MAX = 18446744073709551557UL;
619
#else
620
    const ulong UPRIME_MAX = 4294967291UL;
621
#endif
622
    ulong sieveb;
623
    if (b > UPRIME_MAX) b = UPRIME_MAX;
624
    sieveb = b;
625
    if (maxp2 && maxp2 < b) sieveb = maxp2;
626
    if (T->strategy==PRST_none) T->strategy = PRST_sieve;
627
    sieve_init(T, maxuu(maxp+2, a), sieveb);
628
  }
629
  return 1;
630
}
631

632
int
633
u_forprime_arith_init(forprime_t *T, ulong a, ulong b, ulong c, ulong q)
634
{ return u_forprime_sieve_arith_init(T, NULL, a, b, c, q); }
635

636
/* will run through primes in [a,b] */
637
int
638
u_forprime_init(forprime_t *T, ulong a, ulong b)
639
{ return u_forprime_arith_init(T, a,b, 0,1); }
640

641
/* will run through primes in [a,b] */
642
static int
643
u_forprime_sieve_init(forprime_t *T, struct pari_sieve *s, ulong b)
644
{ return u_forprime_sieve_arith_init(T, s, s->start, b, s->c, s->q); }
645

646
/* now only run through primes <= c; assume c <= b above */
647
void
648
u_forprime_restrict(forprime_t *T, ulong c) { T->b = c; }
649

650
/* b = NULL: loop forever */
651
int
652
forprimestep_init(forprime_t *T, GEN a, GEN b, GEN q)
653
{
654
  long lb;
655
  a = gceil(a); if (typ(a) != t_INT) pari_err_TYPE("forprime_init",a);
656
  if (signe(a) <= 0) a = gen_1;
657
  if (b && typ(b) != t_INFINITY)
658
  {
659
    b = gfloor(b);
660
    if (typ(b) != t_INT) pari_err_TYPE("forprime_init",b);
661
    if (signe(b) < 0 || cmpii(a,b) > 0)
662
    {
663
      T->strategy = PRST_nextprime; /* paranoia */
664
      T->bb = T->pp = gen_0; return 0;
665
    }
666
    lb = lgefint(b);
667
    T->bb = b;
668
  }
669
  else if (!b || inf_get_sign(b) > 0)
670
  {
671
    lb = lgefint(a) + 4;
672
    T->bb = NULL;
673
  }
674
  else /* b == -oo */
675
  {
676
    T->strategy = PRST_nextprime; /* paranoia */
677
    T->bb = T->pp = gen_0; return 0;
678
  }
679
  T->pp = cgeti(lb);
680
  T->c = 0;
681
  T->q = 1;
682
  /* a, b are positive integers, a <= b */
683
  if (q)
684
  {
685
    switch(typ(q))
686
    {
687
      case t_INT: break;
688
      case t_INTMOD: a = addii(a, modii(subii(gel(q,2),a), gel(q,1)));
689
                     q = gel(q,1); break;
690
      default: pari_err_TYPE("forprimestep_init",q);
691
    }
692
    if (signe(q) <= 0) pari_err_TYPE("forprimestep_init (q <= 0)",q);
693
    if (equali1(q)) q = NULL;
694
    else
695
    {
696
      T->q = itou(q);
697
      T->c = umodiu(a, T->q);
698
    }
699
  }
700
  if (lgefint(a) == 3) /* lb == 3 implies b != NULL */
701
    return u_forprime_arith_init(T, uel(a,2), lb == 3? uel(b,2): ULONG_MAX,
702
                                    T->c, T->q);
703
  T->strategy = PRST_nextprime;
704
  affii(subiu(a,T->q), T->pp);
705
  return 1;
706
}
707
int
708
forprime_init(forprime_t *T, GEN a, GEN b)
709
{ return forprimestep_init(T,a,b,NULL); }
710

711
/* assume a <= b <= maxprime()^2, a,b odd, sieve[n] corresponds to
712
 *   a+16*n, a+16*n+2, ..., a+16*n+14 (bits 0 to 7)
713
 * maxpos = index of last sieve cell.
714
 * b-a+2 must be divisible by 16 for use by u_forprime_next */
715
static void
716
sieve_block(ulong a, ulong b, ulong maxpos, unsigned char* sieve)
717
{
718
  ulong p = 2, lim = usqrt(b), sz = (b-a) >> 1;
719
  byteptr d = diffptr+1;
720
  (void)memset(sieve, 0, maxpos+1);
721
  for (;;)
722
  { /* p is odd */
723
    ulong k, r;
724
    NEXT_PRIME_VIADIFF(p, d); /* starts at p = 3 */
725
    if (p > lim) break;
726

727
    /* solve a + 2k = 0 (mod p) */
728
    r = a % p;
729
    if (r == 0)
730
      k = 0;
731
    else
732
    {
733
      k = p - r;
734
      if (odd(k)) k += p;
735
      k >>= 1;
736
    }
737
    /* m = a + 2k is the smallest odd m >= a, p | m */
738
    /* position n (corresponds to a+2n) is sieve[n>>3], bit n&7 */
739
    while (k <= sz) { sieve[k>>3] |= 1 << (k&7); k += p; /* 2k += 2p */ }
740
  }
741
}
742

743
static void
744
pari_sieve_init(struct pari_sieve *s, ulong a, ulong b)
745
{
746
  ulong maxpos= (b - a) >> 4;
747
  s->start = a; s->end = b;
748
  s->sieve = (unsigned char*) pari_malloc(maxpos+1);
749
  s->c = 0; s->q = 1;
750
  sieve_block(a, b, maxpos, s->sieve);
751
  s->maxpos = maxpos; /* must be last in case of SIGINT */
752
}
753

754
static struct pari_sieve pari_sieve_modular;
755

756
#ifdef LONG_IS_64BIT
757
#define PARI_MODULAR_BASE ((1UL<<((BITS_IN_LONG-2)>>1))+1)
758
#else
759
#define PARI_MODULAR_BASE ((1UL<<(BITS_IN_LONG-1))+1)
760
#endif
761

762
void
763
pari_init_primes(ulong maxprime)
764
{
765
  ulong a = PARI_MODULAR_BASE, b = a + (1UL<<20)-2;
766
  initprimetable(maxprime);
767
  pari_sieve_init(&pari_sieve_modular, a, b);
768
}
769

770
void
771
pari_close_primes(void)
772
{
773
  pari_free(diffptr);
774
  pari_free(pari_sieve_modular.sieve);
775
}
776

777
void
778
init_modular_small(forprime_t *S)
779
{
780
#ifdef LONG_IS_64BIT
781
  u_forprime_sieve_init(S, &pari_sieve_modular, ULONG_MAX);
782
#else
783
  ulong a = (1UL<<((BITS_IN_LONG-2)>>1))+1;
784
  u_forprime_init(S, a, ULONG_MAX);
785
#endif
786
}
787

788
void
789
init_modular_big(forprime_t *S)
790
{
791
#ifdef LONG_IS_64BIT
792
  u_forprime_init(S, HIGHBIT + 1, ULONG_MAX);
793
#else
794
  u_forprime_sieve_init(S, &pari_sieve_modular, ULONG_MAX);
795
#endif
796
}
797

798
/* T->cache is a 0-terminated list of primes, return the first one and
799
 * remove it from list. Most of the time the list contains a single prime */
800
static ulong
801
shift_cache(forprime_t *T)
802
{
803
  long i;
804
  T->p = T->cache[0];
805
  for (i = 1;; i++)  /* remove one prime from cache */
806
    if (! (T->cache[i-1] = T->cache[i]) ) break;
807
  return T->p;
808
}
809

810
ulong
811
u_forprime_next(forprime_t *T)
812
{
813
  if (T->strategy == PRST_diffptr)
814
  {
815
    for(;;)
816
    {
817
      if (!*(T->d))
818
      {
819
        T->strategy = T->isieve? PRST_sieve: PRST_unextprime;
820
        if (T->q != 1) { arith_set(T); if (!T->p) return 0; }
821
        /* T->p possibly not a prime if q != 1 */
822
        break;
823
      }
824
      else
825
      {
826
        NEXT_PRIME_VIADIFF(T->p, T->d);
827
        if (T->p > T->b) return 0;
828
        if (T->q == 1 || T->p % T->q == T->c) return T->p;
829
      }
830
    }
831
  }
832
  if (T->strategy == PRST_sieve)
833
  {
834
    ulong n;
835
    if (T->cache[0]) return shift_cache(T);
836
NEXT_CHUNK:
837
    if (T->psieve)
838
    {
839
      T->sieve = T->psieve->sieve;
840
      T->end = T->psieve->end;
841
      if (T->end > T->sieveb) T->end = T->sieveb;
842
      T->maxpos = T->psieve->maxpos;
843
      T->pos = 0;
844
      T->psieve = NULL;
845
    }
846
    for (n = T->pos; n < T->maxpos; n++)
847
      if (T->sieve[n] != 0xFF)
848
      {
849
        unsigned char mask = T->sieve[n];
850
        ulong p = T->a + (n<<4);
851
        long i = 0;
852
        T->pos = n;
853
        if (!(mask &  1)) T->cache[i++] = p;
854
        if (!(mask &  2)) T->cache[i++] = p+2;
855
        if (!(mask &  4)) T->cache[i++] = p+4;
856
        if (!(mask &  8)) T->cache[i++] = p+6;
857
        if (!(mask & 16)) T->cache[i++] = p+8;
858
        if (!(mask & 32)) T->cache[i++] = p+10;
859
        if (!(mask & 64)) T->cache[i++] = p+12;
860
        if (!(mask &128)) T->cache[i++] = p+14;
861
        T->cache[i] = 0;
862
        T->pos = n+1;
863
        return shift_cache(T);
864
      }
865
    /* n = T->maxpos, last cell: check p <= b */
866
    if (T->maxpos && n == T->maxpos && T->sieve[n] != 0xFF)
867
    {
868
      unsigned char mask = T->sieve[n];
869
      ulong p = T->a + (n<<4);
870
      long i = 0;
871
      T->pos = n;
872
      if (!(mask &  1) && p <= T->sieveb) T->cache[i++] = p;
873
      if (!(mask &  2) && p <= T->sieveb-2) T->cache[i++] = p+2;
874
      if (!(mask &  4) && p <= T->sieveb-4) T->cache[i++] = p+4;
875
      if (!(mask &  8) && p <= T->sieveb-6) T->cache[i++] = p+6;
876
      if (!(mask & 16) && p <= T->sieveb-8) T->cache[i++] = p+8;
877
      if (!(mask & 32) && p <= T->sieveb-10) T->cache[i++] = p+10;
878
      if (!(mask & 64) && p <= T->sieveb-12) T->cache[i++] = p+12;
879
      if (!(mask &128) && p <= T->sieveb-14) T->cache[i++] = p+14;
880
      if (i)
881
      {
882
        T->cache[i] = 0;
883
        T->pos = n+1;
884
        return shift_cache(T);
885
      }
886
    }
887

888
    if (T->maxpos && T->end >= T->sieveb) /* done with sieves ? */
889
    {
890
      if (T->sieveb == T->b && T->b != ULONG_MAX) return 0;
891
      T->strategy = PRST_unextprime;
892
    }
893
    else
894
    { /* initialize next chunk */
895
      T->sieve = T->isieve;
896
      if (T->maxpos == 0)
897
        T->a |= 1; /* first time; ensure odd */
898
      else
899
        T->a = (T->end + 2) | 1;
900
      T->end = T->a + T->chunk; /* may overflow */
901
      if (T->end < T->a || T->end > T->sieveb) T->end = T->sieveb;
902
      /* end and a are odd; sieve[k] contains the a + 8*2k + (0,2,...,14).
903
       * The largest k is (end-a) >> 4 */
904
      T->pos = 0;
905
      T->maxpos = (T->end - T->a) >> 4;
906
      sieve_block(T->a, T->end, T->maxpos, T->sieve);
907
      goto NEXT_CHUNK;
908
    }
909
  }
910
  if (T->strategy == PRST_unextprime)
911
  {
912
    if (T->q == 1)
913
    {
914
#ifdef LONG_IS_64BIT
915
      switch(T->p)
916
      {
917
#define retp(x) return T->p = (HIGHBIT+x <= T->b)? HIGHBIT+x: 0
918
        case HIGHBIT: retp(29);
919
        case HIGHBIT + 29: retp(99);
920
        case HIGHBIT + 99: retp(123);
921
        case HIGHBIT +123: retp(131);
922
        case HIGHBIT +131: retp(155);
923
        case HIGHBIT +155: retp(255);
924
        case HIGHBIT +255: retp(269);
925
        case HIGHBIT +269: retp(359);
926
        case HIGHBIT +359: retp(435);
927
        case HIGHBIT +435: retp(449);
928
        case HIGHBIT +449: retp(453);
929
        case HIGHBIT +453: retp(485);
930
        case HIGHBIT +485: retp(491);
931
        case HIGHBIT +491: retp(543);
932
        case HIGHBIT +543: retp(585);
933
        case HIGHBIT +585: retp(599);
934
        case HIGHBIT +599: retp(753);
935
        case HIGHBIT +753: retp(849);
936
        case HIGHBIT +849: retp(879);
937
        case HIGHBIT +879: retp(885);
938
        case HIGHBIT +885: retp(903);
939
        case HIGHBIT +903: retp(995);
940
#undef retp
941
      }
942
#endif
943
      T->p = unextprime(T->p + 1);
944
    }
945
    else do {
946
      T->p += T->q;
947
      if (T->p < T->q || T->p > T->b) { T->p = 0; break; } /* overflow */
948
    } while (!uisprime(T->p));
949
    if (T->p && T->p <= T->b) return T->p;
950
    /* overflow ulong, switch to GEN */
951
    T->strategy = PRST_nextprime;
952
  }
953
  return 0; /* overflow */
954
}
955

956
GEN
957
forprime_next(forprime_t *T)
958
{
959
  pari_sp av;
960
  GEN p;
961
  if (T->strategy != PRST_nextprime)
962
  {
963
    ulong u = u_forprime_next(T);
964
    if (u) { affui(u, T->pp); return T->pp; }
965
    /* failure */
966
    if (T->strategy != PRST_nextprime) return NULL; /* we're done */
967
    /* overflow ulong, switch to GEN */
968
    u = ULONG_MAX;
969
    if (T->q > 1) u -= (ULONG_MAX-T->c) % T->q;
970
    affui(u, T->pp);
971
  }
972
  av = avma; p = T->pp;
973
  if (T->q == 1)
974
  {
975
    p = nextprime(addiu(p, 1));
976
    if (T->bb && abscmpii(p, T->bb) > 0) return gc_NULL(av);
977
  } else do {
978
    p = addiu(p, T->q);
979
    if (T->bb && abscmpii(p, T->bb) > 0) return gc_NULL(av);
980
  } while (!BPSW_psp(p));
981
  affii(p, T->pp); return gc_const(av, T->pp);
982
}
983

984
void
985
forprimestep(GEN a, GEN b, GEN q, GEN code)
986
{
987
  pari_sp av = avma;
988
  forprime_t T;
989

990
  if (!forprimestep_init(&T, a,b,q)) { set_avma(av); return; }
991

992
  push_lex(T.pp,code);
993
  while(forprime_next(&T))
994
  {
995
    closure_evalvoid(code); if (loop_break()) break;
996
    /* p changed in 'code', complain */
997
    if (get_lex(-1) != T.pp)
998
      pari_err(e_MISC,"prime index read-only: was changed to %Ps", get_lex(-1));
999
  }
1000
  pop_lex(1); set_avma(av);
1001
}
1002
void
1003
forprime(GEN a, GEN b, GEN code) { return forprimestep(a,b,NULL,code); }
1004

1005
int
1006
forcomposite_init(forcomposite_t *C, GEN a, GEN b)
1007
{
1008
  pari_sp av = avma;
1009
  a = gceil(a);
1010
  if (typ(a)!=t_INT) pari_err_TYPE("forcomposite",a);
1011
  if (b) {
1012
    if (typ(b) == t_INFINITY) b = NULL;
1013
    else
1014
    {
1015
      b = gfloor(b);
1016
      if (typ(b)!=t_INT) pari_err_TYPE("forcomposite",b);
1017
    }
1018
  }
1019
  if (signe(a) < 0) pari_err_DOMAIN("forcomposite", "a", "<", gen_0, a);
1020
  if (abscmpiu(a, 4) < 0) a = utoipos(4);
1021
  C->first = 1;
1022
  if (!forprime_init(&C->T, a,b) && cmpii(a,b) > 0)
1023
  {
1024
    C->n = gen_1; /* in case caller forgets to check the return value */
1025
    C->b = gen_0; return gc_bool(av,0);
1026
  }
1027
  C->n = setloop(a);
1028
  C->b = b;
1029
  C->p = NULL; return 1;
1030
}
1031

1032
GEN
1033
forcomposite_next(forcomposite_t *C)
1034
{
1035
  if (C->first) /* first call ever */
1036
  {
1037
    C->first = 0;
1038
    C->p = forprime_next(&C->T);
1039
  }
1040
  else
1041
    C->n = incloop(C->n);
1042
  if (C->p)
1043
  {
1044
    if (cmpii(C->n, C->p) < 0) return C->n;
1045
    C->n = incloop(C->n);
1046
    /* n = p+1 */
1047
    C->p = forprime_next(&C->T); /* nextprime(p) > n */
1048
    if (C->p) return C->n;
1049
  }
1050
  if (!C->b || cmpii(C->n, C->b) <= 0) return C->n;
1051
  return NULL;
1052
}
1053

1054
void
1055
forcomposite(GEN a, GEN b, GEN code)
1056
{
1057
  pari_sp av = avma;
1058
  forcomposite_t T;
1059
  GEN n;
1060
  if (!forcomposite_init(&T,a,b)) return;
1061
  push_lex(T.n,code);
1062
  while((n = forcomposite_next(&T)))
1063
  {
1064
    closure_evalvoid(code); if (loop_break()) break;
1065
    /* n changed in 'code', complain */
1066
    if (get_lex(-1) != n)
1067
      pari_err(e_MISC,"index read-only: was changed to %Ps", get_lex(-1));
1068
  }
1069
  pop_lex(1); set_avma(av);
1070
}
1071

1072
Product

Resources

Company