Testing latest pari + WASM + node.js... and it works?! Wow.
License: GPL3
ubuntu2004
/* Copyright (C) 2016 The PARI group.12This file is part of the PARI/GP package.34PARI/GP is free software; you can redistribute it and/or modify it under the5terms of the GNU General Public License as published by the Free Software6Foundation; either version 2 of the License, or (at your option) any later7version. It is distributed in the hope that it will be useful, but WITHOUT8ANY WARRANTY WHATSOEVER.910Check the License for details. You should have received a copy of it, along11with the package; see the file 'COPYING'. If not, write to the Free Software12Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. */1314#include "pari.h"15#include "paripriv.h"1617/**********************************************************************/18/*** ***/19/*** Public prime table ***/20/*** ***/21/**********************************************************************/2223static ulong _maxprime = 0;24static ulong diffptrlen;2526/* Building/Rebuilding the diffptr table. The actual work is done by the27* following two subroutines; the user entry point is the function28* initprimes() below. initprimes1() is the old algorithm, called when29* maxnum (size) is moderate. Must be called after pari_init_stack() )*/30static void31initprimes1(ulong size, long *lenp, ulong *lastp, byteptr p1)32{33pari_sp av = avma;34long k;35byteptr q, r, s, p = (byteptr)stack_calloc(size+2), fin = p + size;3637for (r=q=p,k=1; r<=fin; )38{39do { r+=k; k+=2; r+=k; } while (*++q);40for (s=r; s<=fin; s+=k) *s = 1;41}42r = p1; *r++ = 2; *r++ = 1; /* 2 and 3 */43for (s=q=p+1; ; s=q)44{45do q++; while (*q);46if (q > fin) break;47*r++ = (unsigned char) ((q-s) << 1);48}49*r++ = 0;50*lenp = r - p1;51*lastp = ((s - p) << 1) + 1;52set_avma(av);53}5455/* Timing in ms (Athlon/850; reports 512K of secondary cache; looks56like there is 64K of quickier cache too).5758arena| 30m 100m 300m 1000m 2000m <-- primelimit59=================================================6016K 1.1053 1.1407 1.2589 1.4368 1.60866124K 1.0000 1.0625 1.1320 1.2443 1.30956232K 1.0000 1.0469 1.0761 1.1336 1.17766348K 1.0000 1.0000 1.0254 1.0445 1.05466450K 1.0000 1.0000 1.0152 1.0345 1.04646552K 1.0000 1.0000 1.0203 1.0273 1.03626654K 1.0000 1.0000 1.0812 1.0216 1.02816756K 1.0526 1.0000 1.0051 1.0144 1.02056858K 1.0000 1.0000 1.0000 1.0086 1.01236960K 0.9473 0.9844 1.0051 1.0014 1.00557062K 1.0000 0.9844 0.9949 0.9971 0.99937164K 1.0000 1.0000 1.0000 1.0000 1.00007266K 1.2632 1.2187 1.2183 1.2055 1.19537368K 1.4211 1.4844 1.4721 1.4425 1.41887470K 1.7368 1.7188 1.7107 1.6767 1.64217572K 1.9474 1.9531 1.9594 1.9023 1.85737674K 2.2105 2.1875 2.1827 2.1207 2.06507776K 2.4211 2.4219 2.4010 2.3305 2.26447878K 2.5789 2.6250 2.6091 2.5330 2.45717980K 2.8421 2.8125 2.8223 2.7213 2.63808084K 3.1053 3.1875 3.1776 3.0819 2.98028188K 3.5263 3.5312 3.5228 3.4124 3.29928292K 3.7895 3.8438 3.8375 3.7213 3.59718396K 4.0000 4.1093 4.1218 3.9986 3.965984112K 4.3684 4.5781 4.5787 4.4583 4.611585128K 4.7368 4.8750 4.9188 4.8075 4.899786192K 5.5263 5.7188 5.8020 5.6911 5.706487256K 6.0000 6.2187 6.3045 6.1954 6.103388384K 6.7368 6.9531 7.0405 6.9181 6.791289512K 7.3158 7.5156 7.6294 7.5000 7.465490768K 9.1579 9.4531 9.6395 9.5014 9.1075911024K 10.368 10.7497 10.9999 10.878 10.8201921536K 12.579 13.3124 13.7660 13.747 13.4739932048K 13.737 14.4839 15.0509 15.151 15.1282943076K 14.789 15.5780 16.2993 16.513 16.33659596Now the same number relative to the model9798(1 + 0.36*sqrt(primelimit)/arena) * (arena <= 64 ? 1.05 : (arena-64)**0.38)99100[SLOW2_IN_ROOTS = 0.36, ALPHA = 0.38]101102arena| 30m 100m 300m 1000m 2000m <-- primelimit103=================================================10416K 1.014 0.9835 0.9942 0.9889 1.00410524K 0.9526 0.9758 0.9861 0.9942 0.98110632K 0.971 0.9939 0.9884 0.9849 0.980610748K 0.9902 0.9825 0.996 0.9945 0.988510850K 0.9917 0.9853 0.9906 0.9926 0.990710952K 0.9932 0.9878 0.9999 0.9928 0.990311054K 0.9945 0.9902 1.064 0.9939 0.991311156K 1.048 0.9924 0.9925 0.993 0.992111258K 0.9969 0.9945 0.9909 0.9932 0.991811360K 0.9455 0.9809 0.9992 0.9915 0.992311462K 0.9991 0.9827 0.9921 0.9924 0.992911564K 1 1 1 1 111666K 1.02 0.9849 0.9857 0.9772 0.970411768K 0.8827 0.9232 0.9176 0.9025 0.890311870K 0.9255 0.9177 0.9162 0.9029 0.888111972K 0.9309 0.936 0.9429 0.9219 0.905212074K 0.9715 0.9644 0.967 0.9477 0.929212176K 0.9935 0.9975 0.9946 0.9751 0.955212278K 0.9987 1.021 1.021 1.003 0.981912380K 1.047 1.041 1.052 1.027 1.00612484K 1.052 1.086 1.092 1.075 1.05312588K 1.116 1.125 1.133 1.117 1.09612692K 1.132 1.156 1.167 1.155 1.13412796K 1.137 1.177 1.195 1.185 1.196128112K 1.067 1.13 1.148 1.15 1.217129128K 1.04 1.083 1.113 1.124 1.178130192K 0.9368 0.985 1.025 1.051 1.095131256K 0.8741 0.9224 0.9619 0.995 1.024132384K 0.8103 0.8533 0.8917 0.9282 0.9568133512K 0.7753 0.8135 0.8537 0.892 0.935134768K 0.8184 0.8638 0.9121 0.9586 0.97051351024K 0.8241 0.8741 0.927 0.979 1.031361536K 0.8505 0.9212 0.9882 1.056 1.0961372048K 0.8294 0.8954 0.9655 1.041 1.102138139*/140141#ifndef SLOW2_IN_ROOTS142/* SLOW2_IN_ROOTS below 3: some slowdown starts to be noticable143* when things fit into the cache on Sparc.144* The choice of 2.6 gives a slowdown of 1-2% on UltraSparcII,145* but makes calculations for "maximum" of 436273009146* fit into 256K cache (still common for some architectures).147*148* One may change it when small caches become uncommon, but the gain149* is not going to be very noticable... */150# ifdef i386 /* gcc defines this? */151# define SLOW2_IN_ROOTS 0.36152# else153# define SLOW2_IN_ROOTS 2.6154# endif155#endif156#ifndef CACHE_ARENA157# ifdef i386 /* gcc defines this? */158/* Due to smaller SLOW2_IN_ROOTS, smaller arena is OK; fit L1 cache */159# define CACHE_ARENA (63 * 1024UL) /* No slowdown even with 64K L1 cache */160# else161# define CACHE_ARENA (200 * 1024UL) /* No slowdown even with 256K L2 cache */162# endif163#endif164165#define CACHE_ALPHA (0.38) /* Cache performance model parameter */166#define CACHE_CUTOFF (0.018) /* Cache performance not smooth here */167168static double slow2_in_roots = SLOW2_IN_ROOTS;169170typedef struct {171ulong arena;172double power;173double cutoff;174} cache_model_t;175176static cache_model_t cache_model = { CACHE_ARENA, CACHE_ALPHA, CACHE_CUTOFF };177178/* Assume that some calculation requires a chunk of memory to be179accessed often in more or less random fashion (as in sieving).180Assume that the calculation can be done in steps by subdividing the181chunk into smaller subchunks (arenas) and treating them182separately. Assume that the overhead of subdivision is equivalent183to the number of arenas.184185Find an optimal size of the arena taking into account the overhead186of subdivision, and the overhead of arena not fitting into the187cache. Assume that arenas of size slow2_in_roots slows down the188calculation 2x (comparing to very big arenas; when cache hits do189not matter). Since cache performance varies wildly with190architecture, load, and wheather (especially with cache coloring191enabled), use an idealized cache model based on benchmarks above.192193Assume that an independent region of FIXED_TO_CACHE bytes is accessed194very often concurrently with the arena access.195*/196static ulong197good_arena_size(ulong slow2_size, ulong total, ulong fixed_to_cache,198cache_model_t *cache_model)199{200ulong asize, cache_arena = cache_model->arena;201double Xmin, Xmax, A, B, C1, C2, D, V;202double alpha = cache_model->power, cut_off = cache_model->cutoff;203204/* Estimated relative slowdown,205with overhead = max((fixed_to_cache+arena)/cache_arena - 1, 0):2062071 + slow2_size/arena due to initialization overhead;208209max(1, 4.63 * overhead^0.38 ) due to footprint > cache size.210211[The latter is hard to substantiate theoretically, but this212function describes benchmarks pretty close; it does not hurt that213one can minimize it explicitly too ;-). The switch between214different choices of max() happens when overhead=0.018.]215216Thus the problem is minimizing (1 + slow2_size/arena)*overhead**0.29.217This boils down to F=((X+A)/(X+B))X^alpha, X=overhead,218B = (1 - fixed_to_cache/cache_arena), A = B + slow2_size/cache_arena,219alpha = 0.38, and X>=0.018, X>-B.220221We need to find the rightmost root of (X+A)*(X+B) - alpha(A-B)X to the222right of 0.018 (if such exists and is below Xmax). Then we manually223check the remaining region [0, 0.018].224225Since we cannot trust the purely-experimental cache-hit slowdown226function, as a sanity check always prefer fitting into the227cache (or "almost fitting") if F-law predicts that the larger228value of the arena provides less than 10% speedup.229*/230231/* The simplest case: we fit into cache */232asize = cache_arena - fixed_to_cache;233if (total <= asize) return total;234/* The simple case: fitting into cache doesn't slow us down more than 10% */235if (asize > 10 * slow2_size) return asize;236/* Slowdown of not fitting into cache is significant. Try to optimize.237Do not be afraid to spend some time on optimization - in trivial238cases we do not reach this point; any gain we get should239compensate the time spent on optimization. */240241B = (1 - ((double)fixed_to_cache)/cache_arena);242A = B + ((double)slow2_size)/cache_arena;243C2 = A*B;244C1 = (A + B - 1/alpha*(A - B))/2;245D = C1*C1 - C2;246if (D > 0)247V = cut_off*cut_off + 2*C1*cut_off + C2; /* Value at CUT_OFF */248else249V = 0; /* Peacify the warning */250Xmin = cut_off;251Xmax = ((double)total - fixed_to_cache)/cache_arena; /* Two candidates */252253if ( D <= 0 || (V >= 0 && C1 + cut_off >= 0) ) /* slowdown increasing */254Xmax = cut_off; /* Only one candidate */255else if (V >= 0 && /* slowdown concave down */256((Xmax + C1) <= 0 || (Xmax*Xmax + 2*C1*Xmax + C2) <= 0))257/* DO NOTHING */; /* Keep both candidates */258else if (V <= 0 && (Xmax*Xmax + 2*C1*Xmax + C2) <= 0) /*slowdown decreasing*/259Xmin = cut_off; /* Only one candidate */260else /* Now we know: 2 roots, the largest is in CUT_OFF..Xmax */261Xmax = sqrt(D) - C1;262if (Xmax != Xmin) { /* Xmin == CUT_OFF; Check which one is better */263double v1 = (cut_off + A)/(cut_off + B);264double v2 = 2.33 * (Xmax + A)/(Xmax + B) * pow(Xmax, alpha);265266if (1.1 * v2 >= v1) /* Prefer fitting into the cache if slowdown < 10% */267V = v1;268else269{ Xmin = Xmax; V = v2; }270} else if (B > 0) /* We need V */271V = 2.33 * (Xmin + A)/(Xmin + B) * pow(Xmin, alpha);272if (B > 0 && 1.1 * V > A/B) /* Now Xmin is the minumum. Compare with 0 */273Xmin = 0;274275asize = (ulong)((1 + Xmin)*cache_arena - fixed_to_cache);276if (asize > total) asize = total; /* May happen due to approximations */277return asize;278}279280/* Use as in281install(set_optimize,lLDG) \\ Through some M too?282set_optimize(2,1) \\ disable dependence on limit283\\ 1: how much cache usable, 2: slowdown of setup, 3: alpha, 4: cutoff284\\ 2,3,4 are in units of 0.001285286{ time_primes_arena(ar,limit) = \\ ar = arena size in K287set_optimize(1,floor(ar*1024));288default(primelimit, 200 000); \\ 100000 results in *larger* malloc()!289gettime;290default(primelimit, floor(limit));291if(ar >= 1, ar=floor(ar));292print("arena "ar"K => "gettime"ms");293}294*/295long296set_optimize(long what, GEN g)297{298long ret = 0;299300switch (what) {301case 1:302ret = (long)cache_model.arena;303break;304case 2:305ret = (long)(slow2_in_roots * 1000);306break;307case 3:308ret = (long)(cache_model.power * 1000);309break;310case 4:311ret = (long)(cache_model.cutoff * 1000);312break;313default:314pari_err_BUG("set_optimize");315break;316}317if (g != NULL) {318ulong val = itou(g);319320switch (what) {321case 1: cache_model.arena = val; break;322case 2: slow2_in_roots = (double)val / 1000.; break;323case 3: cache_model.power = (double)val / 1000.; break;324case 4: cache_model.cutoff = (double)val / 1000.; break;325}326}327return ret;328}329330/* s is odd; prime differences (starting from 5-3=2) start at known_primes[2],331terminated by a 0 byte. Checks n odd numbers starting at 'start', setting332bytes starting at data to 0 (composite) or 1 (prime) */333static void334sieve_chunk(byteptr known_primes, ulong s, byteptr data, ulong n)335{336ulong p, cnt = n-1, start = s, delta = 1;337byteptr q;338339memset(data, 0, n);340start >>= 1; /* (start - 1)/2 */341start += n; /* Corresponds to the end */342/* data corresponds to start, q runs over primediffs */343for (q = known_primes + 1, p = 3; delta; delta = *++q, p += delta)344{ /* first odd number >= start > p and divisible by p345= last odd number <= start + 2p - 2 and 0 (mod p)346= p + last number <= start + p - 2 and 0 (mod 2p)347= p + start+p-2 - (start+p-2) % 2p348= start + 2(p - 1 - ((start-1)/2 + (p-1)/2) % p). */349long off = cnt - ((start+(p>>1)) % p);350while (off >= 0) { data[off] = 1; off -= p; }351}352}353354/* assume maxnum <= 436273289 < 2^29 */355static void356initprimes0(ulong maxnum, long *lenp, ulong *lastp, byteptr p1)357{358pari_sp av = avma, bot = pari_mainstack->bot;359long alloced, psize;360byteptr q, end, p, end1, plast, curdiff;361ulong last, remains, curlow, rootnum, asize;362ulong prime_above;363byteptr p_prime_above;364365maxnum |= 1; /* make it odd. */366/* base case */367if (maxnum < 1ul<<17) { initprimes1(maxnum>>1, lenp, lastp, p1); return; }368369/* Checked to be enough up to 40e6, attained at 155893 */370rootnum = usqrt(maxnum) | 1;371initprimes1(rootnum>>1, &psize, &last, p1);372end1 = p1 + psize - 1;373remains = (maxnum - last) >> 1; /* number of odd numbers to check */374375/* we access primes array of psize too; but we access it consecutively,376* thus we do not include it in fixed_to_cache */377asize = good_arena_size((ulong)(rootnum * slow2_in_roots), remains+1, 0,378&cache_model) - 1;379/* enough room on the stack ? */380alloced = (((byteptr)avma) <= ((byteptr)bot) + asize);381if (alloced)382p = (byteptr)pari_malloc(asize+1);383else384p = (byteptr)stack_malloc(asize+1);385end = p + asize; /* the 0 sentinel goes at end. */386curlow = last + 2; /* First candidate: know primes up to last (odd). */387curdiff = end1;388389/* During each iteration p..end-1 represents a range of odd390numbers. plast is a pointer which represents the last prime seen,391it may point before p..end-1. */392plast = p - 1;393p_prime_above = p1 + 2;394prime_above = 3;395while (remains)396{ /* cycle over arenas; performance not crucial */397unsigned char was_delta;398if (asize > remains) { asize = remains; end = p + asize; }399/* Fake the upper limit appropriate for the given arena */400while (prime_above*prime_above <= curlow + (asize << 1) && *p_prime_above)401prime_above += *p_prime_above++;402was_delta = *p_prime_above;403*p_prime_above = 0; /* sentinel for sieve_chunk */404sieve_chunk(p1, curlow, p, asize);405*p_prime_above = was_delta; /* restore */406407p[asize] = 0; /* sentinel */408for (q = p; ; plast = q++)409{ /* q runs over addresses corresponding to primes */410while (*q) q++; /* use sentinel at end */411if (q >= end) break;412*curdiff++ = (unsigned char)(q-plast) << 1; /* < 255 for q < 436273291 */413}414plast -= asize;415remains -= asize;416curlow += (asize<<1);417}418last = curlow - ((p - plast) << 1);419*curdiff++ = 0; /* sentinel */420*lenp = curdiff - p1;421*lastp = last;422if (alloced) pari_free(p); else set_avma(av);423}424425ulong426maxprime(void) { return diffptr ? _maxprime : 0; }427ulong428maxprimeN(void) { return diffptr ? diffptrlen-1: 0; }429430void431maxprime_check(ulong c) { if (_maxprime < c) pari_err_MAXPRIME(c); }432433/* We ensure 65302 <= maxnum <= 436273289: the LHS ensures modular function434* have enough fast primes to work, the RHS ensures that p_{n+1} - p_n < 255435* (N.B. RHS would be incorrect since initprimes0 would make it odd, thereby436* increasing it by 1) */437byteptr438initprimes(ulong maxnum, long *lenp, ulong *lastp)439{440byteptr t;441if (maxnum < 65537)442maxnum = 65537;443else if (maxnum > 436273289)444maxnum = 436273289;445t = (byteptr)pari_malloc((size_t) (1.09 * maxnum/log((double)maxnum)) + 146);446initprimes0(maxnum, lenp, lastp, t);447return (byteptr)pari_realloc(t, *lenp);448}449450void451initprimetable(ulong maxnum)452{453long len;454ulong last;455byteptr p = initprimes(maxnum, &len, &last), old = diffptr;456diffptrlen = minss(diffptrlen, len);457_maxprime = minss(_maxprime,last); /*Protect against ^C*/458diffptr = p; diffptrlen = len; _maxprime = last;459if (old) free(old);460}461462/* all init_primepointer_xx routines set *ptr to the corresponding place463* in prime table */464/* smallest p >= a */465ulong466init_primepointer_geq(ulong a, byteptr *pd)467{468ulong n, p;469prime_table_next_p(a, pd, &p, &n);470return p;471}472/* largest p < a */473ulong474init_primepointer_lt(ulong a, byteptr *pd)475{476ulong n, p;477prime_table_next_p(a, pd, &p, &n);478PREC_PRIME_VIADIFF(p, *pd);479return p;480}481/* largest p <= a */482ulong483init_primepointer_leq(ulong a, byteptr *pd)484{485ulong n, p;486prime_table_next_p(a, pd, &p, &n);487if (p != a) PREC_PRIME_VIADIFF(p, *pd);488return p;489}490/* smallest p > a */491ulong492init_primepointer_gt(ulong a, byteptr *pd)493{494ulong n, p;495prime_table_next_p(a, pd, &p, &n);496if (p == a) NEXT_PRIME_VIADIFF(p, *pd);497return p;498}499500/**********************************************************************/501/*** ***/502/*** forprime ***/503/*** ***/504/**********************************************************************/505506/* return good chunk size for sieve, 16 | chunk + 2 */507static ulong508optimize_chunk(ulong a, ulong b)509{510/* TODO: Optimize size (surely < 512k to stay in L2 cache, but not so large511* as to force recalculating too often). */512ulong chunk = 0x80000UL;513ulong tmp = (b - a) / chunk + 1;514515if (tmp == 1)516chunk = b - a + 16;517else518chunk = (b - a) / tmp + 15;519/* ensure 16 | chunk + 2 */520return (((chunk + 2)>>4)<<4) - 2;521}522static void523sieve_init(forprime_t *T, ulong a, ulong b)524{525T->sieveb = b;526T->chunk = optimize_chunk(a, b);527/* >> 1 [only odds] + 3 [convert from bits to bytes] */528T->isieve = (unsigned char*)stack_malloc(((T->chunk+2) >> 4) + 1);529T->cache[0] = 0;530T->a = a;531T->end = minuu(a + T->chunk, b);532T->pos = T->maxpos = 0;533}534535enum {PRST_none, PRST_diffptr, PRST_sieve, PRST_unextprime, PRST_nextprime};536537static void538u_forprime_set_prime_table(forprime_t *T, ulong a)539{540T->strategy = PRST_diffptr;541if (a < 3)542{543T->p = 0;544T->d = diffptr;545}546else547T->p = init_primepointer_lt(a, &T->d);548}549550/* Set p so that p + q the smallest integer = c (mod q) and > original p.551* Assume 0 < c < q. Set p = 0 on overflow */552static void553arith_set(forprime_t *T)554{555ulong r = T->p % T->q; /* 0 <= r <= min(p, q-1) */556pari_sp av = avma;557GEN d = adduu(T->p - r, T->c);558if (T->c > r) d = subiu(d, T->q);559/* d = c mod q, d = c > r? p-r+c-q: p-r+c, so that560* d <= p and d+q = c>r? p-r+c : p-r+c+q > p */561T->p = itou_or_0(d); set_avma(av); /* d = 0 is impossible */562}563564/* run through primes in arithmetic progression = c (mod q) */565static int566u_forprime_sieve_arith_init(forprime_t *T, struct pari_sieve *psieve,567ulong a, ulong b, ulong c, ulong q)568{569ulong maxp, maxp2;570if (!odd(b) && b > 2) b--;571if (a > b || b < 2)572{573T->strategy = PRST_diffptr; /* paranoia */574T->p = 0; /* empty */575T->b = 0; /* empty */576T->d = diffptr;577return 0;578}579maxp = maxprime();580if (q != 1)581{582c %= q;583if (ugcd(c,q) != 1) { a = maxuu(a,c); b = minuu(b,c); }584if (odd(q) && (a > 2 || c != 2))585{ /* only *odd* primes. If a <= c = 2, then p = 2 must be included :-( */586if (!odd(c)) c += q;587q <<= 1;588}589}590T->q = q;591T->c = c;592T->strategy = PRST_none; /* unknown */593T->psieve = psieve; /* unused for now */594T->isieve = NULL; /* unused for now */595T->b = b;596if (maxp >= b) { /* [a,b] \subset prime table */597u_forprime_set_prime_table(T, a);598return 1;599}600/* b > maxp */601if (a >= maxp)602{603T->p = a - 1;604if (T->q > 1) arith_set(T);605}606else607u_forprime_set_prime_table(T, a);608609maxp2 = (maxp & HIGHMASK)? 0 : maxp*maxp;610/* FIXME: should sieve as well if q != 1, adapt sieve code */611if (q != 1 || (maxp2 && maxp2 <= a)612|| T->b - maxuu(a,maxp) < maxp / expu(b))613{ if (T->strategy==PRST_none) T->strategy = PRST_unextprime; }614else615{ /* worth sieving */616#ifdef LONG_IS_64BIT617const ulong UPRIME_MAX = 18446744073709551557UL;618#else619const ulong UPRIME_MAX = 4294967291UL;620#endif621ulong sieveb;622if (b > UPRIME_MAX) b = UPRIME_MAX;623sieveb = b;624if (maxp2 && maxp2 < b) sieveb = maxp2;625if (T->strategy==PRST_none) T->strategy = PRST_sieve;626sieve_init(T, maxuu(maxp+2, a), sieveb);627}628return 1;629}630631int632u_forprime_arith_init(forprime_t *T, ulong a, ulong b, ulong c, ulong q)633{ return u_forprime_sieve_arith_init(T, NULL, a, b, c, q); }634635/* will run through primes in [a,b] */636int637u_forprime_init(forprime_t *T, ulong a, ulong b)638{ return u_forprime_arith_init(T, a,b, 0,1); }639640/* will run through primes in [a,b] */641static int642u_forprime_sieve_init(forprime_t *T, struct pari_sieve *s, ulong b)643{ return u_forprime_sieve_arith_init(T, s, s->start, b, s->c, s->q); }644645/* now only run through primes <= c; assume c <= b above */646void647u_forprime_restrict(forprime_t *T, ulong c) { T->b = c; }648649/* b = NULL: loop forever */650int651forprimestep_init(forprime_t *T, GEN a, GEN b, GEN q)652{653long lb;654a = gceil(a); if (typ(a) != t_INT) pari_err_TYPE("forprime_init",a);655if (signe(a) <= 0) a = gen_1;656if (b && typ(b) != t_INFINITY)657{658b = gfloor(b);659if (typ(b) != t_INT) pari_err_TYPE("forprime_init",b);660if (signe(b) < 0 || cmpii(a,b) > 0)661{662T->strategy = PRST_nextprime; /* paranoia */663T->bb = T->pp = gen_0; return 0;664}665lb = lgefint(b);666T->bb = b;667}668else if (!b || inf_get_sign(b) > 0)669{670lb = lgefint(a) + 4;671T->bb = NULL;672}673else /* b == -oo */674{675T->strategy = PRST_nextprime; /* paranoia */676T->bb = T->pp = gen_0; return 0;677}678T->pp = cgeti(lb);679T->c = 0;680T->q = 1;681/* a, b are positive integers, a <= b */682if (q)683{684switch(typ(q))685{686case t_INT: break;687case t_INTMOD: a = addii(a, modii(subii(gel(q,2),a), gel(q,1)));688q = gel(q,1); break;689default: pari_err_TYPE("forprimestep_init",q);690}691if (signe(q) <= 0) pari_err_TYPE("forprimestep_init (q <= 0)",q);692if (equali1(q)) q = NULL;693else694{695T->q = itou(q);696T->c = umodiu(a, T->q);697}698}699if (lgefint(a) == 3) /* lb == 3 implies b != NULL */700return u_forprime_arith_init(T, uel(a,2), lb == 3? uel(b,2): ULONG_MAX,701T->c, T->q);702T->strategy = PRST_nextprime;703affii(subiu(a,T->q), T->pp);704return 1;705}706int707forprime_init(forprime_t *T, GEN a, GEN b)708{ return forprimestep_init(T,a,b,NULL); }709710/* assume a <= b <= maxprime()^2, a,b odd, sieve[n] corresponds to711* a+16*n, a+16*n+2, ..., a+16*n+14 (bits 0 to 7)712* maxpos = index of last sieve cell.713* b-a+2 must be divisible by 16 for use by u_forprime_next */714static void715sieve_block(ulong a, ulong b, ulong maxpos, unsigned char* sieve)716{717ulong p = 2, lim = usqrt(b), sz = (b-a) >> 1;718byteptr d = diffptr+1;719(void)memset(sieve, 0, maxpos+1);720for (;;)721{ /* p is odd */722ulong k, r;723NEXT_PRIME_VIADIFF(p, d); /* starts at p = 3 */724if (p > lim) break;725726/* solve a + 2k = 0 (mod p) */727r = a % p;728if (r == 0)729k = 0;730else731{732k = p - r;733if (odd(k)) k += p;734k >>= 1;735}736/* m = a + 2k is the smallest odd m >= a, p | m */737/* position n (corresponds to a+2n) is sieve[n>>3], bit n&7 */738while (k <= sz) { sieve[k>>3] |= 1 << (k&7); k += p; /* 2k += 2p */ }739}740}741742static void743pari_sieve_init(struct pari_sieve *s, ulong a, ulong b)744{745ulong maxpos= (b - a) >> 4;746s->start = a; s->end = b;747s->sieve = (unsigned char*) pari_malloc(maxpos+1);748s->c = 0; s->q = 1;749sieve_block(a, b, maxpos, s->sieve);750s->maxpos = maxpos; /* must be last in case of SIGINT */751}752753static struct pari_sieve pari_sieve_modular;754755#ifdef LONG_IS_64BIT756#define PARI_MODULAR_BASE ((1UL<<((BITS_IN_LONG-2)>>1))+1)757#else758#define PARI_MODULAR_BASE ((1UL<<(BITS_IN_LONG-1))+1)759#endif760761void762pari_init_primes(ulong maxprime)763{764ulong a = PARI_MODULAR_BASE, b = a + (1UL<<20)-2;765initprimetable(maxprime);766pari_sieve_init(&pari_sieve_modular, a, b);767}768769void770pari_close_primes(void)771{772pari_free(diffptr);773pari_free(pari_sieve_modular.sieve);774}775776void777init_modular_small(forprime_t *S)778{779#ifdef LONG_IS_64BIT780u_forprime_sieve_init(S, &pari_sieve_modular, ULONG_MAX);781#else782ulong a = (1UL<<((BITS_IN_LONG-2)>>1))+1;783u_forprime_init(S, a, ULONG_MAX);784#endif785}786787void788init_modular_big(forprime_t *S)789{790#ifdef LONG_IS_64BIT791u_forprime_init(S, HIGHBIT + 1, ULONG_MAX);792#else793u_forprime_sieve_init(S, &pari_sieve_modular, ULONG_MAX);794#endif795}796797/* T->cache is a 0-terminated list of primes, return the first one and798* remove it from list. Most of the time the list contains a single prime */799static ulong800shift_cache(forprime_t *T)801{802long i;803T->p = T->cache[0];804for (i = 1;; i++) /* remove one prime from cache */805if (! (T->cache[i-1] = T->cache[i]) ) break;806return T->p;807}808809ulong810u_forprime_next(forprime_t *T)811{812if (T->strategy == PRST_diffptr)813{814for(;;)815{816if (!*(T->d))817{818T->strategy = T->isieve? PRST_sieve: PRST_unextprime;819if (T->q != 1) { arith_set(T); if (!T->p) return 0; }820/* T->p possibly not a prime if q != 1 */821break;822}823else824{825NEXT_PRIME_VIADIFF(T->p, T->d);826if (T->p > T->b) return 0;827if (T->q == 1 || T->p % T->q == T->c) return T->p;828}829}830}831if (T->strategy == PRST_sieve)832{833ulong n;834if (T->cache[0]) return shift_cache(T);835NEXT_CHUNK:836if (T->psieve)837{838T->sieve = T->psieve->sieve;839T->end = T->psieve->end;840if (T->end > T->sieveb) T->end = T->sieveb;841T->maxpos = T->psieve->maxpos;842T->pos = 0;843T->psieve = NULL;844}845for (n = T->pos; n < T->maxpos; n++)846if (T->sieve[n] != 0xFF)847{848unsigned char mask = T->sieve[n];849ulong p = T->a + (n<<4);850long i = 0;851T->pos = n;852if (!(mask & 1)) T->cache[i++] = p;853if (!(mask & 2)) T->cache[i++] = p+2;854if (!(mask & 4)) T->cache[i++] = p+4;855if (!(mask & 8)) T->cache[i++] = p+6;856if (!(mask & 16)) T->cache[i++] = p+8;857if (!(mask & 32)) T->cache[i++] = p+10;858if (!(mask & 64)) T->cache[i++] = p+12;859if (!(mask &128)) T->cache[i++] = p+14;860T->cache[i] = 0;861T->pos = n+1;862return shift_cache(T);863}864/* n = T->maxpos, last cell: check p <= b */865if (T->maxpos && n == T->maxpos && T->sieve[n] != 0xFF)866{867unsigned char mask = T->sieve[n];868ulong p = T->a + (n<<4);869long i = 0;870T->pos = n;871if (!(mask & 1) && p <= T->sieveb) T->cache[i++] = p;872if (!(mask & 2) && p <= T->sieveb-2) T->cache[i++] = p+2;873if (!(mask & 4) && p <= T->sieveb-4) T->cache[i++] = p+4;874if (!(mask & 8) && p <= T->sieveb-6) T->cache[i++] = p+6;875if (!(mask & 16) && p <= T->sieveb-8) T->cache[i++] = p+8;876if (!(mask & 32) && p <= T->sieveb-10) T->cache[i++] = p+10;877if (!(mask & 64) && p <= T->sieveb-12) T->cache[i++] = p+12;878if (!(mask &128) && p <= T->sieveb-14) T->cache[i++] = p+14;879if (i)880{881T->cache[i] = 0;882T->pos = n+1;883return shift_cache(T);884}885}886887if (T->maxpos && T->end >= T->sieveb) /* done with sieves ? */888{889if (T->sieveb == T->b && T->b != ULONG_MAX) return 0;890T->strategy = PRST_unextprime;891}892else893{ /* initialize next chunk */894T->sieve = T->isieve;895if (T->maxpos == 0)896T->a |= 1; /* first time; ensure odd */897else898T->a = (T->end + 2) | 1;899T->end = T->a + T->chunk; /* may overflow */900if (T->end < T->a || T->end > T->sieveb) T->end = T->sieveb;901/* end and a are odd; sieve[k] contains the a + 8*2k + (0,2,...,14).902* The largest k is (end-a) >> 4 */903T->pos = 0;904T->maxpos = (T->end - T->a) >> 4;905sieve_block(T->a, T->end, T->maxpos, T->sieve);906goto NEXT_CHUNK;907}908}909if (T->strategy == PRST_unextprime)910{911if (T->q == 1)912{913#ifdef LONG_IS_64BIT914switch(T->p)915{916#define retp(x) return T->p = (HIGHBIT+x <= T->b)? HIGHBIT+x: 0917case HIGHBIT: retp(29);918case HIGHBIT + 29: retp(99);919case HIGHBIT + 99: retp(123);920case HIGHBIT +123: retp(131);921case HIGHBIT +131: retp(155);922case HIGHBIT +155: retp(255);923case HIGHBIT +255: retp(269);924case HIGHBIT +269: retp(359);925case HIGHBIT +359: retp(435);926case HIGHBIT +435: retp(449);927case HIGHBIT +449: retp(453);928case HIGHBIT +453: retp(485);929case HIGHBIT +485: retp(491);930case HIGHBIT +491: retp(543);931case HIGHBIT +543: retp(585);932case HIGHBIT +585: retp(599);933case HIGHBIT +599: retp(753);934case HIGHBIT +753: retp(849);935case HIGHBIT +849: retp(879);936case HIGHBIT +879: retp(885);937case HIGHBIT +885: retp(903);938case HIGHBIT +903: retp(995);939#undef retp940}941#endif942T->p = unextprime(T->p + 1);943}944else do {945T->p += T->q;946if (T->p < T->q || T->p > T->b) { T->p = 0; break; } /* overflow */947} while (!uisprime(T->p));948if (T->p && T->p <= T->b) return T->p;949/* overflow ulong, switch to GEN */950T->strategy = PRST_nextprime;951}952return 0; /* overflow */953}954955GEN956forprime_next(forprime_t *T)957{958pari_sp av;959GEN p;960if (T->strategy != PRST_nextprime)961{962ulong u = u_forprime_next(T);963if (u) { affui(u, T->pp); return T->pp; }964/* failure */965if (T->strategy != PRST_nextprime) return NULL; /* we're done */966/* overflow ulong, switch to GEN */967u = ULONG_MAX;968if (T->q > 1) u -= (ULONG_MAX-T->c) % T->q;969affui(u, T->pp);970}971av = avma; p = T->pp;972if (T->q == 1)973{974p = nextprime(addiu(p, 1));975if (T->bb && abscmpii(p, T->bb) > 0) return gc_NULL(av);976} else do {977p = addiu(p, T->q);978if (T->bb && abscmpii(p, T->bb) > 0) return gc_NULL(av);979} while (!BPSW_psp(p));980affii(p, T->pp); return gc_const(av, T->pp);981}982983void984forprimestep(GEN a, GEN b, GEN q, GEN code)985{986pari_sp av = avma;987forprime_t T;988989if (!forprimestep_init(&T, a,b,q)) { set_avma(av); return; }990991push_lex(T.pp,code);992while(forprime_next(&T))993{994closure_evalvoid(code); if (loop_break()) break;995/* p changed in 'code', complain */996if (get_lex(-1) != T.pp)997pari_err(e_MISC,"prime index read-only: was changed to %Ps", get_lex(-1));998}999pop_lex(1); set_avma(av);1000}1001void1002forprime(GEN a, GEN b, GEN code) { return forprimestep(a,b,NULL,code); }10031004int1005forcomposite_init(forcomposite_t *C, GEN a, GEN b)1006{1007pari_sp av = avma;1008a = gceil(a);1009if (typ(a)!=t_INT) pari_err_TYPE("forcomposite",a);1010if (b) {1011if (typ(b) == t_INFINITY) b = NULL;1012else1013{1014b = gfloor(b);1015if (typ(b)!=t_INT) pari_err_TYPE("forcomposite",b);1016}1017}1018if (signe(a) < 0) pari_err_DOMAIN("forcomposite", "a", "<", gen_0, a);1019if (abscmpiu(a, 4) < 0) a = utoipos(4);1020C->first = 1;1021if (!forprime_init(&C->T, a,b) && cmpii(a,b) > 0)1022{1023C->n = gen_1; /* in case caller forgets to check the return value */1024C->b = gen_0; return gc_bool(av,0);1025}1026C->n = setloop(a);1027C->b = b;1028C->p = NULL; return 1;1029}10301031GEN1032forcomposite_next(forcomposite_t *C)1033{1034if (C->first) /* first call ever */1035{1036C->first = 0;1037C->p = forprime_next(&C->T);1038}1039else1040C->n = incloop(C->n);1041if (C->p)1042{1043if (cmpii(C->n, C->p) < 0) return C->n;1044C->n = incloop(C->n);1045/* n = p+1 */1046C->p = forprime_next(&C->T); /* nextprime(p) > n */1047if (C->p) return C->n;1048}1049if (!C->b || cmpii(C->n, C->b) <= 0) return C->n;1050return NULL;1051}10521053void1054forcomposite(GEN a, GEN b, GEN code)1055{1056pari_sp av = avma;1057forcomposite_t T;1058GEN n;1059if (!forcomposite_init(&T,a,b)) return;1060push_lex(T.n,code);1061while((n = forcomposite_next(&T)))1062{1063closure_evalvoid(code); if (loop_break()) break;1064/* n changed in 'code', complain */1065if (get_lex(-1) != n)1066pari_err(e_MISC,"index read-only: was changed to %Ps", get_lex(-1));1067}1068pop_lex(1); set_avma(av);1069}107010711072