Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
Download

Testing latest pari + WASM + node.js... and it works?! Wow.

28485 views
License: GPL3
ubuntu2004
1
/* - debug support */
2
3
#ifdef MPQS_DEBUG_VERBOSE
4
# ifndef MPQS_DEBUG
5
# define MPQS_DEBUG
6
# endif
7
# define PRINT_IF_VERBOSE(x) err_printf(x)
8
#else
9
# define PRINT_IF_VERBOSE(x)
10
#endif
11
12
#ifdef MPQS_DEBUG
13
# define MPQS_DEBUGLEVEL 1000 /* infinity */
14
#else
15
# define MPQS_DEBUGLEVEL DEBUGLEVEL
16
#endif
17
18
/* - non-configurable sizing parameters */
19
20
/* 'large primes' must be smaller than min(MPQS_LP_BOUND, largest_FB_p) */
21
#define MPQS_LP_BOUND 12500000 /* works for 32 and 64bit */
22
23
/* see mpqs_locate_A_range() for an explanation of the following. I had
24
* some good results with about -log2(0.85) but in the range I was testing,
25
* this shifts the primes for A only by one position in the FB. Don't go
26
* over the top with this one... */
27
#define MPQS_A_FUDGE 0.15 /* ~ -log2(0.9) */
28
29
#define MPQS_CANDIDATE_ARRAY_SIZE 2000 /* max. this many cand's per poly */
30
31
/* - structures, types, and constants */
32
33
/* -- reasonably-sized integers */
34
#ifdef LONG_IS_64BIT
35
typedef int mpqs_int32_t;
36
typedef unsigned int mpqs_uint32_t;
37
#else
38
typedef long mpqs_int32_t;
39
typedef ulong mpqs_uint32_t;
40
#endif
41
42
/* -- factor base entries should occupy 32 bytes (and we'll keep them
43
* aligned, for good L1 cache hit rates). Some of the entries will be
44
* abused for e.g. -1 and (factors of) k instead for real factor base
45
* primes, and for a sentinel at the end. This is why __p is a signed
46
* field.-- The two start fields depend on the current polynomial and
47
* keep changing during sieving, the flags will also change depending on
48
* the current A. */
49
/* Let (z1, z2) be the roots of Q(x) = A x^2 + Bx + C mod p_i; then
50
* Q(z1 + p_i Z) == 0 mod p_i and Q(z2 + p_i Z) == 0 mod p_i;
51
* start_1, start_2 are the positions where p_i divides Q(x) for the
52
* first time, already adjusted for the fact that the sieving array,
53
* nominally [-M, M], is represented by a 0-based C array of length
54
* 2M + 1. For the prime factors of A and those of k, the two roots
55
* are equal mod p_i. */
56
57
#define MPQS_FB_ENTRY_PAD 32
58
59
typedef union mpqs_FB_entry {
60
char __pad[MPQS_FB_ENTRY_PAD];
61
struct {
62
/* the prime p, the two arith. prog. mod p, sqrt(kN) mod p */
63
mpqs_int32_t __p, __start1, __start2, __sqrt_kN;
64
float __flogp; /* log(p) as a 4-byte float */
65
unsigned char __val; /* 8-bit approx. scaled log for sieving */
66
unsigned char __flags;
67
} __entry;
68
} mpqs_FB_entry_t;
69
70
/* --- convenience accessor macros for the preceding: */
71
#define fbe_p __entry.__p
72
#define fbe_flogp __entry.__flogp
73
#define fbe_start1 __entry.__start1
74
#define fbe_start2 __entry.__start2
75
#define fbe_sqrt_kN __entry.__sqrt_kN
76
#define fbe_logval __entry.__val
77
#define fbe_flags __entry.__flags
78
79
/* --- flag bits for fbe_flags: */
80
#define MPQS_FBE_CLEAR 0x0 /* no flags */
81
82
/* following used for odd FB primes, and applies to the divisors of A but not
83
* those of k. Must occupy the rightmost bit because we also use it as a
84
* shift count after extracting it from the byte. */
85
#define MPQS_FBE_DIVIDES_A 0x1ul /* and Q(x) mod p only has degree 1 */
86
87
/* TODO (tentative): one bit to mark normal FB primes,
88
* one to mark the factors of k,
89
* one to mark primes used in sieving,
90
* later maybe one to mark primes of which we'll be tracking the square,
91
* one to mark primes currently in use for A;
92
* once we segment the FB, one bit marking the members of the first segment */
93
94
/* -- multiplier k and attached quantities */
95
typedef struct mpqs_multiplier {
96
mpqs_uint32_t k; /* the multiplier (odd, squarefree) */
97
mpqs_uint32_t omega_k; /* number (0, 1 or 2) of primes dividing k */
98
mpqs_uint32_t kp[2]; /* prime factors of k, if any */
99
} mpqs_multiplier_t;
100
101
#define MPQS_POSSIBLE_MULTIPLIERS 15 /* how many values for k we'll try */
102
/* following must be in range of the cand_multipliers table below */
103
104
static const mpqs_multiplier_t cand_multipliers[] = {
105
{ 1, 0, { 0, 0}},
106
{ 3, 1, { 3, 0}},
107
{ 5, 1, { 5, 0}},
108
{ 7, 1, { 7, 0}},
109
{ 11, 1, {11, 0}},
110
{ 13, 1, {13, 0}},
111
{ 15, 2, { 3, 5}},
112
{ 17, 1, {17, 0}},
113
{ 19, 1, {19, 0}},
114
{ 21, 2, { 3, 7}},
115
{ 23, 1, {23, 0}},
116
{ 29, 1, {29, 0}},
117
{ 31, 1, {31, 0}},
118
{ 33, 2, { 3, 11}},
119
{ 35, 2, { 5, 7}},
120
{ 37, 1, {37, 0}},
121
{ 39, 2, { 3, 13}},
122
{ 41, 1, {41, 0}},
123
{ 43, 1, {43, 0}},
124
{ 47, 1, {47, 0}},
125
{ 51, 2, { 3, 17}},
126
{ 53, 1, {53, 0}},
127
{ 55, 2, { 5, 11}},
128
{ 57, 2, { 3, 19}},
129
{ 59, 1, {59, 0}},
130
{ 61, 1, {61, 0}},
131
{ 65, 2, { 5, 13}},
132
{ 67, 1, {67, 0}},
133
{ 69, 2, { 3, 23}},
134
{ 71, 1, {71, 0}},
135
{ 73, 1, {73, 0}},
136
{ 77, 2, { 7, 11}},
137
{ 79, 1, {79, 0}},
138
{ 83, 1, {83, 0}},
139
{ 85, 2, { 5, 17}},
140
{ 87, 2, { 3, 29}},
141
{ 89, 1, {89, 0}},
142
{ 91, 2, { 7, 13}},
143
{ 93, 2, { 3, 31}},
144
{ 95, 2, { 5, 19}},
145
{ 97, 1, {97, 0}}
146
};
147
148
/* -- the array of (Chinese remainder) idempotents which add/subtract up to
149
* the middle coefficient B, and for convenience, the FB subscripts of the
150
* primes in current use for A. We keep these together since both arrays
151
* are of the same size and are used at the same times. */
152
typedef struct mqps_per_A_prime {
153
GEN _H; /* summand for B */
154
mpqs_int32_t _i; /* subscript into FB */
155
} mpqs_per_A_prime_t;
156
157
/* following cooperate with names of local variables in the self_init fcns.
158
* per_A_pr must exist and be an alias for the eponymous handle pointer for
159
* all of these, and FB must exist and correspond to the handle FB pointer
160
* for all but the first two of them. */
161
#define MPQS_H(i) (per_A_pr[i]._H)
162
#define MPQS_I(i) (per_A_pr[i]._i)
163
#define MPQS_AP(i) (FB[MPQS_I(i)].fbe_p)
164
#define MPQS_LP(i) (FB[MPQS_I(i)].fbe_flogp)
165
#define MPQS_SQRT(i) (FB[MPQS_I(i)].fbe_sqrt_kN)
166
#define MPQS_FLG(i) (FB[MPQS_I(i)].fbe_flags)
167
168
/* -- the array of addends / subtrahends for changing polynomials during
169
* self-initialization: (1/A) H[i] mod p_j, with i subscripting the inner
170
* array in each entry, and j choosing the entry in an outer array.
171
* Entries will occupy 64 bytes each no matter what (which imposes at most 17
172
* prime factors for A; thus i will range from 0 to at most 15.) This wastes a
173
* little memory for smaller N but makes it easier for compilers to generate
174
* efficient code. */
175
176
/* NOTE: At present, memory locality vis-a-vis accesses to this array is good
177
* in the slow (new A) branch of mpqs_self_init(), but poor in the fast
178
* (same A, new B) branch, which now loops over the outer array index,
179
* reading just one field of each inner array each time through the FB
180
* loop. This doesn't really harm, but will improve one day when we do
181
* segmented sieve arrays with the attached segmented FB-range accesses. */
182
#define MPQS_MAX_OMEGA_A 17
183
typedef struct mpqs_inv_A_H {
184
mpqs_uint32_t _i[MPQS_MAX_OMEGA_A - 1];
185
} mpqs_inv_A_H_t;
186
187
#define MPQS_INV_A_H(i,j) (inv_A_H[j]._i[i])
188
189
/* -- global handle to keep track of everything used through one factorization
190
* attempt. The order of the fields is determined by keeping most frequently
191
* used stuff near the beginning. */
192
typedef struct mpqs_handle {
193
/* pointers */
194
unsigned char *sieve_array;/* 0-based, representing [-M,M-1] */
195
unsigned char *sieve_array_end; /* points at sieve_array[M-1] */
196
mpqs_FB_entry_t *FB; /* (aligned) FB array itself */
197
long *candidates; /* collects promising sieve subscripts */
198
long *relaprimes; /* prime/exponent pairs in a relation */
199
mpqs_inv_A_H_t *inv_A_H; /* self-init: (aligned) stepping array, and */
200
mpqs_per_A_prime_t *per_A_pr; /* FB subscripts of primes in A etc. */
201
202
/* other stuff that's being used all the time */
203
mpqs_int32_t M; /* sieving over |x| <= M */
204
mpqs_int32_t size_of_FB; /* # primes in FB (or dividing k) */
205
/* the following three are in non-descending order, and the first two must
206
* be adjusted for omega_k at the beginning */
207
mpqs_int32_t index0_FB; /* lowest subscript into FB of a "real" prime
208
* (i.e. other than -1, 2, factors of k) */
209
mpqs_int32_t index1_FB; /* lowest subscript into FB for sieving */
210
mpqs_int32_t index2_FB; /* primes for A are chosen relative to this */
211
unsigned char index2_moved;/* true when we're starved for small A's */
212
unsigned char sieve_threshold; /* distinguishes candidates in sieve */
213
GEN N, kN; /* number to be factored, with multiplier */
214
GEN A, B; /* leading, middle coefficient */
215
mpqs_int32_t omega_A; /* number of primes going into each A */
216
mpqs_int32_t no_B; /* number of B's for each A: 2^(omega_A-1) */
217
double l2_target_A; /* ~log2 of desired typical A */
218
/* counters and bit pattern determining and numbering current polynomial: */
219
mpqs_uint32_t bin_index; /* bit pattern for selecting primes for A */
220
mpqs_uint32_t index_i; /* running count of A's */
221
mpqs_uint32_t index_j; /* B's ordinal number in A's cohort */
222
223
/* further sizing parameters: */
224
mpqs_int32_t target_rels; /* target number of full relations */
225
mpqs_int32_t largest_FB_p; /* largest prime in the FB */
226
mpqs_int32_t pmin_index1; /* lower bound for primes used for sieving */
227
mpqs_int32_t lp_scale; /* factor by which LPs may exceed FB primes */
228
229
/* subscripts determining where to pick primes for A */
230
/* FIXME: lp_bound might have to be mpqs_int64_t ? */
231
long lp_bound; /* cutoff for Large Primes */
232
long digit_size_kN;
233
const mpqs_multiplier_t *_k; /* multiplier k and attached quantities */
234
double tolerance; /* controls the tightness of the sieve */
235
double dkN; /* - double prec. approximation of kN */
236
double l2sqrtkN; /* ~log2(sqrt(kN)) */
237
double l2M; /* ~log2(M) (cf. below) */
238
/* TODO: need an index2_FB here to remember where to start picking primes */
239
/* bookkeeping pointers to containers of aligned memory chunks: */
240
void *FB_chunk; /* (unaligned) chunk containing the FB */
241
void *invAH_chunk; /* (unaligned) chunk for self-init array */
242
} mpqs_handle_t;
243
244
/* -- sizing table entries */
245
246
/* For "tolerance", see mpqs_set_sieve_threshold(). The LP scale, for very
247
* large kN, prevents us from accumulating vast amounts of LP relations with
248
* little chance of hitting any particular large prime a second time and being
249
* able to combine a full relation from two LP ones; however, the sieve
250
* threshold (determined by the tolerance) already works against very large LPs
251
* being produced. The present relations "database" can detect duplicate full
252
* relations only during the sort/combine phases, so we must do some sort
253
* points even for tiny kN where we do not admit large primes at all.
254
* Some constraints imposed by the present implementation:
255
* + omega_A should be at least 3, and no more than MPQS_MAX_OMEGA_A
256
* + The size of the FB must be large enough compared to omega_A
257
* (about 2*omega_A + 3, but this is always true below) */
258
/* XXX Changes needed for segmented mode:
259
* XXX When using it (kN large enough),
260
* XXX - M must become a multiple of the (cache block) segment size
261
* XXX (or to keep things simple: a multiple of 32K)
262
* XXX - we need index3_FB to seperate (smaller) primes used for normal
263
* XXX sieving from larger ones used with transaction buffers
264
* XXX (and the locate_A_range and attached logic must be changed to
265
* XXX cap index2_FB below index3_FB instead of below size_of_FB)
266
*/
267
typedef struct mpqs_parameterset {
268
float tolerance; /* "mesh width" of the sieve */
269
mpqs_int32_t lp_scale; /* factor by which LPs may exceed FB primes */
270
mpqs_int32_t M; /* size of half the sieving interval */
271
mpqs_int32_t size_of_FB; /* #primes to use for FB (including 2) */
272
mpqs_int32_t omega_A; /* #primes to go into each A */
273
/* Following is auto-adjusted to account for prime factors of k inserted
274
* near the start of the FB. NB never ever sieve on the prime 2,which would
275
* just contribute a constant at each sieve point. */
276
mpqs_int32_t pmin_index1; /* lower bound for primes used for sieving */
277
} mpqs_parameterset_t;
278
279
/* - the table of sizing parameters itself */
280
281
/* indexed by size of kN in decimal digits, subscript 0 corresponding to
282
* 9 (or fewer) digits */
283
static const mpqs_parameterset_t mpqs_parameters[] =
284
{ /* tol lp_scl M szFB oA pmx1 */
285
{ /*9*/ 0.8, 1, 350, 19, 3, 5},
286
{ /*10*/ 0.8, 1, 300, 23, 3, 5},
287
{ /*11*/ 0.8, 1, 1000, 27, 3, 5},
288
{ /*12*/ 0.8, 1, 1100, 27, 3, 5},
289
{ /*13*/ 0.8, 1, 1400, 31, 3, 5},
290
{ /*14*/ 0.8, 1, 2200, 33, 3, 5},
291
{ /*15*/ 0.8, 1, 2300, 39, 3, 5},
292
{ /*16*/ 0.8, 1, 2900, 43, 3, 5},
293
{ /*17*/ 0.8, 1, 3200, 51, 3, 5},
294
{ /*18*/ 0.8, 1, 2800, 55, 3, 5},
295
{ /*19*/ 0.8, 1, 3400, 65, 3, 5},
296
{ /*20*/ 0.8, 1, 3400, 71, 3, 5},
297
{ /*21*/ 0.8, 1, 5400, 90, 3, 5},
298
{ /*22*/ 0.8, 1, 5700, 95, 3, 5},
299
{ /*23*/ 0.8, 1, 5700, 110, 3, 5},
300
{ /*24*/ 0.8, 1, 6000, 130, 4, 7},
301
{ /*25*/ 0.8, 1, 6500, 140, 4, 7},
302
{ /*26*/ 0.9, 1, 9000, 160, 4, 7},
303
{ /*27*/ 1.12, 1, 10000, 160, 4, 7},
304
{ /*28*/ 1.17, 1, 13000, 180, 4, 11},
305
{ /*29*/ 1.22, 1, 14000, 220, 4, 11},
306
{ /*30*/ 1.30, 1, 13000, 240, 4, 11},
307
{ /*31*/ 1.33, 1, 11000, 240, 4, 13},
308
{ /*32*/ 1.36, 1, 14000, 300, 5, 13},
309
{ /*33*/ 1.40, 1, 15000, 340, 5, 13},
310
{ /*34*/ 1.43, 1, 15000, 380, 5, 17},
311
{ /*35*/ 1.48, 30, 15000, 380, 5, 17},
312
{ /*36*/ 1.53, 45, 16000, 440, 5, 17},
313
{ /*37*/ 1.60, 60, 15000, 420, 6, 19},
314
{ /*38*/ 1.66, 70, 15000, 520, 6, 19},
315
{ /*39*/ 1.69, 80, 16000, 540, 6, 23},
316
/* around here, the largest prime in FB becomes comparable to M in size */
317
{ /*40*/ 1.69, 80, 16000, 600, 6, 23},
318
{ /*41*/ 1.69, 80, 16000, 700, 6, 23},
319
{ /*42*/ 1.69, 80, 24000, 900, 6, 29},
320
{ /*43*/ 1.69, 80, 26000, 1000, 6, 29},
321
{ /*44*/ 1.69, 80, 18000, 1100, 7, 31},
322
{ /*45*/ 1.69, 80, 20000, 1200, 7, 31},
323
{ /*46*/ 1.69, 80, 22000, 1300, 7, 37},
324
{ /*47*/ 1.69, 80, 24000, 1400, 7, 37},
325
{ /*48*/ 1.69, 80, 24000, 1600, 7, 37},
326
{ /*49*/ 1.72, 80, 28000, 1900, 7, 41},
327
{ /*50*/ 1.75, 80, 36000, 2100, 7, 41},
328
{ /*51*/ 1.80, 80, 32000, 2100, 7, 43},
329
{ /*52*/ 1.85, 80, 44000, 2300, 7, 43},
330
{ /*53*/ 1.90, 80, 44000, 2600, 7, 47},
331
{ /*54*/ 1.95, 80, 40000, 2700, 7, 47},
332
{ /*55*/ 1.95, 80, 48000, 3200, 7, 53},
333
{ /*56*/ 1.95, 80, 56000, 3400, 7, 53},
334
{ /*57*/ 2.00, 80, 40000, 3000, 8, 53},
335
{ /*58*/ 2.05, 80, 64000, 3400, 8, 59},
336
{ /*59*/ 2.10, 80, 64000, 3800, 8, 59},
337
{ /*60*/ 2.15, 80, 80000, 4300, 8, 61},
338
{ /*61*/ 2.20, 80, 80000, 4800, 8, 61},
339
{ /*62*/ 2.25, 80, 80000, 4600, 8, 67},
340
{ /*63*/ 2.39, 80, 80000, 4800, 8, 67},
341
{ /*64*/ 2.30, 80, 88000, 5400, 8, 67},
342
{ /*65*/ 2.31, 80, 120000, 6600, 8, 71},
343
{ /*66*/ 2.32, 80, 120000, 6800, 8, 71},
344
{ /*67*/ 2.33, 80, 144000, 7600, 8, 73},
345
{ /*68*/ 2.34, 80, 144000, 9000, 8, 73},
346
{ /*69*/ 2.35, 80, 160000, 9500, 8, 79},
347
{ /*70*/ 2.36, 80, 176000, 10500, 8, 79},
348
{ /*71*/ 2.37, 80, 240000, 11000, 9, 79},
349
{ /*72*/ 2.38, 80, 240000, 12500, 9, 83},
350
{ /*73*/ 2.41, 80, 240000, 13000, 9, 83},
351
{ /*74*/ 2.46, 80, 256000, 13250, 9, 83},
352
{ /*75*/ 2.51, 80, 256000, 14500, 9, 89},
353
{ /*76*/ 2.56, 80, 256000, 15250, 9, 89},
354
{ /*77*/ 2.58, 80, 320000, 17000, 9, 89},
355
{ /*78*/ 2.60, 80, 320000, 18000, 9, 89},
356
{ /*79*/ 2.63, 80, 320000, 19500, 9, 97},
357
{ /*80*/ 2.65, 80, 448000, 21000, 9, 97},
358
{ /*81*/ 2.72, 80, 448000, 22000, 9, 97},
359
{ /*82*/ 2.77, 80, 448000, 24000, 9, 101},
360
{ /*83*/ 2.82, 80, 480000, 23000, 10, 101},
361
{ /*84*/ 2.84, 80, 480000, 24000, 10, 103},
362
{ /*85*/ 2.86, 80, 512000, 28000, 10, 103},
363
{ /*86*/ 2.88, 80, 448000, 29000, 10, 107},
364
/* architectures with 1MBy L2 cache will become noticeably slower here
365
* as 2*M exceeds that mark - to be addressed in a future version by
366
* segmenting the sieve interval */
367
{ /*87*/ 2.90, 80, 512000, 32000, 10, 107},
368
{ /*88*/ 2.91, 80, 512000, 35000, 10, 109},
369
{ /*89*/ 2.92, 80, 512000, 38000, 10, 109},
370
{ /*90*/ 2.93, 80, 512000, 40000, 10, 113},
371
{ /*91*/ 2.94, 80, 770000, 32200, 10, 113},
372
/* entries below due to Thomas Denny, never tested */
373
{ /*92*/ 3.6, 90, 2000000, 35000, 9, 113},
374
{ /*93*/ 3.7, 90, 2000000, 37000, 9, 113},
375
{ /*94*/ 3.7, 90, 2000000, 39500, 9, 127},
376
{ /*95*/ 3.7, 90, 2500000, 41500, 9, 127},
377
{ /*96*/ 3.8, 90, 2500000, 45000, 10, 127},
378
{ /*97*/ 3.8, 90, 2500000, 47500, 10, 131},
379
{ /*98*/ 3.7, 90, 3000000, 51000, 10, 131},
380
{ /*99*/ 3.8, 90, 3000000, 53000, 10, 133},
381
{/*100*/ 3.8, 90, 875000, 50000, 10, 133},
382
{/*101*/ 3.8, 90, 3500000, 54000, 10, 139},
383
{/*102*/ 3.8, 90, 3500000, 57000, 10, 139},
384
{/*103*/ 3.9, 90, 4000000, 61000, 10, 139},
385
{/*104*/ 3.9, 90, 4000000, 66000, 10, 149},
386
{/*105*/ 3.9, 90, 4000000, 70000, 10, 149},
387
{/*106*/ 3.9, 90, 4000000, 75000, 10, 151},
388
{/*107*/ 3.9, 90, 4000000, 80000, 10, 151},
389
};
390
391
#define MPQS_MAX_DIGIT_SIZE_KN 107
392
393