CoCalc -- mpqs.h

Testing latest pari + WASM + node.js... and it works?! Wow.
pari / 2021-07-01 / pari-2.14.0.alpha / src / modules / mpqs.h
²⁸⁴⁸⁵ views
License: GPL3
ubuntu2004
1
/* - debug support */
2

3
#ifdef MPQS_DEBUG_VERBOSE
4
#  ifndef MPQS_DEBUG
5
#  define MPQS_DEBUG
6
#  endif
7
#  define PRINT_IF_VERBOSE(x) err_printf(x)
8
#else
9
#  define PRINT_IF_VERBOSE(x)
10
#endif
11

12
#ifdef MPQS_DEBUG
13
#  define MPQS_DEBUGLEVEL 1000  /* infinity */
14
#else
15
#  define MPQS_DEBUGLEVEL DEBUGLEVEL
16
#endif
17

18
/* - non-configurable sizing parameters */
19

20
/* 'large primes' must be smaller than min(MPQS_LP_BOUND, largest_FB_p) */
21
#define MPQS_LP_BOUND              12500000 /* works for 32 and 64bit */
22

23
/* see mpqs_locate_A_range() for an explanation of the following.  I had
24
 * some good results with about -log2(0.85) but in the range I was testing,
25
 * this shifts the primes for A only by one position in the FB.  Don't go
26
 * over the top with this one... */
27
#define MPQS_A_FUDGE               0.15 /* ~ -log2(0.9) */
28

29
#define MPQS_CANDIDATE_ARRAY_SIZE  2000 /* max. this many cand's per poly */
30

31
/* - structures, types, and constants */
32

33
/* -- reasonably-sized integers */
34
#ifdef LONG_IS_64BIT
35
typedef int  mpqs_int32_t;
36
typedef unsigned int  mpqs_uint32_t;
37
#else
38
typedef long mpqs_int32_t;
39
typedef ulong mpqs_uint32_t;
40
#endif
41

42
/* -- factor base entries should occupy 32 bytes  (and we'll keep them
43
 * aligned, for good L1 cache hit rates).  Some of the entries will be
44
 * abused for e.g. -1 and (factors of) k instead for real factor base
45
 * primes, and for a sentinel at the end.  This is why __p is a signed
46
 * field.-- The two start fields depend on the current polynomial and
47
 * keep changing during sieving, the flags will also change depending on
48
 * the current A. */
49
/* Let (z1, z2) be the roots of Q(x) = A x^2 + Bx + C mod p_i; then
50
 * Q(z1 + p_i Z) == 0 mod p_i and Q(z2 + p_i Z) == 0 mod p_i;
51
 * start_1, start_2 are the positions where p_i divides Q(x) for the
52
 * first time, already adjusted for the fact that the sieving array,
53
 * nominally [-M, M], is represented by a 0-based C array of length
54
 * 2M + 1.  For the prime factors of A and those of k, the two roots
55
 * are equal mod p_i. */
56

57
#define MPQS_FB_ENTRY_PAD 32
58

59
typedef union mpqs_FB_entry {
60
  char __pad[MPQS_FB_ENTRY_PAD];
61
  struct {
62
    /* the prime p, the two arith. prog. mod p, sqrt(kN) mod p */
63
    mpqs_int32_t __p, __start1, __start2, __sqrt_kN;
64
    float __flogp; /* log(p) as a 4-byte float */
65
    unsigned char __val; /* 8-bit approx. scaled log for sieving */
66
    unsigned char __flags;
67
  } __entry;
68
} mpqs_FB_entry_t;
69

70
/* --- convenience accessor macros for the preceding: */
71
#define fbe_p           __entry.__p
72
#define fbe_flogp       __entry.__flogp
73
#define fbe_start1      __entry.__start1
74
#define fbe_start2      __entry.__start2
75
#define fbe_sqrt_kN     __entry.__sqrt_kN
76
#define fbe_logval      __entry.__val
77
#define fbe_flags       __entry.__flags
78

79
/* --- flag bits for fbe_flags: */
80
#define MPQS_FBE_CLEAR       0x0 /* no flags */
81

82
/* following used for odd FB primes, and applies to the divisors of A but not
83
 * those of k.  Must occupy the rightmost bit because we also use it as a
84
 * shift count after extracting it from the byte. */
85
#define MPQS_FBE_DIVIDES_A   0x1ul /* and Q(x) mod p only has degree 1 */
86

87
/* TODO (tentative): one bit to mark normal FB primes,
88
 * one to mark the factors of k,
89
 * one to mark primes used in sieving,
90
 * later maybe one to mark primes of which we'll be tracking the square,
91
 * one to mark primes currently in use for A;
92
 * once we segment the FB, one bit marking the members of the first segment */
93

94
/* -- multiplier k and attached quantities */
95
typedef struct mpqs_multiplier {
96
  mpqs_uint32_t k;       /* the multiplier (odd, squarefree) */
97
  mpqs_uint32_t omega_k; /* number (0, 1 or 2) of primes dividing k */
98
  mpqs_uint32_t kp[2]; /* prime factors of k, if any */
99
} mpqs_multiplier_t;
100

101
#define MPQS_POSSIBLE_MULTIPLIERS  15 /* how many values for k we'll try */
102
/* following must be in range of the cand_multipliers table below */
103

104
static const mpqs_multiplier_t cand_multipliers[] = {
105
  {  1, 0, { 0,  0}},
106
  {  3, 1, { 3,  0}},
107
  {  5, 1, { 5,  0}},
108
  {  7, 1, { 7,  0}},
109
  { 11, 1, {11,  0}},
110
  { 13, 1, {13,  0}},
111
  { 15, 2, { 3,  5}},
112
  { 17, 1, {17,  0}},
113
  { 19, 1, {19,  0}},
114
  { 21, 2, { 3,  7}},
115
  { 23, 1, {23,  0}},
116
  { 29, 1, {29,  0}},
117
  { 31, 1, {31,  0}},
118
  { 33, 2, { 3, 11}},
119
  { 35, 2, { 5,  7}},
120
  { 37, 1, {37,  0}},
121
  { 39, 2, { 3, 13}},
122
  { 41, 1, {41,  0}},
123
  { 43, 1, {43,  0}},
124
  { 47, 1, {47,  0}},
125
  { 51, 2, { 3, 17}},
126
  { 53, 1, {53,  0}},
127
  { 55, 2, { 5, 11}},
128
  { 57, 2, { 3, 19}},
129
  { 59, 1, {59,  0}},
130
  { 61, 1, {61,  0}},
131
  { 65, 2, { 5, 13}},
132
  { 67, 1, {67,  0}},
133
  { 69, 2, { 3, 23}},
134
  { 71, 1, {71,  0}},
135
  { 73, 1, {73,  0}},
136
  { 77, 2, { 7, 11}},
137
  { 79, 1, {79,  0}},
138
  { 83, 1, {83,  0}},
139
  { 85, 2, { 5, 17}},
140
  { 87, 2, { 3, 29}},
141
  { 89, 1, {89,  0}},
142
  { 91, 2, { 7, 13}},
143
  { 93, 2, { 3, 31}},
144
  { 95, 2, { 5, 19}},
145
  { 97, 1, {97,  0}}
146
};
147

148
/* -- the array of (Chinese remainder) idempotents which add/subtract up to
149
 * the middle coefficient B, and for convenience, the FB subscripts of the
150
 * primes in current use for A.  We keep these together since both arrays
151
 * are of the same size and are used at the same times. */
152
typedef struct mqps_per_A_prime {
153
  GEN _H;          /* summand for B */
154
  mpqs_int32_t _i; /* subscript into FB */
155
} mpqs_per_A_prime_t;
156

157
/* following cooperate with names of local variables in the self_init fcns.
158
 * per_A_pr must exist and be an alias for the eponymous handle pointer for
159
 * all of these, and FB must exist and correspond to the handle FB pointer
160
 * for all but the first two of them. */
161
#define MPQS_H(i) (per_A_pr[i]._H)
162
#define MPQS_I(i) (per_A_pr[i]._i)
163
#define MPQS_AP(i) (FB[MPQS_I(i)].fbe_p)
164
#define MPQS_LP(i) (FB[MPQS_I(i)].fbe_flogp)
165
#define MPQS_SQRT(i) (FB[MPQS_I(i)].fbe_sqrt_kN)
166
#define MPQS_FLG(i) (FB[MPQS_I(i)].fbe_flags)
167

168
/* -- the array of addends / subtrahends for changing polynomials during
169
 * self-initialization: (1/A) H[i] mod p_j, with i subscripting the inner
170
 * array in each entry, and j choosing the entry in an outer array.
171
 * Entries will occupy 64 bytes each no matter what (which imposes at most 17
172
 * prime factors for A; thus i will range from 0 to at most 15.) This wastes a
173
 * little memory for smaller N but makes it easier for compilers to generate
174
 * efficient code. */
175

176
/* NOTE: At present, memory locality vis-a-vis accesses to this array is good
177
 * in the slow (new A) branch of mpqs_self_init(), but poor in the fast
178
 * (same A, new B) branch, which now loops over the outer array index,
179
 * reading just one field of each inner array each time through the FB
180
 * loop.  This doesn't really harm, but will improve one day when we do
181
 * segmented sieve arrays with the attached segmented FB-range accesses. */
182
#define MPQS_MAX_OMEGA_A 17
183
typedef struct mpqs_inv_A_H {
184
  mpqs_uint32_t _i[MPQS_MAX_OMEGA_A - 1];
185
} mpqs_inv_A_H_t;
186

187
#define MPQS_INV_A_H(i,j) (inv_A_H[j]._i[i])
188

189
/* -- global handle to keep track of everything used through one factorization
190
 * attempt. The order of the fields is determined by keeping most frequently
191
 * used stuff near the beginning. */
192
typedef struct mpqs_handle {
193
  /* pointers */
194
  unsigned char *sieve_array;/* 0-based, representing [-M,M-1] */
195
  unsigned char *sieve_array_end; /* points at sieve_array[M-1] */
196
  mpqs_FB_entry_t *FB;       /* (aligned) FB array itself */
197
  long *candidates;          /* collects promising sieve subscripts */
198
  long *relaprimes;          /* prime/exponent pairs in a relation */
199
  mpqs_inv_A_H_t *inv_A_H;   /* self-init: (aligned) stepping array, and */
200
  mpqs_per_A_prime_t *per_A_pr; /* FB subscripts of primes in A etc. */
201

202
  /* other stuff that's being used all the time */
203
  mpqs_int32_t M;            /* sieving over |x| <= M */
204
  mpqs_int32_t size_of_FB;   /* # primes in FB (or dividing k) */
205
  /* the following three are in non-descending order, and the first two must
206
   * be adjusted for omega_k at the beginning */
207
  mpqs_int32_t index0_FB;    /* lowest subscript into FB of a "real" prime
208
                              * (i.e. other than -1, 2, factors of k) */
209
  mpqs_int32_t index1_FB;    /* lowest subscript into FB for sieving */
210
  mpqs_int32_t index2_FB;    /* primes for A are chosen relative to this */
211
  unsigned char index2_moved;/* true when we're starved for small A's */
212
  unsigned char sieve_threshold; /* distinguishes candidates in sieve */
213
  GEN N, kN;                 /* number to be factored, with multiplier */
214
  GEN A, B;                  /* leading, middle coefficient */
215
  mpqs_int32_t omega_A;      /* number of primes going into each A */
216
  mpqs_int32_t no_B;         /* number of B's for each A: 2^(omega_A-1) */
217
  double l2_target_A;        /* ~log2 of desired typical A */
218
  /* counters and bit pattern determining and numbering current polynomial: */
219
  mpqs_uint32_t bin_index;   /* bit pattern for selecting primes for A */
220
  mpqs_uint32_t index_i;     /* running count of A's */
221
  mpqs_uint32_t index_j;     /* B's ordinal number in A's cohort */
222

223
  /* further sizing parameters: */
224
  mpqs_int32_t target_rels;  /* target number of full relations */
225
  mpqs_int32_t largest_FB_p; /* largest prime in the FB */
226
  mpqs_int32_t pmin_index1;  /* lower bound for primes used for sieving */
227
  mpqs_int32_t lp_scale;     /* factor by which LPs may exceed FB primes */
228

229
  /* subscripts determining where to pick primes for A */
230
  /* FIXME: lp_bound might have to be mpqs_int64_t ? */
231
  long lp_bound;             /* cutoff for Large Primes */
232
  long digit_size_kN;
233
  const mpqs_multiplier_t *_k;  /* multiplier k and attached quantities */
234
  double tolerance;          /* controls the tightness of the sieve */
235
  double dkN;                /* - double prec. approximation of kN */
236
  double l2sqrtkN;           /* ~log2(sqrt(kN)) */
237
  double l2M;                /* ~log2(M) (cf. below) */
238
  /* TODO: need an index2_FB here to remember where to start picking primes */
239
  /* bookkeeping pointers to containers of aligned memory chunks: */
240
  void *FB_chunk;            /* (unaligned) chunk containing the FB */
241
  void *invAH_chunk;         /* (unaligned) chunk for self-init array */
242
} mpqs_handle_t;
243

244
/* -- sizing table entries */
245

246
/* For "tolerance", see mpqs_set_sieve_threshold(). The LP scale, for very
247
 * large kN, prevents us from accumulating vast amounts of LP relations with
248
 * little chance of hitting any particular large prime a second time and being
249
 * able to combine a full relation from two LP ones; however, the sieve
250
 * threshold (determined by the tolerance) already works against very large LPs
251
 * being produced. The present relations "database" can detect duplicate full
252
 * relations only during the sort/combine phases, so we must do some sort
253
 * points even for tiny kN where we do not admit large primes at all.
254
 * Some constraints imposed by the present implementation:
255
 * + omega_A should be at least 3, and no more than MPQS_MAX_OMEGA_A
256
 * + The size of the FB must be large enough compared to omega_A
257
 *   (about 2*omega_A + 3, but this is always true below) */
258
/* XXX Changes needed for segmented mode:
259
 * XXX When using it (kN large enough),
260
 * XXX - M must become a multiple of the (cache block) segment size
261
 * XXX   (or to keep things simple: a multiple of 32K)
262
 * XXX - we need index3_FB to seperate (smaller) primes used for normal
263
 * XXX   sieving from larger ones used with transaction buffers
264
 * XXX   (and the locate_A_range and attached logic must be changed to
265
 * XXX   cap index2_FB below index3_FB instead of below size_of_FB)
266
 */
267
typedef struct mpqs_parameterset {
268
  float tolerance;          /* "mesh width" of the sieve */
269
  mpqs_int32_t lp_scale;    /* factor by which LPs may exceed FB primes */
270
  mpqs_int32_t M;           /* size of half the sieving interval */
271
  mpqs_int32_t size_of_FB;  /* #primes to use for FB (including 2) */
272
  mpqs_int32_t omega_A;     /* #primes to go into each A */
273
  /* Following is auto-adjusted to account for prime factors of k inserted
274
   * near the start of the FB. NB never ever sieve on the prime 2,which would
275
   * just contribute a constant at each sieve point. */
276
  mpqs_int32_t pmin_index1; /* lower bound for primes used for sieving */
277
} mpqs_parameterset_t;
278

279
/* - the table of sizing parameters itself */
280

281
/* indexed by size of kN in decimal digits, subscript 0 corresponding to
282
 * 9 (or fewer) digits */
283
static const mpqs_parameterset_t mpqs_parameters[] =
284
{ /*       tol lp_scl     M   szFB  oA pmx1 */
285
  {  /*9*/ 0.8,   1,    350,    19,  3,   5},
286
  { /*10*/ 0.8,   1,    300,    23,  3,   5},
287
  { /*11*/ 0.8,   1,   1000,    27,  3,   5},
288
  { /*12*/ 0.8,   1,   1100,    27,  3,   5},
289
  { /*13*/ 0.8,   1,   1400,    31,  3,   5},
290
  { /*14*/ 0.8,   1,   2200,    33,  3,   5},
291
  { /*15*/ 0.8,   1,   2300,    39,  3,   5},
292
  { /*16*/ 0.8,   1,   2900,    43,  3,   5},
293
  { /*17*/ 0.8,   1,   3200,    51,  3,   5},
294
  { /*18*/ 0.8,   1,   2800,    55,  3,   5},
295
  { /*19*/ 0.8,   1,   3400,    65,  3,   5},
296
  { /*20*/ 0.8,   1,   3400,    71,  3,   5},
297
  { /*21*/ 0.8,   1,   5400,    90,  3,   5},
298
  { /*22*/ 0.8,   1,   5700,    95,  3,   5},
299
  { /*23*/ 0.8,   1,   5700,   110,  3,   5},
300
  { /*24*/ 0.8,   1,   6000,   130,  4,   7},
301
  { /*25*/ 0.8,   1,   6500,   140,  4,   7},
302
  { /*26*/ 0.9,   1,   9000,   160,  4,   7},
303
  { /*27*/ 1.12,  1,  10000,   160,  4,   7},
304
  { /*28*/ 1.17,  1,  13000,   180,  4,  11},
305
  { /*29*/ 1.22,  1,  14000,   220,  4,  11},
306
  { /*30*/ 1.30,  1,  13000,   240,  4,  11},
307
  { /*31*/ 1.33,  1,  11000,   240,  4,  13},
308
  { /*32*/ 1.36,  1,  14000,   300,  5,  13},
309
  { /*33*/ 1.40,  1,  15000,   340,  5,  13},
310
  { /*34*/ 1.43,  1,  15000,   380,  5,  17},
311
  { /*35*/ 1.48, 30,  15000,   380,  5,  17},
312
  { /*36*/ 1.53, 45,  16000,   440,  5,  17},
313
  { /*37*/ 1.60, 60,  15000,   420,  6,  19},
314
  { /*38*/ 1.66, 70,  15000,   520,  6,  19},
315
  { /*39*/ 1.69, 80,  16000,   540,  6,  23},
316
  /* around here, the largest prime in FB becomes comparable to M in size */
317
  { /*40*/ 1.69, 80,  16000,   600,  6,  23},
318
  { /*41*/ 1.69, 80,  16000,   700,  6,  23},
319
  { /*42*/ 1.69, 80,  24000,   900,  6,  29},
320
  { /*43*/ 1.69, 80,  26000,  1000,  6,  29},
321
  { /*44*/ 1.69, 80,  18000,  1100,  7,  31},
322
  { /*45*/ 1.69, 80,  20000,  1200,  7,  31},
323
  { /*46*/ 1.69, 80,  22000,  1300,  7,  37},
324
  { /*47*/ 1.69, 80,  24000,  1400,  7,  37},
325
  { /*48*/ 1.69, 80,  24000,  1600,  7,  37},
326
  { /*49*/ 1.72, 80,  28000,  1900,  7,  41},
327
  { /*50*/ 1.75, 80,  36000,  2100,  7,  41},
328
  { /*51*/ 1.80, 80,  32000,  2100,  7,  43},
329
  { /*52*/ 1.85, 80,  44000,  2300,  7,  43},
330
  { /*53*/ 1.90, 80,  44000,  2600,  7,  47},
331
  { /*54*/ 1.95, 80,  40000,  2700,  7,  47},
332
  { /*55*/ 1.95, 80,  48000,  3200,  7,  53},
333
  { /*56*/ 1.95, 80,  56000,  3400,  7,  53},
334
  { /*57*/ 2.00, 80,  40000,  3000,  8,  53},
335
  { /*58*/ 2.05, 80,  64000,  3400,  8,  59},
336
  { /*59*/ 2.10, 80,  64000,  3800,  8,  59},
337
  { /*60*/ 2.15, 80,  80000,  4300,  8,  61},
338
  { /*61*/ 2.20, 80,  80000,  4800,  8,  61},
339
  { /*62*/ 2.25, 80,  80000,  4600,  8,  67},
340
  { /*63*/ 2.39, 80,  80000,  4800,  8,  67},
341
  { /*64*/ 2.30, 80,  88000,  5400,  8,  67},
342
  { /*65*/ 2.31, 80, 120000,  6600,  8,  71},
343
  { /*66*/ 2.32, 80, 120000,  6800,  8,  71},
344
  { /*67*/ 2.33, 80, 144000,  7600,  8,  73},
345
  { /*68*/ 2.34, 80, 144000,  9000,  8,  73},
346
  { /*69*/ 2.35, 80, 160000,  9500,  8,  79},
347
  { /*70*/ 2.36, 80, 176000, 10500,  8,  79},
348
  { /*71*/ 2.37, 80, 240000, 11000,  9,  79},
349
  { /*72*/ 2.38, 80, 240000, 12500,  9,  83},
350
  { /*73*/ 2.41, 80, 240000, 13000,  9,  83},
351
  { /*74*/ 2.46, 80, 256000, 13250,  9,  83},
352
  { /*75*/ 2.51, 80, 256000, 14500,  9,  89},
353
  { /*76*/ 2.56, 80, 256000, 15250,  9,  89},
354
  { /*77*/ 2.58, 80, 320000, 17000,  9,  89},
355
  { /*78*/ 2.60, 80, 320000, 18000,  9,  89},
356
  { /*79*/ 2.63, 80, 320000, 19500,  9,  97},
357
  { /*80*/ 2.65, 80, 448000, 21000,  9,  97},
358
  { /*81*/ 2.72, 80, 448000, 22000,  9,  97},
359
  { /*82*/ 2.77, 80, 448000, 24000,  9, 101},
360
  { /*83*/ 2.82, 80, 480000, 23000, 10, 101},
361
  { /*84*/ 2.84, 80, 480000, 24000, 10, 103},
362
  { /*85*/ 2.86, 80, 512000, 28000, 10, 103},
363
  { /*86*/ 2.88, 80, 448000, 29000, 10, 107},
364
  /* architectures with 1MBy L2 cache will become noticeably slower here
365
   * as 2*M exceeds that mark - to be addressed in a future version by
366
   * segmenting the sieve interval */
367
  { /*87*/ 2.90, 80, 512000, 32000, 10, 107},
368
  { /*88*/ 2.91, 80, 512000, 35000, 10, 109},
369
  { /*89*/ 2.92, 80, 512000, 38000, 10, 109},
370
  { /*90*/ 2.93, 80, 512000, 40000, 10, 113},
371
  { /*91*/ 2.94, 80, 770000, 32200, 10, 113},
372
  /* entries below due to Thomas Denny, never tested */
373
  { /*92*/ 3.6, 90, 2000000, 35000,  9, 113},
374
  { /*93*/ 3.7, 90, 2000000, 37000,  9, 113},
375
  { /*94*/ 3.7, 90, 2000000, 39500,  9, 127},
376
  { /*95*/ 3.7, 90, 2500000, 41500,  9, 127},
377
  { /*96*/ 3.8, 90, 2500000, 45000, 10, 127},
378
  { /*97*/ 3.8, 90, 2500000, 47500, 10, 131},
379
  { /*98*/ 3.7, 90, 3000000, 51000, 10, 131},
380
  { /*99*/ 3.8, 90, 3000000, 53000, 10, 133},
381
  {/*100*/ 3.8, 90,  875000, 50000, 10, 133},
382
  {/*101*/ 3.8, 90, 3500000, 54000, 10, 139},
383
  {/*102*/ 3.8, 90, 3500000, 57000, 10, 139},
384
  {/*103*/ 3.9, 90, 4000000, 61000, 10, 139},
385
  {/*104*/ 3.9, 90, 4000000, 66000, 10, 149},
386
  {/*105*/ 3.9, 90, 4000000, 70000, 10, 149},
387
  {/*106*/ 3.9, 90, 4000000, 75000, 10, 151},
388
  {/*107*/ 3.9, 90, 4000000, 80000, 10, 151},
389
};
390

391
#define MPQS_MAX_DIGIT_SIZE_KN 107
392

393
Product

Resources

Company