CoCalc -- mpqs.c

Testing latest pari + WASM + node.js... and it works?! Wow.
pari / 2021-07-01 / pari-2.14.0.alpha / src / modules / mpqs.c
²⁸⁴⁹⁵ views
License: GPL3
ubuntu2004
1
/* Copyright (C) 2000  The PARI group.
2

3
This file is part of the PARI/GP package.
4

5
PARI/GP is free software; you can redistribute it and/or modify it under the
6
terms of the GNU General Public License as published by the Free Software
7
Foundation; either version 2 of the License, or (at your option) any later
8
version. It is distributed in the hope that it will be useful, but WITHOUT
9
ANY WARRANTY WHATSOEVER.
10

11
Check the License for details. You should have received a copy of it, along
12
with the package; see the file 'COPYING'. If not, write to the Free Software
13
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. */
14

15
/* Self-Initializing Multi-Polynomial Quadratic Sieve, based on code developed
16
 * as part of the LiDIA project.
17
 *
18
 * Original version: Thomas Papanikolaou and Xavier Roblot
19
 * Extensively modified by The PARI group. */
20
/* Notation commonly used in this file, and sketch of algorithm:
21
 *
22
 * Given an odd integer N > 1 to be factored, we throw in a small odd squarefree
23
 * multiplier k so as to make kN = 1 mod 4 and to have many small primes over
24
 * which X^2 - kN splits.  We compute a factor base FB of such primes then
25
 * look for values x0 such that Q0(x0) = x0^2 - kN can be decomposed over FB,
26
 * up to a possible factor dividing k and a possible "large prime". Relations
27
 * involving the latter can be combined into full relations which don't; full
28
 * relations, by Gaussian elimination over F2 for the exponent vectors lead us
29
 * to an expression X^2 - Y^2 divisible by N and hopefully to a nontrivial
30
 * splitting when we compute gcd(X + Y, N).  Note that this can never
31
 * split prime powers.
32
 *
33
 * Candidates x0 are found by sieving along arithmetic progressions modulo the
34
 * small primes in FB and evaluation of candidates picks out those x0 where
35
 * many of these progressions coincide, resulting in a highly divisible Q0(x0).
36
 *
37
 * The Multi-Polynomial version improves this by choosing a modest subset of
38
 * FB primes (let A be their product) and forcing these to divide Q0(x).
39
 * Write Q(x) = Q0(2Ax + B) = (2Ax + B)^2 - kN = 4A(Ax^2 + Bx + C), where B is
40
 * suitably chosen.  For each A, there are 2^omega_A possible values for B
41
 * but we'll use only half of these, since the other half is easily covered by
42
 * exploiting the symmetry x -> -x of the original Q0. The "Self-Initializating"
43
 * bit refers to the fact that switching from one B to the next is fast, whereas
44
 * switching to the next A involves some recomputation (C is never needed).
45
 * Thus we quickly run through many polynomials sharing the same A.
46
 *
47
 * The sieve ranges over values x0 such that |x0| < M  (we use x = x0 + M
48
 * as array subscript).  The coefficients A are chosen so that A*M ~ sqrt(kN).
49
 * Then |B| is bounded by ~ (j+4)*A, and |C| = -C ~ (M/4)*sqrt(kN), so
50
 * Q(x0)/(4A) takes values roughly between -|C| and 3|C|.
51
 *
52
 * Refinements. We do not use the smallest FB primes for sieving, incorporating
53
 * them only after selecting candidates).  The substition of 2Ax+B into
54
 * X^2 - kN, with odd B, forces 2 to occur; when kN is 1 mod 8, it occurs at
55
 * least to the 3rd power; when kN = 5 mod 8, it occurs exactly to the 2nd
56
 * power.  We never sieve on 2 and always pull out the power of 2 directly. The
57
 * prime factors of k show up whenever 2Ax + B has a factor in common with k;
58
 * we don't sieve on these either but easily recognize them in a candidate. */
59
#include "pari.h"
60
#include "paripriv.h"
61

62
#define DEBUGLEVEL DEBUGLEVEL_mpqs
63

64
/** DEBUG **/
65
/* #define MPQS_DEBUG_VERBOSE 1 */
66
#include "mpqs.h"
67

68
#define REL_OFFSET 20
69
#define REL_MASK ((1UL<<REL_OFFSET)-1)
70
#define MAX_PE_PAIR 60
71

72
#ifdef HAS_SSE2
73
#include <emmintrin.h>
74
#define AND(a,b) ((mpqs_bit_array)__builtin_ia32_andps((__v4sf)(a), (__v4sf)(b)))
75
#define EXT0(a) ((ulong)__builtin_ia32_vec_ext_v2di((__v2di)(a), 0))
76
#define EXT1(a) ((ulong)__builtin_ia32_vec_ext_v2di((__v2di)(a), 1))
77
#define TEST(a) (EXT0(a) || EXT1(a))
78
typedef __v2di mpqs_bit_array;
79
const mpqs_bit_array mpqs_mask = { (long) 0x8080808080808080L, (long) 0x8080808080808080UL };
80
#else
81
/* Use ulong for the bit arrays */
82
typedef ulong mpqs_bit_array;
83
#define AND(a,b) ((a)&(b))
84
#define TEST(a) (a)
85

86
#ifdef LONG_IS_64BIT
87
const mpqs_bit_array mpqs_mask = 0x8080808080808080UL;
88
#else
89
const mpqs_bit_array mpqs_mask = 0x80808080UL;
90
#endif
91
#endif
92

93
static GEN rel_q(GEN c) { return gel(c,3); }
94
static GEN rel_Y(GEN c) { return gel(c,1); }
95
static GEN rel_p(GEN c) { return gel(c,2); }
96

97
static void
98
frel_add(hashtable *frel, GEN R)
99
{
100
  ulong h = hash_GEN(R);
101
  if (!hash_search2(frel, (void*)R, h))
102
    hash_insert2(frel, (void*)R, (void*)1, h);
103
}
104

105
/*********************************************************************/
106
/**                         INITIAL SIZING                          **/
107
/*********************************************************************/
108
/* # of decimal digits of argument */
109
static long
110
decimal_len(GEN N)
111
{ pari_sp av = avma; return gc_long(av, 1+logint(N, utoipos(10))); }
112

113
/* To be called after choosing k and putting kN into the handle:
114
 * Pick up the parameters for given size of kN in decimal digits and fill in
115
 * the handle. Return 0 when kN is too large, 1 when we're ok. */
116
static int
117
mpqs_set_parameters(mpqs_handle_t *h)
118
{
119
  long s, D;
120
  const mpqs_parameterset_t *P;
121

122
  h->digit_size_kN = D = decimal_len(h->kN);
123
  if (D > MPQS_MAX_DIGIT_SIZE_KN) return 0;
124
  P = &(mpqs_parameters[maxss(0, D - 9)]);
125
  h->tolerance   = P->tolerance;
126
  h->lp_scale    = P->lp_scale;
127
  /* make room for prime factors of k if any: */
128
  h->size_of_FB  = s = P->size_of_FB + h->_k->omega_k;
129
  /* for the purpose of Gauss elimination etc., prime factors of k behave
130
   * like real FB primes, so take them into account when setting the goal: */
131
  h->target_rels = (s >= 200 ? s + 10 : (mpqs_int32_t)(s * 1.05));
132
  h->M           = P->M;
133
  h->omega_A     = P->omega_A;
134
  h->no_B        = 1UL << (P->omega_A - 1);
135
  h->pmin_index1 = P->pmin_index1;
136
  /* certain subscripts into h->FB should also be offset by omega_k: */
137
  h->index0_FB   = 3 + h->_k->omega_k;
138
  if (DEBUGLEVEL >= 5)
139
  {
140
    err_printf("MPQS: kN = %Ps\n", h->kN);
141
    err_printf("MPQS: kN has %ld decimal digits\n", D);
142
    err_printf("\t(estimated memory needed: %4.1fMBy)\n",
143
               (s + 1)/8388608. * h->target_rels);
144
  }
145
  return 1;
146
}
147

148
/*********************************************************************/
149
/**                       OBJECT HOUSEKEEPING                       **/
150
/*********************************************************************/
151

152
/* factor base constructor. Really a home-grown memalign(3c) underneath.
153
 * We don't want FB entries to straddle L1 cache line boundaries, and
154
 * malloc(3c) only guarantees alignment adequate for all primitive data
155
 * types of the platform ABI - typically to 8 or 16 byte boundaries.
156
 * Also allocate the inv_A_H array.
157
 * The FB array pointer is returned for convenience */
158
static mpqs_FB_entry_t *
159
mpqs_FB_ctor(mpqs_handle_t *h)
160
{
161
  /* leave room for slots 0, 1, and sentinel slot at the end of the array */
162
  long size_FB_chunk = (h->size_of_FB + 3) * sizeof(mpqs_FB_entry_t);
163
  /* like FB, except this one does not have a sentinel slot at the end */
164
  long size_IAH_chunk = (h->size_of_FB + 2) * sizeof(mpqs_inv_A_H_t);
165
  char *fbp = (char*)stack_malloc(size_FB_chunk + 64);
166
  char *iahp = (char*)stack_malloc(size_IAH_chunk + 64);
167
  long fbl, iahl;
168

169
  h->FB_chunk = (void *)fbp;
170
  h->invAH_chunk = (void *)iahp;
171
  /* round up to next higher 64-bytes-aligned address */
172
  fbl = (((long)fbp) + 64) & ~0x3FL;
173
  /* and put the actual array there */
174
  h->FB = (mpqs_FB_entry_t *)fbl;
175

176
  iahl = (((long)iahp) + 64) & ~0x3FL;
177
  h->inv_A_H = (mpqs_inv_A_H_t *)iahl;
178
  return (mpqs_FB_entry_t *)fbl;
179
}
180

181
/* sieve array constructor;  also allocates the candidates array
182
 * and temporary storage for relations under construction */
183
static void
184
mpqs_sieve_array_ctor(mpqs_handle_t *h)
185
{
186
  long size = (h->M << 1) + 1;
187
  mpqs_int32_t size_of_FB = h->size_of_FB;
188

189
  h->sieve_array = (unsigned char *) stack_calloc_align(size, sizeof(mpqs_mask));
190
  h->sieve_array_end = h->sieve_array + size - 2;
191
  h->sieve_array_end[1] = 255; /* sentinel */
192
  h->candidates = (long *)stack_malloc(MPQS_CANDIDATE_ARRAY_SIZE * sizeof(long));
193
  /* whereas mpqs_self_init() uses size_of_FB+1, we just use the size as
194
   * it is, not counting FB[1], to start off the following estimate */
195
  if (size_of_FB > MAX_PE_PAIR) size_of_FB = MAX_PE_PAIR;
196
  /* and for tracking which primes occur in the current relation: */
197
  h->relaprimes = (long *) stack_malloc((size_of_FB << 1) * sizeof(long));
198
}
199

200
/* allocate GENs for current polynomial and self-initialization scratch data */
201
static void
202
mpqs_poly_ctor(mpqs_handle_t *h)
203
{
204
  mpqs_int32_t i, w = h->omega_A;
205
  h->per_A_pr = (mpqs_per_A_prime_t *)
206
                stack_calloc(w * sizeof(mpqs_per_A_prime_t));
207
  /* A is the product of w primes, each below word size.
208
   * |B| <= (w + 4) * A, so can have at most one word more
209
   * H holds residues modulo A: the same size as used for A is sufficient. */
210
  h->A = cgeti(w + 2);
211
  h->B = cgeti(w + 3);
212
  for (i = 0; i < w; i++) h->per_A_pr[i]._H = cgeti(w + 2);
213
}
214

215
/*********************************************************************/
216
/**                        FACTOR BASE SETUP                        **/
217
/*********************************************************************/
218
/* fill in the best-guess multiplier k for N. We force kN = 1 mod 4.
219
 * Caller should proceed to fill in kN
220
 * See Knuth-Schroeppel function in
221
 * Robert D. Silverman
222
 * The multiple polynomial quadratic sieve
223
 * Math. Comp. 48 (1987), 329-339
224
 * https://www.ams.org/journals/mcom/1987-48-177/S0025-5718-1987-0866119-8/
225
 */
226
static ulong
227
mpqs_find_k(mpqs_handle_t *h)
228
{
229
  const pari_sp av = avma;
230
  const long N_mod_8 = mod8(h->N), N_mod_4 = N_mod_8 & 3;
231
  long dl = decimal_len(h->N);
232
  long D = maxss(0, minss(dl,MPQS_MAX_DIGIT_SIZE_KN)-9);
233
  long MPQS_MULTIPLIER_SEARCH_DEPTH = mpqs_parameters[D].size_of_FB;
234
  forprime_t S;
235
  struct {
236
    const mpqs_multiplier_t *_k;
237
    long np; /* number of primes in factorbase so far for this k */
238
    double value; /* the larger, the better */
239
  } cache[MPQS_POSSIBLE_MULTIPLIERS];
240
  ulong MPQS_NB_MULTIPLIERS = dl < 40 ? 5 : MPQS_POSSIBLE_MULTIPLIERS;
241
  ulong p, i, nbk;
242

243
  for (i = nbk = 0; i < numberof(cand_multipliers); i++)
244
  {
245
    const mpqs_multiplier_t *cand_k = &cand_multipliers[i];
246
    long k = cand_k->k;
247
    double v;
248
    if ((k & 3) != N_mod_4) continue; /* want kN = 1 (mod 4) */
249
    v = -log((double)k)/2;
250
    if ((k & 7) == N_mod_8) v += M_LN2; /* kN = 1 (mod 8) */
251
    cache[nbk].np = 0;
252
    cache[nbk]._k = cand_k;
253
    cache[nbk].value = v;
254
    if (++nbk == MPQS_NB_MULTIPLIERS) break; /* enough */
255
  }
256
  /* next test is an impossible situation: kills spurious gcc-5.1 warnings
257
   * "array subscript is above array bounds" */
258
  if (nbk > MPQS_POSSIBLE_MULTIPLIERS) nbk = MPQS_POSSIBLE_MULTIPLIERS;
259
  u_forprime_init(&S, 2, ULONG_MAX);
260
  while ( (p = u_forprime_next(&S)) )
261
  {
262
    long kroNp = kroiu(h->N, p), seen = 0;
263
    if (!kroNp) return p;
264
    for (i = 0; i < nbk; i++)
265
    {
266
      long krokp;
267
      if (cache[i].np > MPQS_MULTIPLIER_SEARCH_DEPTH) continue;
268
      seen++;
269
      krokp = krouu(cache[i]._k->k % p, p);
270
      if (krokp == kroNp) /* kronecker(k*N, p)=1 */
271
      {
272
        cache[i].value += 2*log((double) p)/p;
273
        cache[i].np++;
274
      } else if (krokp == 0)
275
      {
276
        cache[i].value += log((double) p)/p;
277
        cache[i].np++;
278
      }
279
    }
280
    if (!seen) break; /* we're gone through SEARCH_DEPTH primes for all k */
281
  }
282
  if (!p) pari_err_OVERFLOW("mpqs_find_k [ran out of primes]");
283
  {
284
    long best_i = 0;
285
    double v = cache[0].value;
286
    for (i = 1; i < nbk; i++)
287
      if (cache[i].value > v) { best_i = i; v = cache[i].value; }
288
    h->_k = cache[best_i]._k; return gc_ulong(av,0);
289
  }
290
}
291

292
/* Create a factor base of 'size' primes p_i such that legendre(k*N, p_i) != -1
293
 * We could have shifted subscripts down from their historical arrangement,
294
 * but this seems too risky for the tiny potential gain in memory economy.
295
 * The real constraint is that the subscripts of anything which later shows
296
 * up at the Gauss stage must be nonnegative, because the exponent vectors
297
 * there use the same subscripts to refer to the same FB entries.  Thus in
298
 * particular, the entry representing -1 could be put into FB[0], but could
299
 * not be moved to FB[-1] (although mpqs_FB_ctor() could be easily adapted
300
 * to support negative subscripts).-- The historically grown layout is:
301
 * FB[0] is unused.
302
 * FB[1] is not explicitly used but stands for -1.
303
 * FB[2] contains 2 (always).
304
 * Before we are called, the size_of_FB field in the handle will already have
305
 * been adjusted by _k->omega_k, so there's room for the primes dividing k,
306
 * which when present will occupy FB[3] and following.
307
 * The "real" odd FB primes begin at FB[h->index0_FB].
308
 * FB[size_of_FB+1] is the last prime p_i.
309
 * FB[size_of_FB+2] is a sentinel to simplify some of our loops.
310
 * Thus we allocate size_of_FB+3 slots for FB.
311
 *
312
 * If a prime factor of N is found during the construction, it is returned
313
 * in f, otherwise f = 0. */
314

315
/* returns the FB array pointer for convenience */
316
static mpqs_FB_entry_t *
317
mpqs_create_FB(mpqs_handle_t *h, ulong *f)
318
{
319
  mpqs_FB_entry_t *FB = mpqs_FB_ctor(h);
320
  const pari_sp av = avma;
321
  mpqs_int32_t size = h->size_of_FB;
322
  long i;
323
  mpqs_uint32_t k = h->_k->k;
324
  forprime_t S;
325

326
  FB[2].fbe_p = 2;
327
  /* the fbe_logval and the fbe_sqrt_kN for 2 are never used */
328
  FB[2].fbe_flags = MPQS_FBE_CLEAR;
329
  for (i = 3; i < h->index0_FB; i++)
330
  { /* this loop executes h->_k->omega_k = 0, 1, or 2 times */
331
    mpqs_uint32_t kp = (ulong)h->_k->kp[i-3];
332
    if (MPQS_DEBUGLEVEL >= 7) err_printf(",<%lu>", (ulong)kp);
333
    FB[i].fbe_p = kp;
334
    /* we could flag divisors of k here, but no need so far */
335
    FB[i].fbe_flags = MPQS_FBE_CLEAR;
336
    FB[i].fbe_flogp = (float)log2((double) kp);
337
    FB[i].fbe_sqrt_kN = 0;
338
  }
339
  (void)u_forprime_init(&S, 3, ULONG_MAX);
340
  while (i < size + 2)
341
  {
342
    ulong p = u_forprime_next(&S);
343
    if (p > k || k % p)
344
    {
345
      ulong kNp = umodiu(h->kN, p);
346
      long kr = krouu(kNp, p);
347
      if (kr != -1)
348
      {
349
        if (kr == 0) { *f = p; return FB; }
350
        FB[i].fbe_p = (mpqs_uint32_t) p;
351
        FB[i].fbe_flags = MPQS_FBE_CLEAR;
352
        /* dyadic logarithm of p; single precision suffices */
353
        FB[i].fbe_flogp = (float)log2((double)p);
354
        /* cannot yet fill in fbe_logval because the scaling multiplier
355
         * depends on the largest prime in FB, as yet unknown */
356

357
        /* x such that x^2 = kN (mod p_i) */
358
        FB[i++].fbe_sqrt_kN = (mpqs_uint32_t)Fl_sqrt(kNp, p);
359
      }
360
    }
361
  }
362
  set_avma(av);
363
  if (MPQS_DEBUGLEVEL >= 7)
364
  {
365
    err_printf("MPQS: FB [-1,2");
366
    for (i = 3; i < h->index0_FB; i++) err_printf(",<%lu>", FB[i].fbe_p);
367
    for (; i < size + 2; i++) err_printf(",%lu", FB[i].fbe_p);
368
    err_printf("]\n");
369
  }
370

371
  FB[i].fbe_p = 0;              /* sentinel */
372
  h->largest_FB_p = FB[i-1].fbe_p; /* at subscript size_of_FB + 1 */
373

374
  /* locate the smallest prime that will be used for sieving */
375
  for (i = h->index0_FB; FB[i].fbe_p != 0; i++)
376
    if (FB[i].fbe_p >= h->pmin_index1) break;
377
  h->index1_FB = i;
378
  /* with our parameters this will never fall off the end of the FB */
379
  *f = 0; return FB;
380
}
381

382
/*********************************************************************/
383
/**                      MISC HELPER FUNCTIONS                      **/
384
/*********************************************************************/
385

386
/* Effect of the following:  multiplying the base-2 logarithm of some
387
 * quantity by log_multiplier will rescale something of size
388
 *    log2 ( sqrt(kN) * M / (largest_FB_prime)^tolerance )
389
 * to 232.  Note that sqrt(kN) * M is just A*M^2, the value our polynomials
390
 * take at the outer edges of the sieve interval.  The scale here leaves
391
 * a little wiggle room for accumulated rounding errors from the approximate
392
 * byte-sized scaled logarithms for the factor base primes which we add up
393
 * in the sieving phase.-- The threshold is then chosen so that a point in
394
 * the sieve has to reach a result which, under the same scaling, represents
395
 *    log2 ( sqrt(kN) * M / (largest_FB_prime)^tolerance )
396
 * in order to be accepted as a candidate. */
397
/* The old formula was...
398
 *   log_multiplier =
399
 *      127.0 / (0.5 * log2 (handle->dkN) + log2((double)M)
400
 *               - tolerance * log2((double)handle->largest_FB_p));
401
 * and we used to use this with a constant threshold of 128. */
402

403
/* NOTE: We used to divide log_multiplier by an extra factor 2, and in
404
 * compensation we were multiplying by 2 when the fbe_logp fields were being
405
 * filled in, making all those bytes even.  Tradeoff: the extra bit of
406
 * precision is helpful, but interferes with a possible sieving optimization
407
 * (artifically shift right the logp's of primes in A, and just run over both
408
 * arithmetical progressions  (which coincide in this case)  instead of
409
 * skipping the second one, to avoid the conditional branch in the
410
 * mpqs_sieve() loops).  We could still do this, but might lose a little bit
411
 * accuracy for those primes.  Probably no big deal. */
412
static void
413
mpqs_set_sieve_threshold(mpqs_handle_t *h)
414
{
415
  mpqs_FB_entry_t *FB = h->FB;
416
  double log_maxval, log_multiplier;
417
  long i;
418

419
  h->l2sqrtkN = 0.5 * log2(h->dkN);
420
  h->l2M = log2((double)h->M);
421
  log_maxval = h->l2sqrtkN + h->l2M - MPQS_A_FUDGE;
422
  log_multiplier = 232.0 / log_maxval;
423
  h->sieve_threshold = (unsigned char) (log_multiplier *
424
    (log_maxval - h->tolerance * log2((double)h->largest_FB_p))) + 1;
425
  /* That "+ 1" really helps - we may want to tune towards somewhat smaller
426
   * tolerances  (or introduce self-tuning one day)... */
427

428
  /* If this turns out to be <128, scream loudly.
429
   * That means that the FB or the tolerance or both are way too
430
   * large for the size of kN.  (Normally, the threshold should end
431
   * up in the 150...170 range.) */
432
  if (h->sieve_threshold < 128) {
433
    h->sieve_threshold = 128;
434
    pari_warn(warner,
435
        "MPQS: sizing out of tune, FB size or tolerance\n\ttoo large");
436
  }
437
  if (DEBUGLEVEL >= 5)
438
    err_printf("MPQS: sieve threshold: %ld\n",h->sieve_threshold);
439
  /* Now fill in the byte-sized approximate scaled logarithms of p_i */
440
  if (DEBUGLEVEL >= 5)
441
    err_printf("MPQS: computing logarithm approximations for p_i in FB\n");
442
  for (i = h->index0_FB; i < h->size_of_FB + 2; i++)
443
    FB[i].fbe_logval = (unsigned char) (log_multiplier * FB[i].fbe_flogp);
444
}
445

446
/* Given the partially populated handle, find the optimum place in the FB
447
 * to pick prime factors for A from.  The lowest admissible subscript is
448
 * index0_FB, but unless kN is very small, we stay away a bit from that.
449
 * The highest admissible is size_of_FB + 1, where the largest FB prime
450
 * resides.  The ideal corner is about (sqrt(kN)/M) ^ (1/omega_A),
451
 * so that A will end up of size comparable to sqrt(kN)/M;  experimentally
452
 * it seems desirable to stay slightly below this.  Moreover, the selection
453
 * of the individual primes happens to err on the large side, for which we
454
 * compensate a bit, using the (small positive) quantity MPQS_A_FUDGE.
455
 * We rely on a few auxiliary fields in the handle to be already set by
456
 * mqps_set_sieve_threshold() before we are called.
457
 * Return 1 on success, and 0 otherwise. */
458
static int
459
mpqs_locate_A_range(mpqs_handle_t *h)
460
{
461
  /* i will be counted up to the desirable index2_FB + 1, and omega_A is never
462
   * less than 3, and we want
463
   *   index2_FB - (omega_A - 1) + 1 >= index0_FB + omega_A - 3,
464
   * so: */
465
  long i = h->index0_FB + 2*(h->omega_A) - 4;
466
  double l2_target_pA;
467
  mpqs_FB_entry_t *FB = h->FB;
468

469
  h->l2_target_A = (h->l2sqrtkN - h->l2M - MPQS_A_FUDGE);
470
  l2_target_pA = h->l2_target_A / h->omega_A;
471

472
  /* find the sweet spot, normally shouldn't take long */
473
  while (FB[i].fbe_p && FB[i].fbe_flogp <= l2_target_pA) i++;
474

475
  /* check whether this hasn't walked off the top end... */
476
  /* The following should actually NEVER happen. */
477
  if (i > h->size_of_FB - 3)
478
  { /* this isn't going to work at all. */
479
    pari_warn(warner,
480
        "MPQS: sizing out of tune, FB too small or\n\tway too few primes in A");
481
    return 0;
482
  }
483
  h->index2_FB = i - 1; return 1;
484
  /* assert: index0_FB + (omega_A - 3) [the lowest FB subscript used in primes
485
   * for A]  + (omega_A - 2) <= index2_FB  [the subscript from which the choice
486
   * of primes for A starts, putting omega_A - 1 of them at or below index2_FB,
487
   * and the last and largest one above, cf. mpqs_si_choose_primes]. Moreover,
488
   * index2_FB indicates the last prime below the ideal size, unless (when kN
489
   * is tiny) the ideal size was too small to use. */
490
}
491

492
/*********************************************************************/
493
/**                       SELF-INITIALIZATION                       **/
494
/*********************************************************************/
495

496
#ifdef MPQS_DEBUG
497
/* Debug-only helper routine: check correctness of the root z mod p_i
498
 * by evaluting A * z^2 + B * z + C mod p_i  (which should be 0). */
499
static void
500
check_root(mpqs_handle_t *h, GEN mC, long p, long start)
501
{
502
  pari_sp av = avma;
503
  long z = start - ((long)(h->M) % p);
504
  if (umodiu(subii(mulsi(z, addii(h->B, mulsi(z, h->A))), mC), p))
505
  {
506
    err_printf("MPQS: p = %ld\n", p);
507
    err_printf("MPQS: A = %Ps\n", h->A);
508
    err_printf("MPQS: B = %Ps\n", h->B);
509
    err_printf("MPQS: C = %Ps\n", negi(mC));
510
    err_printf("MPQS: z = %ld\n", z);
511
    pari_err_BUG("MPQS: self_init: found wrong polynomial");
512
  }
513
  set_avma(av);
514
}
515
#endif
516

517
/* Increment *x > 0 to a larger value which has the same number of 1s in its
518
 * binary representation.  Wraparound can be detected by the caller as long as
519
 * we keep total_no_of_primes_for_A strictly less than BITS_IN_LONG.
520
 *
521
 * Changed switch to increment *x in all cases to the next larger number
522
 * which (a) has the same count of 1 bits and (b) does not arise from the
523
 * old value by moving a single 1 bit one position to the left  (which was
524
 * undesirable for the sieve). --GN based on discussion with TP */
525
INLINE void
526
mpqs_increment(mpqs_uint32_t *x)
527
{
528
  mpqs_uint32_t r1_mask, r01_mask, slider=1UL;
529

530
  switch (*x & 0x1F)
531
  { /* 32-way computed jump handles 22 out of 32 cases */
532
  case 29:
533
    (*x)++; break; /* shifts a single bit, but we postprocess this case */
534
  case 26:
535
    (*x) += 2; break; /* again */
536
  case 1: case 3: case 6: case 9: case 11:
537
  case 17: case 19: case 22: case 25: case 27:
538
    (*x) += 3; return;
539
  case 20:
540
    (*x) += 4; break; /* again */
541
  case 5: case 12: case 14: case 21:
542
    (*x) += 5; return;
543
  case 2: case 7: case 13: case 18: case 23:
544
    (*x) += 6; return;
545
  case 10:
546
    (*x) += 7; return;
547
  case 8:
548
    (*x) += 8; break; /* and again */
549
  case 4: case 15:
550
    (*x) += 12; return;
551
  default: /* 0, 16, 24, 28, 30, 31 */
552
    /* isolate rightmost 1 */
553
    r1_mask = ((*x ^ (*x - 1)) + 1) >> 1;
554
    /* isolate rightmost 1 which has a 0 to its left */
555
    r01_mask = ((*x ^ (*x + r1_mask)) + r1_mask) >> 2;
556
    /* simple cases.  Both of these shift a single bit one position to the
557
       left, and will need postprocessing */
558
    if (r1_mask == r01_mask) { *x += r1_mask; break; }
559
    if (r1_mask == 1) { *x += r01_mask; break; }
560
    /* General case: add r01_mask, kill off as many 1 bits as possible to its
561
     * right while at the same time filling in 1 bits from the LSB. */
562
    if (r1_mask == 2) { *x += (r01_mask>>1) + 1; return; }
563
    while (r01_mask > r1_mask && slider < r1_mask)
564
    {
565
      r01_mask >>= 1; slider <<= 1;
566
    }
567
    *x += r01_mask + slider - 1;
568
    return;
569
  }
570
  /* post-process cases which couldn't be finalized above */
571
  r1_mask = ((*x ^ (*x - 1)) + 1) >> 1;
572
  r01_mask = ((*x ^ (*x + r1_mask)) + r1_mask) >> 2;
573
  if (r1_mask == r01_mask) { *x += r1_mask; return; }
574
  if (r1_mask == 1) { *x += r01_mask; return; }
575
  if (r1_mask == 2) { *x += (r01_mask>>1) + 1; return; }
576
  while (r01_mask > r1_mask && slider < r1_mask)
577
  {
578
    r01_mask >>= 1; slider <<= 1;
579
  }
580
  *x += r01_mask + slider - 1;
581
}
582

583
/* self-init (1): advancing the bit pattern, and choice of primes for A.
584
 * On first call, h->bin_index = 0. On later occasions, we need to begin
585
 * by clearing the MPQS_FBE_DIVIDES_A bit in the fbe_flags of the former
586
 * prime factors of A (use per_A_pr to find them). Upon successful return, that
587
 * array will have been filled in, and the flag bits will have been turned on
588
 * again in the right places.
589
 * Return 1 when all is fine and 0 when we found we'd be using more bits to
590
 * the left in bin_index than we have matching primes in the FB. In the latter
591
 * case, bin_index will be zeroed out, index2_FB will be incremented by 2,
592
 * index2_moved will be turned on; the caller, after checking that index2_FB
593
 * has not become too large, should just call us again, which then succeeds:
594
 * we'll start again with a right-justified sequence of 1 bits in bin_index,
595
 * now interpreted as selecting primes relative to the new index2_FB. */
596
INLINE int
597
mpqs_si_choose_primes(mpqs_handle_t *h)
598
{
599
  mpqs_FB_entry_t *FB = h->FB;
600
  mpqs_per_A_prime_t *per_A_pr = h->per_A_pr;
601
  double l2_last_p = h->l2_target_A;
602
  mpqs_int32_t omega_A = h->omega_A;
603
  int i, j, v2, prev_last_p_idx;
604
  int room = h->index2_FB - h->index0_FB - omega_A + 4;
605
  /* The notion of room here (cf mpqs_locate_A_range() above) is the number
606
   * of primes at or below index2_FB which are eligible for A. We need
607
   * >= omega_A - 1 of them, and it is guaranteed by mpqs_locate_A_range() that
608
   * this many are available: the lowest FB slot used for A is never less than
609
   * index0_FB + omega_A - 3. When omega_A = 3 (very small kN), we allow
610
   * ourselves to reach all the way down to index0_FB; otherwise, we keep away
611
   * from it by at least one position.  For omega_A >= 4 this avoids situations
612
   * where the selection of the smaller primes here has advanced to a lot of
613
   * very small ones, and the single last larger one has soared away to bump
614
   * into the top end of the FB. */
615
  mpqs_uint32_t room_mask;
616
  mpqs_int32_t p;
617
  ulong bits;
618

619
  /* XXX also clear the index_j field here? */
620
  if (h->bin_index == 0)
621
  { /* first time here, or after increasing index2_FB, initialize to a pattern
622
     * of omega_A - 1 consecutive 1 bits. Caller has ensured that there are
623
     * enough primes for this in the FB below index2_FB. */
624
    h->bin_index = (1UL << (omega_A - 1)) - 1;
625
    prev_last_p_idx = 0;
626
  }
627
  else
628
  { /* clear out old flags */
629
    for (i = 0; i < omega_A; i++) MPQS_FLG(i) = MPQS_FBE_CLEAR;
630
    prev_last_p_idx = MPQS_I(omega_A-1);
631

632
    if (room > 30) room = 30;
633
    room_mask = ~((1UL << room) - 1);
634

635
    /* bump bin_index to next acceptable value. If index2_moved is off, call
636
     * mpqs_increment() once; otherwise, repeat until there's something in the
637
     * least significant 2 bits - to ensure that we never re-use an A which
638
     * we'd used before increasing index2_FB - but also stop if something shows
639
     * up in the forbidden bits on the left where we'd run out of bits or walk
640
     * beyond index0_FB + omega_A - 3. */
641
    mpqs_increment(&h->bin_index);
642
    if (h->index2_moved)
643
    {
644
      while ((h->bin_index & (room_mask | 0x3)) == 0)
645
        mpqs_increment(&h->bin_index);
646
    }
647
    /* did we fall off the edge on the left? */
648
    if ((h->bin_index & room_mask) != 0)
649
    { /* Yes. Turn on the index2_moved flag in the handle */
650
      h->index2_FB += 2; /* caller to check this isn't too large!!! */
651
      h->index2_moved = 1;
652
      h->bin_index = 0;
653
      if (MPQS_DEBUGLEVEL >= 5)
654
        err_printf("MPQS: wrapping, more primes for A now chosen near FB[%ld] = %ld\n",
655
                   (long)h->index2_FB,
656
                   (long)FB[h->index2_FB].fbe_p);
657
      return 0; /* back off - caller should retry */
658
    }
659
  }
660
  /* assert: we aren't occupying any of the room_mask bits now, and if
661
   * index2_moved had already been on, at least one of the two LSBs is on */
662
  bits = h->bin_index;
663
  if (MPQS_DEBUGLEVEL >= 6)
664
    err_printf("MPQS: new bit pattern for primes for A: 0x%lX\n", bits);
665

666
  /* map bits to FB subscripts, counting downward with bit 0 corresponding
667
   * to index2_FB, and accumulate logarithms against l2_last_p */
668
  j = h->index2_FB;
669
  v2 = vals((long)bits);
670
  if (v2) { j -= v2; bits >>= v2; }
671
  for (i = omega_A - 2; i >= 0; i--)
672
  {
673
    MPQS_I(i) = j;
674
    l2_last_p -= MPQS_LP(i);
675
    MPQS_FLG(i) |= MPQS_FBE_DIVIDES_A;
676
    bits &= ~1UL;
677
    if (!bits) break; /* i = 0 */
678
    v2 = vals((long)bits); /* > 0 */
679
    bits >>= v2; j -= v2;
680
  }
681
  /* Choose the larger prime.  Note we keep index2_FB <= size_of_FB - 3 */
682
  for (j = h->index2_FB + 1; (p = FB[j].fbe_p); j++)
683
    if (FB[j].fbe_flogp > l2_last_p) break;
684
  /* The following trick avoids generating a large proportion of duplicate
685
   * relations when the last prime falls into an area where there are large
686
   * gaps from one FB prime to the next, and would otherwise often be repeated
687
   * (so that successive A's would wind up too similar to each other). While
688
   * this trick isn't perfect, it gets rid of a major part of the potential
689
   * duplication. */
690
  if (p && j == prev_last_p_idx) { j++; p = FB[j].fbe_p; }
691
  MPQS_I(omega_A - 1) = p? j: h->size_of_FB + 1;
692
  MPQS_FLG(omega_A - 1) |= MPQS_FBE_DIVIDES_A;
693

694
  if (MPQS_DEBUGLEVEL >= 6)
695
  {
696
    err_printf("MPQS: chose primes for A");
697
    for (i = 0; i < omega_A; i++)
698
      err_printf(" FB[%ld]=%ld%s", (long)MPQS_I(i), (long)MPQS_AP(i),
699
                 i < omega_A - 1 ? "," : "\n");
700
  }
701
  return 1;
702
}
703

704
/* There are 4 parts to self-initialization, exercised at different times:
705
 * - choosing a new sqfree coef. A (selecting its prime factors, FB bookkeeping)
706
 * - doing the actual computations attached to a new A
707
 * - choosing a new B keeping the same A (much simpler)
708
 * - a small common bit that needs to happen in both cases.
709
 * As to the first item, the scheme works as follows: pick omega_A - 1 prime
710
 * factors for A below the index2_FB point which marks their ideal size, and
711
 * one prime above this point, choosing the latter so log2(A) ~ l2_target_A.
712
 * Lower prime factors are chosen using bit patterns of constant weight,
713
 * gradually moving away from index2_FB towards smaller FB subscripts.
714
 * If this bumps into index0_FB (for very small input), back up by increasing
715
 * index2_FB by two, and from then on choosing only bit patterns with either or
716
 * both of their bottom bits set, so at least one of the omega_A - 1 smaller
717
 * prime factor will be beyond the original index2_FB point. In this way we
718
 * avoid re-using the same A. (The choice of the upper "flyer" prime is
719
 * constrained by the size of the FB, which normally should never a problem.
720
 * For tiny kN, we might have to live with a nonoptimal choice.)
721
 *
722
 * Mathematically, we solve a quadratic (over F_p for each prime p in the FB
723
 * which doesn't divide A), a linear equation for each prime p | A, and
724
 * precompute differences between roots mod p so we can adjust the roots
725
 * quickly when we change B. See Thomas Sosnowski's Diplomarbeit. */
726
/* compute coefficients of sieving polynomial for self initializing variant.
727
 * Coefficients A and B are set (preallocated GENs) and several tables are
728
 * updated. */
729
static int
730
mpqs_self_init(mpqs_handle_t *h)
731
{
732
  const ulong size_of_FB = h->size_of_FB + 1;
733
  mpqs_FB_entry_t *FB = h->FB;
734
  mpqs_inv_A_H_t *inv_A_H = h->inv_A_H;
735
  const pari_sp av = avma;
736
  GEN p1, A = h->A, B = h->B;
737
  mpqs_per_A_prime_t *per_A_pr = h->per_A_pr;
738
  long i, j;
739

740
#ifdef MPQS_DEBUG
741
  err_printf("MPQS DEBUG: enter self init, avma = 0x%lX\n", (ulong)avma);
742
#endif
743
  if (++h->index_j == (mpqs_uint32_t)h->no_B)
744
  { /* all the B's have been used, choose new A; this is indicated by setting
745
     * index_j to 0 */
746
    h->index_j = 0;
747
    h->index_i++; /* count finished A's */
748
  }
749

750
  if (h->index_j == 0)
751
  { /* compute first polynomial with new A */
752
    GEN a, b, A2;
753
    if (!mpqs_si_choose_primes(h))
754
    { /* Ran out of room towards small primes, and index2_FB was raised. */
755
      if (size_of_FB - h->index2_FB < 4) return 0; /* Fail */
756
      (void)mpqs_si_choose_primes(h); /* now guaranteed to succeed */
757
    }
758
    /* bin_index and per_A_pr now populated with consistent values */
759

760
    /* compute A = product of omega_A primes given by bin_index */
761
    a = b = NULL;
762
    for (i = 0; i < h->omega_A; i++)
763
    {
764
      ulong p = MPQS_AP(i);
765
      a = a? muliu(a, p): utoipos(p);
766
    }
767
    affii(a, A);
768
    /* Compute H[i], 0 <= i < omega_A.  Also compute the initial
769
     * B = sum(v_i*H[i]), by taking all v_i = +1
770
     * TODO: following needs to be changed later for segmented FB and sieve
771
     * interval, where we'll want to precompute several B's. */
772
    for (i = 0; i < h->omega_A; i++)
773
    {
774
      ulong p = MPQS_AP(i);
775
      GEN t = divis(A, (long)p);
776
      t = remii(mulii(t, muluu(Fl_inv(umodiu(t, p), p), MPQS_SQRT(i))), A);
777
      affii(t, MPQS_H(i));
778
      b = b? addii(b, t): t;
779
    }
780
    affii(b, B); set_avma(av);
781

782
    /* ensure B = 1 mod 4 */
783
    if (mod2(B) == 0)
784
      affii(addii(B, mului(mod4(A), A)), B); /* B += (A % 4) * A; */
785

786
    A2 = shifti(A, 1);
787
    /* compute the roots z1, z2, of the polynomial Q(x) mod p_j and
788
     * initialize start1[i] with the first value p_i | Q(z1 + i p_j)
789
     * initialize start2[i] with the first value p_i | Q(z2 + i p_j)
790
     * The following loop does The Right Thing for primes dividing k (where
791
     * sqrt_kN is 0 mod p). Primes dividing A are skipped here, and are handled
792
     * further down in the common part of SI. */
793
    for (j = 3; (ulong)j <= size_of_FB; j++)
794
    {
795
      ulong s, mb, t, m, p, iA2, iA;
796
      if (FB[j].fbe_flags & MPQS_FBE_DIVIDES_A) continue;
797
      p = (ulong)FB[j].fbe_p;
798
      m = h->M % p;
799
      iA2 = Fl_inv(umodiu(A2, p), p); /* = 1/(2*A) mod p_j */
800
      iA = iA2 << 1; if (iA > p) iA -= p;
801
      mb = umodiu(B, p); if (mb) mb = p - mb; /* mb = -B mod p */
802
      s = FB[j].fbe_sqrt_kN;
803
      t = Fl_add(m, Fl_mul(Fl_sub(mb, s, p), iA2, p), p);
804
      FB[j].fbe_start1 = (mpqs_int32_t)t;
805
      FB[j].fbe_start2 = (mpqs_int32_t)Fl_add(t, Fl_mul(s, iA, p), p);
806
      for (i = 0; i < h->omega_A - 1; i++)
807
      {
808
        ulong h = umodiu(MPQS_H(i), p);
809
        MPQS_INV_A_H(i,j) = Fl_mul(h, iA, p); /* 1/A * H[i] mod p_j */
810
      }
811
    }
812
  }
813
  else
814
  { /* no "real" computation -- use recursive formula */
815
    /* The following exploits that B is the sum of omega_A terms +-H[i]. Each
816
     * time we switch to a new B, we choose a new pattern of signs; the
817
     * precomputation of the inv_A_H array allows us to change the two
818
     * arithmetic progressions equally fast. The choice of sign patterns does
819
     * not follow the bit pattern of the ordinal number of B in the current
820
     * cohort; rather, we use a Gray code, changing only one sign each time.
821
     * When the i-th rightmost bit of the new ordinal number index_j of B is 1,
822
     * the sign of H[i] is changed; the next bit to the left tells us whether
823
     * we should be adding or subtracting the difference term. We never need to
824
     * change the sign of H[omega_A-1] (the topmost one), because that would
825
     * just give us the same sieve items Q(x) again with the opposite sign
826
     * of x.  This is why we only precomputed inv_A_H up to i = omega_A - 2. */
827
    ulong p, v2 = vals(h->index_j); /* new starting positions for sieving */
828
    j = h->index_j >> v2;
829
    p1 = shifti(MPQS_H(v2), 1);
830
    if (j & 2)
831
    { /* j = 3 mod 4 */
832
      for (j = 3; (ulong)j <= size_of_FB; j++)
833
      {
834
        if (FB[j].fbe_flags & MPQS_FBE_DIVIDES_A) continue;
835
        p = (ulong)FB[j].fbe_p;
836
        FB[j].fbe_start1 = Fl_sub(FB[j].fbe_start1, MPQS_INV_A_H(v2,j), p);
837
        FB[j].fbe_start2 = Fl_sub(FB[j].fbe_start2, MPQS_INV_A_H(v2,j), p);
838
      }
839
      p1 = addii(B, p1);
840
    }
841
    else
842
    { /* j = 1 mod 4 */
843
      for (j = 3; (ulong)j <= size_of_FB; j++)
844
      {
845
        if (FB[j].fbe_flags & MPQS_FBE_DIVIDES_A) continue;
846
        p = (ulong)FB[j].fbe_p;
847
        FB[j].fbe_start1 = Fl_add(FB[j].fbe_start1, MPQS_INV_A_H(v2,j), p);
848
        FB[j].fbe_start2 = Fl_add(FB[j].fbe_start2, MPQS_INV_A_H(v2,j), p);
849
      }
850
      p1 = subii(B, p1);
851
    }
852
    affii(p1, B);
853
  }
854

855
  /* p=2 is a special case.  start1[2], start2[2] are never looked at,
856
   * so don't bother setting them. */
857

858
  /* compute zeros of polynomials that have only one zero mod p since p | A */
859
  p1 = diviiexact(subii(h->kN, sqri(B)), shifti(A, 2)); /* coefficient -C */
860
  for (i = 0; i < h->omega_A; i++)
861
  {
862
    ulong p = MPQS_AP(i), s = h->M + Fl_div(umodiu(p1, p), umodiu(B, p), p);
863
    FB[MPQS_I(i)].fbe_start1 = FB[MPQS_I(i)].fbe_start2 = (mpqs_int32_t)(s % p);
864
  }
865
#ifdef MPQS_DEBUG
866
  for (j = 3; j <= size_of_FB; j++)
867
  {
868
    check_root(h, p1, FB[j].fbe_p, FB[j].fbe_start1);
869
    check_root(h, p1, FB[j].fbe_p, FB[j].fbe_start2);
870
  }
871
#endif
872
  if (MPQS_DEBUGLEVEL >= 6)
873
    err_printf("MPQS: chose Q_%ld(x) = %Ps x^2 %c %Ps x + C\n",
874
               (long) h->index_j, h->A,
875
               signe(h->B) < 0? '-': '+', absi_shallow(h->B));
876
  set_avma(av);
877
#ifdef MPQS_DEBUG
878
  err_printf("MPQS DEBUG: leave self init, avma = 0x%lX\n", (ulong)avma);
879
#endif
880
  return 1;
881
}
882

883
/*********************************************************************/
884
/**                           THE SIEVE                             **/
885
/*********************************************************************/
886
/* p4 = 4*p, logp ~ log(p), B/E point to the beginning/end of a sieve array */
887
INLINE void
888
mpqs_sieve_p(unsigned char *B, unsigned char *E, long p4, long p,
889
             unsigned char logp)
890
{
891
  register unsigned char *e = E - p4;
892
  /* Unrolled loop. It might be better to let the compiler worry about this
893
   * kind of optimization, based on its knowledge of whatever useful tricks the
894
   * machine instruction set architecture is offering */
895
  while (e - B >= 0) /* signed comparison */
896
  {
897
    (*B) += logp, B += p;
898
    (*B) += logp, B += p;
899
    (*B) += logp, B += p;
900
    (*B) += logp, B += p;
901
  }
902
  while (E - B >= 0) (*B) += logp, B += p;
903
}
904

905
INLINE void
906
mpqs_sieve_p1(unsigned char *B, unsigned char *E, long s1, long s2,
907
             unsigned char logp)
908
{
909
  while (E - B >= 0)
910
  {
911
    (*B) += logp, B += s1;
912
    if (E - B < 0) break;
913
    (*B) += logp, B += s2;
914
  }
915
}
916

917
INLINE void
918
mpqs_sieve_p2(unsigned char *B, unsigned char *E, long p4, long s1, long s2,
919
             unsigned char logp)
920
{
921
  register unsigned char *e = E - p4;
922
  /* Unrolled loop. It might be better to let the compiler worry about this
923
   * kind of optimization, based on its knowledge of whatever useful tricks the
924
   * machine instruction set architecture is offering */
925
  while (e - B >= 0) /* signed comparison */
926
  {
927
    (*B) += logp, B += s1;
928
    (*B) += logp, B += s2;
929
    (*B) += logp, B += s1;
930
    (*B) += logp, B += s2;
931
    (*B) += logp, B += s1;
932
    (*B) += logp, B += s2;
933
    (*B) += logp, B += s1;
934
    (*B) += logp, B += s2;
935
  }
936
  while (E - B >= 0) {(*B) += logp, B += s1; if (E - B < 0) break; (*B) += logp, B += s2;}
937
}
938
static void
939
mpqs_sieve(mpqs_handle_t *h)
940
{
941
  long p, l = h->index1_FB;
942
  mpqs_FB_entry_t *FB = &(h->FB[l]);
943
  unsigned char *S = h->sieve_array, *Send = h->sieve_array_end;
944
  long size = h->M << 1, size4 = size >> 3;
945
  memset((void*)S, 0, size * sizeof(unsigned char));
946
  for (  ; (p = FB->fbe_p) && p <= size4; FB++) /* l++ */
947
  {
948
    unsigned char logp = FB->fbe_logval;
949
    long s1 = FB->fbe_start1, s2 = FB->fbe_start2;
950
    /* sieve with FB[l] from start1[l], and from start2[l] if s1 != s2 */
951
    if (s1 == s2) mpqs_sieve_p(S + s1, Send, p << 2, p, logp);
952
    else
953
    {
954
      if (s1>s2) lswap(s1,s2)
955
      mpqs_sieve_p2(S + s1, Send, p << 2, s2-s1,p+s1-s2, logp);
956
    }
957
  }
958
  for (   ; (p = FB->fbe_p) && p <= size; FB++) /* l++ */
959
  {
960
    unsigned char logp = FB->fbe_logval;
961
    long s1 = FB->fbe_start1, s2 = FB->fbe_start2;
962
    /* sieve with FB[l] from start1[l], and from start2[l] if s1 != s2 */
963
    if (s1 == s2) mpqs_sieve_p(S + s1, Send, p << 2, p, logp);
964
    else
965
    {
966
      if (s1>s2) lswap(s1,s2)
967
      mpqs_sieve_p1(S + s1, Send, s2-s1, p+s1-s2, logp);
968
    }
969
  }
970
  for (    ; (p = FB->fbe_p); FB++)
971
  {
972
    unsigned char logp = FB->fbe_logval;
973
    long s1 = FB->fbe_start1, s2 = FB->fbe_start2;
974
    if (s1 < size) S[s1] += logp;
975
    if (s2!=s1 && s2 < size) S[s2] += logp;
976
  }
977
}
978

979
/* Could use the fact that 4 | M, but let the compiler worry about unrolling. */
980
static long
981
mpqs_eval_sieve(mpqs_handle_t *h)
982
{
983
  long x = 0, count = 0, M2 = h->M << 1;
984
  unsigned char t = h->sieve_threshold;
985
  unsigned char *S = h->sieve_array;
986
  mpqs_bit_array * U = (mpqs_bit_array *) S;
987
  long *cand = h->candidates;
988
  const long sizemask = sizeof(mpqs_mask);
989

990
  /* Exploiting the sentinel, we don't need to check for x < M2 in the inner
991
   * while loop; more than makes up for the lack of explicit unrolling. */
992
  while (count < MPQS_CANDIDATE_ARRAY_SIZE - 1)
993
  {
994
    long j, y;
995
    while (!TEST(AND(U[x],mpqs_mask))) x++;
996
    y = x*sizemask;
997
    for (j=0; j<sizemask; j++, y++)
998
    {
999
      if (y >= M2)
1000
        { cand[count] = 0; return count; }
1001
      if (S[y]>=t)
1002
        cand[count++] = y;
1003
    }
1004
    x++;
1005
  }
1006
  cand[count] = 0; return count;
1007
}
1008

1009
/*********************************************************************/
1010
/**                     CONSTRUCTING RELATIONS                      **/
1011
/*********************************************************************/
1012

1013
/* only used for debugging */
1014
static void
1015
split_relp(GEN rel, GEN *prelp, GEN *prelc)
1016
{
1017
  long j, l = lg(rel);
1018
  GEN relp, relc;
1019
  *prelp = relp = cgetg(l, t_VECSMALL);
1020
  *prelc = relc = cgetg(l, t_VECSMALL);
1021
  for (j=1; j<l; j++)
1022
  {
1023
    relc[j] = rel[j] >> REL_OFFSET;
1024
    relp[j] = rel[j] & REL_MASK;
1025
  }
1026
}
1027

1028
#ifdef MPQS_DEBUG
1029
static GEN
1030
mpqs_factorback(mpqs_handle_t *h, GEN relp)
1031
{
1032
  GEN N = h->N, Q = gen_1;
1033
  long j, l = lg(relp);
1034
  for (j = 1; j < l; j++)
1035
  {
1036
    long e = relp[j] >> REL_OFFSET, i = relp[j] & REL_MASK;
1037
    if (i == 1) Q = Fp_neg(Q,N); /* special case -1 */
1038
    else Q = Fp_mul(Q, Fp_powu(utoipos(h->FB[i].fbe_p), e, N), N);
1039
  }
1040
  return Q;
1041
}
1042
static void
1043
mpqs_check_rel(mpqs_handle_t *h, GEN c)
1044
{
1045
  pari_sp av = avma;
1046
  int LP = (lg(c) == 4);
1047
  GEN rhs = mpqs_factorback(h, rel_p(c));
1048
  GEN Y = rel_Y(c), Qx_2 = remii(sqri(Y), h->N);
1049
  if (LP) rhs = modii(mulii(rhs, rel_q(c)), h->N);
1050
  if (!equalii(Qx_2, rhs))
1051
  {
1052
    GEN relpp, relpc;
1053
    split_relp(rel_p(c), &relpp, &relpc);
1054
    err_printf("MPQS: %Ps : %Ps %Ps\n", Y, relpp,relpc);
1055
    err_printf("\tQx_2 = %Ps\n", Qx_2);
1056
    err_printf("\t rhs = %Ps\n", rhs);
1057
    pari_err_BUG(LP? "MPQS: wrong large prime relation found"
1058
                   : "MPQS: wrong full relation found");
1059
  }
1060
  PRINT_IF_VERBOSE(LP? "\b(;)": "\b(:)");
1061
  set_avma(av);
1062
}
1063
#endif
1064

1065
static void
1066
rel_to_ei(GEN ei, GEN relp)
1067
{
1068
  long j, l = lg(relp);
1069
  for (j=1; j<l; j++)
1070
  {
1071
    long e = relp[j] >> REL_OFFSET, i = relp[j] & REL_MASK;
1072
    ei[i] += e;
1073
  }
1074
}
1075
static void
1076
mpqs_add_factor(GEN relp, long *i, ulong ei, ulong pi)
1077
{ relp[++*i] = pi | (ei << REL_OFFSET); }
1078

1079
static int
1080
zv_is_even(GEN V)
1081
{
1082
  long i, l = lg(V);
1083
  for (i=1; i<l; i++)
1084
    if (odd(uel(V,i))) return 0;
1085
  return 1;
1086
}
1087

1088
static GEN
1089
combine_large_primes(mpqs_handle_t *h, GEN rel1, GEN rel2)
1090
{
1091
  GEN new_Y, new_Y1, Y1 = rel_Y(rel1), Y2 = rel_Y(rel2);
1092
  long l, lei = h->size_of_FB + 1, nb = 0;
1093
  GEN ei, relp, iq, q = rel_q(rel1);
1094

1095
  if (!invmod(q, h->N, &iq)) return equalii(iq, h->N)? NULL: iq; /* rare */
1096
  ei = zero_zv(lei);
1097
  rel_to_ei(ei, rel_p(rel1));
1098
  rel_to_ei(ei, rel_p(rel2));
1099
  if (zv_is_even(ei)) return NULL;
1100
  new_Y = modii(mulii(mulii(Y1, Y2), iq), h->N);
1101
  new_Y1 = subii(h->N, new_Y);
1102
  if (abscmpii(new_Y1, new_Y) < 0) new_Y = new_Y1;
1103
  relp = cgetg(MAX_PE_PAIR+1,t_VECSMALL);
1104
  if (odd(ei[1])) mpqs_add_factor(relp, &nb, 1, 1);
1105
  for (l = 2; l <= lei; l++)
1106
    if (ei[l]) mpqs_add_factor(relp, &nb, ei[l],l);
1107
  setlg(relp, nb+1);
1108
  if (DEBUGLEVEL >= 6)
1109
  {
1110
    GEN relpp, relpc, rel1p, rel1c, rel2p, rel2c;
1111
    split_relp(relp,&relpp,&relpc);
1112
    split_relp(rel1,&rel1p,&rel1c);
1113
    split_relp(rel2,&rel2p,&rel2c);
1114
    err_printf("MPQS: combining\n");
1115
    err_printf("    {%Ps @ %Ps : %Ps}\n", q, Y1, rel1p, rel1c);
1116
    err_printf("  * {%Ps @ %Ps : %Ps}\n", q, Y2, rel2p, rel2c);
1117
    err_printf(" == {%Ps, %Ps}\n", relpp, relpc);
1118
  }
1119
#ifdef MPQS_DEBUG
1120
  {
1121
    pari_sp av1 = avma;
1122
    if (!equalii(modii(sqri(new_Y), h->N), mpqs_factorback(h, relp)))
1123
      pari_err_BUG("MPQS: combined large prime relation is false");
1124
    set_avma(av1);
1125
  }
1126
#endif
1127
  return mkvec2(new_Y, relp);
1128
}
1129

1130
/* nc candidates */
1131
static GEN
1132
mpqs_eval_cand(mpqs_handle_t *h, long nc, hashtable *frel, hashtable *lprel)
1133
{
1134
  mpqs_FB_entry_t *FB = h->FB;
1135
  GEN A = h->A, B = h->B;
1136
  long *relaprimes = h->relaprimes, *candidates = h->candidates;
1137
  long pi, i;
1138
  int pii;
1139
  mpqs_per_A_prime_t *per_A_pr = h->per_A_pr;
1140

1141
  for (i = 0; i < nc; i++)
1142
  {
1143
    pari_sp btop = avma;
1144
    GEN Qx, Qx_part, Y, relp = cgetg(MAX_PE_PAIR+1,t_VECSMALL);
1145
    long powers_of_2, p, x = candidates[i], nb = 0;
1146
    int relaprpos = 0;
1147
    long k;
1148
    unsigned char thr = h->sieve_array[x];
1149
    /* Y = 2*A*x + B, Qx = Y^2/(4*A) = Q(x) */
1150
    Y = addii(mulis(A, 2 * (x - h->M)), B);
1151
    Qx = subii(sqri(Y), h->kN); /* != 0 since N not a square and (N,k) = 1 */
1152
    if (signe(Qx) < 0)
1153
    {
1154
      setabssign(Qx);
1155
      mpqs_add_factor(relp, &nb, 1, 1); /* i = 1, ei = 1, pi */
1156
    }
1157
    /* Qx > 0, divide by powers of 2; we're really dealing with 4*A*Q(x), so we
1158
     * always have at least 2^2 here, and at least 2^3 when kN = 1 mod 4 */
1159
    powers_of_2 = vali(Qx);
1160
    Qx = shifti(Qx, -powers_of_2);
1161
    mpqs_add_factor(relp, &nb, powers_of_2, 2); /* i = 1, ei = 1, pi */
1162
    /* When N is small, it may happen that N | Qx outright. In any case, when
1163
     * no extensive prior trial division / Rho / ECM was attempted, gcd(Qx,N)
1164
     * may turn out to be a nontrivial factor of N (not in FB or we'd have
1165
     * found it already, but possibly smaller than the large prime bound). This
1166
     * is too rare to check for here in the inner loop, but it will be caught
1167
     * if such an LP relation is ever combined with another. */
1168

1169
    /* Pass 1 over odd primes in FB: pick up all possible divisors of Qx
1170
     * including those sitting in k or in A, and remember them in relaprimes.
1171
     * Do not yet worry about possible repeated factors, these will be found in
1172
     * the Pass 2. Pass 1 recognizes divisors of A by their corresponding flags
1173
     * bit in the FB entry. (Divisors of k are ignored at this stage.)
1174
     * We construct a preliminary table of FB subscripts and "exponents" of FB
1175
     * primes which divide Qx. (We store subscripts, not the primes themselves.)
1176
     * We distinguish three cases:
1177
     * 0) prime in A which does not divide Qx/A,
1178
     * 1) prime not in A which divides Qx/A,
1179
     * 2) prime in A which divides Qx/A.
1180
     * Cases 1 and 2 need checking for repeated factors, kind 0 doesn't.
1181
     * Cases 0 and 1 contribute 1 to the exponent in the relation, case 2
1182
     * contributes 2.
1183
     * Factors in common with k are simpler: if they occur, they occur
1184
     * exactly to the first power, and this makes no difference in Pass 1,
1185
     * so they behave just like every normal odd FB prime. */
1186
    for (Qx_part = A, pi = 3; pi< h->index1_FB; pi++)
1187
    {
1188
      ulong p = FB[pi].fbe_p;
1189
      long xp = x % p;
1190
      /* Here we used that MPQS_FBE_DIVIDES_A = 1. */
1191

1192
      if (xp == FB[pi].fbe_start1 || xp == FB[pi].fbe_start2)
1193
      { /* p divides Q(x)/A and possibly A, case 2 or 3 */
1194
        ulong ei = FB[pi].fbe_flags & MPQS_FBE_DIVIDES_A;
1195
        relaprimes[relaprpos++] = pi;
1196
        relaprimes[relaprpos++] = 1 + ei;
1197
        Qx_part = muliu(Qx_part, p);
1198
      }
1199
    }
1200
    for (  ; thr && (p = FB[pi].fbe_p); pi++)
1201
    {
1202
      long xp = x % p;
1203
      /* Here we used that MPQS_FBE_DIVIDES_A = 1. */
1204

1205
      if (xp == FB[pi].fbe_start1 || xp == FB[pi].fbe_start2)
1206
      { /* p divides Q(x)/A and possibly A, case 2 or 3 */
1207
        ulong ei = FB[pi].fbe_flags & MPQS_FBE_DIVIDES_A;
1208
        relaprimes[relaprpos++] = pi;
1209
        relaprimes[relaprpos++] = 1 + ei;
1210
        Qx_part = muliu(Qx_part, p);
1211
        thr -= FB[pi].fbe_logval;
1212
      }
1213
    }
1214
    for (k = 0;  k< h->omega_A; k++)
1215
    {
1216
      long pi = MPQS_I(k);
1217
      ulong p = FB[pi].fbe_p;
1218
      long xp = x % p;
1219
      if (!(xp == FB[pi].fbe_start1 || xp == FB[pi].fbe_start2))
1220
      { /* p divides A but does not divide Q(x)/A, case 1 */
1221
        relaprimes[relaprpos++] = pi;
1222
        relaprimes[relaprpos++] = 0;
1223
      }
1224
    }
1225
    /* We have accumulated the known factors of Qx except for possible repeated
1226
     * factors and for possible large primes.  Divide off what we have.
1227
     * This is faster than dividing off A and each prime separately. */
1228
    Qx = diviiexact(Qx, Qx_part);
1229

1230
#ifdef MPQS_DEBUG
1231
    err_printf("MPQS DEBUG: eval loop 3, avma = 0x%lX\n", (ulong)avma);
1232
#endif
1233
    /* Pass 2: deal with repeated factors and store tentative relation. At this
1234
     * point, the only primes which can occur again in the adjusted Qx are
1235
     * those in relaprimes which are followed by 1 or 2. We must pick up those
1236
     * followed by a 0, too. */
1237
    PRINT_IF_VERBOSE("a");
1238
    for (pii = 0; pii < relaprpos; pii += 2)
1239
    {
1240
      ulong r, ei = relaprimes[pii+1];
1241
      GEN q;
1242

1243
      pi = relaprimes[pii];
1244
      /* p | k (identified by its index before index0_FB)* or p | A (ei = 0) */
1245
      if ((mpqs_int32_t)pi < h->index0_FB || ei == 0)
1246
      {
1247
        mpqs_add_factor(relp, &nb, 1, pi);
1248
        continue;
1249
      }
1250
      p = FB[pi].fbe_p;
1251
      /* p might still divide the current adjusted Qx. Try it. */
1252
      switch(cmpiu(Qx, p))
1253
      {
1254
        case 0: ei++; Qx = gen_1; break;
1255
        case 1:
1256
          q = absdiviu_rem(Qx, p, &r);
1257
          while (r == 0) { ei++; Qx = q; q = absdiviu_rem(Qx, p, &r); }
1258
          break;
1259
      }
1260
      mpqs_add_factor(relp, &nb, ei, pi);
1261
    }
1262

1263
#ifdef MPQS_DEBUG
1264
    err_printf("MPQS DEBUG: eval loop 4, avma = 0x%lX\n", (ulong)avma);
1265
#endif
1266
    PRINT_IF_VERBOSE("\bb");
1267
    setlg(relp, nb+1);
1268
    if (is_pm1(Qx))
1269
    {
1270
      GEN rel = gerepilecopy(btop, mkvec2(absi_shallow(Y), relp));
1271
#ifdef MPQS_DEBUG
1272
      mpqs_check_rel(h, rel);
1273
#endif
1274
      frel_add(frel, rel);
1275
    }
1276
    else if (cmpiu(Qx, h->lp_bound) <= 0)
1277
    {
1278
      ulong q = itou(Qx);
1279
      GEN rel = mkvec3(absi_shallow(Y),relp,Qx);
1280
      GEN col = hash_haskey_GEN(lprel, (void*)q);
1281
#ifdef MPQS_DEBUG
1282
      mpqs_check_rel(h, rel);
1283
#endif
1284
      if (!col) /* relation up to large prime */
1285
        hash_insert(lprel, (void*)q, (void*)gerepilecopy(btop,rel));
1286
      else if ((rel = combine_large_primes(h, rel, col)))
1287
      {
1288
        if (typ(rel) == t_INT) return rel; /* very unlikely */
1289
#ifdef MPQS_DEBUG
1290
        mpqs_check_rel(h, rel);
1291
#endif
1292
        frel_add(frel, gerepilecopy(btop,rel));
1293
      }
1294
      else
1295
        set_avma(btop);
1296
    }
1297
    else
1298
    { /* TODO: check for double large prime */
1299
      PRINT_IF_VERBOSE("\b.");
1300
      set_avma(btop);
1301
    }
1302
  }
1303
  PRINT_IF_VERBOSE("\n");
1304
  return NULL;
1305
}
1306

1307
/*********************************************************************/
1308
/**                    FROM RELATIONS TO DIVISORS                   **/
1309
/*********************************************************************/
1310

1311
/* create an F2m from a relations list */
1312
static GEN
1313
rels_to_F2Ms(GEN rel)
1314
{
1315
  long i, cols = lg(rel)-1;
1316
  GEN m = cgetg(cols+1, t_VEC);
1317
  for (i = 1; i <= cols; i++)
1318
  {
1319
    GEN relp = gmael(rel,i,2), rel2;
1320
    long j, l = lg(relp), o = 0, k;
1321
    for (j = 1; j < l; j++)
1322
      if (odd(relp[j] >> REL_OFFSET)) o++;
1323
    rel2 = cgetg(o+1, t_VECSMALL);
1324
    for (j = 1, k = 1; j < l; j++)
1325
      if (odd(relp[j] >> REL_OFFSET))
1326
        rel2[k++] = relp[j] & REL_MASK;
1327
    gel(m, i) = rel2;
1328
  }
1329
  return m;
1330
}
1331

1332
static int
1333
split(GEN *D, long *e)
1334
{
1335
  ulong mask;
1336
  long flag;
1337
  if (MR_Jaeschke(*D)) { *e = 1; return 1; } /* probable prime */
1338
  if (Z_issquareall(*D, D))
1339
  { /* squares could cost us a lot of time */
1340
    if (DEBUGLEVEL >= 5) err_printf("MPQS: decomposed a square\n");
1341
    *e = 2; return 1;
1342
  }
1343
  mask = 7;
1344
  /* 5th/7th powers aren't worth the trouble. OTOH once we have the hooks for
1345
   * dealing with cubes, higher powers can be handled essentially for free) */
1346
  if ((flag = is_357_power(*D, D, &mask)))
1347
  {
1348
    if (DEBUGLEVEL >= 5)
1349
      err_printf("MPQS: decomposed a %s power\n", uordinal(flag));
1350
    *e = flag; return 1;
1351
  }
1352
  *e = 0; return 0; /* known composite */
1353
}
1354

1355
/* return a GEN structure containing NULL but safe for gerepileupto */
1356
static GEN
1357
mpqs_solve_linear_system(mpqs_handle_t *h, hashtable *frel)
1358
{
1359
  mpqs_FB_entry_t *FB = h->FB;
1360
  pari_sp av = avma;
1361
  GEN rels = hash_keys(frel), N = h->N, r, c, res, ei, M, Ker;
1362
  long i, j, nrows, rlast, rnext, rmax, rank;
1363

1364
  M = rels_to_F2Ms(rels);
1365
  Ker = F2Ms_ker(M, h->size_of_FB+1); rank = lg(Ker)-1;
1366
  if (DEBUGLEVEL >= 4)
1367
  {
1368
    if (DEBUGLEVEL >= 7)
1369
      err_printf("\\\\ MPQS RELATION MATRIX\nFREL=%Ps\nKERNEL=%Ps\n",M, Ker);
1370
    err_printf("MPQS: Gauss done: kernel has rank %ld, taking gcds...\n", rank);
1371
  }
1372
  if (!rank)
1373
  { /* trivial kernel; main loop may look for more relations */
1374
    if (DEBUGLEVEL >= 3)
1375
      pari_warn(warner, "MPQS: no solutions found from linear system solver");
1376
    return gc_NULL(av); /* no factors found */
1377
  }
1378

1379
  /* Expect up to 2^rank pairwise coprime factors, but a kernel basis vector
1380
   * may not contribute to the decomposition; r stores the factors and c what
1381
   * we know about them (0: composite, 1: probably prime, >= 2: proper power) */
1382
  ei = cgetg(h->size_of_FB + 2, t_VECSMALL);
1383
  rmax = logint(N, utoi(3));
1384
  if (rank <= BITS_IN_LONG-2)
1385
    rmax = minss(rmax, 1L<<rank); /* max # of factors we can hope for */
1386
  r = cgetg(rmax+1, t_VEC);
1387
  c = cgetg(rmax+1, t_VECSMALL);
1388
  rnext = rlast = 1;
1389
  nrows = lg(M)-1;
1390
  for (i = 1; i <= rank; i++)
1391
  { /* loop over kernel basis */
1392
    GEN X = gen_1, Y_prod = gen_1, X_plus_Y, D;
1393
    pari_sp av2 = avma, av3;
1394
    long done = 0; /* # probably-prime factors or powers whose bases we won't
1395
                    * handle any further */
1396
    memset((void *)(ei+1), 0, (h->size_of_FB + 1) * sizeof(long));
1397
    for (j = 1; j <= nrows; j++)
1398
      if (F2m_coeff(Ker, j, i))
1399
      {
1400
        GEN R = gel(rels,j);
1401
        Y_prod = gerepileuptoint(av2, remii(mulii(Y_prod, gel(R,1)), N));
1402
        rel_to_ei(ei, gel(R,2));
1403
      }
1404
    av3 = avma;
1405
    for (j = 2; j <= h->size_of_FB + 1; j++)
1406
      if (ei[j])
1407
      {
1408
        GEN q = utoipos(FB[j].fbe_p);
1409
        if (ei[j] & 1) pari_err_BUG("MPQS (relation is a nonsquare)");
1410
        X = remii(mulii(X, Fp_powu(q, (ulong)ei[j]>>1, N)), N);
1411
        X = gerepileuptoint(av3, X);
1412
      }
1413
    if (MPQS_DEBUGLEVEL >= 1 && !dvdii(subii(sqri(X), sqri(Y_prod)), N))
1414
    {
1415
      err_printf("MPQS: X^2 - Y^2 != 0 mod N\n");
1416
      err_printf("\tindex i = %ld\n", i);
1417
      pari_warn(warner, "MPQS: wrong relation found after Gauss");
1418
    }
1419
    /* At this point, gcd(X-Y, N) * gcd(X+Y, N) = N:
1420
     * 1) N | X^2 - Y^2, so it divides the LHS;
1421
     * 2) let P be any prime factor of N. If P | X-Y and P | X+Y, then P | 2X
1422
     * But X is a product of powers of FB primes => coprime to N.
1423
     * Hence we work with gcd(X+Y, N) alone. */
1424
    X_plus_Y = addii(X, Y_prod);
1425
    if (rnext == 1)
1426
    { /* we still haven't decomposed, and want both a gcd and its cofactor. */
1427
      D = gcdii(X_plus_Y, N);
1428
      if (is_pm1(D) || equalii(D,N)) { set_avma(av2); continue; }
1429
      /* got something that works */
1430
      if (DEBUGLEVEL >= 5)
1431
        err_printf("MPQS: splitting N after %ld kernel vector%s\n",
1432
                   i+1, (i? "s" : ""));
1433
      gel(r,1) = diviiexact(N, D);
1434
      gel(r,2) = D;
1435
      rlast = rnext = 3;
1436
      if (split(&gel(r,1), &c[1])) done++;
1437
      if (split(&gel(r,2), &c[2])) done++;
1438
      if (done == 2 || rmax == 2) break;
1439
      if (DEBUGLEVEL >= 5)
1440
        err_printf("MPQS: got two factors, looking for more...\n");
1441
    }
1442
    else
1443
    { /* we already have factors */
1444
      for (j = 1; j < rnext; j++)
1445
      { /* loop over known-composite factors */
1446
        /* skip probable primes and also roots of pure powers: they are a lot
1447
         * smaller than N and should be easy to deal with later */
1448
        if (c[j]) { done++; continue; }
1449
        av3 = avma; D = gcdii(X_plus_Y, gel(r,j));
1450
        if (is_pm1(D) || equalii(D, gel(r,j))) { set_avma(av3); continue; }
1451
        /* got one which splits this factor */
1452
        if (DEBUGLEVEL >= 5)
1453
          err_printf("MPQS: resplitting a factor after %ld kernel vectors\n",
1454
                     i+1);
1455
        gel(r,j) = diviiexact(gel(r,j), D);
1456
        gel(r,rnext) = D;
1457
        if (split(&gel(r,j), &c[j])) done++;
1458
        /* Don't increment done: happens later when we revisit c[rnext] during
1459
         * the present inner loop. */
1460
        (void)split(&gel(r,rnext), &c[rnext]);
1461
        if (++rnext > rmax) break; /* all possible factors seen */
1462
      } /* loop over known composite factors */
1463

1464
      if (rnext > rlast)
1465
      {
1466
        if (DEBUGLEVEL >= 5)
1467
          err_printf("MPQS: got %ld factors%s\n", rlast - 1,
1468
                     (done < rlast ? ", looking for more..." : ""));
1469
        rlast = rnext;
1470
      }
1471
      /* break out if we have rmax factors or all current factors are probable
1472
       * primes or tiny roots from pure powers */
1473
      if (rnext > rmax || done == rnext - 1) break;
1474
    }
1475
  }
1476
  if (rnext == 1) return gc_NULL(av); /* no factors found */
1477

1478
  /* normal case: convert to ifac format as described in ifactor1.c (value,
1479
   * exponent, class [unknown, known composite, known prime]) */
1480
  rlast = rnext - 1; /* # of distinct factors found */
1481
  res = cgetg(3*rlast + 1, t_VEC);
1482
  if (DEBUGLEVEL >= 6) err_printf("MPQS: wrapping up %ld factors\n", rlast);
1483
  for (i = j = 1; i <= rlast; i++, j += 3)
1484
  {
1485
    long C  = c[i];
1486
    icopyifstack(gel(r,i), gel(res,j)); /* factor */
1487
    gel(res,j+1) = C <= 1? gen_1: utoipos(C); /* exponent */
1488
    gel(res,j+2) = C ? NULL: gen_0; /* unknown or known composite */
1489
    if (DEBUGLEVEL >= 6)
1490
      err_printf("\tpackaging %ld: %Ps ^%ld (%s)\n", i, gel(r,i),
1491
                 itos(gel(res,j+1)), (C? "unknown": "composite"));
1492
  }
1493
  return res;
1494
}
1495

1496
/*********************************************************************/
1497
/**               MAIN ENTRY POINT AND DRIVER ROUTINE               **/
1498
/*********************************************************************/
1499
static void
1500
toolarge()
1501
{ pari_warn(warner, "MPQS: number too big to be factored with MPQS,\n\tgiving up"); }
1502

1503
/* Factors N using the self-initializing multipolynomial quadratic sieve
1504
 * (SIMPQS).  Returns one of the two factors, or (usually) a vector of factors
1505
 * and exponents and information about which ones are still composite, or NULL
1506
 * when we can't seem to make any headway. */
1507
GEN
1508
mpqs(GEN N)
1509
{
1510
  const long size_N = decimal_len(N);
1511
  mpqs_handle_t H;
1512
  GEN fact; /* will in the end hold our factor(s) */
1513
  mpqs_FB_entry_t *FB; /* factor base */
1514
  double dbg_target, DEFEAT;
1515
  ulong p;
1516
  pari_timer T;
1517
  hashtable lprel, frel;
1518
  pari_sp av = avma;
1519

1520
  if (DEBUGLEVEL >= 4) err_printf("MPQS: number to factor N = %Ps\n", N);
1521
  if (size_N > MPQS_MAX_DIGIT_SIZE_KN) { toolarge(); return NULL; }
1522
  if (DEBUGLEVEL >= 4)
1523
  {
1524
    timer_start(&T);
1525
    err_printf("MPQS: factoring number of %ld decimal digits\n", size_N);
1526
  }
1527
  H.N = N;
1528
  H.bin_index = 0;
1529
  H.index_i = 0;
1530
  H.index_j = 0;
1531
  H.index2_moved = 0;
1532
  p = mpqs_find_k(&H);
1533
  if (p) { set_avma(av); return utoipos(p); }
1534
  if (DEBUGLEVEL >= 5)
1535
    err_printf("MPQS: found multiplier %ld for N\n", H._k->k);
1536
  H.kN = muliu(N, H._k->k);
1537
  if (!mpqs_set_parameters(&H)) { toolarge(); return NULL; }
1538

1539
  if (DEBUGLEVEL >= 5)
1540
    err_printf("MPQS: creating factor base and allocating arrays...\n");
1541
  FB = mpqs_create_FB(&H, &p);
1542
  if (p) { set_avma(av); return utoipos(p); }
1543
  mpqs_sieve_array_ctor(&H);
1544
  mpqs_poly_ctor(&H);
1545

1546
  H.lp_bound = minss(H.largest_FB_p, MPQS_LP_BOUND);
1547
  /* don't allow large primes to have room for two factors both bigger than
1548
   * what the FB contains (...yet!) */
1549
  H.lp_bound *= minss(H.lp_scale, H.largest_FB_p - 1);
1550
  H.dkN = gtodouble(H.kN);
1551
  /* compute the threshold and fill in the byte-sized scaled logarithms */
1552
  mpqs_set_sieve_threshold(&H);
1553
  if (!mpqs_locate_A_range(&H)) return NULL;
1554
  if (DEBUGLEVEL >= 4)
1555
  {
1556
    err_printf("MPQS: sieving interval = [%ld, %ld]\n", -(long)H.M, (long)H.M);
1557
    /* that was a little white lie, we stop one position short at the top */
1558
    err_printf("MPQS: size of factor base = %ld\n", (long)H.size_of_FB);
1559
    err_printf("MPQS: striving for %ld relations\n", (long)H.target_rels);
1560
    err_printf("MPQS: coefficients A will be built from %ld primes each\n",
1561
               (long)H.omega_A);
1562
    err_printf("MPQS: primes for A to be chosen near FB[%ld] = %ld\n",
1563
               (long)H.index2_FB, (long)FB[H.index2_FB].fbe_p);
1564
    err_printf("MPQS: smallest prime used for sieving FB[%ld] = %ld\n",
1565
               (long)H.index1_FB, (long)FB[H.index1_FB].fbe_p);
1566
    err_printf("MPQS: largest prime in FB = %ld\n", (long)H.largest_FB_p);
1567
    err_printf("MPQS: bound for `large primes' = %ld\n", (long)H.lp_bound);
1568
    if (DEBUGLEVEL >= 5)
1569
      err_printf("MPQS: sieve threshold = %u\n", (unsigned int)H.sieve_threshold);
1570
    err_printf("MPQS: computing relations:");
1571
  }
1572

1573
  /* main loop which
1574
   * - computes polynomials and their zeros (SI)
1575
   * - does the sieving
1576
   * - tests candidates of the sieve array */
1577

1578
  /* Let (A, B_i) the current pair of coeffs. If i == 0 a new A is generated */
1579
  H.index_j = (mpqs_uint32_t)-1;  /* increment below will have it start at 0 */
1580

1581
  dbg_target = H.target_rels / 100.;
1582
  DEFEAT = H.target_rels * 1.5;
1583
  hash_init_GEN(&frel, H.target_rels, gequal, 1);
1584
  hash_init_ulong(&lprel,H.target_rels, 1);
1585
  for(;;)
1586
  {
1587
    long tc;
1588
    /* self initialization: compute polynomial and its zeros */
1589
    if (!mpqs_self_init(&H))
1590
    { /* have run out of primes for A; give up */
1591
      if (DEBUGLEVEL >= 2)
1592
        err_printf("MPQS: Ran out of primes for A, giving up.\n");
1593
      return gc_NULL(av);
1594
    }
1595
    mpqs_sieve(&H);
1596
    tc = mpqs_eval_sieve(&H);
1597
    if (DEBUGLEVEL >= 6)
1598
      err_printf("MPQS: found %lu candidate%s\n", tc, (tc==1? "" : "s"));
1599
    if (tc)
1600
    {
1601
      fact = mpqs_eval_cand(&H, tc, &frel, &lprel);
1602
      if (fact)
1603
      { /* factor found during combining */
1604
        if (DEBUGLEVEL >= 4)
1605
        {
1606
          err_printf("\nMPQS: split N whilst combining, time = %ld ms\n",
1607
                     timer_delay(&T));
1608
          err_printf("MPQS: found factor = %Ps\n", fact);
1609
        }
1610
        return gerepileupto(av, fact);
1611
      }
1612
    }
1613
    if (DEBUGLEVEL >= 4 && frel.nb > dbg_target)
1614
    {
1615
      err_printf(" %ld%%", 100*frel.nb/ H.target_rels);
1616
      if (DEBUGLEVEL >= 5) err_printf(" (%ld ms)", timer_delay(&T));
1617
      dbg_target += H.target_rels / 100.;
1618
    }
1619
    if (frel.nb < (ulong)H.target_rels) continue; /* main loop */
1620

1621
    if (DEBUGLEVEL >= 4)
1622
    {
1623
      timer_start(&T);
1624
      err_printf("\nMPQS: starting Gauss over F_2 on %ld distinct relations\n",
1625
                 frel.nb);
1626
    }
1627
    fact = mpqs_solve_linear_system(&H, &frel);
1628
    if (fact)
1629
    { /* solution found */
1630
      if (DEBUGLEVEL >= 4)
1631
      {
1632
        err_printf("\nMPQS: time in Gauss and gcds = %ld ms\n",timer_delay(&T));
1633
        if (typ(fact) == t_INT) err_printf("MPQS: found factor = %Ps\n", fact);
1634
        else
1635
        {
1636
          long j, nf = (lg(fact)-1)/3;
1637
          err_printf("MPQS: found %ld factors =\n", nf);
1638
          for (j = 1; j <= nf; j++)
1639
            err_printf("\t%Ps%s\n", gel(fact,3*j-2), (j < nf)? ",": "");
1640
        }
1641
      }
1642
      return gerepileupto(av, fact);
1643
    }
1644
    if (DEBUGLEVEL >= 4)
1645
    {
1646
      err_printf("\nMPQS: time in Gauss and gcds = %ld ms\n",timer_delay(&T));
1647
      err_printf("MPQS: no factors found.\n");
1648
      if (frel.nb < DEFEAT)
1649
        err_printf("\nMPQS: restarting sieving ...\n");
1650
      else
1651
        err_printf("\nMPQS: giving up.\n");
1652
    }
1653
    if (frel.nb >= DEFEAT) return gc_NULL(av);
1654
    H.target_rels += 10;
1655
  }
1656
}
1657

1658
Product

Resources

Company