CoCalc -- gcdll.c

Testing latest pari + WASM + node.js... and it works?! Wow.
pari / 2021-07-01 / pari-2.14.0.alpha / src / kernel / none / gcdll.c
²⁸⁴⁹⁵ views
License: GPL3
ubuntu2004
1
#line 2 "../src/kernel/none/gcdll.c"
2
/* Copyright (C) 2000  The PARI group.
3

4
This file is part of the PARI/GP package.
5

6
PARI/GP is free software; you can redistribute it and/or modify it under the
7
terms of the GNU General Public License as published by the Free Software
8
Foundation; either version 2 of the License, or (at your option) any later
9
version. It is distributed in the hope that it will be useful, but WITHOUT
10
ANY WARRANTY WHATSOEVER.
11

12
Check the License for details. You should have received a copy of it, along
13
with the package; see the file 'COPYING'. If not, write to the Free Software
14
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. */
15

16
/***********************************************************************/
17
/**                                                                   **/
18
/**                          GCD                                      **/
19
/**                                                                   **/
20
/***********************************************************************/
21
/* Fast ulong gcd.  Called with y odd; x can be arbitrary (but will most of
22
 * the time be smaller than y). */
23

24
/* Gotos are Harmful, and Programming is a Science.  E.W.Dijkstra. */
25
INLINE ulong
26
gcduodd(ulong x, ulong y)         /* assume y&1==1, y > 1 */
27
{
28
  if (!x) return y;
29
  /* fix up x */
30
  while (!(x&1)) x>>=1;
31
  if (x==1) return 1;
32
  if (x==y) return y;
33
  else if (x>y) goto xislarger;/* will be rare, given how we'll use this */
34
  /* loop invariants: x,y odd and distinct. */
35
 yislarger:
36
  if ((x^y)&2)                 /* ...01, ...11 or vice versa */
37
    y=(x>>2)+(y>>2)+1;         /* ==(x+y)>>2 except it can't overflow */
38
  else                         /* ...01,...01 or ...11,...11 */
39
    y=(y-x)>>2;                /* now y!=0 in either case */
40
  while (!(y&1)) y>>=1;        /* kill any windfall-gained powers of 2 */
41
  if (y==1) return 1;          /* comparand == return value... */
42
  if (x==y) return y;          /* this and the next is just one comparison */
43
  else if (x<y) goto yislarger;/* else fall through to xislarger */
44

45
 xislarger:                    /* same as above, seen through a mirror */
46
  if ((x^y)&2)
47
    x=(x>>2)+(y>>2)+1;
48
  else
49
    x=(x-y)>>2;                /* x!=0 */
50
  while (!(x&1)) x>>=1;
51
  if (x==1) return 1;
52
  if (x==y) return y;
53
  else if (x>y) goto xislarger;
54

55
  goto yislarger;
56
}
57
/* Gotos are useful, and Programming is an Art.  D.E.Knuth. */
58
/* PS: Of course written with Dijkstra's lessons firmly in mind... --GN */
59

60
/* at least one of a or b is odd, return gcd(a,b) */
61
INLINE ulong
62
mygcduodd(ulong a, ulong b)
63
{
64
  ulong c;
65
  if (b&1)
66
  {
67
    if (a==1 || b==1)
68
      c = 1;
69
    else
70
     c = gcduodd(a, b);
71
  }
72
  else
73
  {
74
    if (a==1)
75
      c = 1;
76
    else
77
      c = gcduodd(b, a);
78
  }
79
  return c;
80
}
81

82
/* modified right shift binary algorithm with at most one division */
83
ulong
84
ugcd(ulong a,ulong b)
85
{
86
  long v;
87
  if (!b) return a;
88
  if (!a) return b;
89
  if (a>b) { a %= b; if (!a) return b; }
90
  else     { b %= a; if (!b) return a; }
91
  v = vals(a|b);
92
  return mygcduodd(a>>v, b>>v) << v;
93
}
94
long
95
cgcd(long a,long b) { return (long)ugcd(labs(a), labs(b)); }
96

97
/* For gcdii(): assume a>b>0, return gcd(a,b) as a GEN */
98
static GEN
99
igcduu(ulong a, ulong b)
100
{
101
  long v;
102
  a %= b; if (!a) return utoipos(b);
103
  v = vals(a|b);
104
  return utoipos( mygcduodd(a>>v, b>>v) << v );
105
}
106

107
/*Warning: overflows silently if lcm does not fit*/
108
ulong
109
ulcm(ulong a, ulong b)
110
{
111
  ulong d = ugcd(a,b);
112
  if (!d) return 0;
113
  return d == 1? a*b: a*(b/d);
114
}
115
long
116
clcm(long a,long b) { return ulcm(labs(a), labs(b)); }
117

118
/********************************************************************/
119
/**                                                                **/
120
/**               INTEGER EXTENDED GCD  (AND INVMOD)               **/
121
/**                                                                **/
122
/********************************************************************/
123
/* Two basic ideas - (1) avoid many integer divisions, especially when the
124
 * quotient is 1 which happens ~ 40% of the time.  (2) Use Lehmer's trick as
125
 * modified by Jebelean of extracting a couple of words' worth of leading bits
126
 * from both operands, and compute partial quotients from them as long as we
127
 * can be sure of their values.  Jebelean's modifications consist in
128
 * inequalities from which we can quickly decide whether to carry on or to
129
 * return to the outer loop, and in re-shifting after the first word's worth of
130
 * bits has been used up.  All of this is described in R. Lercier's thesis
131
 * [pp148-153 & 163f.], except his outer loop isn't quite right: the catch-up
132
 * divisions needed when one partial quotient is larger than a word are missing.
133
 *
134
 * The API consists of invmod() and bezout() below; the single-word routines
135
 * xgcduu and xxgcduu may be called directly if desired; lgcdii() probably
136
 * doesn't make much sense out of context.
137
 *
138
 * The whole lot is a factor 6 .. 8 faster on word-sized operands, and asym-
139
 * ptotically about a factor 2.5 .. 3, depending on processor architecture,
140
 * than the naive continued-division code.  Unfortunately, thanks to the
141
 * unrolled loops and all, the code is lengthy. */
142

143
/*==================================
144
 * xgcduu(d,d1,f,v,v1,s)
145
 * xxgcduu(d,d1,f,u,u1,v,v1,s)
146
 * rgcduu(d,d1,vmax,u,u1,v,v1,s)
147
 *==================================*/
148
/*
149
 * Fast `final' extended gcd algorithm, acting on two ulongs.  Ideally this
150
 * should be replaced with assembler versions wherever possible.  The present
151
 * code essentially does `subtract, compare, and possibly divide' at each step,
152
 * which is reasonable when hardware division (a) exists, (b) is a bit slowish
153
 * and (c) does not depend a lot on the operand values (as on i486).  When
154
 * wordsize division is in fact an assembler routine based on subtraction,
155
 * this strategy may not be the most efficient one.
156
 *
157
 * xxgcduu() should be called with  d > d1 > 0, returns gcd(d,d1), and assigns
158
 * the usual signless cont.frac. recurrence matrix to [u, u1; v, v1]  (i.e.,
159
 * the product of all the [0, 1; 1 q_j] where the leftmost factor arises from
160
 * the quotient of the first division step),  and the information about the
161
 * implied signs to s  (-1 when an odd number of divisions has been done,
162
 * 1 otherwise).  xgcduu() is exactly the same except that u,u1 are not com-
163
 * puted (and not returned, of course).
164
 *
165
 * The input flag f should be set to 1 if we know in advance that gcd(d,d1)==1
166
 * (so we can stop the chain division one step early:  as soon as the remainder
167
 * equals 1).  Use this when you intend to use only what would be v, or only
168
 * what would be u and v, after that final division step, but not u1 and v1.
169
 * With the flag in force and thus without that final step, the interesting
170
 * quantity/ies will still sit in [u1 and] v1, of course.
171
 *
172
 * For computing the inverse of a single-word INTMOD known to exist, pass f=1
173
 * to xgcduu(), and obtain the result from s and v1.  (The routine does the
174
 * right thing when d1==1 already.)  For finishing a multiword modinv known
175
 * to exist, pass f=1 to xxgcduu(), and multiply the returned matrix  (with
176
 * properly adjusted signs)  onto the values v' and v1' previously obtained
177
 * from the multiword division steps.  Actually, just take the scalar product
178
 * of [v',v1'] with [u1,-v1], and change the sign if s==-1.  (If the final
179
 * step had been carried out, it would be [-u,v], and s would also change.)
180
 * For reducing a rational number to lowest terms, pass f=0 to xgcduu().
181
 * Finally, f=0 with xxgcduu() is useful for Bezout computations.
182
 * (It is safe for invmod() to call xgcduu() with f=1, because f&1 doesn't
183
 * make a difference when gcd(d,d1)>1.  The speedup is negligible.)
184
 *
185
 * In principle, when gcd(d,d1) is known to be 1, it is straightforward to
186
 * recover the final u,u1 given only v,v1 and s.  However, it probably isn't
187
 * worthwhile, as it trades a few multiplications for a division.
188
 *
189
 * rgcduu() is a variant of xxgcduu() which does not have f  (the effect is
190
 * that of f=0),  but instead has a ulong vmax parameter, for use in rational
191
 * reconstruction below.  It returns when v1 exceeds vmax;  v will never
192
 * exceed vmax.  (vmax=0 is taken as a synonym of ULONG_MAX i.e. unlimited,
193
 * in which case rgcduu behaves exactly like xxgcduu with f=0.)  The return
194
 * value of rgcduu() is typically meaningless;  the interesting part is the
195
 * matrix. */
196

197
ulong
198
xgcduu(ulong d, ulong d1, int f, ulong* v, ulong* v1, long *s)
199
{
200
  ulong xv,xv1, xs, q,res;
201
  LOCAL_HIREMAINDER;
202

203
  /* The above blurb contained a lie.  The main loop always stops when d1
204
   * has become equal to 1.  If (d1 == 1 && !(f&1)) after the loop, we do
205
   * the final `division' of d by 1 `by hand' as it were.
206
   *
207
   * The loop has already been unrolled once.  Aggressive optimization could
208
   * well lead to a totally unrolled assembler version.
209
   *
210
   * On modern x86 architectures, this loop is a pig anyway.  The division
211
   * instruction always puts its result into the same pair of registers, and
212
   * we always want to use one of them straight away, so pipeline performance
213
   * will suck big time.  An assembler version should probably do a first loop
214
   * computing and storing all the quotients -- their number is bounded in
215
   * advance -- and then assembling the matrix in a second pass.  On other
216
   * architectures where we can cycle through four or so groups of registers
217
   * and exploit a fast ALU result-to-operand feedback path, this is much less
218
   * of an issue. */
219
  xs = res = 0;
220
  xv = 0UL; xv1 = 1UL;
221
  while (d1 > 1UL)
222
  {
223
    d -= d1; /* no need to use subll */
224
    if (d >= d1)
225
    {
226
      hiremainder = 0; q = 1 + divll(d,d1); d = hiremainder;
227
      xv += q * xv1;
228
    }
229
    else
230
      xv += xv1;
231
    if (d <= 1UL) { xs=1; break; } /* possible loop exit */
232
    /* repeat with inverted roles */
233
    d1 -= d;
234
    if (d1 >= d)
235
    {
236
      hiremainder = 0; q = 1 + divll(d1,d); d1 = hiremainder;
237
      xv1 += q * xv;
238
    }
239
    else
240
      xv1 += xv;
241
  }
242

243
  if (!(f&1))
244
  { /* division by 1 postprocessing if needed */
245
    if (xs && d==1)
246
    { xv1 += d1 * xv; xs = 0; res = 1UL; }
247
    else if (!xs && d1==1)
248
    { xv += d * xv1; xs = 1; res = 1UL; }
249
  }
250

251
  if (xs)
252
  {
253
    *s = -1; *v = xv1; *v1 = xv;
254
    return (res ? res : (d==1 ? 1UL : d1));
255
  }
256
  else
257
  {
258
    *s = 1; *v = xv; *v1 = xv1;
259
    return (res ? res : (d1==1 ? 1UL : d));
260
  }
261
}
262

263
ulong
264
xxgcduu(ulong d, ulong d1, int f,
265
        ulong* u, ulong* u1, ulong* v, ulong* v1, long *s)
266
{
267
  ulong xu,xu1, xv,xv1, xs, q,res;
268
  LOCAL_HIREMAINDER;
269

270
  xs = res = 0;
271
  xu = xv1 = 1UL;
272
  xu1 = xv = 0UL;
273
  while (d1 > 1UL)
274
  {
275
    /* no need to use subll */
276
    d -= d1;
277
    if (d >= d1)
278
    {
279
      hiremainder = 0; q = 1 + divll(d,d1); d = hiremainder;
280
      xv += q * xv1;
281
      xu += q * xu1;
282
    }
283
    else
284
    { xv += xv1; xu += xu1; }
285
    if (d <= 1UL) { xs=1; break; } /* possible loop exit */
286
    /* repeat with inverted roles */
287
    d1 -= d;
288
    if (d1 >= d)
289
    {
290
      hiremainder = 0; q = 1 + divll(d1,d); d1 = hiremainder;
291
      xv1 += q * xv;
292
      xu1 += q * xu;
293
    }
294
    else
295
    { xv1 += xv; xu1 += xu; }
296
  }
297

298
  if (!(f&1))
299
  { /* division by 1 postprocessing if needed */
300
    if (xs && d==1)
301
    {
302
      xv1 += d1 * xv;
303
      xu1 += d1 * xu;
304
      xs = 0; res = 1UL;
305
    }
306
    else if (!xs && d1==1)
307
    {
308
      xv += d * xv1;
309
      xu += d * xu1;
310
      xs = 1; res = 1UL;
311
    }
312
  }
313

314
  if (xs)
315
  {
316
    *s = -1; *u = xu1; *u1 = xu; *v = xv1; *v1 = xv;
317
    return (res ? res : (d==1 ? 1UL : d1));
318
  }
319
  else
320
  {
321
    *s = 1; *u = xu; *u1 = xu1; *v = xv; *v1 = xv1;
322
    return (res ? res : (d1==1 ? 1UL : d));
323
  }
324
}
325

326
ulong
327
rgcduu(ulong d, ulong d1, ulong vmax,
328
       ulong* u, ulong* u1, ulong* v, ulong* v1, long *s)
329
{
330
  ulong xu,xu1, xv,xv1, xs, q, res=0;
331
  int f = 0;
332
  LOCAL_HIREMAINDER;
333

334
  if (vmax == 0) vmax = ULONG_MAX;
335
  xs = res = 0;
336
  xu = xv1 = 1UL;
337
  xu1 = xv = 0UL;
338
  while (d1 > 1UL)
339
  {
340
    d -= d1; /* no need to use subll */
341
    if (d >= d1)
342
    {
343
      hiremainder = 0; q = 1 + divll(d,d1); d = hiremainder;
344
      xv += q * xv1;
345
      xu += q * xu1;
346
    }
347
    else
348
    { xv += xv1; xu += xu1; }
349
    /* possible loop exit */
350
    if (xv > vmax) { f=xs=1; break; }
351
    if (d <= 1UL) { xs=1; break; }
352
    /* repeat with inverted roles */
353
    d1 -= d;
354
    if (d1 >= d)
355
    {
356
      hiremainder = 0; q = 1 + divll(d1,d); d1 = hiremainder;
357
      xv1 += q * xv;
358
      xu1 += q * xu;
359
    }
360
    else
361
    { xv1 += xv; xu1 += xu; }
362
    /* possible loop exit */
363
    if (xv1 > vmax) { f=1; break; }
364
  }
365

366
  if (!(f&1))
367
  { /* division by 1 postprocessing if needed */
368
    if (xs && d==1)
369
    {
370
      xv1 += d1 * xv;
371
      xu1 += d1 * xu;
372
      xs = 0; res = 1UL;
373
    }
374
    else if (!xs && d1==1)
375
    {
376
      xv += d * xv1;
377
      xu += d * xu1;
378
      xs = 1; res = 1UL;
379
    }
380
  }
381

382
  if (xs)
383
  {
384
    *s = -1; *u = xu1; *u1 = xu; *v = xv1; *v1 = xv;
385
    return (res ? res : (d==1 ? 1UL : d1));
386
  }
387
  else
388
  {
389
    *s = 1; *u = xu; *u1 = xu1; *v = xv; *v1 = xv1;
390
    return (res ? res : (d1==1 ? 1UL : d));
391
  }
392
}
393

394
/*==================================
395
 * cbezout(a,b,uu,vv)
396
 *==================================
397
 * Same as bezout() but for C longs.
398
 *    Return g = gcd(a,b) >= 0, and assign longs u,v through pointers uu,vv
399
 *    such that g = u*a + v*b.
400
 * Special cases:
401
 *    a = b = 0 ==> pick u=1, v=0 (and return 1, surprisingly)
402
 *    a != 0 = b ==> keep v=0
403
 *    a = 0 != b ==> keep u=0
404
 *    |a| = |b| != 0 ==> keep u=0, set v=+-1
405
 * Assignments through uu,vv happen unconditionally. */
406
long
407
cbezout(long a,long b,long *uu,long *vv)
408
{
409
  long s,*t;
410
  ulong d = labs(a), d1 = labs(b);
411
  ulong r,u,u1,v,v1;
412

413
  if (!b)
414
  {
415
    *vv=0L;
416
    if (!a) { *uu=1L; return 0L; }
417
    *uu = a < 0 ? -1L : 1L;
418
    return (long)d;
419
  }
420
  else if (!a || (d == d1))
421
  {
422
    *uu = 0L; *vv = b < 0 ? -1L : 1L;
423
    return (long)d1;
424
  }
425
  else if (d == 1) /* frequently used by nfinit */
426
  {
427
    *uu = a; *vv = 0L;
428
    return 1L;
429
  }
430
  else if (d < d1)
431
  {
432
/* bug in gcc-2.95.3:
433
 * s = a; a = b; b = s; produces wrong result a = b. This is OK:  */
434
    { long _x = a; a = b; b = _x; } /* in order to keep the right signs */
435
    r = d; d = d1; d1 = r;
436
    t = uu; uu = vv; vv = t;
437
  }
438
  /* d > d1 > 0 */
439
  r = xxgcduu(d, d1, 0, &u, &u1, &v, &v1, &s);
440
  if (s < 0)
441
  {
442
    *uu = a < 0 ? (long)u : -(long)u;
443
    *vv = b < 0 ? -(long)v : (long)v;
444
  }
445
  else
446
  {
447
    *uu = a < 0 ? -(long)u : (long)u;
448
    *vv = b < 0 ? (long)v : -(long)v;
449
  }
450
  return (long)r;
451
}
452

453
/*==================================
454
 * lgcdii(d,d1,u,u1,v,v1,vmax)
455
 *==================================*/
456
/* Lehmer's partial extended gcd algorithm, acting on two t_INT GENs.
457
 *
458
 * Tries to determine, using the leading 2*BITS_IN_LONG significant bits of d
459
 * and a quantity of bits from d1 obtained by a shift of the same displacement,
460
 * as many partial quotients of d/d1 as possible, and assigns to [u,u1;v,v1]
461
 * the product of all the [0,1; 1,qj] thus obtained, where the leftmost
462
 * factor arises from the quotient of the first division step.
463
 *
464
 * For use in rational reconstruction, vmax can be given a nonzero value.
465
 * In this case, we will return early as soon as v1 > vmax (i.e. v <= vmax)
466
 *
467
 * MUST be called with  d > d1 > 0, and with  d occupying more than one
468
 * significant word.  Returns the number of reduction/swap steps carried out,
469
 * possibly zero, or under certain conditions minus that number.  When the
470
 * return value is nonzero, the caller should use the returned recurrence
471
 * matrix to update its own copies of d,d1.  When the return value is
472
 * nonpositive, and the latest remainder after updating turns out to be
473
 * nonzero, the caller should at once attempt a full division, rather than
474
 * trying lgcdii() again -- this typically happens when we are about to
475
 * encounter a quotient larger than half a word. (This is not detected
476
 * infallibly -- after a positive return value, it is possible that the next
477
 * stage will end up needing a full division.  After a negative return value,
478
 * however, this is certain, and should be acted upon.)
479
 *
480
 * The sign information, for which xgcduu() has its return argument s, is now
481
 * implicit in the LSB of our return value, and the caller may take advantage
482
 * of the fact that a return value of +-1 implies u==0,u1==v==1  [only v1 pro-
483
 * vides interesting information in this case].  One might also use the fact
484
 * that if the return value is +-2, then u==1, but this is rather marginal.
485
 *
486
 * If it was not possible to determine even the first quotient, either because
487
 * we're too close to an integer quotient or because the quotient would be
488
 * larger than one word  (if the `leading digit' of d1 after shifting is all
489
 * zeros), we return 0 and do not assign anything to the last four args.
490
 *
491
 * The division chain might even run to completion. It is up to the caller to
492
 * detect this case. This routine does not change d or d1; this is also up to
493
 * the caller */
494
int
495
lgcdii(ulong* d, ulong* d1, ulong* u, ulong* u1, ulong* v, ulong* v1,
496
       ulong vmax)
497
{
498
  /* Strategy:  (1) Extract/shift most significant bits.  We assume that d
499
   * has at least two significant words, but we can cope with a one-word d1.
500
   * Let dd,dd1 be the most significant dividend word and matching part of the
501
   * divisor.
502
   * (2) Check for overflow on the first division.  For our purposes, this
503
   * happens when the upper half of dd1 is zero.  (Actually this is detected
504
   * during extraction.)
505
   * (3) Get a fix on the first quotient.  We compute q = floor(dd/dd1), which
506
   * is an upper bound for floor(d/d1), and which gives the true value of the
507
   * latter if (and-almost-only-if) the remainder dd' = dd-q*dd1 is >= q.
508
   * (If it isn't, we give up.  This is annoying because the subsequent full
509
   * division will repeat some work already done, but it happens infrequently.
510
   * Doing the extra-bit-fetch in this case would be awkward.)
511
   * (4) Finish initializations.
512
   *
513
   * The remainder of the action is comparatively boring... The main loop has
514
   * been unrolled once (so we don't swap things and we can apply Jebelean's
515
   * termination conditions which alternatingly take two different forms during
516
   * successive iterations).  When we first run out of sufficient bits to form
517
   * a quotient, and have an extra word of each operand, we pull out two whole
518
   * word's worth of dividend bits, and divisor bits of matching significance;
519
   * to these we apply our partial matrix (disregarding overflow because the
520
   * result mod 2^(2*BITS_IN_LONG) will in fact give the correct values), and
521
   * re-extract one word's worth of the current dividend and a matching amount
522
   * of divisor bits.  The affair will normally terminate with matrix entries
523
   * just short of a whole word.  (We terminate the inner loop before these can
524
   * possibly overflow.) */
525
  ulong dd,dd1,ddlo,dd1lo, sh,shc; /* `digits', shift count */
526
  ulong xu,xu1, xv,xv1, q,res; /* recurrences, partial quotient, count */
527
  ulong tmp0,tmp1,tmp2,tmpd,tmpu,tmpv; /* temps */
528
  ulong dm1, d1m1;
529
  long ld, ld1, lz;
530
  int skip = 0;
531
  LOCAL_OVERFLOW;
532
  LOCAL_HIREMAINDER;
533

534
  /* following is just for convenience: vmax==0 means no bound */
535
  if (vmax == 0) vmax = ULONG_MAX;
536
  ld = lgefint(d); ld1 = lgefint(d1); lz = ld - ld1; /* >= 0 */
537
  if (lz > 1) return 0; /* rare */
538

539
  d = int_MSW(d);  dm1  = *int_precW(d);
540
  d1 = int_MSW(d1);d1m1 = *int_precW(d1);
541
  dd1lo = 0; /* unless we find something better */
542
  sh = bfffo(*d);
543

544
  if (sh)
545
  { /* do the shifting */
546
    shc = BITS_IN_LONG - sh;
547
    if (lz)
548
    { /* dividend longer than divisor */
549
      dd1 = (*d1 >> shc);
550
      if (!(HIGHMASK & dd1)) return 0; /* overflow detected */
551
      if (ld1 > 3)
552
        dd1lo = (*d1 << sh) + (d1m1 >> shc);
553
      else
554
        dd1lo = (*d1 << sh);
555
    }
556
    else
557
    { /* dividend and divisor have the same length */
558
      dd1 = (*d1 << sh);
559
      if (!(HIGHMASK & dd1)) return 0;
560
      if (ld1 > 3)
561
      {
562
        dd1 += (d1m1 >> shc);
563
        if (ld1 > 4)
564
          dd1lo = (d1m1 << sh) + (*int_precW(int_precW(d1)) >> shc);
565
        else
566
          dd1lo = (d1m1 << sh);
567
      }
568
    }
569
    /* following lines assume d to have 2 or more significant words */
570
    dd = (*d << sh) + (dm1 >> shc);
571
    if (ld > 4)
572
      ddlo = (dm1 << sh) + (*int_precW(int_precW(d)) >> shc);
573
    else
574
      ddlo = (dm1 << sh);
575
  }
576
  else
577
  { /* no shift needed */
578
    if (lz) return 0; /* dividend longer than divisor: overflow */
579
    dd1 = *d1;
580
    if (!(HIGHMASK & dd1)) return 0;
581
    if(ld1 > 3) dd1lo = d1m1;
582
    /* assume again that d has another significant word */
583
    dd = *d; ddlo = dm1;
584
  }
585
  /* First subtraction/division stage.  (If a subtraction initially suffices,
586
   * we don't divide at all.)  If a Jebelean condition is violated, and we
587
   * can't fix it even by looking at the low-order bits in ddlo,dd1lo, we
588
   * give up and ask for a full division.  Otherwise we commit the result,
589
   * possibly deciding to re-shift immediately afterwards. */
590
  dd -= dd1;
591
  if (dd < dd1)
592
  { /* first quotient known to be == 1 */
593
    xv1 = 1UL;
594
    if (!dd) /* !(Jebelean condition), extraspecial case */
595
    { /* This actually happens. Now q==1 is known, but we underflow already.
596
       * OTOH we've just shortened d by a whole word. Thus we are happy and
597
       * return. */
598
      *u = 0; *v = *u1 = *v1 = 1UL;
599
      return -1; /* Next step will be a full division. */
600
    }
601
  }
602
  else
603
  { /* division indicated */
604
    hiremainder = 0;
605
    xv1 = 1 + divll(dd, dd1); /* xv1: alternative spelling of `q', here ;) */
606
    dd = hiremainder;
607
    if (dd < xv1) /* !(Jebelean cond'), nonextra special case */
608
    { /* Attempt to complete the division using the less significant bits,
609
       * before skipping right past the 1st loop to the reshift stage. */
610
      ddlo = subll(ddlo, mulll(xv1, dd1lo));
611
      dd = subllx(dd, hiremainder);
612

613
      /* If we now have an overflow, q was too large. Thanks to our decision
614
       * not to get here unless the original dd1 had bits set in the upper half
615
       * of the word, we now know that the correct quotient is in fact q-1. */
616
      if (overflow)
617
      {
618
        xv1--;
619
        ddlo = addll(ddlo,dd1lo);
620
        dd = addllx(dd,dd1); /* overflows again which cancels the borrow */
621
        /* ...and fall through to skip=1 below */
622
      }
623
      else
624
      /* Test Jebelean condition anew, at this point using _all_ the extracted
625
       * bits we have.  This is clutching at straws; we have a more or less
626
       * even chance of succeeding this time.  Note that if we fail, we really
627
       * do not know whether the correct quotient would have been q or some
628
       * smaller value. */
629
        if (!dd && ddlo < xv1) return 0;
630

631
      /* Otherwise, we now know that q is correct, but we cannot go into the
632
       * 1st loop.  Raise a flag so we'll remember to skip past the loop.
633
       * Get here also after the q-1 adjustment case. */
634
      skip = 1;
635
    } /* if !(Jebelean) then */
636
  }
637
  res = 1;
638
  if (xv1 > vmax)
639
  { /* gone past the bound already */
640
    *u = 0UL; *u1 = 1UL; *v = 1UL; *v1 = xv1;
641
    return res;
642
  }
643
  xu = 0UL; xv = xu1 = 1UL;
644

645
  /* Some invariants from here across the first loop:
646
   *
647
   * At this point, and again after we are finished with the first loop and
648
   * subsequent conditional, a division and the attached update of the
649
   * recurrence matrix have just been carried out completely.  The matrix
650
   * xu,xu1;xv,xv1 has been initialized (or updated, possibly with permuted
651
   * columns), and the current remainder == next divisor (dd at the moment)
652
   * is nonzero (it might be zero here, but then skip will have been set).
653
   *
654
   * After the first loop, or when skip is set already, it will also be the
655
   * case that there aren't sufficiently many bits to continue without re-
656
   * shifting.  If the divisor after reshifting is zero, or indeed if it
657
   * doesn't have more than half a word's worth of bits, we will have to
658
   * return at that point.  Otherwise, we proceed into the second loop.
659
   *
660
   * Furthermore, when we reach the re-shift stage, dd:ddlo and dd1:dd1lo will
661
   * already reflect the result of applying the current matrix to the old
662
   * ddorig:ddlo and dd1orig:dd1lo.  (For the first iteration above, this
663
   * was easy to achieve, and we didn't even need to peek into the (now
664
   * no longer existent!) saved words.  After the loop, we'll stop for a
665
   * moment to merge in the ddlo,dd1lo contributions.)
666
   *
667
   * Note that after the 1st division, even an a priori quotient of 1 cannot be
668
   * trusted until we've checked Jebelean's condition: it might be too small */
669
  if (!skip)
670
  {
671
    for(;;)
672
    { /* First half of loop divides dd into dd1, and leaves the recurrence
673
       * matrix xu,...,xv1 groomed the wrong way round (xu,xv will be the newer
674
       * entries) when successful. */
675
      tmpd = dd1 - dd;
676
      if (tmpd < dd)
677
      { /* quotient suspected to be 1 */
678
        tmpu = xu + xu1; /* cannot overflow -- everything bounded by
679
                          * the original dd during first loop */
680
        tmpv = xv + xv1;
681
      }
682
      else
683
      { /* division indicated */
684
        hiremainder = 0;
685
        q = 1 + divll(tmpd, dd);
686
        tmpd = hiremainder;
687
        tmpu = xu + q*xu1; /* can't overflow, but may need to be undone */
688
        tmpv = xv + q*xv1;
689
      }
690

691
      tmp0 = addll(tmpv, xv1);
692
      if ((tmpd < tmpu) || overflow ||
693
          (dd - tmpd < tmp0)) /* !(Jebelean cond.) */
694
        break; /* skip ahead to reshift stage */
695
      else
696
      { /* commit dd1, xu, xv */
697
        res++;
698
        dd1 = tmpd; xu = tmpu; xv = tmpv;
699
        if (xv > vmax) { *u = xu1; *u1 = xu; *v = xv1; *v1 = xv; return res; }
700
      }
701

702
      /* Second half of loop divides dd1 into dd, and the matrix returns to its
703
       * normal arrangement. */
704
      tmpd = dd - dd1;
705
      if (tmpd < dd1)
706
      { /* quotient suspected to be 1 */
707
        tmpu = xu1 + xu; /* cannot overflow */
708
        tmpv = xv1 + xv;
709
      }
710
      else
711
      { /* division indicated */
712
        hiremainder = 0;
713
        q = 1 + divll(tmpd, dd1);
714
        tmpd = hiremainder;
715
        tmpu = xu1 + q*xu;
716
        tmpv = xv1 + q*xv;
717
      }
718

719
      tmp0 = addll(tmpu, xu);
720
      if ((tmpd < tmpv) || overflow ||
721
          (dd1 - tmpd < tmp0)) /* !(Jebelean cond.) */
722
        break; /* skip ahead to reshift stage */
723
      else
724
      { /* commit dd, xu1, xv1 */
725
        res++;
726
        dd = tmpd; xu1 = tmpu; xv1 = tmpv;
727
        if (xv1 > vmax) { *u = xu; *u1 = xu1; *v = xv; *v1 = xv1; return res; }
728
      }
729
    } /* end of first loop */
730

731
    /* Intermezzo: update dd:ddlo, dd1:dd1lo.  (But not if skip is set.) */
732
    if (res&1)
733
    { /* after failed division in 1st half of loop:
734
       * [dd1:dd1lo,dd:ddlo] = [ddorig:ddlo,dd1orig:dd1lo]
735
       *                       * [ -xu, xu1 ; xv, -xv1 ]
736
       * Actually, we only multiply [ddlo,dd1lo] onto the matrix and add the
737
       * high-order remainders + overflows onto [dd1,dd] */
738
      tmp1 = mulll(ddlo, xu); tmp0 = hiremainder;
739
      tmp1 = subll(mulll(dd1lo,xv), tmp1);
740
      dd1 += subllx(hiremainder, tmp0);
741
      tmp2 = mulll(ddlo, xu1); tmp0 = hiremainder;
742
      ddlo = subll(tmp2, mulll(dd1lo,xv1));
743
      dd += subllx(tmp0, hiremainder);
744
      dd1lo = tmp1;
745
    }
746
    else
747
    { /* after failed division in 2nd half of loop:
748
       * [dd:ddlo,dd1:dd1lo] = [ddorig:ddlo,dd1orig:dd1lo]
749
       *                       * [ xu1, -xu ; -xv1, xv ]
750
       * Actually, we only multiply [ddlo,dd1lo] onto the matrix and add the
751
       * high-order remainders + overflows onto [dd,dd1] */
752
      tmp1 = mulll(ddlo, xu1); tmp0 = hiremainder;
753
      tmp1 = subll(tmp1, mulll(dd1lo,xv1));
754
      dd += subllx(tmp0, hiremainder);
755
      tmp2 = mulll(ddlo, xu); tmp0 = hiremainder;
756
      dd1lo = subll(mulll(dd1lo,xv), tmp2);
757
      dd1 += subllx(hiremainder, tmp0);
758
      ddlo = tmp1;
759
    }
760
  } /* end of skip-pable section:  get here also, with res==1, when there
761
     * was a problem immediately after the very first division. */
762

763
  /* Re-shift.  Note: the shift count _can_ be zero, viz. under the following
764
   * precise conditions:  The original dd1 had its topmost bit set, so the 1st
765
   * q was 1, and after subtraction, dd had its topmost bit unset.  If now dd=0,
766
   * we'd have taken the return exit already, so we couldn't have got here.
767
   * If not, then it must have been the second division which has gone amiss
768
   * (because dd1 was very close to an exact multiple of the remainder dd value,
769
   * so this will be very rare).  At this point, we'd have a fairly slim chance
770
   * of fixing things by re-examining dd1:dd1lo vs. dd:ddlo, but this is not
771
   * guaranteed to work. Instead of trying, we return at once and let caller
772
   * see to it that the initial subtraction is re-done usingall the bits of
773
   * both operands, which already helps, and the next round will either be a
774
   * full division  (if dd occupied a halfword or less), or another llgcdii()
775
   * first step.  In the latter case, since we try a little harder during our
776
   * first step, we may actually be able to fix the problem, and get here again
777
   * with improved low-order bits and with another step under our belt.
778
   * Otherwise we'll have given up above and forced a full division.
779
   *
780
   * If res is even, the shift count _cannot_ be zero.  (The first step forces
781
   * a zero into the remainder's MSB, and all subsequent remainders will have
782
   * inherited it.)
783
   *
784
   * The re-shift stage exits if the next divisor has at most half a word's
785
   * worth of bits.
786
   *
787
   * For didactic reasons, the second loop will be arranged in the same way
788
   * as the first -- beginning with the division of dd into dd1, as if res
789
   * was odd.  To cater for this, if res is actually even, we swap things
790
   * around during reshifting.  (During the second loop, the parity of res
791
   * does not matter; we know in which half of the loop we are when we decide
792
   * to return.) */
793
  if (res&1)
794
  { /* after odd number of division(s) */
795
    if (dd1 && (sh = bfffo(dd1)))
796
    {
797
      shc = BITS_IN_LONG - sh;
798
      dd = (ddlo >> shc) + (dd << sh);
799
      if (!(HIGHMASK & dd))
800
      {
801
        *u = xu; *u1 = xu1; *v = xv; *v1 = xv1;
802
        return -res; /* full division asked for */
803
      }
804
      dd1 = (dd1lo >> shc) + (dd1 << sh);
805
    }
806
    else
807
    { /* time to return: <= 1 word left, or sh==0 */
808
      *u = xu; *u1 = xu1; *v = xv; *v1 = xv1;
809
      return res;
810
    }
811
  }
812
  else
813
  { /* after even number of divisions */
814
    if (dd)
815
    {
816
      sh = bfffo(dd); /* > 0 */
817
      shc = BITS_IN_LONG - sh;
818
      /* dd:ddlo will become the new dd1, and v.v. */
819
      tmpd = (ddlo >> shc) + (dd << sh);
820
      dd = (dd1lo >> shc) + (dd1 << sh);
821
      dd1 = tmpd;
822
      /* This completes the swap; now dd is again the current divisor */
823
      if (HIGHMASK & dd)
824
      { /* recurrence matrix is the wrong way round; swap it */
825
        tmp0 = xu; xu = xu1; xu1 = tmp0;
826
        tmp0 = xv; xv = xv1; xv1 = tmp0;
827
      }
828
      else
829
      { /* recurrence matrix is the wrong way round; fix this */
830
        *u = xu1; *u1 = xu; *v = xv1; *v1 = xv;
831
        return -res;                /* full division asked for */
832
      }
833
    }
834
    else
835
    { /* time to return: <= 1 word left */
836
      *u = xu1; *u1 = xu; *v = xv1; *v1 = xv;
837
      return res;
838
    }
839
  } /* end reshift */
840

841
  /* The Second Loop.  Rip-off of the first, but we now check for overflow
842
   * in the recurrences.  Returns instead of breaking when we cannot fix the
843
   * quotient any longer. */
844
  for(;;)
845
  { /* First half of loop divides dd into dd1, and leaves the recurrence
846
     * matrix xu,...,xv1 groomed the wrong way round (xu,xv will be the newer
847
     * entries) when successful */
848
    tmpd = dd1 - dd;
849
    if (tmpd < dd)
850
    { /* quotient suspected to be 1 */
851
      tmpu = xu + xu1;
852
      tmpv = addll(xv, xv1); /* xv,xv1 will overflow first */
853
      tmp1 = overflow;
854
    }
855
    else
856
    { /* division indicated */
857
      hiremainder = 0;
858
      q = 1 + divll(tmpd, dd);
859
      tmpd = hiremainder;
860
      tmpu = xu + q*xu1;
861
      tmpv = addll(xv, mulll(q,xv1));
862
      tmp1 = overflow | hiremainder;
863
    }
864

865
    tmp0 = addll(tmpv, xv1);
866
    if ((tmpd < tmpu) || overflow || tmp1 ||
867
        (dd - tmpd < tmp0)) /* !(Jebelean cond.) */
868
    { /* The recurrence matrix has not yet been warped... */
869
      *u = xu; *u1 = xu1; *v = xv; *v1 = xv1;
870
      break;
871
    }
872
    /* commit dd1, xu, xv */
873
    res++;
874
    dd1 = tmpd; xu = tmpu; xv = tmpv;
875
    if (xv > vmax) { *u = xu1; *u1 = xu; *v = xv1; *v1 = xv; return res; }
876

877
    /* Second half of loop divides dd1 into dd, and the matrix returns to its
878
     * normal arrangement */
879
    tmpd = dd - dd1;
880
    if (tmpd < dd1)
881
    { /* quotient suspected to be 1 */
882
      tmpu = xu1 + xu;
883
      tmpv = addll(xv1, xv);
884
      tmp1 = overflow;
885
    }
886
    else
887
    { /* division indicated */
888
      hiremainder = 0;
889
      q = 1 + divll(tmpd, dd1);
890
      tmpd = hiremainder;
891
      tmpu = xu1 + q*xu;
892
      tmpv = addll(xv1, mulll(q, xv));
893
      tmp1 = overflow | hiremainder;
894
    }
895

896
    tmp0 = addll(tmpu, xu);
897
    if ((tmpd < tmpv) || overflow || tmp1 ||
898
        (dd1 - tmpd < tmp0)) /* !(Jebelean cond.) */
899
    { /* The recurrence matrix has not yet been unwarped, so it is
900
       * the wrong way round;  fix this. */
901
      *u = xu1; *u1 = xu; *v = xv1; *v1 = xv;
902
      break;
903
    }
904

905
    res++; /* commit dd, xu1, xv1 */
906
    dd = tmpd; xu1 = tmpu; xv1 = tmpv;
907
    if (xv1 > vmax) { *u = xu; *u1 = xu1; *v = xv; *v1 = xv1; return res; }
908
  } /* end of second loop */
909

910
  return res;
911
}
912

913
/* 1 / Mod(x,p). Assume x < p */
914
ulong
915
Fl_invsafe(ulong x, ulong p)
916
{
917
  long s;
918
  ulong xv, xv1, g = xgcduu(p, x, 1, &xv, &xv1, &s);
919
  if (g != 1UL) return 0UL;
920
  xv = xv1 % p; if (s < 0) xv = p - xv;
921
  return xv;
922
}
923

924
/* 1 / Mod(x,p). Assume x < p */
925
ulong
926
Fl_inv(ulong x, ulong p)
927
{
928
  ulong xv  = Fl_invsafe(x, p);
929
  if (!xv && p!=1UL) pari_err_INV("Fl_inv", mkintmod(utoi(x), utoi(p)));
930
  return xv;
931
}
932

933
Product

Resources

Company