Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
Download

Testing latest pari + WASM + node.js... and it works?! Wow.

28495 views
License: GPL3
ubuntu2004
1
#line 2 "../src/kernel/none/gcdll.c"
2
/* Copyright (C) 2000 The PARI group.
3
4
This file is part of the PARI/GP package.
5
6
PARI/GP is free software; you can redistribute it and/or modify it under the
7
terms of the GNU General Public License as published by the Free Software
8
Foundation; either version 2 of the License, or (at your option) any later
9
version. It is distributed in the hope that it will be useful, but WITHOUT
10
ANY WARRANTY WHATSOEVER.
11
12
Check the License for details. You should have received a copy of it, along
13
with the package; see the file 'COPYING'. If not, write to the Free Software
14
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. */
15
16
/***********************************************************************/
17
/** **/
18
/** GCD **/
19
/** **/
20
/***********************************************************************/
21
/* Fast ulong gcd. Called with y odd; x can be arbitrary (but will most of
22
* the time be smaller than y). */
23
24
/* Gotos are Harmful, and Programming is a Science. E.W.Dijkstra. */
25
INLINE ulong
26
gcduodd(ulong x, ulong y) /* assume y&1==1, y > 1 */
27
{
28
if (!x) return y;
29
/* fix up x */
30
while (!(x&1)) x>>=1;
31
if (x==1) return 1;
32
if (x==y) return y;
33
else if (x>y) goto xislarger;/* will be rare, given how we'll use this */
34
/* loop invariants: x,y odd and distinct. */
35
yislarger:
36
if ((x^y)&2) /* ...01, ...11 or vice versa */
37
y=(x>>2)+(y>>2)+1; /* ==(x+y)>>2 except it can't overflow */
38
else /* ...01,...01 or ...11,...11 */
39
y=(y-x)>>2; /* now y!=0 in either case */
40
while (!(y&1)) y>>=1; /* kill any windfall-gained powers of 2 */
41
if (y==1) return 1; /* comparand == return value... */
42
if (x==y) return y; /* this and the next is just one comparison */
43
else if (x<y) goto yislarger;/* else fall through to xislarger */
44
45
xislarger: /* same as above, seen through a mirror */
46
if ((x^y)&2)
47
x=(x>>2)+(y>>2)+1;
48
else
49
x=(x-y)>>2; /* x!=0 */
50
while (!(x&1)) x>>=1;
51
if (x==1) return 1;
52
if (x==y) return y;
53
else if (x>y) goto xislarger;
54
55
goto yislarger;
56
}
57
/* Gotos are useful, and Programming is an Art. D.E.Knuth. */
58
/* PS: Of course written with Dijkstra's lessons firmly in mind... --GN */
59
60
/* at least one of a or b is odd, return gcd(a,b) */
61
INLINE ulong
62
mygcduodd(ulong a, ulong b)
63
{
64
ulong c;
65
if (b&1)
66
{
67
if (a==1 || b==1)
68
c = 1;
69
else
70
c = gcduodd(a, b);
71
}
72
else
73
{
74
if (a==1)
75
c = 1;
76
else
77
c = gcduodd(b, a);
78
}
79
return c;
80
}
81
82
/* modified right shift binary algorithm with at most one division */
83
ulong
84
ugcd(ulong a,ulong b)
85
{
86
long v;
87
if (!b) return a;
88
if (!a) return b;
89
if (a>b) { a %= b; if (!a) return b; }
90
else { b %= a; if (!b) return a; }
91
v = vals(a|b);
92
return mygcduodd(a>>v, b>>v) << v;
93
}
94
long
95
cgcd(long a,long b) { return (long)ugcd(labs(a), labs(b)); }
96
97
/* For gcdii(): assume a>b>0, return gcd(a,b) as a GEN */
98
static GEN
99
igcduu(ulong a, ulong b)
100
{
101
long v;
102
a %= b; if (!a) return utoipos(b);
103
v = vals(a|b);
104
return utoipos( mygcduodd(a>>v, b>>v) << v );
105
}
106
107
/*Warning: overflows silently if lcm does not fit*/
108
ulong
109
ulcm(ulong a, ulong b)
110
{
111
ulong d = ugcd(a,b);
112
if (!d) return 0;
113
return d == 1? a*b: a*(b/d);
114
}
115
long
116
clcm(long a,long b) { return ulcm(labs(a), labs(b)); }
117
118
/********************************************************************/
119
/** **/
120
/** INTEGER EXTENDED GCD (AND INVMOD) **/
121
/** **/
122
/********************************************************************/
123
/* Two basic ideas - (1) avoid many integer divisions, especially when the
124
* quotient is 1 which happens ~ 40% of the time. (2) Use Lehmer's trick as
125
* modified by Jebelean of extracting a couple of words' worth of leading bits
126
* from both operands, and compute partial quotients from them as long as we
127
* can be sure of their values. Jebelean's modifications consist in
128
* inequalities from which we can quickly decide whether to carry on or to
129
* return to the outer loop, and in re-shifting after the first word's worth of
130
* bits has been used up. All of this is described in R. Lercier's thesis
131
* [pp148-153 & 163f.], except his outer loop isn't quite right: the catch-up
132
* divisions needed when one partial quotient is larger than a word are missing.
133
*
134
* The API consists of invmod() and bezout() below; the single-word routines
135
* xgcduu and xxgcduu may be called directly if desired; lgcdii() probably
136
* doesn't make much sense out of context.
137
*
138
* The whole lot is a factor 6 .. 8 faster on word-sized operands, and asym-
139
* ptotically about a factor 2.5 .. 3, depending on processor architecture,
140
* than the naive continued-division code. Unfortunately, thanks to the
141
* unrolled loops and all, the code is lengthy. */
142
143
/*==================================
144
* xgcduu(d,d1,f,v,v1,s)
145
* xxgcduu(d,d1,f,u,u1,v,v1,s)
146
* rgcduu(d,d1,vmax,u,u1,v,v1,s)
147
*==================================*/
148
/*
149
* Fast `final' extended gcd algorithm, acting on two ulongs. Ideally this
150
* should be replaced with assembler versions wherever possible. The present
151
* code essentially does `subtract, compare, and possibly divide' at each step,
152
* which is reasonable when hardware division (a) exists, (b) is a bit slowish
153
* and (c) does not depend a lot on the operand values (as on i486). When
154
* wordsize division is in fact an assembler routine based on subtraction,
155
* this strategy may not be the most efficient one.
156
*
157
* xxgcduu() should be called with d > d1 > 0, returns gcd(d,d1), and assigns
158
* the usual signless cont.frac. recurrence matrix to [u, u1; v, v1] (i.e.,
159
* the product of all the [0, 1; 1 q_j] where the leftmost factor arises from
160
* the quotient of the first division step), and the information about the
161
* implied signs to s (-1 when an odd number of divisions has been done,
162
* 1 otherwise). xgcduu() is exactly the same except that u,u1 are not com-
163
* puted (and not returned, of course).
164
*
165
* The input flag f should be set to 1 if we know in advance that gcd(d,d1)==1
166
* (so we can stop the chain division one step early: as soon as the remainder
167
* equals 1). Use this when you intend to use only what would be v, or only
168
* what would be u and v, after that final division step, but not u1 and v1.
169
* With the flag in force and thus without that final step, the interesting
170
* quantity/ies will still sit in [u1 and] v1, of course.
171
*
172
* For computing the inverse of a single-word INTMOD known to exist, pass f=1
173
* to xgcduu(), and obtain the result from s and v1. (The routine does the
174
* right thing when d1==1 already.) For finishing a multiword modinv known
175
* to exist, pass f=1 to xxgcduu(), and multiply the returned matrix (with
176
* properly adjusted signs) onto the values v' and v1' previously obtained
177
* from the multiword division steps. Actually, just take the scalar product
178
* of [v',v1'] with [u1,-v1], and change the sign if s==-1. (If the final
179
* step had been carried out, it would be [-u,v], and s would also change.)
180
* For reducing a rational number to lowest terms, pass f=0 to xgcduu().
181
* Finally, f=0 with xxgcduu() is useful for Bezout computations.
182
* (It is safe for invmod() to call xgcduu() with f=1, because f&1 doesn't
183
* make a difference when gcd(d,d1)>1. The speedup is negligible.)
184
*
185
* In principle, when gcd(d,d1) is known to be 1, it is straightforward to
186
* recover the final u,u1 given only v,v1 and s. However, it probably isn't
187
* worthwhile, as it trades a few multiplications for a division.
188
*
189
* rgcduu() is a variant of xxgcduu() which does not have f (the effect is
190
* that of f=0), but instead has a ulong vmax parameter, for use in rational
191
* reconstruction below. It returns when v1 exceeds vmax; v will never
192
* exceed vmax. (vmax=0 is taken as a synonym of ULONG_MAX i.e. unlimited,
193
* in which case rgcduu behaves exactly like xxgcduu with f=0.) The return
194
* value of rgcduu() is typically meaningless; the interesting part is the
195
* matrix. */
196
197
ulong
198
xgcduu(ulong d, ulong d1, int f, ulong* v, ulong* v1, long *s)
199
{
200
ulong xv,xv1, xs, q,res;
201
LOCAL_HIREMAINDER;
202
203
/* The above blurb contained a lie. The main loop always stops when d1
204
* has become equal to 1. If (d1 == 1 && !(f&1)) after the loop, we do
205
* the final `division' of d by 1 `by hand' as it were.
206
*
207
* The loop has already been unrolled once. Aggressive optimization could
208
* well lead to a totally unrolled assembler version.
209
*
210
* On modern x86 architectures, this loop is a pig anyway. The division
211
* instruction always puts its result into the same pair of registers, and
212
* we always want to use one of them straight away, so pipeline performance
213
* will suck big time. An assembler version should probably do a first loop
214
* computing and storing all the quotients -- their number is bounded in
215
* advance -- and then assembling the matrix in a second pass. On other
216
* architectures where we can cycle through four or so groups of registers
217
* and exploit a fast ALU result-to-operand feedback path, this is much less
218
* of an issue. */
219
xs = res = 0;
220
xv = 0UL; xv1 = 1UL;
221
while (d1 > 1UL)
222
{
223
d -= d1; /* no need to use subll */
224
if (d >= d1)
225
{
226
hiremainder = 0; q = 1 + divll(d,d1); d = hiremainder;
227
xv += q * xv1;
228
}
229
else
230
xv += xv1;
231
if (d <= 1UL) { xs=1; break; } /* possible loop exit */
232
/* repeat with inverted roles */
233
d1 -= d;
234
if (d1 >= d)
235
{
236
hiremainder = 0; q = 1 + divll(d1,d); d1 = hiremainder;
237
xv1 += q * xv;
238
}
239
else
240
xv1 += xv;
241
}
242
243
if (!(f&1))
244
{ /* division by 1 postprocessing if needed */
245
if (xs && d==1)
246
{ xv1 += d1 * xv; xs = 0; res = 1UL; }
247
else if (!xs && d1==1)
248
{ xv += d * xv1; xs = 1; res = 1UL; }
249
}
250
251
if (xs)
252
{
253
*s = -1; *v = xv1; *v1 = xv;
254
return (res ? res : (d==1 ? 1UL : d1));
255
}
256
else
257
{
258
*s = 1; *v = xv; *v1 = xv1;
259
return (res ? res : (d1==1 ? 1UL : d));
260
}
261
}
262
263
ulong
264
xxgcduu(ulong d, ulong d1, int f,
265
ulong* u, ulong* u1, ulong* v, ulong* v1, long *s)
266
{
267
ulong xu,xu1, xv,xv1, xs, q,res;
268
LOCAL_HIREMAINDER;
269
270
xs = res = 0;
271
xu = xv1 = 1UL;
272
xu1 = xv = 0UL;
273
while (d1 > 1UL)
274
{
275
/* no need to use subll */
276
d -= d1;
277
if (d >= d1)
278
{
279
hiremainder = 0; q = 1 + divll(d,d1); d = hiremainder;
280
xv += q * xv1;
281
xu += q * xu1;
282
}
283
else
284
{ xv += xv1; xu += xu1; }
285
if (d <= 1UL) { xs=1; break; } /* possible loop exit */
286
/* repeat with inverted roles */
287
d1 -= d;
288
if (d1 >= d)
289
{
290
hiremainder = 0; q = 1 + divll(d1,d); d1 = hiremainder;
291
xv1 += q * xv;
292
xu1 += q * xu;
293
}
294
else
295
{ xv1 += xv; xu1 += xu; }
296
}
297
298
if (!(f&1))
299
{ /* division by 1 postprocessing if needed */
300
if (xs && d==1)
301
{
302
xv1 += d1 * xv;
303
xu1 += d1 * xu;
304
xs = 0; res = 1UL;
305
}
306
else if (!xs && d1==1)
307
{
308
xv += d * xv1;
309
xu += d * xu1;
310
xs = 1; res = 1UL;
311
}
312
}
313
314
if (xs)
315
{
316
*s = -1; *u = xu1; *u1 = xu; *v = xv1; *v1 = xv;
317
return (res ? res : (d==1 ? 1UL : d1));
318
}
319
else
320
{
321
*s = 1; *u = xu; *u1 = xu1; *v = xv; *v1 = xv1;
322
return (res ? res : (d1==1 ? 1UL : d));
323
}
324
}
325
326
ulong
327
rgcduu(ulong d, ulong d1, ulong vmax,
328
ulong* u, ulong* u1, ulong* v, ulong* v1, long *s)
329
{
330
ulong xu,xu1, xv,xv1, xs, q, res=0;
331
int f = 0;
332
LOCAL_HIREMAINDER;
333
334
if (vmax == 0) vmax = ULONG_MAX;
335
xs = res = 0;
336
xu = xv1 = 1UL;
337
xu1 = xv = 0UL;
338
while (d1 > 1UL)
339
{
340
d -= d1; /* no need to use subll */
341
if (d >= d1)
342
{
343
hiremainder = 0; q = 1 + divll(d,d1); d = hiremainder;
344
xv += q * xv1;
345
xu += q * xu1;
346
}
347
else
348
{ xv += xv1; xu += xu1; }
349
/* possible loop exit */
350
if (xv > vmax) { f=xs=1; break; }
351
if (d <= 1UL) { xs=1; break; }
352
/* repeat with inverted roles */
353
d1 -= d;
354
if (d1 >= d)
355
{
356
hiremainder = 0; q = 1 + divll(d1,d); d1 = hiremainder;
357
xv1 += q * xv;
358
xu1 += q * xu;
359
}
360
else
361
{ xv1 += xv; xu1 += xu; }
362
/* possible loop exit */
363
if (xv1 > vmax) { f=1; break; }
364
}
365
366
if (!(f&1))
367
{ /* division by 1 postprocessing if needed */
368
if (xs && d==1)
369
{
370
xv1 += d1 * xv;
371
xu1 += d1 * xu;
372
xs = 0; res = 1UL;
373
}
374
else if (!xs && d1==1)
375
{
376
xv += d * xv1;
377
xu += d * xu1;
378
xs = 1; res = 1UL;
379
}
380
}
381
382
if (xs)
383
{
384
*s = -1; *u = xu1; *u1 = xu; *v = xv1; *v1 = xv;
385
return (res ? res : (d==1 ? 1UL : d1));
386
}
387
else
388
{
389
*s = 1; *u = xu; *u1 = xu1; *v = xv; *v1 = xv1;
390
return (res ? res : (d1==1 ? 1UL : d));
391
}
392
}
393
394
/*==================================
395
* cbezout(a,b,uu,vv)
396
*==================================
397
* Same as bezout() but for C longs.
398
* Return g = gcd(a,b) >= 0, and assign longs u,v through pointers uu,vv
399
* such that g = u*a + v*b.
400
* Special cases:
401
* a = b = 0 ==> pick u=1, v=0 (and return 1, surprisingly)
402
* a != 0 = b ==> keep v=0
403
* a = 0 != b ==> keep u=0
404
* |a| = |b| != 0 ==> keep u=0, set v=+-1
405
* Assignments through uu,vv happen unconditionally. */
406
long
407
cbezout(long a,long b,long *uu,long *vv)
408
{
409
long s,*t;
410
ulong d = labs(a), d1 = labs(b);
411
ulong r,u,u1,v,v1;
412
413
if (!b)
414
{
415
*vv=0L;
416
if (!a) { *uu=1L; return 0L; }
417
*uu = a < 0 ? -1L : 1L;
418
return (long)d;
419
}
420
else if (!a || (d == d1))
421
{
422
*uu = 0L; *vv = b < 0 ? -1L : 1L;
423
return (long)d1;
424
}
425
else if (d == 1) /* frequently used by nfinit */
426
{
427
*uu = a; *vv = 0L;
428
return 1L;
429
}
430
else if (d < d1)
431
{
432
/* bug in gcc-2.95.3:
433
* s = a; a = b; b = s; produces wrong result a = b. This is OK: */
434
{ long _x = a; a = b; b = _x; } /* in order to keep the right signs */
435
r = d; d = d1; d1 = r;
436
t = uu; uu = vv; vv = t;
437
}
438
/* d > d1 > 0 */
439
r = xxgcduu(d, d1, 0, &u, &u1, &v, &v1, &s);
440
if (s < 0)
441
{
442
*uu = a < 0 ? (long)u : -(long)u;
443
*vv = b < 0 ? -(long)v : (long)v;
444
}
445
else
446
{
447
*uu = a < 0 ? -(long)u : (long)u;
448
*vv = b < 0 ? (long)v : -(long)v;
449
}
450
return (long)r;
451
}
452
453
/*==================================
454
* lgcdii(d,d1,u,u1,v,v1,vmax)
455
*==================================*/
456
/* Lehmer's partial extended gcd algorithm, acting on two t_INT GENs.
457
*
458
* Tries to determine, using the leading 2*BITS_IN_LONG significant bits of d
459
* and a quantity of bits from d1 obtained by a shift of the same displacement,
460
* as many partial quotients of d/d1 as possible, and assigns to [u,u1;v,v1]
461
* the product of all the [0,1; 1,qj] thus obtained, where the leftmost
462
* factor arises from the quotient of the first division step.
463
*
464
* For use in rational reconstruction, vmax can be given a nonzero value.
465
* In this case, we will return early as soon as v1 > vmax (i.e. v <= vmax)
466
*
467
* MUST be called with d > d1 > 0, and with d occupying more than one
468
* significant word. Returns the number of reduction/swap steps carried out,
469
* possibly zero, or under certain conditions minus that number. When the
470
* return value is nonzero, the caller should use the returned recurrence
471
* matrix to update its own copies of d,d1. When the return value is
472
* nonpositive, and the latest remainder after updating turns out to be
473
* nonzero, the caller should at once attempt a full division, rather than
474
* trying lgcdii() again -- this typically happens when we are about to
475
* encounter a quotient larger than half a word. (This is not detected
476
* infallibly -- after a positive return value, it is possible that the next
477
* stage will end up needing a full division. After a negative return value,
478
* however, this is certain, and should be acted upon.)
479
*
480
* The sign information, for which xgcduu() has its return argument s, is now
481
* implicit in the LSB of our return value, and the caller may take advantage
482
* of the fact that a return value of +-1 implies u==0,u1==v==1 [only v1 pro-
483
* vides interesting information in this case]. One might also use the fact
484
* that if the return value is +-2, then u==1, but this is rather marginal.
485
*
486
* If it was not possible to determine even the first quotient, either because
487
* we're too close to an integer quotient or because the quotient would be
488
* larger than one word (if the `leading digit' of d1 after shifting is all
489
* zeros), we return 0 and do not assign anything to the last four args.
490
*
491
* The division chain might even run to completion. It is up to the caller to
492
* detect this case. This routine does not change d or d1; this is also up to
493
* the caller */
494
int
495
lgcdii(ulong* d, ulong* d1, ulong* u, ulong* u1, ulong* v, ulong* v1,
496
ulong vmax)
497
{
498
/* Strategy: (1) Extract/shift most significant bits. We assume that d
499
* has at least two significant words, but we can cope with a one-word d1.
500
* Let dd,dd1 be the most significant dividend word and matching part of the
501
* divisor.
502
* (2) Check for overflow on the first division. For our purposes, this
503
* happens when the upper half of dd1 is zero. (Actually this is detected
504
* during extraction.)
505
* (3) Get a fix on the first quotient. We compute q = floor(dd/dd1), which
506
* is an upper bound for floor(d/d1), and which gives the true value of the
507
* latter if (and-almost-only-if) the remainder dd' = dd-q*dd1 is >= q.
508
* (If it isn't, we give up. This is annoying because the subsequent full
509
* division will repeat some work already done, but it happens infrequently.
510
* Doing the extra-bit-fetch in this case would be awkward.)
511
* (4) Finish initializations.
512
*
513
* The remainder of the action is comparatively boring... The main loop has
514
* been unrolled once (so we don't swap things and we can apply Jebelean's
515
* termination conditions which alternatingly take two different forms during
516
* successive iterations). When we first run out of sufficient bits to form
517
* a quotient, and have an extra word of each operand, we pull out two whole
518
* word's worth of dividend bits, and divisor bits of matching significance;
519
* to these we apply our partial matrix (disregarding overflow because the
520
* result mod 2^(2*BITS_IN_LONG) will in fact give the correct values), and
521
* re-extract one word's worth of the current dividend and a matching amount
522
* of divisor bits. The affair will normally terminate with matrix entries
523
* just short of a whole word. (We terminate the inner loop before these can
524
* possibly overflow.) */
525
ulong dd,dd1,ddlo,dd1lo, sh,shc; /* `digits', shift count */
526
ulong xu,xu1, xv,xv1, q,res; /* recurrences, partial quotient, count */
527
ulong tmp0,tmp1,tmp2,tmpd,tmpu,tmpv; /* temps */
528
ulong dm1, d1m1;
529
long ld, ld1, lz;
530
int skip = 0;
531
LOCAL_OVERFLOW;
532
LOCAL_HIREMAINDER;
533
534
/* following is just for convenience: vmax==0 means no bound */
535
if (vmax == 0) vmax = ULONG_MAX;
536
ld = lgefint(d); ld1 = lgefint(d1); lz = ld - ld1; /* >= 0 */
537
if (lz > 1) return 0; /* rare */
538
539
d = int_MSW(d); dm1 = *int_precW(d);
540
d1 = int_MSW(d1);d1m1 = *int_precW(d1);
541
dd1lo = 0; /* unless we find something better */
542
sh = bfffo(*d);
543
544
if (sh)
545
{ /* do the shifting */
546
shc = BITS_IN_LONG - sh;
547
if (lz)
548
{ /* dividend longer than divisor */
549
dd1 = (*d1 >> shc);
550
if (!(HIGHMASK & dd1)) return 0; /* overflow detected */
551
if (ld1 > 3)
552
dd1lo = (*d1 << sh) + (d1m1 >> shc);
553
else
554
dd1lo = (*d1 << sh);
555
}
556
else
557
{ /* dividend and divisor have the same length */
558
dd1 = (*d1 << sh);
559
if (!(HIGHMASK & dd1)) return 0;
560
if (ld1 > 3)
561
{
562
dd1 += (d1m1 >> shc);
563
if (ld1 > 4)
564
dd1lo = (d1m1 << sh) + (*int_precW(int_precW(d1)) >> shc);
565
else
566
dd1lo = (d1m1 << sh);
567
}
568
}
569
/* following lines assume d to have 2 or more significant words */
570
dd = (*d << sh) + (dm1 >> shc);
571
if (ld > 4)
572
ddlo = (dm1 << sh) + (*int_precW(int_precW(d)) >> shc);
573
else
574
ddlo = (dm1 << sh);
575
}
576
else
577
{ /* no shift needed */
578
if (lz) return 0; /* dividend longer than divisor: overflow */
579
dd1 = *d1;
580
if (!(HIGHMASK & dd1)) return 0;
581
if(ld1 > 3) dd1lo = d1m1;
582
/* assume again that d has another significant word */
583
dd = *d; ddlo = dm1;
584
}
585
/* First subtraction/division stage. (If a subtraction initially suffices,
586
* we don't divide at all.) If a Jebelean condition is violated, and we
587
* can't fix it even by looking at the low-order bits in ddlo,dd1lo, we
588
* give up and ask for a full division. Otherwise we commit the result,
589
* possibly deciding to re-shift immediately afterwards. */
590
dd -= dd1;
591
if (dd < dd1)
592
{ /* first quotient known to be == 1 */
593
xv1 = 1UL;
594
if (!dd) /* !(Jebelean condition), extraspecial case */
595
{ /* This actually happens. Now q==1 is known, but we underflow already.
596
* OTOH we've just shortened d by a whole word. Thus we are happy and
597
* return. */
598
*u = 0; *v = *u1 = *v1 = 1UL;
599
return -1; /* Next step will be a full division. */
600
}
601
}
602
else
603
{ /* division indicated */
604
hiremainder = 0;
605
xv1 = 1 + divll(dd, dd1); /* xv1: alternative spelling of `q', here ;) */
606
dd = hiremainder;
607
if (dd < xv1) /* !(Jebelean cond'), nonextra special case */
608
{ /* Attempt to complete the division using the less significant bits,
609
* before skipping right past the 1st loop to the reshift stage. */
610
ddlo = subll(ddlo, mulll(xv1, dd1lo));
611
dd = subllx(dd, hiremainder);
612
613
/* If we now have an overflow, q was too large. Thanks to our decision
614
* not to get here unless the original dd1 had bits set in the upper half
615
* of the word, we now know that the correct quotient is in fact q-1. */
616
if (overflow)
617
{
618
xv1--;
619
ddlo = addll(ddlo,dd1lo);
620
dd = addllx(dd,dd1); /* overflows again which cancels the borrow */
621
/* ...and fall through to skip=1 below */
622
}
623
else
624
/* Test Jebelean condition anew, at this point using _all_ the extracted
625
* bits we have. This is clutching at straws; we have a more or less
626
* even chance of succeeding this time. Note that if we fail, we really
627
* do not know whether the correct quotient would have been q or some
628
* smaller value. */
629
if (!dd && ddlo < xv1) return 0;
630
631
/* Otherwise, we now know that q is correct, but we cannot go into the
632
* 1st loop. Raise a flag so we'll remember to skip past the loop.
633
* Get here also after the q-1 adjustment case. */
634
skip = 1;
635
} /* if !(Jebelean) then */
636
}
637
res = 1;
638
if (xv1 > vmax)
639
{ /* gone past the bound already */
640
*u = 0UL; *u1 = 1UL; *v = 1UL; *v1 = xv1;
641
return res;
642
}
643
xu = 0UL; xv = xu1 = 1UL;
644
645
/* Some invariants from here across the first loop:
646
*
647
* At this point, and again after we are finished with the first loop and
648
* subsequent conditional, a division and the attached update of the
649
* recurrence matrix have just been carried out completely. The matrix
650
* xu,xu1;xv,xv1 has been initialized (or updated, possibly with permuted
651
* columns), and the current remainder == next divisor (dd at the moment)
652
* is nonzero (it might be zero here, but then skip will have been set).
653
*
654
* After the first loop, or when skip is set already, it will also be the
655
* case that there aren't sufficiently many bits to continue without re-
656
* shifting. If the divisor after reshifting is zero, or indeed if it
657
* doesn't have more than half a word's worth of bits, we will have to
658
* return at that point. Otherwise, we proceed into the second loop.
659
*
660
* Furthermore, when we reach the re-shift stage, dd:ddlo and dd1:dd1lo will
661
* already reflect the result of applying the current matrix to the old
662
* ddorig:ddlo and dd1orig:dd1lo. (For the first iteration above, this
663
* was easy to achieve, and we didn't even need to peek into the (now
664
* no longer existent!) saved words. After the loop, we'll stop for a
665
* moment to merge in the ddlo,dd1lo contributions.)
666
*
667
* Note that after the 1st division, even an a priori quotient of 1 cannot be
668
* trusted until we've checked Jebelean's condition: it might be too small */
669
if (!skip)
670
{
671
for(;;)
672
{ /* First half of loop divides dd into dd1, and leaves the recurrence
673
* matrix xu,...,xv1 groomed the wrong way round (xu,xv will be the newer
674
* entries) when successful. */
675
tmpd = dd1 - dd;
676
if (tmpd < dd)
677
{ /* quotient suspected to be 1 */
678
tmpu = xu + xu1; /* cannot overflow -- everything bounded by
679
* the original dd during first loop */
680
tmpv = xv + xv1;
681
}
682
else
683
{ /* division indicated */
684
hiremainder = 0;
685
q = 1 + divll(tmpd, dd);
686
tmpd = hiremainder;
687
tmpu = xu + q*xu1; /* can't overflow, but may need to be undone */
688
tmpv = xv + q*xv1;
689
}
690
691
tmp0 = addll(tmpv, xv1);
692
if ((tmpd < tmpu) || overflow ||
693
(dd - tmpd < tmp0)) /* !(Jebelean cond.) */
694
break; /* skip ahead to reshift stage */
695
else
696
{ /* commit dd1, xu, xv */
697
res++;
698
dd1 = tmpd; xu = tmpu; xv = tmpv;
699
if (xv > vmax) { *u = xu1; *u1 = xu; *v = xv1; *v1 = xv; return res; }
700
}
701
702
/* Second half of loop divides dd1 into dd, and the matrix returns to its
703
* normal arrangement. */
704
tmpd = dd - dd1;
705
if (tmpd < dd1)
706
{ /* quotient suspected to be 1 */
707
tmpu = xu1 + xu; /* cannot overflow */
708
tmpv = xv1 + xv;
709
}
710
else
711
{ /* division indicated */
712
hiremainder = 0;
713
q = 1 + divll(tmpd, dd1);
714
tmpd = hiremainder;
715
tmpu = xu1 + q*xu;
716
tmpv = xv1 + q*xv;
717
}
718
719
tmp0 = addll(tmpu, xu);
720
if ((tmpd < tmpv) || overflow ||
721
(dd1 - tmpd < tmp0)) /* !(Jebelean cond.) */
722
break; /* skip ahead to reshift stage */
723
else
724
{ /* commit dd, xu1, xv1 */
725
res++;
726
dd = tmpd; xu1 = tmpu; xv1 = tmpv;
727
if (xv1 > vmax) { *u = xu; *u1 = xu1; *v = xv; *v1 = xv1; return res; }
728
}
729
} /* end of first loop */
730
731
/* Intermezzo: update dd:ddlo, dd1:dd1lo. (But not if skip is set.) */
732
if (res&1)
733
{ /* after failed division in 1st half of loop:
734
* [dd1:dd1lo,dd:ddlo] = [ddorig:ddlo,dd1orig:dd1lo]
735
* * [ -xu, xu1 ; xv, -xv1 ]
736
* Actually, we only multiply [ddlo,dd1lo] onto the matrix and add the
737
* high-order remainders + overflows onto [dd1,dd] */
738
tmp1 = mulll(ddlo, xu); tmp0 = hiremainder;
739
tmp1 = subll(mulll(dd1lo,xv), tmp1);
740
dd1 += subllx(hiremainder, tmp0);
741
tmp2 = mulll(ddlo, xu1); tmp0 = hiremainder;
742
ddlo = subll(tmp2, mulll(dd1lo,xv1));
743
dd += subllx(tmp0, hiremainder);
744
dd1lo = tmp1;
745
}
746
else
747
{ /* after failed division in 2nd half of loop:
748
* [dd:ddlo,dd1:dd1lo] = [ddorig:ddlo,dd1orig:dd1lo]
749
* * [ xu1, -xu ; -xv1, xv ]
750
* Actually, we only multiply [ddlo,dd1lo] onto the matrix and add the
751
* high-order remainders + overflows onto [dd,dd1] */
752
tmp1 = mulll(ddlo, xu1); tmp0 = hiremainder;
753
tmp1 = subll(tmp1, mulll(dd1lo,xv1));
754
dd += subllx(tmp0, hiremainder);
755
tmp2 = mulll(ddlo, xu); tmp0 = hiremainder;
756
dd1lo = subll(mulll(dd1lo,xv), tmp2);
757
dd1 += subllx(hiremainder, tmp0);
758
ddlo = tmp1;
759
}
760
} /* end of skip-pable section: get here also, with res==1, when there
761
* was a problem immediately after the very first division. */
762
763
/* Re-shift. Note: the shift count _can_ be zero, viz. under the following
764
* precise conditions: The original dd1 had its topmost bit set, so the 1st
765
* q was 1, and after subtraction, dd had its topmost bit unset. If now dd=0,
766
* we'd have taken the return exit already, so we couldn't have got here.
767
* If not, then it must have been the second division which has gone amiss
768
* (because dd1 was very close to an exact multiple of the remainder dd value,
769
* so this will be very rare). At this point, we'd have a fairly slim chance
770
* of fixing things by re-examining dd1:dd1lo vs. dd:ddlo, but this is not
771
* guaranteed to work. Instead of trying, we return at once and let caller
772
* see to it that the initial subtraction is re-done usingall the bits of
773
* both operands, which already helps, and the next round will either be a
774
* full division (if dd occupied a halfword or less), or another llgcdii()
775
* first step. In the latter case, since we try a little harder during our
776
* first step, we may actually be able to fix the problem, and get here again
777
* with improved low-order bits and with another step under our belt.
778
* Otherwise we'll have given up above and forced a full division.
779
*
780
* If res is even, the shift count _cannot_ be zero. (The first step forces
781
* a zero into the remainder's MSB, and all subsequent remainders will have
782
* inherited it.)
783
*
784
* The re-shift stage exits if the next divisor has at most half a word's
785
* worth of bits.
786
*
787
* For didactic reasons, the second loop will be arranged in the same way
788
* as the first -- beginning with the division of dd into dd1, as if res
789
* was odd. To cater for this, if res is actually even, we swap things
790
* around during reshifting. (During the second loop, the parity of res
791
* does not matter; we know in which half of the loop we are when we decide
792
* to return.) */
793
if (res&1)
794
{ /* after odd number of division(s) */
795
if (dd1 && (sh = bfffo(dd1)))
796
{
797
shc = BITS_IN_LONG - sh;
798
dd = (ddlo >> shc) + (dd << sh);
799
if (!(HIGHMASK & dd))
800
{
801
*u = xu; *u1 = xu1; *v = xv; *v1 = xv1;
802
return -res; /* full division asked for */
803
}
804
dd1 = (dd1lo >> shc) + (dd1 << sh);
805
}
806
else
807
{ /* time to return: <= 1 word left, or sh==0 */
808
*u = xu; *u1 = xu1; *v = xv; *v1 = xv1;
809
return res;
810
}
811
}
812
else
813
{ /* after even number of divisions */
814
if (dd)
815
{
816
sh = bfffo(dd); /* > 0 */
817
shc = BITS_IN_LONG - sh;
818
/* dd:ddlo will become the new dd1, and v.v. */
819
tmpd = (ddlo >> shc) + (dd << sh);
820
dd = (dd1lo >> shc) + (dd1 << sh);
821
dd1 = tmpd;
822
/* This completes the swap; now dd is again the current divisor */
823
if (HIGHMASK & dd)
824
{ /* recurrence matrix is the wrong way round; swap it */
825
tmp0 = xu; xu = xu1; xu1 = tmp0;
826
tmp0 = xv; xv = xv1; xv1 = tmp0;
827
}
828
else
829
{ /* recurrence matrix is the wrong way round; fix this */
830
*u = xu1; *u1 = xu; *v = xv1; *v1 = xv;
831
return -res; /* full division asked for */
832
}
833
}
834
else
835
{ /* time to return: <= 1 word left */
836
*u = xu1; *u1 = xu; *v = xv1; *v1 = xv;
837
return res;
838
}
839
} /* end reshift */
840
841
/* The Second Loop. Rip-off of the first, but we now check for overflow
842
* in the recurrences. Returns instead of breaking when we cannot fix the
843
* quotient any longer. */
844
for(;;)
845
{ /* First half of loop divides dd into dd1, and leaves the recurrence
846
* matrix xu,...,xv1 groomed the wrong way round (xu,xv will be the newer
847
* entries) when successful */
848
tmpd = dd1 - dd;
849
if (tmpd < dd)
850
{ /* quotient suspected to be 1 */
851
tmpu = xu + xu1;
852
tmpv = addll(xv, xv1); /* xv,xv1 will overflow first */
853
tmp1 = overflow;
854
}
855
else
856
{ /* division indicated */
857
hiremainder = 0;
858
q = 1 + divll(tmpd, dd);
859
tmpd = hiremainder;
860
tmpu = xu + q*xu1;
861
tmpv = addll(xv, mulll(q,xv1));
862
tmp1 = overflow | hiremainder;
863
}
864
865
tmp0 = addll(tmpv, xv1);
866
if ((tmpd < tmpu) || overflow || tmp1 ||
867
(dd - tmpd < tmp0)) /* !(Jebelean cond.) */
868
{ /* The recurrence matrix has not yet been warped... */
869
*u = xu; *u1 = xu1; *v = xv; *v1 = xv1;
870
break;
871
}
872
/* commit dd1, xu, xv */
873
res++;
874
dd1 = tmpd; xu = tmpu; xv = tmpv;
875
if (xv > vmax) { *u = xu1; *u1 = xu; *v = xv1; *v1 = xv; return res; }
876
877
/* Second half of loop divides dd1 into dd, and the matrix returns to its
878
* normal arrangement */
879
tmpd = dd - dd1;
880
if (tmpd < dd1)
881
{ /* quotient suspected to be 1 */
882
tmpu = xu1 + xu;
883
tmpv = addll(xv1, xv);
884
tmp1 = overflow;
885
}
886
else
887
{ /* division indicated */
888
hiremainder = 0;
889
q = 1 + divll(tmpd, dd1);
890
tmpd = hiremainder;
891
tmpu = xu1 + q*xu;
892
tmpv = addll(xv1, mulll(q, xv));
893
tmp1 = overflow | hiremainder;
894
}
895
896
tmp0 = addll(tmpu, xu);
897
if ((tmpd < tmpv) || overflow || tmp1 ||
898
(dd1 - tmpd < tmp0)) /* !(Jebelean cond.) */
899
{ /* The recurrence matrix has not yet been unwarped, so it is
900
* the wrong way round; fix this. */
901
*u = xu1; *u1 = xu; *v = xv1; *v1 = xv;
902
break;
903
}
904
905
res++; /* commit dd, xu1, xv1 */
906
dd = tmpd; xu1 = tmpu; xv1 = tmpv;
907
if (xv1 > vmax) { *u = xu; *u1 = xu1; *v = xv; *v1 = xv1; return res; }
908
} /* end of second loop */
909
910
return res;
911
}
912
913
/* 1 / Mod(x,p). Assume x < p */
914
ulong
915
Fl_invsafe(ulong x, ulong p)
916
{
917
long s;
918
ulong xv, xv1, g = xgcduu(p, x, 1, &xv, &xv1, &s);
919
if (g != 1UL) return 0UL;
920
xv = xv1 % p; if (s < 0) xv = p - xv;
921
return xv;
922
}
923
924
/* 1 / Mod(x,p). Assume x < p */
925
ulong
926
Fl_inv(ulong x, ulong p)
927
{
928
ulong xv = Fl_invsafe(x, p);
929
if (!xv && p!=1UL) pari_err_INV("Fl_inv", mkintmod(utoi(x), utoi(p)));
930
return xv;
931
}
932
933