Testing latest pari + WASM + node.js... and it works?! Wow.
License: GPL3
ubuntu2004
#line 2 "../src/kernel/none/gcdll.c"1/* Copyright (C) 2000 The PARI group.23This file is part of the PARI/GP package.45PARI/GP is free software; you can redistribute it and/or modify it under the6terms of the GNU General Public License as published by the Free Software7Foundation; either version 2 of the License, or (at your option) any later8version. It is distributed in the hope that it will be useful, but WITHOUT9ANY WARRANTY WHATSOEVER.1011Check the License for details. You should have received a copy of it, along12with the package; see the file 'COPYING'. If not, write to the Free Software13Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. */1415/***********************************************************************/16/** **/17/** GCD **/18/** **/19/***********************************************************************/20/* Fast ulong gcd. Called with y odd; x can be arbitrary (but will most of21* the time be smaller than y). */2223/* Gotos are Harmful, and Programming is a Science. E.W.Dijkstra. */24INLINE ulong25gcduodd(ulong x, ulong y) /* assume y&1==1, y > 1 */26{27if (!x) return y;28/* fix up x */29while (!(x&1)) x>>=1;30if (x==1) return 1;31if (x==y) return y;32else if (x>y) goto xislarger;/* will be rare, given how we'll use this */33/* loop invariants: x,y odd and distinct. */34yislarger:35if ((x^y)&2) /* ...01, ...11 or vice versa */36y=(x>>2)+(y>>2)+1; /* ==(x+y)>>2 except it can't overflow */37else /* ...01,...01 or ...11,...11 */38y=(y-x)>>2; /* now y!=0 in either case */39while (!(y&1)) y>>=1; /* kill any windfall-gained powers of 2 */40if (y==1) return 1; /* comparand == return value... */41if (x==y) return y; /* this and the next is just one comparison */42else if (x<y) goto yislarger;/* else fall through to xislarger */4344xislarger: /* same as above, seen through a mirror */45if ((x^y)&2)46x=(x>>2)+(y>>2)+1;47else48x=(x-y)>>2; /* x!=0 */49while (!(x&1)) x>>=1;50if (x==1) return 1;51if (x==y) return y;52else if (x>y) goto xislarger;5354goto yislarger;55}56/* Gotos are useful, and Programming is an Art. D.E.Knuth. */57/* PS: Of course written with Dijkstra's lessons firmly in mind... --GN */5859/* at least one of a or b is odd, return gcd(a,b) */60INLINE ulong61mygcduodd(ulong a, ulong b)62{63ulong c;64if (b&1)65{66if (a==1 || b==1)67c = 1;68else69c = gcduodd(a, b);70}71else72{73if (a==1)74c = 1;75else76c = gcduodd(b, a);77}78return c;79}8081/* modified right shift binary algorithm with at most one division */82ulong83ugcd(ulong a,ulong b)84{85long v;86if (!b) return a;87if (!a) return b;88if (a>b) { a %= b; if (!a) return b; }89else { b %= a; if (!b) return a; }90v = vals(a|b);91return mygcduodd(a>>v, b>>v) << v;92}93long94cgcd(long a,long b) { return (long)ugcd(labs(a), labs(b)); }9596/* For gcdii(): assume a>b>0, return gcd(a,b) as a GEN */97static GEN98igcduu(ulong a, ulong b)99{100long v;101a %= b; if (!a) return utoipos(b);102v = vals(a|b);103return utoipos( mygcduodd(a>>v, b>>v) << v );104}105106/*Warning: overflows silently if lcm does not fit*/107ulong108ulcm(ulong a, ulong b)109{110ulong d = ugcd(a,b);111if (!d) return 0;112return d == 1? a*b: a*(b/d);113}114long115clcm(long a,long b) { return ulcm(labs(a), labs(b)); }116117/********************************************************************/118/** **/119/** INTEGER EXTENDED GCD (AND INVMOD) **/120/** **/121/********************************************************************/122/* Two basic ideas - (1) avoid many integer divisions, especially when the123* quotient is 1 which happens ~ 40% of the time. (2) Use Lehmer's trick as124* modified by Jebelean of extracting a couple of words' worth of leading bits125* from both operands, and compute partial quotients from them as long as we126* can be sure of their values. Jebelean's modifications consist in127* inequalities from which we can quickly decide whether to carry on or to128* return to the outer loop, and in re-shifting after the first word's worth of129* bits has been used up. All of this is described in R. Lercier's thesis130* [pp148-153 & 163f.], except his outer loop isn't quite right: the catch-up131* divisions needed when one partial quotient is larger than a word are missing.132*133* The API consists of invmod() and bezout() below; the single-word routines134* xgcduu and xxgcduu may be called directly if desired; lgcdii() probably135* doesn't make much sense out of context.136*137* The whole lot is a factor 6 .. 8 faster on word-sized operands, and asym-138* ptotically about a factor 2.5 .. 3, depending on processor architecture,139* than the naive continued-division code. Unfortunately, thanks to the140* unrolled loops and all, the code is lengthy. */141142/*==================================143* xgcduu(d,d1,f,v,v1,s)144* xxgcduu(d,d1,f,u,u1,v,v1,s)145* rgcduu(d,d1,vmax,u,u1,v,v1,s)146*==================================*/147/*148* Fast `final' extended gcd algorithm, acting on two ulongs. Ideally this149* should be replaced with assembler versions wherever possible. The present150* code essentially does `subtract, compare, and possibly divide' at each step,151* which is reasonable when hardware division (a) exists, (b) is a bit slowish152* and (c) does not depend a lot on the operand values (as on i486). When153* wordsize division is in fact an assembler routine based on subtraction,154* this strategy may not be the most efficient one.155*156* xxgcduu() should be called with d > d1 > 0, returns gcd(d,d1), and assigns157* the usual signless cont.frac. recurrence matrix to [u, u1; v, v1] (i.e.,158* the product of all the [0, 1; 1 q_j] where the leftmost factor arises from159* the quotient of the first division step), and the information about the160* implied signs to s (-1 when an odd number of divisions has been done,161* 1 otherwise). xgcduu() is exactly the same except that u,u1 are not com-162* puted (and not returned, of course).163*164* The input flag f should be set to 1 if we know in advance that gcd(d,d1)==1165* (so we can stop the chain division one step early: as soon as the remainder166* equals 1). Use this when you intend to use only what would be v, or only167* what would be u and v, after that final division step, but not u1 and v1.168* With the flag in force and thus without that final step, the interesting169* quantity/ies will still sit in [u1 and] v1, of course.170*171* For computing the inverse of a single-word INTMOD known to exist, pass f=1172* to xgcduu(), and obtain the result from s and v1. (The routine does the173* right thing when d1==1 already.) For finishing a multiword modinv known174* to exist, pass f=1 to xxgcduu(), and multiply the returned matrix (with175* properly adjusted signs) onto the values v' and v1' previously obtained176* from the multiword division steps. Actually, just take the scalar product177* of [v',v1'] with [u1,-v1], and change the sign if s==-1. (If the final178* step had been carried out, it would be [-u,v], and s would also change.)179* For reducing a rational number to lowest terms, pass f=0 to xgcduu().180* Finally, f=0 with xxgcduu() is useful for Bezout computations.181* (It is safe for invmod() to call xgcduu() with f=1, because f&1 doesn't182* make a difference when gcd(d,d1)>1. The speedup is negligible.)183*184* In principle, when gcd(d,d1) is known to be 1, it is straightforward to185* recover the final u,u1 given only v,v1 and s. However, it probably isn't186* worthwhile, as it trades a few multiplications for a division.187*188* rgcduu() is a variant of xxgcduu() which does not have f (the effect is189* that of f=0), but instead has a ulong vmax parameter, for use in rational190* reconstruction below. It returns when v1 exceeds vmax; v will never191* exceed vmax. (vmax=0 is taken as a synonym of ULONG_MAX i.e. unlimited,192* in which case rgcduu behaves exactly like xxgcduu with f=0.) The return193* value of rgcduu() is typically meaningless; the interesting part is the194* matrix. */195196ulong197xgcduu(ulong d, ulong d1, int f, ulong* v, ulong* v1, long *s)198{199ulong xv,xv1, xs, q,res;200LOCAL_HIREMAINDER;201202/* The above blurb contained a lie. The main loop always stops when d1203* has become equal to 1. If (d1 == 1 && !(f&1)) after the loop, we do204* the final `division' of d by 1 `by hand' as it were.205*206* The loop has already been unrolled once. Aggressive optimization could207* well lead to a totally unrolled assembler version.208*209* On modern x86 architectures, this loop is a pig anyway. The division210* instruction always puts its result into the same pair of registers, and211* we always want to use one of them straight away, so pipeline performance212* will suck big time. An assembler version should probably do a first loop213* computing and storing all the quotients -- their number is bounded in214* advance -- and then assembling the matrix in a second pass. On other215* architectures where we can cycle through four or so groups of registers216* and exploit a fast ALU result-to-operand feedback path, this is much less217* of an issue. */218xs = res = 0;219xv = 0UL; xv1 = 1UL;220while (d1 > 1UL)221{222d -= d1; /* no need to use subll */223if (d >= d1)224{225hiremainder = 0; q = 1 + divll(d,d1); d = hiremainder;226xv += q * xv1;227}228else229xv += xv1;230if (d <= 1UL) { xs=1; break; } /* possible loop exit */231/* repeat with inverted roles */232d1 -= d;233if (d1 >= d)234{235hiremainder = 0; q = 1 + divll(d1,d); d1 = hiremainder;236xv1 += q * xv;237}238else239xv1 += xv;240}241242if (!(f&1))243{ /* division by 1 postprocessing if needed */244if (xs && d==1)245{ xv1 += d1 * xv; xs = 0; res = 1UL; }246else if (!xs && d1==1)247{ xv += d * xv1; xs = 1; res = 1UL; }248}249250if (xs)251{252*s = -1; *v = xv1; *v1 = xv;253return (res ? res : (d==1 ? 1UL : d1));254}255else256{257*s = 1; *v = xv; *v1 = xv1;258return (res ? res : (d1==1 ? 1UL : d));259}260}261262ulong263xxgcduu(ulong d, ulong d1, int f,264ulong* u, ulong* u1, ulong* v, ulong* v1, long *s)265{266ulong xu,xu1, xv,xv1, xs, q,res;267LOCAL_HIREMAINDER;268269xs = res = 0;270xu = xv1 = 1UL;271xu1 = xv = 0UL;272while (d1 > 1UL)273{274/* no need to use subll */275d -= d1;276if (d >= d1)277{278hiremainder = 0; q = 1 + divll(d,d1); d = hiremainder;279xv += q * xv1;280xu += q * xu1;281}282else283{ xv += xv1; xu += xu1; }284if (d <= 1UL) { xs=1; break; } /* possible loop exit */285/* repeat with inverted roles */286d1 -= d;287if (d1 >= d)288{289hiremainder = 0; q = 1 + divll(d1,d); d1 = hiremainder;290xv1 += q * xv;291xu1 += q * xu;292}293else294{ xv1 += xv; xu1 += xu; }295}296297if (!(f&1))298{ /* division by 1 postprocessing if needed */299if (xs && d==1)300{301xv1 += d1 * xv;302xu1 += d1 * xu;303xs = 0; res = 1UL;304}305else if (!xs && d1==1)306{307xv += d * xv1;308xu += d * xu1;309xs = 1; res = 1UL;310}311}312313if (xs)314{315*s = -1; *u = xu1; *u1 = xu; *v = xv1; *v1 = xv;316return (res ? res : (d==1 ? 1UL : d1));317}318else319{320*s = 1; *u = xu; *u1 = xu1; *v = xv; *v1 = xv1;321return (res ? res : (d1==1 ? 1UL : d));322}323}324325ulong326rgcduu(ulong d, ulong d1, ulong vmax,327ulong* u, ulong* u1, ulong* v, ulong* v1, long *s)328{329ulong xu,xu1, xv,xv1, xs, q, res=0;330int f = 0;331LOCAL_HIREMAINDER;332333if (vmax == 0) vmax = ULONG_MAX;334xs = res = 0;335xu = xv1 = 1UL;336xu1 = xv = 0UL;337while (d1 > 1UL)338{339d -= d1; /* no need to use subll */340if (d >= d1)341{342hiremainder = 0; q = 1 + divll(d,d1); d = hiremainder;343xv += q * xv1;344xu += q * xu1;345}346else347{ xv += xv1; xu += xu1; }348/* possible loop exit */349if (xv > vmax) { f=xs=1; break; }350if (d <= 1UL) { xs=1; break; }351/* repeat with inverted roles */352d1 -= d;353if (d1 >= d)354{355hiremainder = 0; q = 1 + divll(d1,d); d1 = hiremainder;356xv1 += q * xv;357xu1 += q * xu;358}359else360{ xv1 += xv; xu1 += xu; }361/* possible loop exit */362if (xv1 > vmax) { f=1; break; }363}364365if (!(f&1))366{ /* division by 1 postprocessing if needed */367if (xs && d==1)368{369xv1 += d1 * xv;370xu1 += d1 * xu;371xs = 0; res = 1UL;372}373else if (!xs && d1==1)374{375xv += d * xv1;376xu += d * xu1;377xs = 1; res = 1UL;378}379}380381if (xs)382{383*s = -1; *u = xu1; *u1 = xu; *v = xv1; *v1 = xv;384return (res ? res : (d==1 ? 1UL : d1));385}386else387{388*s = 1; *u = xu; *u1 = xu1; *v = xv; *v1 = xv1;389return (res ? res : (d1==1 ? 1UL : d));390}391}392393/*==================================394* cbezout(a,b,uu,vv)395*==================================396* Same as bezout() but for C longs.397* Return g = gcd(a,b) >= 0, and assign longs u,v through pointers uu,vv398* such that g = u*a + v*b.399* Special cases:400* a = b = 0 ==> pick u=1, v=0 (and return 1, surprisingly)401* a != 0 = b ==> keep v=0402* a = 0 != b ==> keep u=0403* |a| = |b| != 0 ==> keep u=0, set v=+-1404* Assignments through uu,vv happen unconditionally. */405long406cbezout(long a,long b,long *uu,long *vv)407{408long s,*t;409ulong d = labs(a), d1 = labs(b);410ulong r,u,u1,v,v1;411412if (!b)413{414*vv=0L;415if (!a) { *uu=1L; return 0L; }416*uu = a < 0 ? -1L : 1L;417return (long)d;418}419else if (!a || (d == d1))420{421*uu = 0L; *vv = b < 0 ? -1L : 1L;422return (long)d1;423}424else if (d == 1) /* frequently used by nfinit */425{426*uu = a; *vv = 0L;427return 1L;428}429else if (d < d1)430{431/* bug in gcc-2.95.3:432* s = a; a = b; b = s; produces wrong result a = b. This is OK: */433{ long _x = a; a = b; b = _x; } /* in order to keep the right signs */434r = d; d = d1; d1 = r;435t = uu; uu = vv; vv = t;436}437/* d > d1 > 0 */438r = xxgcduu(d, d1, 0, &u, &u1, &v, &v1, &s);439if (s < 0)440{441*uu = a < 0 ? (long)u : -(long)u;442*vv = b < 0 ? -(long)v : (long)v;443}444else445{446*uu = a < 0 ? -(long)u : (long)u;447*vv = b < 0 ? (long)v : -(long)v;448}449return (long)r;450}451452/*==================================453* lgcdii(d,d1,u,u1,v,v1,vmax)454*==================================*/455/* Lehmer's partial extended gcd algorithm, acting on two t_INT GENs.456*457* Tries to determine, using the leading 2*BITS_IN_LONG significant bits of d458* and a quantity of bits from d1 obtained by a shift of the same displacement,459* as many partial quotients of d/d1 as possible, and assigns to [u,u1;v,v1]460* the product of all the [0,1; 1,qj] thus obtained, where the leftmost461* factor arises from the quotient of the first division step.462*463* For use in rational reconstruction, vmax can be given a nonzero value.464* In this case, we will return early as soon as v1 > vmax (i.e. v <= vmax)465*466* MUST be called with d > d1 > 0, and with d occupying more than one467* significant word. Returns the number of reduction/swap steps carried out,468* possibly zero, or under certain conditions minus that number. When the469* return value is nonzero, the caller should use the returned recurrence470* matrix to update its own copies of d,d1. When the return value is471* nonpositive, and the latest remainder after updating turns out to be472* nonzero, the caller should at once attempt a full division, rather than473* trying lgcdii() again -- this typically happens when we are about to474* encounter a quotient larger than half a word. (This is not detected475* infallibly -- after a positive return value, it is possible that the next476* stage will end up needing a full division. After a negative return value,477* however, this is certain, and should be acted upon.)478*479* The sign information, for which xgcduu() has its return argument s, is now480* implicit in the LSB of our return value, and the caller may take advantage481* of the fact that a return value of +-1 implies u==0,u1==v==1 [only v1 pro-482* vides interesting information in this case]. One might also use the fact483* that if the return value is +-2, then u==1, but this is rather marginal.484*485* If it was not possible to determine even the first quotient, either because486* we're too close to an integer quotient or because the quotient would be487* larger than one word (if the `leading digit' of d1 after shifting is all488* zeros), we return 0 and do not assign anything to the last four args.489*490* The division chain might even run to completion. It is up to the caller to491* detect this case. This routine does not change d or d1; this is also up to492* the caller */493int494lgcdii(ulong* d, ulong* d1, ulong* u, ulong* u1, ulong* v, ulong* v1,495ulong vmax)496{497/* Strategy: (1) Extract/shift most significant bits. We assume that d498* has at least two significant words, but we can cope with a one-word d1.499* Let dd,dd1 be the most significant dividend word and matching part of the500* divisor.501* (2) Check for overflow on the first division. For our purposes, this502* happens when the upper half of dd1 is zero. (Actually this is detected503* during extraction.)504* (3) Get a fix on the first quotient. We compute q = floor(dd/dd1), which505* is an upper bound for floor(d/d1), and which gives the true value of the506* latter if (and-almost-only-if) the remainder dd' = dd-q*dd1 is >= q.507* (If it isn't, we give up. This is annoying because the subsequent full508* division will repeat some work already done, but it happens infrequently.509* Doing the extra-bit-fetch in this case would be awkward.)510* (4) Finish initializations.511*512* The remainder of the action is comparatively boring... The main loop has513* been unrolled once (so we don't swap things and we can apply Jebelean's514* termination conditions which alternatingly take two different forms during515* successive iterations). When we first run out of sufficient bits to form516* a quotient, and have an extra word of each operand, we pull out two whole517* word's worth of dividend bits, and divisor bits of matching significance;518* to these we apply our partial matrix (disregarding overflow because the519* result mod 2^(2*BITS_IN_LONG) will in fact give the correct values), and520* re-extract one word's worth of the current dividend and a matching amount521* of divisor bits. The affair will normally terminate with matrix entries522* just short of a whole word. (We terminate the inner loop before these can523* possibly overflow.) */524ulong dd,dd1,ddlo,dd1lo, sh,shc; /* `digits', shift count */525ulong xu,xu1, xv,xv1, q,res; /* recurrences, partial quotient, count */526ulong tmp0,tmp1,tmp2,tmpd,tmpu,tmpv; /* temps */527ulong dm1, d1m1;528long ld, ld1, lz;529int skip = 0;530LOCAL_OVERFLOW;531LOCAL_HIREMAINDER;532533/* following is just for convenience: vmax==0 means no bound */534if (vmax == 0) vmax = ULONG_MAX;535ld = lgefint(d); ld1 = lgefint(d1); lz = ld - ld1; /* >= 0 */536if (lz > 1) return 0; /* rare */537538d = int_MSW(d); dm1 = *int_precW(d);539d1 = int_MSW(d1);d1m1 = *int_precW(d1);540dd1lo = 0; /* unless we find something better */541sh = bfffo(*d);542543if (sh)544{ /* do the shifting */545shc = BITS_IN_LONG - sh;546if (lz)547{ /* dividend longer than divisor */548dd1 = (*d1 >> shc);549if (!(HIGHMASK & dd1)) return 0; /* overflow detected */550if (ld1 > 3)551dd1lo = (*d1 << sh) + (d1m1 >> shc);552else553dd1lo = (*d1 << sh);554}555else556{ /* dividend and divisor have the same length */557dd1 = (*d1 << sh);558if (!(HIGHMASK & dd1)) return 0;559if (ld1 > 3)560{561dd1 += (d1m1 >> shc);562if (ld1 > 4)563dd1lo = (d1m1 << sh) + (*int_precW(int_precW(d1)) >> shc);564else565dd1lo = (d1m1 << sh);566}567}568/* following lines assume d to have 2 or more significant words */569dd = (*d << sh) + (dm1 >> shc);570if (ld > 4)571ddlo = (dm1 << sh) + (*int_precW(int_precW(d)) >> shc);572else573ddlo = (dm1 << sh);574}575else576{ /* no shift needed */577if (lz) return 0; /* dividend longer than divisor: overflow */578dd1 = *d1;579if (!(HIGHMASK & dd1)) return 0;580if(ld1 > 3) dd1lo = d1m1;581/* assume again that d has another significant word */582dd = *d; ddlo = dm1;583}584/* First subtraction/division stage. (If a subtraction initially suffices,585* we don't divide at all.) If a Jebelean condition is violated, and we586* can't fix it even by looking at the low-order bits in ddlo,dd1lo, we587* give up and ask for a full division. Otherwise we commit the result,588* possibly deciding to re-shift immediately afterwards. */589dd -= dd1;590if (dd < dd1)591{ /* first quotient known to be == 1 */592xv1 = 1UL;593if (!dd) /* !(Jebelean condition), extraspecial case */594{ /* This actually happens. Now q==1 is known, but we underflow already.595* OTOH we've just shortened d by a whole word. Thus we are happy and596* return. */597*u = 0; *v = *u1 = *v1 = 1UL;598return -1; /* Next step will be a full division. */599}600}601else602{ /* division indicated */603hiremainder = 0;604xv1 = 1 + divll(dd, dd1); /* xv1: alternative spelling of `q', here ;) */605dd = hiremainder;606if (dd < xv1) /* !(Jebelean cond'), nonextra special case */607{ /* Attempt to complete the division using the less significant bits,608* before skipping right past the 1st loop to the reshift stage. */609ddlo = subll(ddlo, mulll(xv1, dd1lo));610dd = subllx(dd, hiremainder);611612/* If we now have an overflow, q was too large. Thanks to our decision613* not to get here unless the original dd1 had bits set in the upper half614* of the word, we now know that the correct quotient is in fact q-1. */615if (overflow)616{617xv1--;618ddlo = addll(ddlo,dd1lo);619dd = addllx(dd,dd1); /* overflows again which cancels the borrow */620/* ...and fall through to skip=1 below */621}622else623/* Test Jebelean condition anew, at this point using _all_ the extracted624* bits we have. This is clutching at straws; we have a more or less625* even chance of succeeding this time. Note that if we fail, we really626* do not know whether the correct quotient would have been q or some627* smaller value. */628if (!dd && ddlo < xv1) return 0;629630/* Otherwise, we now know that q is correct, but we cannot go into the631* 1st loop. Raise a flag so we'll remember to skip past the loop.632* Get here also after the q-1 adjustment case. */633skip = 1;634} /* if !(Jebelean) then */635}636res = 1;637if (xv1 > vmax)638{ /* gone past the bound already */639*u = 0UL; *u1 = 1UL; *v = 1UL; *v1 = xv1;640return res;641}642xu = 0UL; xv = xu1 = 1UL;643644/* Some invariants from here across the first loop:645*646* At this point, and again after we are finished with the first loop and647* subsequent conditional, a division and the attached update of the648* recurrence matrix have just been carried out completely. The matrix649* xu,xu1;xv,xv1 has been initialized (or updated, possibly with permuted650* columns), and the current remainder == next divisor (dd at the moment)651* is nonzero (it might be zero here, but then skip will have been set).652*653* After the first loop, or when skip is set already, it will also be the654* case that there aren't sufficiently many bits to continue without re-655* shifting. If the divisor after reshifting is zero, or indeed if it656* doesn't have more than half a word's worth of bits, we will have to657* return at that point. Otherwise, we proceed into the second loop.658*659* Furthermore, when we reach the re-shift stage, dd:ddlo and dd1:dd1lo will660* already reflect the result of applying the current matrix to the old661* ddorig:ddlo and dd1orig:dd1lo. (For the first iteration above, this662* was easy to achieve, and we didn't even need to peek into the (now663* no longer existent!) saved words. After the loop, we'll stop for a664* moment to merge in the ddlo,dd1lo contributions.)665*666* Note that after the 1st division, even an a priori quotient of 1 cannot be667* trusted until we've checked Jebelean's condition: it might be too small */668if (!skip)669{670for(;;)671{ /* First half of loop divides dd into dd1, and leaves the recurrence672* matrix xu,...,xv1 groomed the wrong way round (xu,xv will be the newer673* entries) when successful. */674tmpd = dd1 - dd;675if (tmpd < dd)676{ /* quotient suspected to be 1 */677tmpu = xu + xu1; /* cannot overflow -- everything bounded by678* the original dd during first loop */679tmpv = xv + xv1;680}681else682{ /* division indicated */683hiremainder = 0;684q = 1 + divll(tmpd, dd);685tmpd = hiremainder;686tmpu = xu + q*xu1; /* can't overflow, but may need to be undone */687tmpv = xv + q*xv1;688}689690tmp0 = addll(tmpv, xv1);691if ((tmpd < tmpu) || overflow ||692(dd - tmpd < tmp0)) /* !(Jebelean cond.) */693break; /* skip ahead to reshift stage */694else695{ /* commit dd1, xu, xv */696res++;697dd1 = tmpd; xu = tmpu; xv = tmpv;698if (xv > vmax) { *u = xu1; *u1 = xu; *v = xv1; *v1 = xv; return res; }699}700701/* Second half of loop divides dd1 into dd, and the matrix returns to its702* normal arrangement. */703tmpd = dd - dd1;704if (tmpd < dd1)705{ /* quotient suspected to be 1 */706tmpu = xu1 + xu; /* cannot overflow */707tmpv = xv1 + xv;708}709else710{ /* division indicated */711hiremainder = 0;712q = 1 + divll(tmpd, dd1);713tmpd = hiremainder;714tmpu = xu1 + q*xu;715tmpv = xv1 + q*xv;716}717718tmp0 = addll(tmpu, xu);719if ((tmpd < tmpv) || overflow ||720(dd1 - tmpd < tmp0)) /* !(Jebelean cond.) */721break; /* skip ahead to reshift stage */722else723{ /* commit dd, xu1, xv1 */724res++;725dd = tmpd; xu1 = tmpu; xv1 = tmpv;726if (xv1 > vmax) { *u = xu; *u1 = xu1; *v = xv; *v1 = xv1; return res; }727}728} /* end of first loop */729730/* Intermezzo: update dd:ddlo, dd1:dd1lo. (But not if skip is set.) */731if (res&1)732{ /* after failed division in 1st half of loop:733* [dd1:dd1lo,dd:ddlo] = [ddorig:ddlo,dd1orig:dd1lo]734* * [ -xu, xu1 ; xv, -xv1 ]735* Actually, we only multiply [ddlo,dd1lo] onto the matrix and add the736* high-order remainders + overflows onto [dd1,dd] */737tmp1 = mulll(ddlo, xu); tmp0 = hiremainder;738tmp1 = subll(mulll(dd1lo,xv), tmp1);739dd1 += subllx(hiremainder, tmp0);740tmp2 = mulll(ddlo, xu1); tmp0 = hiremainder;741ddlo = subll(tmp2, mulll(dd1lo,xv1));742dd += subllx(tmp0, hiremainder);743dd1lo = tmp1;744}745else746{ /* after failed division in 2nd half of loop:747* [dd:ddlo,dd1:dd1lo] = [ddorig:ddlo,dd1orig:dd1lo]748* * [ xu1, -xu ; -xv1, xv ]749* Actually, we only multiply [ddlo,dd1lo] onto the matrix and add the750* high-order remainders + overflows onto [dd,dd1] */751tmp1 = mulll(ddlo, xu1); tmp0 = hiremainder;752tmp1 = subll(tmp1, mulll(dd1lo,xv1));753dd += subllx(tmp0, hiremainder);754tmp2 = mulll(ddlo, xu); tmp0 = hiremainder;755dd1lo = subll(mulll(dd1lo,xv), tmp2);756dd1 += subllx(hiremainder, tmp0);757ddlo = tmp1;758}759} /* end of skip-pable section: get here also, with res==1, when there760* was a problem immediately after the very first division. */761762/* Re-shift. Note: the shift count _can_ be zero, viz. under the following763* precise conditions: The original dd1 had its topmost bit set, so the 1st764* q was 1, and after subtraction, dd had its topmost bit unset. If now dd=0,765* we'd have taken the return exit already, so we couldn't have got here.766* If not, then it must have been the second division which has gone amiss767* (because dd1 was very close to an exact multiple of the remainder dd value,768* so this will be very rare). At this point, we'd have a fairly slim chance769* of fixing things by re-examining dd1:dd1lo vs. dd:ddlo, but this is not770* guaranteed to work. Instead of trying, we return at once and let caller771* see to it that the initial subtraction is re-done usingall the bits of772* both operands, which already helps, and the next round will either be a773* full division (if dd occupied a halfword or less), or another llgcdii()774* first step. In the latter case, since we try a little harder during our775* first step, we may actually be able to fix the problem, and get here again776* with improved low-order bits and with another step under our belt.777* Otherwise we'll have given up above and forced a full division.778*779* If res is even, the shift count _cannot_ be zero. (The first step forces780* a zero into the remainder's MSB, and all subsequent remainders will have781* inherited it.)782*783* The re-shift stage exits if the next divisor has at most half a word's784* worth of bits.785*786* For didactic reasons, the second loop will be arranged in the same way787* as the first -- beginning with the division of dd into dd1, as if res788* was odd. To cater for this, if res is actually even, we swap things789* around during reshifting. (During the second loop, the parity of res790* does not matter; we know in which half of the loop we are when we decide791* to return.) */792if (res&1)793{ /* after odd number of division(s) */794if (dd1 && (sh = bfffo(dd1)))795{796shc = BITS_IN_LONG - sh;797dd = (ddlo >> shc) + (dd << sh);798if (!(HIGHMASK & dd))799{800*u = xu; *u1 = xu1; *v = xv; *v1 = xv1;801return -res; /* full division asked for */802}803dd1 = (dd1lo >> shc) + (dd1 << sh);804}805else806{ /* time to return: <= 1 word left, or sh==0 */807*u = xu; *u1 = xu1; *v = xv; *v1 = xv1;808return res;809}810}811else812{ /* after even number of divisions */813if (dd)814{815sh = bfffo(dd); /* > 0 */816shc = BITS_IN_LONG - sh;817/* dd:ddlo will become the new dd1, and v.v. */818tmpd = (ddlo >> shc) + (dd << sh);819dd = (dd1lo >> shc) + (dd1 << sh);820dd1 = tmpd;821/* This completes the swap; now dd is again the current divisor */822if (HIGHMASK & dd)823{ /* recurrence matrix is the wrong way round; swap it */824tmp0 = xu; xu = xu1; xu1 = tmp0;825tmp0 = xv; xv = xv1; xv1 = tmp0;826}827else828{ /* recurrence matrix is the wrong way round; fix this */829*u = xu1; *u1 = xu; *v = xv1; *v1 = xv;830return -res; /* full division asked for */831}832}833else834{ /* time to return: <= 1 word left */835*u = xu1; *u1 = xu; *v = xv1; *v1 = xv;836return res;837}838} /* end reshift */839840/* The Second Loop. Rip-off of the first, but we now check for overflow841* in the recurrences. Returns instead of breaking when we cannot fix the842* quotient any longer. */843for(;;)844{ /* First half of loop divides dd into dd1, and leaves the recurrence845* matrix xu,...,xv1 groomed the wrong way round (xu,xv will be the newer846* entries) when successful */847tmpd = dd1 - dd;848if (tmpd < dd)849{ /* quotient suspected to be 1 */850tmpu = xu + xu1;851tmpv = addll(xv, xv1); /* xv,xv1 will overflow first */852tmp1 = overflow;853}854else855{ /* division indicated */856hiremainder = 0;857q = 1 + divll(tmpd, dd);858tmpd = hiremainder;859tmpu = xu + q*xu1;860tmpv = addll(xv, mulll(q,xv1));861tmp1 = overflow | hiremainder;862}863864tmp0 = addll(tmpv, xv1);865if ((tmpd < tmpu) || overflow || tmp1 ||866(dd - tmpd < tmp0)) /* !(Jebelean cond.) */867{ /* The recurrence matrix has not yet been warped... */868*u = xu; *u1 = xu1; *v = xv; *v1 = xv1;869break;870}871/* commit dd1, xu, xv */872res++;873dd1 = tmpd; xu = tmpu; xv = tmpv;874if (xv > vmax) { *u = xu1; *u1 = xu; *v = xv1; *v1 = xv; return res; }875876/* Second half of loop divides dd1 into dd, and the matrix returns to its877* normal arrangement */878tmpd = dd - dd1;879if (tmpd < dd1)880{ /* quotient suspected to be 1 */881tmpu = xu1 + xu;882tmpv = addll(xv1, xv);883tmp1 = overflow;884}885else886{ /* division indicated */887hiremainder = 0;888q = 1 + divll(tmpd, dd1);889tmpd = hiremainder;890tmpu = xu1 + q*xu;891tmpv = addll(xv1, mulll(q, xv));892tmp1 = overflow | hiremainder;893}894895tmp0 = addll(tmpu, xu);896if ((tmpd < tmpv) || overflow || tmp1 ||897(dd1 - tmpd < tmp0)) /* !(Jebelean cond.) */898{ /* The recurrence matrix has not yet been unwarped, so it is899* the wrong way round; fix this. */900*u = xu1; *u1 = xu; *v = xv1; *v1 = xv;901break;902}903904res++; /* commit dd, xu1, xv1 */905dd = tmpd; xu1 = tmpu; xv1 = tmpv;906if (xv1 > vmax) { *u = xu; *u1 = xu1; *v = xv; *v1 = xv1; return res; }907} /* end of second loop */908909return res;910}911912/* 1 / Mod(x,p). Assume x < p */913ulong914Fl_invsafe(ulong x, ulong p)915{916long s;917ulong xv, xv1, g = xgcduu(p, x, 1, &xv, &xv1, &s);918if (g != 1UL) return 0UL;919xv = xv1 % p; if (s < 0) xv = p - xv;920return xv;921}922923/* 1 / Mod(x,p). Assume x < p */924ulong925Fl_inv(ulong x, ulong p)926{927ulong xv = Fl_invsafe(x, p);928if (!xv && p!=1UL) pari_err_INV("Fl_inv", mkintmod(utoi(x), utoi(p)));929return xv;930}931932933