Path: blob/master/src/java.desktop/share/native/libmlib_image/mlib_ImageConv_u16nw.c
41152 views
/*1* Copyright (c) 2003, 2020, Oracle and/or its affiliates. All rights reserved.2* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.3*4* This code is free software; you can redistribute it and/or modify it5* under the terms of the GNU General Public License version 2 only, as6* published by the Free Software Foundation. Oracle designates this7* particular file as subject to the "Classpath" exception as provided8* by Oracle in the LICENSE file that accompanied this code.9*10* This code is distributed in the hope that it will be useful, but WITHOUT11* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or12* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License13* version 2 for more details (a copy is included in the LICENSE file that14* accompanied this code).15*16* You should have received a copy of the GNU General Public License version17* 2 along with this work; if not, write to the Free Software Foundation,18* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.19*20* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA21* or visit www.oracle.com if you need additional information or have any22* questions.23*/242526/*27* FUNCTION28* Internal functions for mlib_ImageConv* on U8/S16/U16 types and29* MLIB_EDGE_DST_NO_WRITE mask30*/3132#include "mlib_image.h"33#include "mlib_c_ImageConv.h"3435/*36This define switches between functions of different data types37*/38#define IMG_TYPE 33940/***************************************************************/41#if IMG_TYPE == 14243#define DTYPE mlib_u844#define CONV_FUNC(KERN) mlib_c_conv##KERN##nw_u845#define CONV_FUNC_I(KERN) mlib_i_conv##KERN##nw_u846#define DSCALE (1 << 24)47#define FROM_S32(x) (((x) >> 24) ^ 128)48#define S64TOS32(x) (x)49#define SAT_OFF -(1u << 31)5051#elif IMG_TYPE == 25253#define DTYPE mlib_s1654#define CONV_FUNC(KERN) mlib_conv##KERN##nw_s1655#define CONV_FUNC_I(KERN) mlib_i_conv##KERN##nw_s1656#define DSCALE 65536.057#define FROM_S32(x) ((x) >> 16)58#define S64TOS32(x) ((x) & 0xffffffff)59#define SAT_OFF6061#elif IMG_TYPE == 36263#define DTYPE mlib_u1664#define CONV_FUNC(KERN) mlib_conv##KERN##nw_u1665#define CONV_FUNC_I(KERN) mlib_i_conv##KERN##nw_u1666#define DSCALE 65536.067#define FROM_S32(x) (((x) >> 16) ^ 0x8000)68#define S64TOS32(x) (x)69#define SAT_OFF -(1u << 31)7071#endif /* IMG_TYPE == 1 */7273/***************************************************************/74#define BUFF_SIZE 16007576#define CACHE_SIZE (64*1024)7778/***************************************************************/79#define FTYPE mlib_d648081#ifndef MLIB_USE_FTOI_CLAMPING8283#define CLAMP_S32(x) \84(((x) <= MLIB_S32_MIN) ? MLIB_S32_MIN : (((x) >= MLIB_S32_MAX) ? MLIB_S32_MAX : (mlib_s32)(x)))8586#else8788#define CLAMP_S32(x) ((mlib_s32)(x))8990#endif /* MLIB_USE_FTOI_CLAMPING */9192/***************************************************************/93#define D2I(x) CLAMP_S32((x) SAT_OFF)9495/***************************************************************/96#ifdef _LITTLE_ENDIAN9798#define STORE2(res0, res1) \99dp[0 ] = res1; \100dp[chan1] = res0101102#else103104#define STORE2(res0, res1) \105dp[0 ] = res0; \106dp[chan1] = res1107108#endif /* _LITTLE_ENDIAN */109110/***************************************************************/111#ifdef _NO_LONGLONG112113#define LOAD_BUFF(buff) \114buff[i ] = sp[0]; \115buff[i + 1] = sp[chan1]116117#else /* _NO_LONGLONG */118119#ifdef _LITTLE_ENDIAN120121#define LOAD_BUFF(buff) \122*(mlib_s64*)(buff + i) = (((mlib_s64)sp[chan1]) << 32) | S64TOS32((mlib_s64)sp[0])123124#else /* _LITTLE_ENDIAN */125126#define LOAD_BUFF(buff) \127*(mlib_s64*)(buff + i) = (((mlib_s64)sp[0]) << 32) | S64TOS32((mlib_s64)sp[chan1])128129#endif /* _LITTLE_ENDIAN */130#endif /* _NO_LONGLONG */131132/***************************************************************/133typedef union {134mlib_d64 d64;135struct {136mlib_s32 i0;137mlib_s32 i1;138} i32s;139struct {140mlib_s32 f0;141mlib_s32 f1;142} f32s;143} d64_2x32;144145/***************************************************************/146#define DEF_VARS(type) \147type *adr_src, *sl, *sp = NULL; \148type *adr_dst, *dl, *dp = NULL; \149FTYPE *pbuff = buff; \150mlib_s32 wid, hgt, sll, dll; \151mlib_s32 nchannel, chan1; \152mlib_s32 i, j, c153154/***************************************************************/155#define GET_SRC_DST_PARAMETERS(type) \156hgt = mlib_ImageGetHeight(src); \157wid = mlib_ImageGetWidth(src); \158nchannel = mlib_ImageGetChannels(src); \159sll = mlib_ImageGetStride(src) / sizeof(type); \160dll = mlib_ImageGetStride(dst) / sizeof(type); \161adr_src = (type *)mlib_ImageGetData(src); \162adr_dst = (type *)mlib_ImageGetData(dst)163164/***************************************************************/165#if IMG_TYPE == 1166167/* Test for the presence of any "1" bit in bits1688 to 31 of val. If present, then val is either169negative or >255. If over/underflows of 8 bits170are uncommon, then this technique can be a win,171since only a single test, rather than two, is172necessary to determine if clamping is needed.173On the other hand, if over/underflows are common,174it adds an extra test.175*/176#define CLAMP_STORE(dst, val) \177if (val & 0xffffff00) { \178if (val < MLIB_U8_MIN) \179dst = MLIB_U8_MIN; \180else \181dst = MLIB_U8_MAX; \182} else { \183dst = (mlib_u8)val; \184}185186#elif IMG_TYPE == 2187188#define CLAMP_STORE(dst, val) \189if (val >= MLIB_S16_MAX) \190dst = MLIB_S16_MAX; \191else if (val <= MLIB_S16_MIN) \192dst = MLIB_S16_MIN; \193else \194dst = (mlib_s16)val195196#elif IMG_TYPE == 3197198#define CLAMP_STORE(dst, val) \199if (val >= MLIB_U16_MAX) \200dst = MLIB_U16_MAX; \201else if (val <= MLIB_U16_MIN) \202dst = MLIB_U16_MIN; \203else \204dst = (mlib_u16)val205206#endif /* IMG_TYPE == 1 */207208/***************************************************************/209#define MAX_KER 7210#define MAX_N 15211212static mlib_status mlib_ImageConv1xN(mlib_image *dst,213const mlib_image *src,214const mlib_d64 *k,215mlib_s32 n,216mlib_s32 dn,217mlib_s32 cmask)218{219FTYPE buff[BUFF_SIZE];220mlib_s32 off, kh;221mlib_s32 d0, d1;222const FTYPE *pk;223FTYPE k0, k1, k2, k3;224FTYPE p0, p1, p2, p3, p4;225DEF_VARS(DTYPE);226DTYPE *sl_c, *dl_c, *sl0;227mlib_s32 l, hsize, max_hsize;228GET_SRC_DST_PARAMETERS(DTYPE);229230hgt -= (n - 1);231adr_dst += dn*dll;232233max_hsize = (CACHE_SIZE/sizeof(DTYPE))/sll;234235if (!max_hsize) max_hsize = 1;236237if (max_hsize > BUFF_SIZE) {238pbuff = mlib_malloc(sizeof(FTYPE)*max_hsize);239}240241chan1 = nchannel;242243sl_c = adr_src;244dl_c = adr_dst;245246for (l = 0; l < hgt; l += hsize) {247hsize = hgt - l;248249if (hsize > max_hsize) hsize = max_hsize;250251for (c = 0; c < nchannel; c++) {252if (!(cmask & (1 << (chan1 - 1 - c)))) continue;253254sl = sl_c + c;255dl = dl_c + c;256257for (j = 0; j < hsize; j++) pbuff[j] = 0.0;258259for (i = 0; i < wid; i++) {260sl0 = sl;261262for (off = 0; off < (n - 4); off += 4) {263pk = k + off;264sp = sl0;265266k0 = pk[0]; k1 = pk[1]; k2 = pk[2]; k3 = pk[3];267p2 = sp[0]; p3 = sp[sll]; p4 = sp[2*sll];268sp += 3*sll;269270for (j = 0; j < hsize; j += 2) {271p0 = p2; p1 = p3; p2 = p4;272p3 = sp[0];273p4 = sp[sll];274275pbuff[j ] += p0*k0 + p1*k1 + p2*k2 + p3*k3;276pbuff[j + 1] += p1*k0 + p2*k1 + p3*k2 + p4*k3;277278sp += 2*sll;279}280281sl0 += 4*sll;282}283284pk = k + off;285sp = sl0;286287k0 = pk[0]; k1 = pk[1]; k2 = pk[2]; k3 = pk[3];288p2 = sp[0]; p3 = sp[sll]; p4 = sp[2*sll];289290dp = dl;291kh = n - off;292293if (kh == 4) {294sp += 3*sll;295296for (j = 0; j <= (hsize - 2); j += 2) {297p0 = p2; p1 = p3; p2 = p4;298p3 = sp[0];299p4 = sp[sll];300301d0 = D2I(p0*k0 + p1*k1 + p2*k2 + p3*k3 + pbuff[j]);302d1 = D2I(p1*k0 + p2*k1 + p3*k2 + p4*k3 + pbuff[j + 1]);303304dp[0 ] = FROM_S32(d0);305dp[dll] = FROM_S32(d1);306307pbuff[j] = 0;308pbuff[j + 1] = 0;309310sp += 2*sll;311dp += 2*dll;312}313314if (j < hsize) {315p0 = p2; p1 = p3; p2 = p4;316p3 = sp[0];317318d0 = D2I(p0*k0 + p1*k1 + p2*k2 + p3*k3 + pbuff[j]);319320pbuff[j] = 0;321322dp[0] = FROM_S32(d0);323}324325} else if (kh == 3) {326sp += 2*sll;327328for (j = 0; j <= (hsize - 2); j += 2) {329p0 = p2; p1 = p3;330p2 = sp[0];331p3 = sp[sll];332333d0 = D2I(p0*k0 + p1*k1 + p2*k2 + pbuff[j]);334d1 = D2I(p1*k0 + p2*k1 + p3*k2 + pbuff[j + 1]);335336dp[0 ] = FROM_S32(d0);337dp[dll] = FROM_S32(d1);338339pbuff[j] = 0;340pbuff[j + 1] = 0;341342sp += 2*sll;343dp += 2*dll;344}345346if (j < hsize) {347p0 = p2; p1 = p3;348p2 = sp[0];349350d0 = D2I(p0*k0 + p1*k1 + p2*k2 + pbuff[j]);351352pbuff[j] = 0;353354dp[0] = FROM_S32(d0);355}356357} else if (kh == 2) {358sp += sll;359360for (j = 0; j <= (hsize - 2); j += 2) {361p0 = p2;362p1 = sp[0];363p2 = sp[sll];364365d0 = D2I(p0*k0 + p1*k1 + pbuff[j]);366d1 = D2I(p1*k0 + p2*k1 + pbuff[j + 1]);367368dp[0 ] = FROM_S32(d0);369dp[dll] = FROM_S32(d1);370371pbuff[j] = 0;372pbuff[j + 1] = 0;373374sp += 2*sll;375dp += 2*dll;376}377378if (j < hsize) {379p0 = p2;380p1 = sp[0];381382d0 = D2I(p0*k0 + p1*k1 + pbuff[j]);383384pbuff[j] = 0;385386dp[0] = FROM_S32(d0);387}388389} else /* if (kh == 1) */ {390for (j = 0; j < hsize; j++) {391p0 = sp[0];392393d0 = D2I(p0*k0 + pbuff[j]);394395dp[0] = FROM_S32(d0);396397pbuff[j] = 0;398399sp += sll;400dp += dll;401}402}403404sl += chan1;405dl += chan1;406}407}408409sl_c += max_hsize*sll;410dl_c += max_hsize*dll;411}412413if (pbuff != buff) mlib_free(pbuff);414415return MLIB_SUCCESS;416}417418/***************************************************************/419mlib_status CONV_FUNC(MxN)(mlib_image *dst,420const mlib_image *src,421const mlib_s32 *kernel,422mlib_s32 m,423mlib_s32 n,424mlib_s32 dm,425mlib_s32 dn,426mlib_s32 scale,427mlib_s32 cmask)428{429FTYPE buff[BUFF_SIZE], *buffs_arr[2*(MAX_N + 1)];430FTYPE **buffs = buffs_arr, *buffd;431FTYPE akernel[256], *k = akernel, fscale = DSCALE;432mlib_s32 mn, l, off, kw, bsize, buff_ind;433mlib_s32 d0, d1;434FTYPE k0, k1, k2, k3, k4, k5, k6;435FTYPE p0, p1, p2, p3, p4, p5, p6, p7;436d64_2x32 dd;437DEF_VARS(DTYPE);438mlib_s32 chan2;439mlib_s32 *buffo, *buffi;440mlib_status status = MLIB_SUCCESS;441442GET_SRC_DST_PARAMETERS(DTYPE);443444if (scale > 30) {445fscale *= 1.0/(1 << 30);446scale -= 30;447}448449fscale /= (1 << scale);450451mn = m*n;452453if (mn > 256) {454k = mlib_malloc(mn*sizeof(mlib_d64));455456if (k == NULL) return MLIB_FAILURE;457}458459for (i = 0; i < mn; i++) {460k[i] = kernel[i]*fscale;461}462463if (m == 1) {464status = mlib_ImageConv1xN(dst, src, k, n, dn, cmask);465FREE_AND_RETURN_STATUS;466}467468bsize = (n + 3)*wid;469470if ((bsize > BUFF_SIZE) || (n > MAX_N)) {471pbuff = mlib_malloc(sizeof(FTYPE)*bsize + sizeof(FTYPE *)*2*(n + 1));472473if (pbuff == NULL) {474status = MLIB_FAILURE;475FREE_AND_RETURN_STATUS;476}477buffs = (FTYPE **)(pbuff + bsize);478}479480for (l = 0; l < (n + 1); l++) buffs[l] = pbuff + l*wid;481for (l = 0; l < (n + 1); l++) buffs[l + (n + 1)] = buffs[l];482buffd = buffs[n] + wid;483buffo = (mlib_s32*)(buffd + wid);484buffi = buffo + (wid &~ 1);485486chan1 = nchannel;487chan2 = chan1 + chan1;488489wid -= (m - 1);490hgt -= (n - 1);491adr_dst += dn*dll + dm*nchannel;492493for (c = 0; c < nchannel; c++) {494if (!(cmask & (1 << (chan1 - 1 - c)))) continue;495496sl = adr_src + c;497dl = adr_dst + c;498499for (l = 0; l < n; l++) {500FTYPE *buff = buffs[l];501502for (i = 0; i < wid + (m - 1); i++) {503buff[i] = (FTYPE)sl[i*chan1];504}505506sl += sll;507}508509buff_ind = 0;510511for (i = 0; i < wid; i++) buffd[i] = 0.0;512513for (j = 0; j < hgt; j++) {514FTYPE **buffc = buffs + buff_ind;515FTYPE *buffn = buffc[n];516FTYPE *pk = k;517518for (l = 0; l < n; l++) {519FTYPE *buff_l = buffc[l];520521for (off = 0; off < m;) {522FTYPE *buff = buff_l + off;523524kw = m - off;525526if (kw > 2*MAX_KER) kw = MAX_KER; else527if (kw > MAX_KER) kw = kw/2;528off += kw;529530sp = sl;531dp = dl;532533p2 = buff[0]; p3 = buff[1]; p4 = buff[2];534p5 = buff[3]; p6 = buff[4]; p7 = buff[5];535536k0 = pk[0]; k1 = pk[1]; k2 = pk[2]; k3 = pk[3];537k4 = pk[4]; k5 = pk[5]; k6 = pk[6];538pk += kw;539540if (kw == 7) {541542if (l < (n - 1) || off < m) {543for (i = 0; i <= (wid - 2); i += 2) {544p0 = p2; p1 = p3; p2 = p4; p3 = p5; p4 = p6; p5 = p7;545546p6 = buff[i + 6]; p7 = buff[i + 7];547548buffd[i ] += p0*k0 + p1*k1 + p2*k2 + p3*k3 + p4*k4 + p5*k5 + p6*k6;549buffd[i + 1] += p1*k0 + p2*k1 + p3*k2 + p4*k3 + p5*k4 + p6*k5 + p7*k6;550}551552} else {553for (i = 0; i <= (wid - 2); i += 2) {554p0 = p2; p1 = p3; p2 = p4; p3 = p5; p4 = p6; p5 = p7;555556p6 = buff[i + 6]; p7 = buff[i + 7];557558LOAD_BUFF(buffi);559560dd.d64 = *(FTYPE *)(buffi + i);561buffn[i ] = (FTYPE)dd.i32s.i0;562buffn[i + 1] = (FTYPE)dd.i32s.i1;563564d0 = D2I(p0*k0 + p1*k1 + p2*k2 + p3*k3 + p4*k4 + p5*k5 + p6*k6 + buffd[i ]);565d1 = D2I(p1*k0 + p2*k1 + p3*k2 + p4*k3 + p5*k4 + p6*k5 + p7*k6 + buffd[i + 1]);566567dp[0 ] = FROM_S32(d0);568dp[chan1] = FROM_S32(d1);569570buffd[i ] = 0.0;571buffd[i + 1] = 0.0;572573sp += chan2;574dp += chan2;575}576}577578} else if (kw == 6) {579580if (l < (n - 1) || off < m) {581for (i = 0; i <= (wid - 2); i += 2) {582p0 = p2; p1 = p3; p2 = p4; p3 = p5; p4 = p6;583584p5 = buff[i + 5]; p6 = buff[i + 6];585586buffd[i ] += p0*k0 + p1*k1 + p2*k2 + p3*k3 + p4*k4 + p5*k5;587buffd[i + 1] += p1*k0 + p2*k1 + p3*k2 + p4*k3 + p5*k4 + p6*k5;588}589590} else {591for (i = 0; i <= (wid - 2); i += 2) {592p0 = p2; p1 = p3; p2 = p4; p3 = p5; p4 = p6;593594p5 = buff[i + 5]; p6 = buff[i + 6];595596buffn[i ] = (FTYPE)sp[0];597buffn[i + 1] = (FTYPE)sp[chan1];598599d0 = D2I(p0*k0 + p1*k1 + p2*k2 + p3*k3 + p4*k4 + p5*k5 + buffd[i ]);600d1 = D2I(p1*k0 + p2*k1 + p3*k2 + p4*k3 + p5*k4 + p6*k5 + buffd[i + 1]);601602dp[0 ] = FROM_S32(d0);603dp[chan1] = FROM_S32(d1);604605buffd[i ] = 0.0;606buffd[i + 1] = 0.0;607608sp += chan2;609dp += chan2;610}611}612613} else if (kw == 5) {614615if (l < (n - 1) || off < m) {616for (i = 0; i <= (wid - 2); i += 2) {617p0 = p2; p1 = p3; p2 = p4; p3 = p5;618619p4 = buff[i + 4]; p5 = buff[i + 5];620621buffd[i ] += p0*k0 + p1*k1 + p2*k2 + p3*k3 + p4*k4;622buffd[i + 1] += p1*k0 + p2*k1 + p3*k2 + p4*k3 + p5*k4;623}624625} else {626for (i = 0; i <= (wid - 2); i += 2) {627p0 = p2; p1 = p3; p2 = p4; p3 = p5;628629p4 = buff[i + 4]; p5 = buff[i + 5];630631buffn[i ] = (FTYPE)sp[0];632buffn[i + 1] = (FTYPE)sp[chan1];633634d0 = D2I(p0*k0 + p1*k1 + p2*k2 + p3*k3 + p4*k4 + buffd[i ]);635d1 = D2I(p1*k0 + p2*k1 + p3*k2 + p4*k3 + p5*k4 + buffd[i + 1]);636637dp[0 ] = FROM_S32(d0);638dp[chan1] = FROM_S32(d1);639640buffd[i ] = 0.0;641buffd[i + 1] = 0.0;642643sp += chan2;644dp += chan2;645}646}647648} else if (kw == 4) {649650if (l < (n - 1) || off < m) {651for (i = 0; i <= (wid - 2); i += 2) {652p0 = p2; p1 = p3; p2 = p4;653654p3 = buff[i + 3]; p4 = buff[i + 4];655656buffd[i ] += p0*k0 + p1*k1 + p2*k2 + p3*k3;657buffd[i + 1] += p1*k0 + p2*k1 + p3*k2 + p4*k3;658}659660} else {661for (i = 0; i <= (wid - 2); i += 2) {662p0 = p2; p1 = p3; p2 = p4;663664p3 = buff[i + 3]; p4 = buff[i + 4];665666buffn[i ] = (FTYPE)sp[0];667buffn[i + 1] = (FTYPE)sp[chan1];668669d0 = D2I(p0*k0 + p1*k1 + p2*k2 + p3*k3 + buffd[i ]);670d1 = D2I(p1*k0 + p2*k1 + p3*k2 + p4*k3 + buffd[i + 1]);671672dp[0 ] = FROM_S32(d0);673dp[chan1] = FROM_S32(d1);674675buffd[i ] = 0.0;676buffd[i + 1] = 0.0;677678sp += chan2;679dp += chan2;680}681}682683} else if (kw == 3) {684685if (l < (n - 1) || off < m) {686for (i = 0; i <= (wid - 2); i += 2) {687p0 = p2; p1 = p3;688689p2 = buff[i + 2]; p3 = buff[i + 3];690691buffd[i ] += p0*k0 + p1*k1 + p2*k2;692buffd[i + 1] += p1*k0 + p2*k1 + p3*k2;693}694695} else {696for (i = 0; i <= (wid - 2); i += 2) {697p0 = p2; p1 = p3;698699p2 = buff[i + 2]; p3 = buff[i + 3];700701buffn[i ] = (FTYPE)sp[0];702buffn[i + 1] = (FTYPE)sp[chan1];703704d0 = D2I(p0*k0 + p1*k1 + p2*k2 + buffd[i ]);705d1 = D2I(p1*k0 + p2*k1 + p3*k2 + buffd[i + 1]);706707dp[0 ] = FROM_S32(d0);708dp[chan1] = FROM_S32(d1);709710buffd[i ] = 0.0;711buffd[i + 1] = 0.0;712713sp += chan2;714dp += chan2;715}716}717718} else /*if (kw == 2)*/ {719720if (l < (n - 1) || off < m) {721for (i = 0; i <= (wid - 2); i += 2) {722p0 = p2;723724p1 = buff[i + 1]; p2 = buff[i + 2];725726buffd[i ] += p0*k0 + p1*k1;727buffd[i + 1] += p1*k0 + p2*k1;728}729730} else {731for (i = 0; i <= (wid - 2); i += 2) {732p0 = p2;733734p1 = buff[i + 1]; p2 = buff[i + 2];735736buffn[i ] = (FTYPE)sp[0];737buffn[i + 1] = (FTYPE)sp[chan1];738739d0 = D2I(p0*k0 + p1*k1 + buffd[i ]);740d1 = D2I(p1*k0 + p2*k1 + buffd[i + 1]);741742dp[0 ] = FROM_S32(d0);743dp[chan1] = FROM_S32(d1);744745buffd[i ] = 0.0;746buffd[i + 1] = 0.0;747748sp += chan2;749dp += chan2;750}751}752}753}754}755756/* last pixels */757for (; i < wid; i++) {758FTYPE *pk = k, s = 0;759mlib_s32 x, d0;760761for (l = 0; l < n; l++) {762FTYPE *buff = buffc[l] + i;763764for (x = 0; x < m; x++) s += buff[x] * (*pk++);765}766767d0 = D2I(s);768dp[0] = FROM_S32(d0);769770buffn[i] = (FTYPE)sp[0];771772sp += chan1;773dp += chan1;774}775776for (l = 0; l < (m - 1); l++) buffn[wid + l] = sp[l*chan1];777778/* next line */779sl += sll;780dl += dll;781782buff_ind++;783784if (buff_ind >= n + 1) buff_ind = 0;785}786}787788FREE_AND_RETURN_STATUS;789}790791/***************************************************************/792/* for x86, using integer multiplies is faster */793794#define STORE_RES(res, x) \795x >>= shift2; \796CLAMP_STORE(res, x)797798mlib_status CONV_FUNC_I(MxN)(mlib_image *dst,799const mlib_image *src,800const mlib_s32 *kernel,801mlib_s32 m,802mlib_s32 n,803mlib_s32 dm,804mlib_s32 dn,805mlib_s32 scale,806mlib_s32 cmask)807{808mlib_s32 buff[BUFF_SIZE], *buffd = buff;809mlib_s32 l, off, kw;810mlib_s32 d0, d1, shift1, shift2;811mlib_s32 k0, k1, k2, k3, k4, k5, k6;812mlib_s32 p0, p1, p2, p3, p4, p5, p6, p7;813DTYPE *adr_src, *sl, *sp = NULL;814DTYPE *adr_dst, *dl, *dp = NULL;815mlib_s32 wid, hgt, sll, dll;816mlib_s32 nchannel, chan1;817mlib_s32 i, j, c;818mlib_s32 chan2;819mlib_s32 k_locl[MAX_N*MAX_N], *k = k_locl;820GET_SRC_DST_PARAMETERS(DTYPE);821822#if IMG_TYPE != 1823shift1 = 16;824#else825shift1 = 8;826#endif /* IMG_TYPE != 1 */827shift2 = scale - shift1;828829chan1 = nchannel;830chan2 = chan1 + chan1;831832wid -= (m - 1);833hgt -= (n - 1);834adr_dst += dn*dll + dm*nchannel;835836if (wid > BUFF_SIZE) {837buffd = mlib_malloc(sizeof(mlib_s32)*wid);838839if (buffd == NULL) return MLIB_FAILURE;840}841842if (m*n > MAX_N*MAX_N) {843k = mlib_malloc(sizeof(mlib_s32)*(m*n));844845if (k == NULL) {846if (buffd != buff) mlib_free(buffd);847return MLIB_FAILURE;848}849}850851for (i = 0; i < m*n; i++) {852k[i] = kernel[i] >> shift1;853}854855for (c = 0; c < nchannel; c++) {856if (!(cmask & (1 << (nchannel - 1 - c)))) continue;857858sl = adr_src + c;859dl = adr_dst + c;860861for (i = 0; i < wid; i++) buffd[i] = 0;862863for (j = 0; j < hgt; j++) {864mlib_s32 *pk = k;865866for (l = 0; l < n; l++) {867DTYPE *sp0 = sl + l*sll;868869for (off = 0; off < m;) {870sp = sp0 + off*chan1;871dp = dl;872873kw = m - off;874875if (kw > 2*MAX_KER) kw = MAX_KER; else876if (kw > MAX_KER) kw = kw/2;877off += kw;878879p2 = sp[0]; p3 = sp[chan1]; p4 = sp[chan2];880p5 = sp[chan2 + chan1]; p6 = sp[chan2 + chan2]; p7 = sp[5*chan1];881882k0 = pk[0]; k1 = pk[1]; k2 = pk[2]; k3 = pk[3];883k4 = pk[4]; k5 = pk[5]; k6 = pk[6];884pk += kw;885886sp += (kw - 1)*chan1;887888if (kw == 7) {889890if (l < (n - 1) || off < m) {891for (i = 0; i <= (wid - 2); i += 2) {892p0 = p2; p1 = p3; p2 = p4; p3 = p5; p4 = p6; p5 = p7;893p6 = sp[0];894p7 = sp[chan1];895896buffd[i ] += p0*k0 + p1*k1 + p2*k2 + p3*k3 + p4*k4 + p5*k5 + p6*k6;897buffd[i + 1] += p1*k0 + p2*k1 + p3*k2 + p4*k3 + p5*k4 + p6*k5 + p7*k6;898899sp += chan2;900}901902} else {903for (i = 0; i <= (wid - 2); i += 2) {904p0 = p2; p1 = p3; p2 = p4; p3 = p5; p4 = p6; p5 = p7;905p6 = sp[0];906p7 = sp[chan1];907908d0 = (p0*k0 + p1*k1 + p2*k2 + p3*k3 + p4*k4 + p5*k5 + p6*k6 + buffd[i ]);909d1 = (p1*k0 + p2*k1 + p3*k2 + p4*k3 + p5*k4 + p6*k5 + p7*k6 + buffd[i + 1]);910911STORE_RES(dp[0 ], d0);912STORE_RES(dp[chan1], d1);913914buffd[i ] = 0;915buffd[i + 1] = 0;916917sp += chan2;918dp += chan2;919}920}921922} else if (kw == 6) {923924if (l < (n - 1) || off < m) {925for (i = 0; i <= (wid - 2); i += 2) {926p0 = p2; p1 = p3; p2 = p4; p3 = p5; p4 = p6;927p5 = sp[0];928p6 = sp[chan1];929930buffd[i ] += p0*k0 + p1*k1 + p2*k2 + p3*k3 + p4*k4 + p5*k5;931buffd[i + 1] += p1*k0 + p2*k1 + p3*k2 + p4*k3 + p5*k4 + p6*k5;932933sp += chan2;934}935936} else {937for (i = 0; i <= (wid - 2); i += 2) {938p0 = p2; p1 = p3; p2 = p4; p3 = p5; p4 = p6;939p5 = sp[0];940p6 = sp[chan1];941942d0 = (p0*k0 + p1*k1 + p2*k2 + p3*k3 + p4*k4 + p5*k5 + buffd[i ]);943d1 = (p1*k0 + p2*k1 + p3*k2 + p4*k3 + p5*k4 + p6*k5 + buffd[i + 1]);944945STORE_RES(dp[0 ], d0);946STORE_RES(dp[chan1], d1);947948buffd[i ] = 0;949buffd[i + 1] = 0;950951sp += chan2;952dp += chan2;953}954}955956} else if (kw == 5) {957958if (l < (n - 1) || off < m) {959for (i = 0; i <= (wid - 2); i += 2) {960p0 = p2; p1 = p3; p2 = p4; p3 = p5;961p4 = sp[0];962p5 = sp[chan1];963964buffd[i ] += p0*k0 + p1*k1 + p2*k2 + p3*k3 + p4*k4;965buffd[i + 1] += p1*k0 + p2*k1 + p3*k2 + p4*k3 + p5*k4;966967sp += chan2;968}969970} else {971for (i = 0; i <= (wid - 2); i += 2) {972p0 = p2; p1 = p3; p2 = p4; p3 = p5;973p4 = sp[0];974p5 = sp[chan1];975976d0 = (p0*k0 + p1*k1 + p2*k2 + p3*k3 + p4*k4 + buffd[i ]);977d1 = (p1*k0 + p2*k1 + p3*k2 + p4*k3 + p5*k4 + buffd[i + 1]);978979STORE_RES(dp[0 ], d0);980STORE_RES(dp[chan1], d1);981982buffd[i ] = 0;983buffd[i + 1] = 0;984985sp += chan2;986dp += chan2;987}988}989990} else if (kw == 4) {991992if (l < (n - 1) || off < m) {993for (i = 0; i <= (wid - 2); i += 2) {994p0 = p2; p1 = p3; p2 = p4;995p3 = sp[0];996p4 = sp[chan1];997998buffd[i ] += p0*k0 + p1*k1 + p2*k2 + p3*k3;999buffd[i + 1] += p1*k0 + p2*k1 + p3*k2 + p4*k3;10001001sp += chan2;1002}10031004} else {1005for (i = 0; i <= (wid - 2); i += 2) {1006p0 = p2; p1 = p3; p2 = p4;1007p3 = sp[0];1008p4 = sp[chan1];10091010d0 = (p0*k0 + p1*k1 + p2*k2 + p3*k3 + buffd[i ]);1011d1 = (p1*k0 + p2*k1 + p3*k2 + p4*k3 + buffd[i + 1]);10121013STORE_RES(dp[0 ], d0);1014STORE_RES(dp[chan1], d1);10151016buffd[i ] = 0;1017buffd[i + 1] = 0;10181019sp += chan2;1020dp += chan2;1021}1022}10231024} else if (kw == 3) {10251026if (l < (n - 1) || off < m) {1027for (i = 0; i <= (wid - 2); i += 2) {1028p0 = p2; p1 = p3;1029p2 = sp[0];1030p3 = sp[chan1];10311032buffd[i ] += p0*k0 + p1*k1 + p2*k2;1033buffd[i + 1] += p1*k0 + p2*k1 + p3*k2;10341035sp += chan2;1036}10371038} else {1039for (i = 0; i <= (wid - 2); i += 2) {1040p0 = p2; p1 = p3;1041p2 = sp[0];1042p3 = sp[chan1];10431044d0 = (p0*k0 + p1*k1 + p2*k2 + buffd[i ]);1045d1 = (p1*k0 + p2*k1 + p3*k2 + buffd[i + 1]);10461047STORE_RES(dp[0 ], d0);1048STORE_RES(dp[chan1], d1);10491050buffd[i ] = 0;1051buffd[i + 1] = 0;10521053sp += chan2;1054dp += chan2;1055}1056}10571058} else if (kw == 2) {10591060if (l < (n - 1) || off < m) {1061for (i = 0; i <= (wid - 2); i += 2) {1062p0 = p2;1063p1 = sp[0];1064p2 = sp[chan1];10651066buffd[i ] += p0*k0 + p1*k1;1067buffd[i + 1] += p1*k0 + p2*k1;10681069sp += chan2;1070}10711072} else {1073for (i = 0; i <= (wid - 2); i += 2) {1074p0 = p2;1075p1 = sp[0];1076p2 = sp[chan1];10771078d0 = (p0*k0 + p1*k1 + buffd[i ]);1079d1 = (p1*k0 + p2*k1 + buffd[i + 1]);10801081STORE_RES(dp[0 ], d0);1082STORE_RES(dp[chan1], d1);10831084buffd[i ] = 0;1085buffd[i + 1] = 0;10861087sp += chan2;1088dp += chan2;1089}1090}10911092} else /*if (kw == 1)*/ {10931094if (l < (n - 1) || off < m) {1095for (i = 0; i <= (wid - 2); i += 2) {1096p0 = sp[0];1097p1 = sp[chan1];10981099buffd[i ] += p0*k0;1100buffd[i + 1] += p1*k0;11011102sp += chan2;1103}11041105} else {1106for (i = 0; i <= (wid - 2); i += 2) {1107p0 = sp[0];1108p1 = sp[chan1];11091110d0 = (p0*k0 + buffd[i ]);1111d1 = (p1*k0 + buffd[i + 1]);11121113STORE_RES(dp[0 ], d0);1114STORE_RES(dp[chan1], d1);11151116buffd[i ] = 0;1117buffd[i + 1] = 0;11181119sp += chan2;1120dp += chan2;1121}1122}1123}1124}1125}11261127/* last pixels */1128for (; i < wid; i++) {1129mlib_s32 *pk = k, s = 0;1130mlib_s32 x;11311132for (l = 0; l < n; l++) {1133sp = sl + l*sll + i*chan1;11341135for (x = 0; x < m; x++) {1136s += sp[0] * pk[0];1137sp += chan1;1138pk ++;1139}1140}11411142STORE_RES(dp[0], s);11431144sp += chan1;1145dp += chan1;1146}11471148sl += sll;1149dl += dll;1150}1151}11521153if (buffd != buff) mlib_free(buffd);1154if (k != k_locl) mlib_free(k);11551156return MLIB_SUCCESS;1157}11581159/***************************************************************/116011611162