CoCalc -- Math3D.cpp

GitHub Repository: hrydgard/ppsspp
Path: blob/master/GPU/Math3D.cpp
³¹⁸⁵ views
1
// Copyright (c) 2012- PPSSPP Project.
2

3
// This program is free software: you can redistribute it and/or modify
4
// it under the terms of the GNU General Public License as published by
5
// the Free Software Foundation, version 2.0 or later versions.
6

7
// This program is distributed in the hope that it will be useful,
8
// but WITHOUT ANY WARRANTY; without even the implied warranty of
9
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
10
// GNU General Public License 2.0 for more details.
11

12
// A copy of the GPL 2.0 should have been included with the program.
13
// If not, see http://www.gnu.org/licenses/
14

15
// Official git repository and contact information can be found at
16
// https://github.com/hrydgard/ppsspp and http://www.ppsspp.org/.
17

18
#include "GPU/Math3D.h"
19
#include "Common/Common.h"
20
#include "Common/Math/SIMDHeaders.h"
21

22
#if PPSSPP_ARCH(SSE2)
23
// For the SSE4 stuff.
24
#include <smmintrin.h>
25
#endif
26

27
namespace Math3D {
28

29
template<>
30
float Vec2<float>::Length() const
31
{
32
	// Doubt this is worth it for a vec2 :/
33
#if defined(_M_SSE)
34
	float ret;
35
	__m128d tmp = _mm_load_sd((const double*)&x);
36
	__m128 xy = _mm_castpd_ps(tmp);
37
	__m128 sq = _mm_mul_ps(xy, xy);
38
	const __m128 r2 = _mm_shuffle_ps(sq, sq, _MM_SHUFFLE(0, 0, 0, 1));
39
	const __m128 res = _mm_add_ss(sq, r2);
40
	_mm_store_ss(&ret, _mm_sqrt_ss(res));
41
	return ret;
42
#elif PPSSPP_ARCH(ARM64_NEON)
43
	float32x2_t vec = vld1_f32(&x);
44
	float32x2_t sq = vmul_f32(vec, vec);
45
	float32x2_t add2 = vpadd_f32(sq, sq);
46
	float32x2_t res = vsqrt_f32(add2);
47
	return vget_lane_f32(res, 0);
48
#else
49
	return sqrtf(Length2());
50
#endif
51
}
52

53
template<>
54
void Vec2<float>::SetLength(const float l)
55
{
56
	(*this) *= l / Length();
57
}
58

59
template<>
60
Vec2<float> Vec2<float>::WithLength(const float l) const
61
{
62
	return (*this) * l / Length();
63
}
64

65
template<>
66
float Vec2<float>::Distance2To(const Vec2<float> &other) const {
67
	return Vec2<float>(other-(*this)).Length2();
68
}
69

70
template<>
71
Vec2<float> Vec2<float>::Normalized() const
72
{
73
	return (*this) / Length();
74
}
75

76
template<>
77
float Vec2<float>::Normalize()
78
{
79
	float len = Length();
80
	(*this) = (*this)/len;
81
	return len;
82
}
83

84
template<>
85
float Vec3<float>::Length() const
86
{
87
#if defined(_M_SSE)
88
	float ret;
89
	__m128 xyz = _mm_loadu_ps(&x);
90
	__m128 sq = _mm_mul_ps(xyz, xyz);
91
	const __m128 r2 = _mm_shuffle_ps(sq, sq, _MM_SHUFFLE(0, 0, 0, 1));
92
	const __m128 r3 = _mm_shuffle_ps(sq, sq, _MM_SHUFFLE(0, 0, 0, 2));
93
	const __m128 res = _mm_add_ss(sq, _mm_add_ss(r2, r3));
94
	_mm_store_ss(&ret, _mm_sqrt_ss(res));
95
	return ret;
96
#elif PPSSPP_ARCH(ARM64_NEON)
97
	float32x4_t sq = vsetq_lane_f32(0.0f, vmulq_f32(vec, vec), 3);
98
	float32x2_t add1 = vget_low_f32(vpaddq_f32(sq, sq));
99
	float32x2_t add2 = vpadd_f32(add1, add1);
100
	float32x2_t res = vsqrt_f32(add2);
101
	return vget_lane_f32(res, 0);
102
#else
103
	return sqrtf(Length2());
104
#endif
105
}
106

107
template<>
108
void Vec3<float>::SetLength(const float l)
109
{
110
	(*this) *= l / Length();
111
}
112

113
template<>
114
Vec3<float> Vec3<float>::WithLength(const float l) const
115
{
116
	return (*this) * l / Length();
117
}
118

119
template<>
120
float Vec3<float>::Distance2To(const Vec3<float> &other) const {
121
	return Vec3<float>(other-(*this)).Length2();
122
}
123

124
#if defined(_M_SSE)
125
__m128 SSENormalizeMultiplierSSE2(__m128 v)
126
{
127
	const __m128 sq = _mm_mul_ps(v, v);
128
	const __m128 r2 = _mm_shuffle_ps(sq, sq, _MM_SHUFFLE(0, 0, 0, 1));
129
	const __m128 r3 = _mm_shuffle_ps(sq, sq, _MM_SHUFFLE(0, 0, 0, 2));
130
	const __m128 res = _mm_add_ss(r3, _mm_add_ss(r2, sq));
131

132
	const __m128 rt = _mm_rsqrt_ss(res);
133
	return _mm_shuffle_ps(rt, rt, _MM_SHUFFLE(0, 0, 0, 0));
134
}
135

136
#if defined(__GNUC__) || defined(__clang__) || defined(__INTEL_COMPILER)
137
[[gnu::target("sse4.1")]]
138
#endif
139
__m128 SSENormalizeMultiplierSSE4(__m128 v)
140
{
141
	// This is only used for Vec3f, so ignore the 4th component, might be garbage.
142
	return _mm_rsqrt_ps(_mm_dp_ps(v, v, 0x77));
143
}
144

145
__m128 SSENormalizeMultiplier(bool useSSE4, __m128 v)
146
{
147
	if (useSSE4)
148
		return SSENormalizeMultiplierSSE4(v);
149
	return SSENormalizeMultiplierSSE2(v);
150
}
151

152
template<>
153
Vec3<float> Vec3<float>::Normalized(bool useSSE4) const
154
{
155
	const __m128 normalize = SSENormalizeMultiplier(useSSE4, vec);
156
	return _mm_mul_ps(normalize, vec);
157
}
158

159
template<>
160
Vec3<float> Vec3<float>::NormalizedOr001(bool useSSE4) const {
161
	const __m128 normalize = SSENormalizeMultiplier(useSSE4, vec);
162
	const __m128 result = _mm_mul_ps(normalize, vec);
163
	const __m128 mask = _mm_cmpunord_ps(result, vec);
164
	const __m128 replace = _mm_and_ps(_mm_set_ps(0.0f, 1.0f, 0.0f, 0.0f), mask);
165
	// Replace with the constant if the mask matched.
166
	return _mm_or_ps(_mm_andnot_ps(mask, result), replace);
167
}
168
#elif PPSSPP_ARCH(ARM64_NEON)
169
template<>
170
Vec3<float> Vec3<float>::Normalized(bool useSSE4) const {
171
	float32x4_t sq = vsetq_lane_f32(0.0f, vmulq_f32(vec, vec), 3);
172
	float32x2_t add1 = vget_low_f32(vpaddq_f32(sq, sq));
173
	float32x2_t summed = vpadd_f32(add1, add1);
174

175
	float32x2_t e = vrsqrte_f32(summed);
176
	e = vmul_f32(vrsqrts_f32(vmul_f32(e, e), summed), e);
177
	e = vmul_f32(vrsqrts_f32(vmul_f32(e, e), summed), e);
178

179
	float32x4_t factor = vdupq_lane_f32(e, 0);
180
	return Vec3<float>(vmulq_f32(vec, factor));
181
}
182

183
template<>
184
Vec3<float> Vec3<float>::NormalizedOr001(bool useSSE4) const {
185
	float32x4_t sq = vsetq_lane_f32(0.0f, vmulq_f32(vec, vec), 3);
186
	float32x2_t add1 = vget_low_f32(vpaddq_f32(sq, sq));
187
	float32x2_t summed = vpadd_f32(add1, add1);
188
	if (vget_lane_f32(summed, 0) == 0.0f) {
189
		return Vec3<float>(vsetq_lane_f32(1.0f, vdupq_lane_f32(summed, 0), 2));
190
	}
191

192
	float32x2_t e = vrsqrte_f32(summed);
193
	e = vmul_f32(vrsqrts_f32(vmul_f32(e, e), summed), e);
194
	e = vmul_f32(vrsqrts_f32(vmul_f32(e, e), summed), e);
195

196
	float32x4_t factor = vdupq_lane_f32(e, 0);
197
	return Vec3<float>(vmulq_f32(vec, factor));
198
}
199
#else
200
template<>
201
Vec3<float> Vec3<float>::Normalized(bool useSSE4) const
202
{
203
	return (*this) / Length();
204
}
205

206
template<>
207
Vec3<float> Vec3<float>::NormalizedOr001(bool useSSE4) const {
208
	float len = Length();
209
	if (len == 0.0f) {
210
		return Vec3<float>(0.0f, 0.0f, 1.0f);
211
	}
212
	return *this / len;
213
}
214
#endif
215

216
template<>
217
float Vec3<float>::Normalize()
218
{
219
	float len = Length();
220
	(*this) = (*this)/len;
221
	return len;
222
}
223

224
template<>
225
float Vec3<float>::NormalizeOr001() {
226
	float len = Length();
227
	if (len == 0.0f) {
228
		z = 1.0f;
229
	} else {
230
		*this /= len;
231
	}
232
	return len;
233
}
234

235
template<>
236
Vec3Packed<float> Vec3Packed<float>::FromRGB(unsigned int rgb)
237
{
238
	return Vec3Packed((rgb & 0xFF) * (1.0f/255.0f),
239
				((rgb >> 8) & 0xFF) * (1.0f/255.0f),
240
				((rgb >> 16) & 0xFF) * (1.0f/255.0f));
241
}
242

243
template<>
244
Vec3Packed<int> Vec3Packed<int>::FromRGB(unsigned int rgb)
245
{
246
	return Vec3Packed(rgb & 0xFF, (rgb >> 8) & 0xFF, (rgb >> 16) & 0xFF);
247
}
248

249
template<>
250
unsigned int Vec3Packed<float>::ToRGB() const
251
{
252
	return ((unsigned int)(r()*255.f)) +
253
			((unsigned int)(g()*255.f*256.f)) +
254
			((unsigned int)(b()*255.f*256.f*256.f));
255
}
256

257
template<>
258
unsigned int Vec3Packed<int>::ToRGB() const
259
{
260
	return (r()&0xFF) | ((g()&0xFF)<<8) | ((b()&0xFF)<<16);
261
}
262

263
template<>
264
float Vec3Packed<float>::Length() const
265
{
266
	return sqrtf(Length2());
267
}
268

269
template<>
270
void Vec3Packed<float>::SetLength(const float l)
271
{
272
	(*this) *= l / Length();
273
}
274

275
template<>
276
Vec3Packed<float> Vec3Packed<float>::WithLength(const float l) const
277
{
278
	return (*this) * l / Length();
279
}
280

281
template<>
282
float Vec3Packed<float>::Distance2To(const Vec3Packed<float> &other) const {
283
	return Vec3Packed<float>(other-(*this)).Length2();
284
}
285

286
template<>
287
Vec3Packed<float> Vec3Packed<float>::Normalized() const
288
{
289
	return (*this) / Length();
290
}
291

292
template<>
293
float Vec3Packed<float>::Normalize()
294
{
295
	float len = Length();
296
	(*this) = (*this)/len;
297
	return len;
298
}
299

300
template<>
301
float Vec4<float>::Length() const
302
{
303
#if defined(_M_SSE)
304
	float ret;
305
	__m128 xyzw = _mm_loadu_ps(&x);
306
	__m128 sq = _mm_mul_ps(xyzw, xyzw);
307
	const __m128 r2 = _mm_add_ps(sq, _mm_movehl_ps(sq, sq));
308
	const __m128 res = _mm_add_ss(r2, _mm_shuffle_ps(r2, r2, _MM_SHUFFLE(0, 0, 0, 1)));
309
	_mm_store_ss(&ret, _mm_sqrt_ss(res));
310
	return ret;
311
#elif PPSSPP_ARCH(ARM64_NEON)
312
	float32x4_t sq = vmulq_f32(vec, vec);
313
	float32x2_t add1 = vget_low_f32(vpaddq_f32(sq, sq));
314
	float32x2_t add2 = vpadd_f32(add1, add1);
315
	float32x2_t res = vsqrt_f32(add2);
316
	return vget_lane_f32(res, 0);
317
#else
318
	return sqrtf(Length2());
319
#endif
320
}
321

322
template<>
323
void Vec4<float>::SetLength(const float l)
324
{
325
	(*this) *= l / Length();
326
}
327

328
template<>
329
Vec4<float> Vec4<float>::WithLength(const float l) const
330
{
331
	return (*this) * l / Length();
332
}
333

334
template<>
335
float Vec4<float>::Distance2To(const Vec4<float> &other) const {
336
	return Vec4<float>(other-(*this)).Length2();
337
}
338

339
template<>
340
Vec4<float> Vec4<float>::Normalized() const
341
{
342
	return (*this) / Length();
343
}
344

345
template<>
346
float Vec4<float>::Normalize()
347
{
348
	float len = Length();
349
	(*this) = (*this)/len;
350
	return len;
351
}
352

353
}; // namespace Math3D
354

355
Product

Resources

Company