cosmopolitan/third_party/avir/avir_float4_sse.h

/* clang-format off */
//$ nobt
//$ nocpp

/**
 * @file avir_float4_sse.h
 *
 * @brief Inclusion file for the "float4" type.
 *
 * This file includes the "float4" SSE-based type used for SIMD variable
 * storage and processing.
 *
 * AVIR Copyright (c) 2015-2019 Aleksey Vaneev
 */

#ifndef AVIR_FLOAT4_SSE_INCLUDED
#define AVIR_FLOAT4_SSE_INCLUDED

#include "third_party/avir/avir.h"
#include "libc/bits/mmintrin.h"
#include "libc/bits/xmmintrin.h"
#include "libc/bits/xmmintrin.h"
#include "libc/bits/xmmintrin.h"
#include "libc/bits/emmintrin.h"

namespace avir {

/**
 * @brief SIMD packed 4-float type.
 *
 * This class implements a packed 4-float type that can be used to perform
 * parallel computation using SIMD instructions on SSE-enabled processors.
 * This class can be used as the "fptype" argument of the avir::fpclass_def
 * class.
 */

class float4
{
public:
	float4()
	{
	}

	float4( const float4& s )
		: value( s.value )
	{
	}

	float4( const __m128 s )
		: value( s )
	{
	}

	float4( const float s )
		: value( _mm_set1_ps( s ))
	{
	}

	float4& operator = ( const float4& s )
	{
		value = s.value;
		return( *this );
	}

	float4& operator = ( const __m128 s )
	{
		value = s;
		return( *this );
	}

	float4& operator = ( const float s )
	{
		value = _mm_set1_ps( s );
		return( *this );
	}

	operator float () const
	{
		return( _mm_cvtss_f32( value ));
	}

	/**
	 * @param p Pointer to memory from where the value should be loaded,
	 * should be 16-byte aligned.
	 * @return float4 value loaded from the specified memory location.
	 */

	static float4 load( const float* const p )
	{
		return( _mm_load_ps( p ));
	}

	/**
	 * @param p Pointer to memory from where the value should be loaded,
	 * may have any alignment.
	 * @return float4 value loaded from the specified memory location.
	 */

	static float4 loadu( const float* const p )
	{
		return( _mm_loadu_ps( p ));
	}

	/**
	 * @param p Pointer to memory from where the value should be loaded,
	 * may have any alignment.
	 * @param lim The maximum number of elements to load, >0.
	 * @return float4 value loaded from the specified memory location, with
	 * elements beyond "lim" set to 0.
	 */

	static float4 loadu( const float* const p, int lim )
	{
		if( lim > 2 )
		{
			if( lim > 3 )
			{
				return( _mm_loadu_ps( p ));
			}
			else
			{
				return( _mm_set_ps( 0.0f, p[ 2 ], p[ 1 ], p[ 0 ]));
			}
		}
		else
		{
			if( lim == 2 )
			{
				return( _mm_set_ps( 0.0f, 0.0f, p[ 1 ], p[ 0 ]));
			}
			else
			{
				return( _mm_load_ss( p ));
			}
		}
	}

	/**
	 * Function stores *this value to the specified memory location.
	 *
	 * @param[out] p Output memory location, should be 16-byte aligned.
	 */

	void store( float* const p ) const
	{
		_mm_store_ps( p, value );
	}

	/**
	 * Function stores *this value to the specified memory location.
	 *
	 * @param[out] p Output memory location, may have any alignment.
	 */

	void storeu( float* const p ) const
	{
		_mm_storeu_ps( p, value );
	}

	/**
	 * Function stores "lim" lower elements of *this value to the specified
	 * memory location.
	 *
	 * @param[out] p Output memory location, may have any alignment.
	 * @param lim The number of lower elements to store, >0.
	 */

	void storeu( float* const p, int lim ) const
	{
		if( lim > 2 )
		{
			if( lim > 3 )
			{
				_mm_storeu_ps( p, value );
			}
			else
			{
				_mm_storel_pi( (__m64*) p, value );
				_mm_store_ss( p + 2, _mm_movehl_ps( value, value ));
			}
		}
		else
		{
			if( lim == 2 )
			{
				_mm_storel_pi( (__m64*) p, value );
			}
			else
			{
				_mm_store_ss( p, value );
			}
		}
	}

	float4& operator += ( const float4& s )
	{
		value = _mm_add_ps( value, s.value );
		return( *this );
	}

	float4& operator -= ( const float4& s )
	{
		value = _mm_sub_ps( value, s.value );
		return( *this );
	}

	float4& operator *= ( const float4& s )
	{
		value = _mm_mul_ps( value, s.value );
		return( *this );
	}

	float4& operator /= ( const float4& s )
	{
		value = _mm_div_ps( value, s.value );
		return( *this );
	}

	float4 operator + ( const float4& s ) const
	{
		return( _mm_add_ps( value, s.value ));
	}

	float4 operator - ( const float4& s ) const
	{
		return( _mm_sub_ps( value, s.value ));
	}

	float4 operator * ( const float4& s ) const
	{
		return( _mm_mul_ps( value, s.value ));
	}

	float4 operator / ( const float4& s ) const
	{
		return( _mm_div_ps( value, s.value ));
	}

	/**
	 * @return Horizontal sum of elements.
	 */

	float hadd() const
	{
		const __m128 v = _mm_add_ps( value, _mm_movehl_ps( value, value ));
		const __m128 res = _mm_add_ss( v, _mm_shuffle_ps( v, v, 1 ));
		return( _mm_cvtss_f32( res ));
	}

	/**
	 * Function performs in-place addition of a value located in memory and
	 * the specified value.
	 *
	 * @param p Pointer to value where addition happens. May be unaligned.
	 * @param v Value to add.
	 */

	static void addu( float* const p, const float4& v )
	{
		( loadu( p ) + v ).storeu( p );
	}

	/**
	 * Function performs in-place addition of a value located in memory and
	 * the specified value. Limited to the specfied number of elements.
	 *
	 * @param p Pointer to value where addition happens. May be unaligned.
	 * @param v Value to add.
	 * @param lim The element number limit, >0.
	 */

	static void addu( float* const p, const float4& v, const int lim )
	{
		( loadu( p, lim ) + v ).storeu( p, lim );
	}

	__m128 value; ///< Packed value of 4 floats.
		///<
};

/**
 * SIMD rounding function, exact result.
 *
 * @param v Value to round.
 * @return Rounded SIMD value.
 */

inline float4 round( const float4& v )
{
	unsigned int prevrm = _MM_GET_ROUNDING_MODE();
	_MM_SET_ROUNDING_MODE( _MM_ROUND_NEAREST );

	const __m128 res = _mm_cvtepi32_ps( _mm_cvtps_epi32( v.value ));

	_MM_SET_ROUNDING_MODE( prevrm );

	return( res );
}

/**
 * SIMD function "clamps" (clips) the specified packed values so that they are
 * not lesser than "minv", and not greater than "maxv".
 *
 * @param Value Value to clamp.
 * @param minv Minimal allowed value.
 * @param maxv Maximal allowed value.
 * @return The clamped value.
 */

inline float4 clamp( const float4& Value, const float4& minv,
	const float4& maxv )
{
	return( _mm_min_ps( _mm_max_ps( Value.value, minv.value ), maxv.value ));
}

typedef fpclass_def< avir :: float4, float > fpclass_float4; ///<
	///< Class that can be used as the "fpclass" template parameter of the
	///< avir::CImageResizer class to perform calculation using default
	///< interleaved algorithm, using SIMD float4 type.
	///<

} // namespace avir

#endif // AVIR_FLOAT4_SSE_INCLUDED
Initial import 2020-06-15 14:18:57 +00:00			`/* clang-format off */`
			`//$ nobt`
			`//$ nocpp`

			`/**`
			`* @file avir_float4_sse.h`
			`*`
			`* @brief Inclusion file for the "float4" type.`
			`*`
			`* This file includes the "float4" SSE-based type used for SIMD variable`
			`* storage and processing.`
			`*`
			`* AVIR Copyright (c) 2015-2019 Aleksey Vaneev`
			`*/`

			`#ifndef AVIR_FLOAT4_SSE_INCLUDED`
			`#define AVIR_FLOAT4_SSE_INCLUDED`

			`#include "third_party/avir/avir.h"`
			`#include "libc/bits/mmintrin.h"`
			`#include "libc/bits/xmmintrin.h"`
			`#include "libc/bits/xmmintrin.h"`
Add glob and some finer tuning of documentation 2020-06-21 07:10:11 +00:00			`#include "libc/bits/xmmintrin.h"`
Initial import 2020-06-15 14:18:57 +00:00			`#include "libc/bits/emmintrin.h"`

			`namespace avir {`

			`/**`
			`* @brief SIMD packed 4-float type.`
			`*`
			`* This class implements a packed 4-float type that can be used to perform`
			`* parallel computation using SIMD instructions on SSE-enabled processors.`
			`* This class can be used as the "fptype" argument of the avir::fpclass_def`
			`* class.`
			`*/`

			`class float4`
			`{`
			`public:`
			`float4()`
			`{`
			`}`

			`float4( const float4& s )`
			`: value( s.value )`
			`{`
			`}`

			`float4( const __m128 s )`
			`: value( s )`
			`{`
			`}`

			`float4( const float s )`
			`: value( _mm_set1_ps( s ))`
			`{`
			`}`

			`float4& operator = ( const float4& s )`
			`{`
			`value = s.value;`
			`return( *this );`
			`}`

			`float4& operator = ( const __m128 s )`
			`{`
			`value = s;`
			`return( *this );`
			`}`

			`float4& operator = ( const float s )`
			`{`
			`value = _mm_set1_ps( s );`
			`return( *this );`
			`}`

			`operator float () const`
			`{`
			`return( _mm_cvtss_f32( value ));`
			`}`

			`/**`
			`* @param p Pointer to memory from where the value should be loaded,`
			`* should be 16-byte aligned.`
			`* @return float4 value loaded from the specified memory location.`
			`*/`

			`static float4 load( const float* const p )`
			`{`
			`return( _mm_load_ps( p ));`
			`}`

			`/**`
			`* @param p Pointer to memory from where the value should be loaded,`
			`* may have any alignment.`
			`* @return float4 value loaded from the specified memory location.`
			`*/`

			`static float4 loadu( const float* const p )`
			`{`
			`return( _mm_loadu_ps( p ));`
			`}`

			`/**`
			`* @param p Pointer to memory from where the value should be loaded,`
			`* may have any alignment.`
			`* @param lim The maximum number of elements to load, >0.`
			`* @return float4 value loaded from the specified memory location, with`
			`* elements beyond "lim" set to 0.`
			`*/`

			`static float4 loadu( const float* const p, int lim )`
			`{`
			`if( lim > 2 )`
			`{`
			`if( lim > 3 )`
			`{`
			`return( _mm_loadu_ps( p ));`
			`}`
			`else`
			`{`
			`return( _mm_set_ps( 0.0f, p[ 2 ], p[ 1 ], p[ 0 ]));`
			`}`
			`}`
			`else`
			`{`
			`if( lim == 2 )`
			`{`
			`return( _mm_set_ps( 0.0f, 0.0f, p[ 1 ], p[ 0 ]));`
			`}`
			`else`
			`{`
			`return( _mm_load_ss( p ));`
			`}`
			`}`
			`}`

			`/**`
			`* Function stores *this value to the specified memory location.`
			`*`
			`* @param[out] p Output memory location, should be 16-byte aligned.`
			`*/`

			`void store( float* const p ) const`
			`{`
			`_mm_store_ps( p, value );`
			`}`

			`/**`
			`* Function stores *this value to the specified memory location.`
			`*`
			`* @param[out] p Output memory location, may have any alignment.`
			`*/`

			`void storeu( float* const p ) const`
			`{`
			`_mm_storeu_ps( p, value );`
			`}`

			`/**`
			`* Function stores "lim" lower elements of *this value to the specified`
			`* memory location.`
			`*`
			`* @param[out] p Output memory location, may have any alignment.`
			`* @param lim The number of lower elements to store, >0.`
			`*/`

			`void storeu( float* const p, int lim ) const`
			`{`
			`if( lim > 2 )`
			`{`
			`if( lim > 3 )`
			`{`
			`_mm_storeu_ps( p, value );`
			`}`
			`else`
			`{`
			`_mm_storel_pi( (__m64*) p, value );`
			`_mm_store_ss( p + 2, _mm_movehl_ps( value, value ));`
			`}`
			`}`
			`else`
			`{`
			`if( lim == 2 )`
			`{`
			`_mm_storel_pi( (__m64*) p, value );`
			`}`
			`else`
			`{`
			`_mm_store_ss( p, value );`
			`}`
			`}`
			`}`

			`float4& operator += ( const float4& s )`
			`{`
			`value = _mm_add_ps( value, s.value );`
			`return( *this );`
			`}`

			`float4& operator -= ( const float4& s )`
			`{`
			`value = _mm_sub_ps( value, s.value );`
			`return( *this );`
			`}`

			`float4& operator *= ( const float4& s )`
			`{`
			`value = _mm_mul_ps( value, s.value );`
			`return( *this );`
			`}`

			`float4& operator /= ( const float4& s )`
			`{`
			`value = _mm_div_ps( value, s.value );`
			`return( *this );`
			`}`

			`float4 operator + ( const float4& s ) const`
			`{`
			`return( _mm_add_ps( value, s.value ));`
			`}`

			`float4 operator - ( const float4& s ) const`
			`{`
			`return( _mm_sub_ps( value, s.value ));`
			`}`

			`float4 operator * ( const float4& s ) const`
			`{`
			`return( _mm_mul_ps( value, s.value ));`
			`}`

			`float4 operator / ( const float4& s ) const`
			`{`
			`return( _mm_div_ps( value, s.value ));`
			`}`

			`/**`
			`* @return Horizontal sum of elements.`
			`*/`

			`float hadd() const`
			`{`
			`const __m128 v = _mm_add_ps( value, _mm_movehl_ps( value, value ));`
			`const __m128 res = _mm_add_ss( v, _mm_shuffle_ps( v, v, 1 ));`
			`return( _mm_cvtss_f32( res ));`
			`}`

			`/**`
			`* Function performs in-place addition of a value located in memory and`
			`* the specified value.`
			`*`
			`* @param p Pointer to value where addition happens. May be unaligned.`
			`* @param v Value to add.`
			`*/`

			`static void addu( float* const p, const float4& v )`
			`{`
			`( loadu( p ) + v ).storeu( p );`
			`}`

			`/**`
			`* Function performs in-place addition of a value located in memory and`
			`* the specified value. Limited to the specfied number of elements.`
			`*`
			`* @param p Pointer to value where addition happens. May be unaligned.`
			`* @param v Value to add.`
			`* @param lim The element number limit, >0.`
			`*/`

			`static void addu( float* const p, const float4& v, const int lim )`
			`{`
			`( loadu( p, lim ) + v ).storeu( p, lim );`
			`}`

			`__m128 value; ///< Packed value of 4 floats.`
			`///<`
			`};`

			`/**`
			`* SIMD rounding function, exact result.`
			`*`
			`* @param v Value to round.`
			`* @return Rounded SIMD value.`
			`*/`

			`inline float4 round( const float4& v )`
			`{`
			`unsigned int prevrm = _MM_GET_ROUNDING_MODE();`
			`_MM_SET_ROUNDING_MODE( _MM_ROUND_NEAREST );`

			`const __m128 res = _mm_cvtepi32_ps( _mm_cvtps_epi32( v.value ));`

			`_MM_SET_ROUNDING_MODE( prevrm );`

			`return( res );`
			`}`

			`/**`
			`* SIMD function "clamps" (clips) the specified packed values so that they are`
			`* not lesser than "minv", and not greater than "maxv".`
			`*`
			`* @param Value Value to clamp.`
			`* @param minv Minimal allowed value.`
			`* @param maxv Maximal allowed value.`
			`* @return The clamped value.`
			`*/`

			`inline float4 clamp( const float4& Value, const float4& minv,`
			`const float4& maxv )`
			`{`
			`return( _mm_min_ps( _mm_max_ps( Value.value, minv.value ), maxv.value ));`
			`}`

			`typedef fpclass_def< avir :: float4, float > fpclass_float4; ///<`
			`///< Class that can be used as the "fpclass" template parameter of the`
			`///< avir::CImageResizer class to perform calculation using default`
			`///< interleaved algorithm, using SIMD float4 type.`
			`///<`

			`} // namespace avir`

			`#endif // AVIR_FLOAT4_SSE_INCLUDED`