🖐🏻 👨🏼‍🌾 ✍🏿 Se necesitan diferentes raíces, diferentes raíces son importantes 🤜🏽 🔋 👖

En lugar de presentar

En primer lugar, me gustaría expresar mi gratitud a todos los que respondieron al primer artículo sobre la optimización del código en C / C ++ utilizando el ejemplo de una función para calcular la raíz cuadrada de un número entero redondeado al número entero más cercano. Gracias a la atención de expertos, se ha corregido un error tipográfico en el texto; la alcancía de algoritmos efectivos se ha repuesto.

Un algoritmo interesante es sqrxi32 de @ Sdima1357 - Ejemplo 1, en lo sucesivo denominado "_i32" para abreviar. El algoritmo "_i32" cumple incondicionalmente la condición de la tarea principal - "redondeo al número entero más cercano" - en todo el conjunto de valores de argumento [0 .. 0xFFFFFFFF], al tiempo que muestra un alto rendimiento.

Ejemplo 1: cálculo de la raíz cuadrada de un número entero, redondeado al número entero más cercano.

uint16_t sqrxi32( uint32_t y )
{
	if ( y == 1 )
		return 1;

	uint32_t xh = y > 0x10000ul ? 0x10000ul : y;
	uint32_t xl = 0;
	uint32_t xc;

	for ( int k = 0; k < 16; k++ )
	{
		xc = ( xh + xl ) >> 1ul;
		if ( xc * xc - xc >= y )
		{
			xh = xc;
		}
		else
		{
			xl = xc;
		}
	}
	return ( xh + xl ) >> 1ul;
}

Otra buena cualidad del algoritmo "_i32" es su predictibilidad temporal. El tiempo de ejecución "_i32" es constante, en contraste con el algoritmo "_evn", que consume tiempo de la máquina en proporción al módulo del argumento.

De Que Trata Este Texto

Observe el efecto complejo de la configuración de compilación y la plataforma de hardware de destino en el rendimiento final, cuando se aplica al mismo código fuente.

El código fuente contiene una solución a un problema con diferentes algoritmos.

— 3 ;
— 3

( main.c)
-
: CubeIDE ( Eclipce CDT)
RELEASE CubeIDE
: «ISO C11 + gnu extensions» (-std=gnu11)
:
- CubeMX — default settings, +48MHz, +USART1, +HAL;
- Runtime lib: Reduced C ( --spec=nano.specs );
- Use float with printf from new lib nano ( -u _printf_float );

FPU «M4» sqrt_fps, (float), «_fps» (Float Point Short) — 2.

2: float

uint16_t sqrt_fps( uint32_t number )
{
	if ( number < 2 )
		return (uint16_t) number;

	float f_rslt = sqrtf( number );
	uint32_t rslt = (uint32_t) f_rslt;

	if ( !( f_rslt - (float) rslt < .5 ) )
		rslt++;

	return (uint16_t) rslt;
}

«_fps» 22- , 1E+5 — 1.

1: "_fps" 1E+6+

[0 .. 1E+5].

— , .

, «x86» «ARM Cortex» . — 2.

Y ( 4), . X — .

( 2), , .

, , , : -O0, -Os, -O3.

( 2) : -O0, -Os, -O3.

100% , ( -O0 ). .

( ‑O0 ) , - .

«M4».

x86

( 3) Y . X — ( 4).

, .

X . .

3: x86

«x86» .

( ‑O0 ) 39% «_fpu» ( ‑Os ) 16% «_fps» ( ‑O3 ). , «x86» .

, -O3 -Os.

M4

«M4» ( 4).

4: M4

«M4» «_fps», — float.

FPU «M4» — 5.

, «_fps» 1E+5 (. 1) , FPU «M4» .

5: M4 FPU

M0

«M0» «M4» FPU ( 5), — 6.

, «M4», «M0» — 48 MHz. , «M0» , «M4», .

6: M0

«_fps» «M0» «_fpu».

( 6) .

( ‑O0 ) «_evn» «_i32». «_evn» , «_i32», .

. , «M4» «x86» .

, , ( 3 6).

, , . .

1. x86

CubeIDE (Eclipse CDT) C
— 3
sqrt_cmp.h — 6
:
1. IDE;
2. — 4
( -O0, -O3, -Os ) .

3: x86 — main.c

#include "sqrt_cmp.h"

int main( void )
{
	main_Of_SqrtComp();
	return 0;
}

4 x86

gcc main.c -o main -I. -Wall -lm -std=gnu11 -O3 && ./main

«x86» , main.c sqrt_cmp.h , (pwd).

7: «x86»

2. STM32

CubeIDE STM32 (CubeMX)
sqrt_cmp.h STM32 — 6
sqrt_cmp.h main.c — 5
IDE
Cambiando el tipo de optimización (-O0, -O3, -Os) observe el resultado

Ejemplo 5: código fuente para STM32 (con espacios <...>) - main.c

< … >
/* Private includes ----------------------------------------------------------*/
/* USER CODE BEGIN Includes */
#include "sqrt_cmp.h"
/* USER CODE END Includes */
< … >
/**
  * @brief  The application entry point.
  * @retval int
  */
int main(void)
{
< … >
  /* Infinite loop */
  /* USER CODE BEGIN WHILE */
  main_Of_SqrtComp();
  while (1)
  {
    /* USER CODE END WHILE */
    /* USER CODE BEGIN 3 */
  }
  /* USER CODE END 3 */

Apéndice 3. Procedimiento para probar otros algoritmos y plataformas

El montaje de prueba para otras plataformas se realiza por analogía.

Para plataformas de hardware distintas de las mencionadas anteriormente (Tabla 3), es probable que se requiera una modificación cosmética del archivo "sqrt_cmp.h".

Ejemplo 6: contenido del archivo sqrt_cmp.h

/******************************************************************************
 * File: sqrt_cmp.h Created on 5 . 2020 .
 * CC0 1.0 Universal (CC0 1.0)
 * Creative Commons Public Domain Dedication
 * No Copyright
 *
 * TAB Size .EQ 4
 ********************************************************************************/

#ifndef __SQRT_CMP_H
#define __SQRT_CMP_H

#include	<math.h>
#include	<stdio.h>
#include	<stdint.h>

#ifdef __cplusplus
extern "C" {
#endif

/******************************************************************************
 * Interface of the entry point for all sqrt tests
 ******************************************************************************/

void main_Of_SqrtComp();

/******************************************************************************
 * test case selection: TEST_SET
 * select one of the test suite via a comment.
 ******************************************************************************/

#define TEST_SET			TEST_ALL
//#define TEST_SET			TEST_ROUNDING
//#define TEST_SET			TEST_PERFORMANCE

/******************************************************************************
 * Interfaces of test functions.
 * See implementation of them at the end of this file.
 ******************************************************************************/

typedef uint16_t (*sqrt_func)( uint32_t number );

uint16_t sqrt_fpu( uint32_t number );	// floating point function from article
uint16_t sqrt_evn( uint32_t number );	// integer function from article

uint16_t sqrxi32( uint32_t y );			// integer function from comment by

uint16_t sqrt_fps( uint32_t number );	// optimized floating point function for Cortex M4

										// <-- insert interface of your function here

/******************************************************************************
 * Set to variable named as 'round_test_func' below
 * to the alias of one of the functions above.
 * The NULL will select last function in comp_list[]
 ******************************************************************************/

sqrt_func round_test_func = sqrt_fps;	// specific instance for the rounding test
//sqrt_func round_test_func = sqrxi32;	// specific instance for the rounding test
//sqrt_func round_test_func = sqrt_evn;	// specific instance for the rounding test

//sqrt_func round_test_func = NULL;		// last function in comp_list[]

/******************************************************************************
 * The array of test functions for competing routines is called comp_list[].
 * Adding a new function to the test:
 - copy the implementation of the new function to the end of this file;
 - declare the function interface at the beginning of this file;
 - add the alias and declaration of the new function to
 end of array named comp_list[].
 ******************************************************************************/

// @formatter:off

typedef struct
{
	sqrt_func	fsqrt;
	char *		alias;
} SCompFunc;

SCompFunc comp_list[] =	// competition list
{
	{ sqrt_fpu, "_fpu" },
	{ sqrt_fps, "_fps" },
	{ sqrt_evn, "_evn" },
	{ sqrxi32,  "_i32" }
							// <-- insert your function name & alias here
};

/* @formatter:on */

/******************************************************************************
 * Platform-independent definitions
 ******************************************************************************/

#define PUT_FORMAT_MSG(f_, ...) { \
				sprintf( (char *)s_buf, (char *)f_, ##__VA_ARGS__ ); \
				PUT_MSG( (char *)s_buf ); }

#define MS_PER_SEC	1000
#define US_PER_SEC	( MS_PER_SEC * MS_PER_SEC )

#define ARRAY_SIZE(a) (sizeof a / sizeof *a)	// size of static array at runtime

#define SIRV(f) if ( f ) ;						// suppress Ignore Return Value warning

/******************************************************************************
 * Platform-specific defines
 ******************************************************************************/

#if defined( USE_HAL_DRIVER )	// STM32 ARM Cortex platform

#	include	<string.h>
#	include "main.h"

	//*****************************************************************************
	// Platform-specific defines for the helper functions

#	define SCALE_RATE		1	// must be .GE than 1

#	define X_CLOCK			HAL_GetTick()
#	define X_DELAY( ms )	HAL_Delay( ms )

	//*****************************************************************************
	// Platform-specific defines for the terminal output

#	define USART_HANDLE		huart1	// set valid USART handler alias here defined by the config of MCU
#	define USART_TIMEOUT	150		// max timeout for HAL_UART_Transmit

extern UART_HandleTypeDef USART_HANDLE;
extern HAL_StatusTypeDef HAL_UART_Transmit ( UART_HandleTypeDef *huart, uint8_t *pData, uint16_t Size, uint32_t Timeout );

#	define PUT_MSG( msg ) \
		HAL_UART_Transmit( &USART_HANDLE, (uint8_t *)msg, strlen( (char *)msg ), USART_TIMEOUT )

#	define CPU_CLOCK_MHz	( SystemCoreClock / US_PER_SEC )	// CPU CLK in MHz

#	if defined( STM32F0 )
#		define	CPU_ID ( "STM32 ARM Cortex M0" )
#	elif defined ( STM32F3 )
#		define	CPU_ID ( "STM32 ARM Cortex M4" )
#	else
#		define	CPU_ID ( "Maybe STM32 ARM Cortex" )
#	endif

#	define PUT_SYS_INFO	PUT_FORMAT_MSG( " %s @ "fdU()" MHz\n", CPU_ID, CPU_CLOCK_MHz )

#else	// #if defined( USE_HAL_DRIVER	)

#		include <time.h>
#		include <stdlib.h>

	//*****************************************************************************
	// Platform-specific defines for the helper functions

#	define SCALE_RATE		100		// must be .GE than 1

#	define X_CLOCK			(uint32_t) x_clock()
#	define X_DELAY( ms )	x_delay( ms )

uint32_t x_clock()
{
	uint64_t result = (uint64_t) clock();
	result *= MS_PER_SEC;
	result /= CLOCKS_PER_SEC;
	return (uint32_t) result;
}

void x_delay( uint32_t ms )
{
	uint64_t tm = x_clock();
	while ( ( x_clock() - tm ) < ms )
		;
}

	//*****************************************************************************
	// Platform-specific defines for the terminal output

#	define PUT_MSG( msg ) \
		printf( "%s", (char *)msg ), fflush ( stdout );

#	if defined( __unix__ )		// anybody other platform for gcc

#		define PUT_SYS_INFO	SIRV( system( "cat /proc/cpuinfo | grep 'model name' | head -1 | sed s/'model name\t:'/''/" ) )

#	else

#		define PUT_SYS_INFO	PUT_MSG( "Undefined System & CPU" )

#	endif	// #	if defined( __unix__ )  // anybody other platform for gcc

#endif		// #if defined( USE_HAL_DRIVER	)

#if  ( __WORDSIZE == 64 )

#	define fdI(s)	"%" #s "d"
#	define fdU(s)	"%" #s "u"
#	define fdX(s)	"%" #s "x"

#else	// let's say __WORDSIZE == 32

#	define fdI(s)	"%" #s "ld"
#	define fdU(s)	"%" #s "lu"
#	define fdX(s)	"%" #s "lx"

#endif	// #if ( __WORDSIZE == 64 )

#if defined ( DEBUG ) || defined ( _DEBUG ) // chk build mode of CubeIDE

#	define	BUILD_MODE	"DEBUG"

#else // Maybe Release

#	define	BUILD_MODE	"RELEASE"

#endif	// #if defined ( DEBUG ) || defined ( _DEBUG )

/******************************************************************************
 * the helper data with testing ranges
 ******************************************************************************/

// @formatter:off

typedef struct
{
	uint32_t	start;
	uint32_t	stop;
	uint32_t	repeat;
} STestRange;

STestRange	test_rngs[] =
{
	{ 0, 1000, 100 * SCALE_RATE },
	{ 0, 10000, 10 * SCALE_RATE },
	{ 0, 100000, 1 * SCALE_RATE }
};

uint32_t test_results[ARRAY_SIZE( test_rngs )][ARRAY_SIZE( comp_list ) + 1];

#define MSG_BUFF_SIZE	512

uint8_t s_buf[MSG_BUFF_SIZE];	// buffer for a terminal output

/* @formatter:on */

/******************************************************************************
 * Test sets definitions. Do not change it.
 ******************************************************************************/

#define TEST_ROUNDING		1
#define TEST_PERFORMANCE	2
#define TEST_ALL			( TEST_ROUNDING | TEST_PERFORMANCE )

#ifndef TEST_SET
#	define	TEST_SET	TEST_ALL
#endif

#define HI_ROUND_TEST_RANGE_END		0x007FFFFFUL
#define HI_ROUND_TEST_RANGE_START	( HI_ROUND_TEST_RANGE_END >> 4 )

/******************************************************************************
 * Interface of helper functions
 ******************************************************************************/

void main_Header();
void testRounding();
void testPerformance();

/******************************************************************************
 * Implementation of the entry point for all sqrt tests
 ******************************************************************************/

void main_Of_SqrtComp()
{

	X_DELAY( MS_PER_SEC / 2 );	// suppress the output of a previous instance
								// while the new instance is loading into the MCU

	uint32_t start_time = X_CLOCK;

	main_Header();

	// checking normal and extended ranges for rounding
	if ( TEST_SET & TEST_ROUNDING )
		testRounding();

	// checking normal ranges on execution time
	if ( TEST_SET & TEST_PERFORMANCE )
		testPerformance();

	uint32_t test_time = X_CLOCK - start_time;

	uint32_t test_m = ( test_time / MS_PER_SEC ) / 60;
	uint32_t test_s = ( test_time / MS_PER_SEC ) % 60;
	uint32_t test_ms = test_time % MS_PER_SEC;

	PUT_FORMAT_MSG( "\ndone, spent time: "fdU()" m, "fdU()"."fdU()" s\n", test_m, test_s, test_ms );
}

/******************************************************************************
 * Implementation of the helper functions
 ******************************************************************************/

void main_Header()
{

	PUT_MSG( "\n\n**********************************************************\n" );
	PUT_SYS_INFO;
	PUT_FORMAT_MSG( "*********** %s, built at %s\n", BUILD_MODE, __TIME__ );
}

void testPerformance()
{
	uint32_t i_func, i_rpt, i_rng;
	uint32_t number, first, second, diff;
	uint64_t temp;

	PUT_MSG( "----------+ Performance test" );

	for ( i_rng = 0; i_rng < ARRAY_SIZE( test_rngs ); i_rng++ )
	{
		PUT_MSG( "\n" );
		PUT_FORMAT_MSG( "test range:["fdU()".."fdU()"], repeat="fdU()"\n", test_rngs[i_rng].start, test_rngs[i_rng].stop,
				test_rngs[i_rng].repeat );

		test_results[i_rng][0] = test_rngs[i_rng].stop;

		for ( i_func = 0; i_func < ARRAY_SIZE( comp_list ); i_func++ )
		{
			PUT_FORMAT_MSG( "%s ... ", comp_list[i_func].alias );

			first = X_CLOCK;

			for ( i_rpt = 0; i_rpt < test_rngs[i_rng].repeat; i_rpt++ )
				for ( number = test_rngs[i_rng].start; number < test_rngs[i_rng].stop; number++ )
					comp_list[i_func].fsqrt( number );

			second = X_CLOCK;

			diff = second - first;

			temp = ( test_rngs[i_rng].stop - test_rngs[i_rng].start ) * test_rngs[i_rng].repeat;
			test_results[i_rng][i_func + 1] = (uint32_t) ( temp / diff );

			if ( i_func < ARRAY_SIZE( comp_list ) - 1 )
				PUT_MSG( ", " );
		}
	}

	// small report
	PUT_FORMAT_MSG( "\n----------+ Report: sqrt`s calls per ms\n%10s", "range" );

	for ( i_func = 0; i_func < ARRAY_SIZE( comp_list ); i_func++ )
		PUT_FORMAT_MSG( "%10s", comp_list[i_func].alias );

	for ( i_rng = 0; i_rng < ARRAY_SIZE( test_rngs ); i_rng++ )
	{
		PUT_MSG( "\n" );
		for ( i_func = 0; i_func < ARRAY_SIZE( comp_list ) + 1; i_func++ )
			PUT_FORMAT_MSG( fdU( 10 ), test_results[i_rng][i_func] );
	}

	PUT_FORMAT_MSG( "\n----------+\n%10s", "average" );

	for ( i_func = 0; i_func < ARRAY_SIZE( comp_list ); i_func++ )
	{
		temp = 0;

		for ( i_rng = 0; i_rng < ARRAY_SIZE( test_rngs ); i_rng++ )
			temp += test_results[i_rng][i_func + 1];

		temp /= ARRAY_SIZE( test_rngs );

		PUT_FORMAT_MSG( fdU( 10 ), (uint32_t)temp );
	}
}

void testRoundingFunction( uint32_t start, uint32_t finish, sqrt_func psqrt, char *fname );

void testRounding()
{
	uint16_t i_rng;
	uint16_t f_rng;

	PUT_MSG( "----------+ Rounding test\n" );

	// checking the existence for the test function
	for ( f_rng = 0; f_rng < ARRAY_SIZE( comp_list ); f_rng++ )
		if ( comp_list[f_rng].fsqrt == round_test_func )
			break;

	if ( !( f_rng < ARRAY_SIZE( comp_list ) ) )
	{
		f_rng = ARRAY_SIZE( comp_list ) - 1;
		PUT_FORMAT_MSG( "Value of 'round_test_func' not found.\n" );
	}

	PUT_FORMAT_MSG( "Function '%s' is tested for rounding.\n", comp_list[f_rng].alias );

	// checking standard ranges
	for ( i_rng = 0; i_rng < ARRAY_SIZE( test_rngs ); i_rng++ )
		testRoundingFunction( test_rngs[i_rng].start, test_rngs[i_rng].stop, comp_list[f_rng].fsqrt, comp_list[f_rng].alias );

	// checking extended range
	testRoundingFunction( HI_ROUND_TEST_RANGE_START, HI_ROUND_TEST_RANGE_END, comp_list[f_rng].fsqrt, comp_list[f_rng].alias );
}

void turn_the_fan( uint32_t ms );

void testRoundingFunction( uint32_t start, uint32_t finish, sqrt_func psqrt, char *fname )
{
	uint32_t rf, ri;
	uint32_t n, c = 0;

	PUT_FORMAT_MSG( "test range:["fdU( 10 )".."fdU( 10 )"] ... ", start, finish );

	for ( n = start; n < finish; n++ )
	{
		rf = sqrt_fpu( n );
		ri = ( *psqrt )( n );
		if ( rf != ri )
		{
			if ( c++ > 3 )
			{
				PUT_FORMAT_MSG( "\b\n(!)too many mistakes in '%s', ", fname );
				break;
			}
			else
			{
				double d = sqrt( (double) n );
				PUT_FORMAT_MSG( "\b\n%s("fdU( 10 )")="fdU()" != "fdU(), fname, n, ri, rf );
				PUT_FORMAT_MSG( " (real value is %.6lf)", d );
			}
		}
		turn_the_fan( MS_PER_SEC );
	}

	if ( !c )
	{
		PUT_FORMAT_MSG( "\b done.\n" );
	}
	else
	{
		PUT_FORMAT_MSG( "test failed.\n" );
	}
}

void turn_the_fan( uint32_t ms )
{
	static char ca[] = "|/-\\";
	static uint32_t cs = ARRAY_SIZE(ca) - 1;
	static uint32_t cn = 0;

	static uint32_t at = 0;

	uint32_t ct = X_CLOCK;
	if ( ct - at > ms )
	{
		at = ct;
		PUT_FORMAT_MSG( "\b%c", ca[cn++ % cs] );
	}
}

/******************************************************************************
 * Implementation of the sqrt functions
 ******************************************************************************/

// floating point arg & result with double
uint16_t sqrt_fpu( uint32_t number )
{
	if ( number < 2 )
		return (uint16_t) number;

	double f_rslt = sqrt( number );
	uint32_t rslt = (uint32_t) f_rslt;

	if ( !( f_rslt - (double) rslt < .5 ) )
		rslt++;

	return (uint16_t) rslt;
}

// floating point arg & result with float
uint16_t sqrt_fps( uint32_t number )
{
	if ( number < 2 )
		return (uint16_t) number;

	float f_rslt = sqrtf( number );
	uint32_t rslt = (uint32_t) f_rslt;

	if ( !( f_rslt - (float) rslt < .5 ) )
		rslt++;

	return (uint16_t) rslt;
}

// unsigned integer arg & result

// @formatter:off
uint16_t sqrt_evn ( uint32_t number )
{
	if ( number < 2 )
		return ( uint16_t ) number;

	uint32_t temp;
	uint32_t div;
	uint32_t rslt;

	if ( number & 0xFFFF0000L )
		if ( number & 0xFF000000L )
			if ( number & 0xF0000000L )
				if ( number & 0xE0000000L )
					div = 43771;
				else
					div = 22250;
			else
				if ( number & 0x0C000000L )
					div = 11310;
				else
					div = 5749;
		else
			if ( number & 0x00F00000L )
				if ( number & 0x00C00000L )
					div = 2923;
				else
					div = 1486;
			else
				if ( number & 0x000C0000L )
					div = 755;
				else
					div = 384;
	else
		if ( number & 0xFF00L )
			if ( number & 0xF000L )
				if ( number & 0xC000L )
					div = 195;
				else
					div = 99;
			else
				if ( number & 0x0C00L )
					div = 50;
				else
					div = 25;
		else
			if ( number & 0xF0L )
				if ( number & 0x80L )
					div = 13;
				else
					div = 7;
			else
				div = 3;

	rslt = number;

	while ( 1 )
	{
		temp = number / div;
		temp += div;

		div = temp >> 1;
		div += temp & 1;

		if ( rslt > div )
			rslt = div;
		else
		{
			if ( number / rslt == rslt - 1 && number % rslt == 0 )
				rslt--;

			return ( uint16_t ) rslt;
		}
	}
}
/* @formatter:on */

// unsigned integer arg & result
uint16_t sqrxi32( uint32_t y )
{

	if ( y == 1 )
		return 1;

	uint32_t xh = y > 0x10000ul ? 0x10000ul : y;
	uint32_t xl = 0;
	uint32_t xc;

	for ( int k = 0; k < 16; k++ )
	{
		xc = ( xh + xl ) >> 1ul;
		if ( xc * xc - xc >= y )
		{
			xh = xc;
		}
		else
		{
			xl = xc;
		}
	}
	return ( xh + xl ) >> 1ul;
}

// <-- insert implementation of your function sqrt here

#ifdef __cplusplus
}
#endif

#endif // __SQRT_CMP_H

Se necesitan diferentes raíces, diferentes raíces son importantes

En lugar de presentar

De Que Trata Este Texto

x86

M4

M0

1. x86

2. STM32

Apéndice 3. Procedimiento para probar otros algoritmos y plataformas

More articles: