I've implemented some pcg variants in C to include in my codebase. I've got a 64bit state generating 32 values, and a 128bit state generating 64bit values.
The implementation from the site (pcg_setseq_128_xsl_rr_64_random_r) uses clang or gcc __int128 type which I don't believe can be used with MSVC. So I implemented the few 128bit math operation required (multiply, add, shift left and right) using 2 uint64_t.
The code was working, but when I did a simple performance test, generating 2*u32_max random numbers, my code was about 10 second slower than the official code (23.3s for the official implementation using __int128, 33.8s for my 128bit code), using clang 12.0.1 with -O2. Compiling with MSVC (19.29) was a lot worse, about 93.4s also with -O2 (and MSVC is my primary target).
After looking at the disassembly, I noticed that the 128bit math functions were inlined in the main function, but the register values were pushed on the stack instead of being used directly.
; This is from clang ;... mov qword ptr [rsp+0x20], r11 mov qword ptr [rsp+0x38], r8 mov qword ptr [rsp+0x28], r13 mov qword ptr [rsp+0x30], rsi mov rcx, qword ptr [rsp+0x20] imul rcx, qword ptr [rsp+0x30] mov rsi, qword ptr [rsp+0x38] imul rsi, qword ptr [rsp+0x28] mov rax, qword ptr [rsp+0x20] mul qword ptr [rsp+0x28] ;...
I changed the function (multiply and add) to be macros instead and the code generated (both clang and msvc) uses only register and is much closer to the __int128 code. The performances are a lot closer too (__int128 => 23.7, my code => 25.5).
My question is: is there a particular reason why the compilers pushed the registers on the stack before using them ? Is there a way to keep the function code but make it generate better assembly ?
Here is the code. You'll need pcg_variants.h and pcg-advance-128.c. In random_64_get
you can change which version of the 128bit multiply and add to use.
/* clang pcg_test.c -opcg_test.exe -O2 -fno-exceptions -ffast-math -fno-rtti -Werror -g -msse2 -msse4.1 -gcodeview -gno-column-info -fdiagnostics-absolute-paths */ #include <windows.h> #include <stdint.h> #include <intrin.h> #include <stdio.h> typedef uint8_t u8; typedef uint32_t u32; typedef uint64_t u64; typedef int64_t s64; typedef double r64; #define false 0 #define true 1 #define u32_max 0xffffffff #define safe_macro( macro ) do { macro; } while ( 0 ) #if defined ( __clang__ ) #define _assert( expression ) safe_macro( if ( !( expression ) ) { __builtin_debugtrap( ); } ) #else #define _assert( expression ) safe_macro( if ( !( expression ) ) { __debugbreak( ); } ) #endif #if defined ( __clang__ ) /* * PCG Random Number Generation for C. * * Copyright 2014-2019 Melissa O'Neill <[email protected]>, * and the PCG Project contributors. * * SPDX-License-Identifier: (Apache-2.0 OR MIT) * * Licensed under the Apache License, Version 2.0 (provided in * LICENSE-APACHE.txt and at http://www.apache.org/licenses/LICENSE-2.0) * or under the MIT license (provided in LICENSE-MIT.txt and at * http://opensource.org/licenses/MIT), at your option. This file may not * be copied, modified, or distributed except according to those terms. * * Distributed on an "AS IS" BASIS, WITHOUT WARRANTY OF ANY KIND, either * express or implied. See your chosen license for details. * * For additional information about the PCG random number generation scheme, * visit http://www.pcg-random.org/. */ #include "pcg_variants.h" #include "pcg-advance-128.c" #endif #define perf_get_frequency( u64_ptr ) QueryPerformanceFrequency( ( LARGE_INTEGER* ) u64_ptr ) #define perf_get_time( u64_ptr ) QueryPerformanceCounter( ( LARGE_INTEGER* ) u64_ptr ) /* == 128 bit math == */ typedef struct u128_t { u64 low, high; } u128_t; #define u128_make( low, high ) ( u128_t ) { low, high } #define math_u128_mul_inline( r, a, b ) { u128_t t; t.low = _umul128( a.low, b.low, &t.high ); t.high += ( ( a.low * b.high ) + ( a.high * b.low ) ); r = t; } u128_t math_u128_mul( volatile u128_t a, volatile u128_t b ) { /* NOTE simon: Idea is to do similiar thing as written multiplication: multiply each digit from a with each digit of b and add each multiplication result. But we use 64bit instead of digits. The a.low + b.low is a 64bit * 64bit multiply where the result is 128bits. We keep every 128bits using the `mul` instruction using rax (implied) and another register and the result is stored in rdx (high 64bits) and rax (low 64bits). We than multiply a.low with b.high which is a regular 64bit multiply because we can't store the bit that would overflow (bit that would be outside of the 128bits). We do a similar thing with a.high and b.low. We don't need to multiply a.high with b.high as that can only produce bits higher than the 128bits. e.g. If we used 2 digits numbers, the smallest we could do would be 10 * 10 which is 100 and requires 3 digits. The result is adding the results from those calculations. */ u128_t result; u64 high_1 = a.low * b.high; u64 high_2 = a.high * b.low; u64 high_3; result.low = _umul128( a.low, b.low, &high_3 ); result.high = high_1 + high_2 + high_3; return result; } #define math_u128_add_inline( r, a, b ) { u128_t t; t.low = a.low + b.low; t.high = a.high + b.high + ( ( t.low < a.low ) ? 1 : 0 ); r = t; } u128_t math_u128_add( u128_t a, u128_t b ) { u128_t result; result.low = a.low + b.low; result.high = a.high + b.high + ( ( result.low < a.low ) ? 1 : 0 ); return result; } u128_t math_u128_left_shift( u128_t a, u8 bit_count ) { u128_t result; if ( bit_count <= 64 ) { result.low = a.low << bit_count; result.high = a.high << bit_count; result.high |= a.low >> ( 64 - bit_count ); } else { result.low = 0; result.high = a.low << ( bit_count - 64 ); } return result; } u128_t math_u128_right_shift( u128_t a, u8 bit_count ) { u128_t result; if ( bit_count <= 64 ) { result.high = a.high >> bit_count; result.low = a.low >> bit_count; result.low |= a.high << ( 64 - bit_count ); } else { result.high = 0; result.low = a.high >> ( bit_count - 64 ); } return result; } /* == Custom pcg implementation. */ #define random_64_multiplier ( u128_t ) { /* low */ 4865540595714422341ull, /* high */ 2549297995355413924ull } typedef struct random_64_t { u128_t state; u128_t increment; } random_64_t; random_64_t random_64_make( u128_t seed, u128_t sequence ) { random_64_t result = { 0 }; result.increment = math_u128_left_shift( sequence, 1 ); result.increment.low |= 1; result.state = math_u128_add( result.increment, seed ); result.state = math_u128_mul( result.state, random_64_multiplier ); result.state = math_u128_add( result.state, result.increment ); return result; } u64 random_64_get( random_64_t* random_64 ) { /* NOTE simon: 128 xsl rr 64 */ u64 result = 0; random_64->state = math_u128_mul( random_64->state, random_64_multiplier ); // math_u128_mul_inline( random_64->state, random_64->state, random_64_multiplier ); random_64->state = math_u128_add( random_64->state, random_64->increment ); // math_u128_add_inline( random_64->state, random_64->state, random_64->increment ); #if 0 u8 rotation = ( u8 ) math_u128_right_shift( random_64->state, 122 ).low; u64 temp = math_u128_right_shift( random_64->state, 64 ).low; temp ^= random_64->state.low; #else /* NOTE simon: 2^6 => 64 bit rotation. */ u8 rotation = ( u8 ) ( random_64->state.high >> ( 64 - 6 ) ); u64 temp = random_64->state.high ^ random_64->state.low; #endif result = ( temp >> rotation ) | ( temp << ( 64 - rotation ) ); return result; } void random_64_advance( random_64_t* random_64, u128_t delta ) { u128_t multiplier = random_64_multiplier; u128_t increment = random_64->increment; u128_t multiply_accumulator = u128_make( 1, 0 ); u128_t increment_accumulator = u128_make( 0, 0 ); while ( delta.low > 0 || delta.high > 0 ) { if ( delta.low & 1 ) { multiply_accumulator = math_u128_mul( multiply_accumulator, multiplier ); increment_accumulator = math_u128_mul( increment_accumulator, multiplier ); increment_accumulator = math_u128_add( increment_accumulator, increment ); } u128_t temp = math_u128_add( multiplier, u128_make( 1, 0 ) ); increment = math_u128_mul( temp, increment ); multiplier = math_u128_mul( multiplier, multiplier ); delta = math_u128_right_shift( delta, 1 ); /* NOTE simon: delta /= 2. */ } random_64->state = math_u128_mul( multiply_accumulator, random_64->state ); random_64->state = math_u128_add( random_64->state, increment_accumulator ); } u64 random_64_get_range( random_64_t* random_64, u64 min, u64 max ) { u64 result = 0; u64 range = max - min; u64 threshold = ( ( u64 ) ( -( s64 ) range ) ) % range; while ( true ) { u64 number = random_64_get( random_64 ); if ( number >= threshold ) { result = min + ( number % range ); break; } } return result; } int main( int argc, char** argv ) { u64 frequency; perf_get_frequency( &frequency ); u64 sum = 0; struct pcg_state_setseq_128 rng_1; // pcg64_srandom_r( &rng_1, ( pcg128_t ) &rng_1, ( pcg128_t ) &rng_1 ); pcg64_srandom_r( &rng_1, ( pcg128_t ) 0x000000DF8F19FBD0, ( pcg128_t ) 0x000000DF8F19FBD0 ); // random_64_t rng_2 = random_64_make( u128_make( ( u64 ) &rng_1, 0 ), u128_make( ( u64 ) &rng_1, 0 ) ); random_64_t rng_2 = random_64_make( u128_make( ( u64 ) 0x000000DF8F19FBD0, 0 ), u128_make( ( u64 ) 0x000000DF8F19FBD0, 0 ) ); /* Some tests. u64 a_high = ( u64 ) ( rng_1.state >> 64 ); u64 a_low = ( u64 ) ( rng_1.state ); u64 b_high = ( u64 ) ( rng_1.inc >> 64 ); u64 b_low = ( u64 ) ( rng_1.inc ); _assert( a_high == rng_2.state.high ); _assert( a_low == rng_2.state.low ); _assert( b_high == rng_2.increment.high ); _assert( b_low == rng_2.increment.low ); pcg64_advance_r( &rng_1, 12345 ); pcg64_advance_r( &rng_1, 145 ); pcg64_advance_r( &rng_1, 1235 ); pcg64_advance_r( &rng_1, 1245 ); pcg64_advance_r( &rng_1, 14839255 ); random_64_advance( &rng_2, u128_make( 12345, 0 ) ); random_64_advance( &rng_2, u128_make( 145, 0 ) ); random_64_advance( &rng_2, u128_make( 1235, 0 ) ); random_64_advance( &rng_2, u128_make( 1245, 0 ) ); random_64_advance( &rng_2, u128_make( 14839255, 0 ) ); a_high = ( u64 ) ( rng_1.state >> 64 ); a_low = ( u64 ) ( rng_1.state ); b_high = ( u64 ) ( rng_1.inc >> 64 ); b_low = ( u64 ) ( rng_1.inc ); _assert( a_high == rng_2.state.high ); _assert( a_low == rng_2.state.low ); _assert( b_high == rng_2.increment.high ); _assert( b_low == rng_2.increment.low ); */ for ( u32 i = 0; i < u32_max; i++ ) { uint64_t a = pcg64_random_r( &rng_1 ); uint64_t b = random_64_get( &rng_2 ); _assert( a == b ); uint64_t c = a + b; _assert( c == a * 2 ); sum += c; } /* Time */ // pcg64_srandom_r( &rng_1, ( pcg128_t ) &rng_1, ( pcg128_t ) &rng_1 ); pcg64_srandom_r( &rng_1, ( pcg128_t ) 0x000000DF8F19FBD0, ( pcg128_t ) 0x000000DF8F19FBD0 ); u64 pcg_start; perf_get_time( &pcg_start ); for ( u32 i = 0; i < u32_max; i++ ) { uint64_t a = pcg64_random_r( &rng_1 ); uint64_t b = pcg64_random_r( &rng_1 ); uint64_t c = a + b; sum += c; } u64 pcg_end; perf_get_time( &pcg_end ); r64 pcg_time = ( r64 ) ( pcg_end - pcg_start ) / ( r64 ) frequency; printf( "pcg: %f\n", pcg_time ); // rng_2 = random_64_make( u128_make( ( u64 ) &rng_1, 0 ), u128_make( ( u64 ) &rng_1, 0 ) ); rng_2 = random_64_make( u128_make( ( u64 ) 0x000000DF8F19FBD0, 0 ), u128_make( ( u64 ) 0x000000DF8F19FBD0, 0 ) ); u64 custom_start; perf_get_time( &custom_start ); for ( u32 i = 0; i < u32_max; i++ ) { uint64_t a = random_64_get( &rng_2 ); uint64_t b = random_64_get( &rng_2 ); uint64_t c = a + b; sum += c; } u64 custom_end; perf_get_time( &custom_end ); r64 custom_time = ( r64 ) ( custom_end - custom_start ) / ( r64 ) frequency; printf( "custom: %f\n", custom_time ); return ( u32 ) sum; }