From 5eeaa130fd67a71e8de85fb222faae2bd1ed271c Mon Sep 17 00:00:00 2001 From: Randy Heit Date: Wed, 4 Mar 2009 06:06:31 +0000 Subject: [PATCH] - Went back to using RDTSC for timing on Win32. Ironically, QueryPerformanceCounter() is obviously using the TSC for its timing on my machine, yet the overhead it has to do to keep the timer sane is apparently noticeable on a few maps. I suppose I should at some time check clock_gettime() and see if it has similar issues on Linux. SVN r1460 (trunk) --- docs/rh-log.txt | 9 ++++++- src/stats.h | 46 +++++++++++++++++++++++++++++++----- src/win32/i_system.cpp | 53 +++++++++++++++++++++++++++++++++++++----- src/x86.h | 28 +++++++++++----------- 4 files changed, 109 insertions(+), 27 deletions(-) diff --git a/docs/rh-log.txt b/docs/rh-log.txt index 76f9c88fc..9df211d08 100644 --- a/docs/rh-log.txt +++ b/docs/rh-log.txt @@ -1,4 +1,11 @@ -March 3, 2009 (Changes by Graf Zahl) +March 4, 2009 +- Went back to using RDTSC for timing on Win32. Ironically, + QueryPerformanceCounter() is obviously using the TSC for its timing on my + machine, yet the overhead it has to do to keep the timer sane is apparently + visible on a few maps. I suppose I should at some time check clock_gettime() + and see if it has similar issues on Linux. + +March 3, 2009 (Changes by Graf Zahl) - changed: If a monster with the BOSSDEATH flag is crushed A_BossDeath will be called now. - fixed: D'Sparil's second form was missing the BOSSDEATH flag. diff --git a/src/stats.h b/src/stats.h index 179b6a996..dfbe26174 100644 --- a/src/stats.h +++ b/src/stats.h @@ -103,8 +103,40 @@ private: #else // Windows +#include "x86.h" + extern double PerfToSec, PerfToMillisec; -long long QueryPerfCounter(); + +#ifdef _MSC_VER +// Trying to include intrin.h here results in some bizarre errors, so I'm just +// going to duplicate the function prototype instead. +//#include +extern "C" unsigned __int64 __rdtsc(void); +#pragma intrinsic(__rdtsc) +inline unsigned __int64 rdtsc() +{ +#ifndef _M_X64 + if (CPU.bRDTSC) +#endif + { + return __rdtsc(); + } + return 0; +} +#else +inline volatile unsigned long long rdtsc() +{ +#ifndef __amd64__ + if (CPU.bRDTSC) +#endif + { + register unsigned long long tsc asm("eax"); + asm volatile ("\trdtsc\n" : : : "eax, "edx"); + return tsc; + } + return 0; +} +#endif class cycle_t { @@ -122,16 +154,13 @@ public: void Clock() { - // Not using QueryPerformanceCounter directly, so we don't need - // to pull in the Windows headers for every single file that - // wants to do some profiling. - long long time = QueryPerfCounter(); + long long time = rdtsc(); Counter -= time; } void Unclock() { - long long time = QueryPerfCounter(); + long long time = rdtsc(); Counter += time; } @@ -145,6 +174,11 @@ public: return Counter * PerfToMillisec; } + long long GetRawCounter() + { + return Counter; + } + private: long long Counter; }; diff --git a/src/win32/i_system.cpp b/src/win32/i_system.cpp index 7092d8483..94faa24a7 100644 --- a/src/win32/i_system.cpp +++ b/src/win32/i_system.cpp @@ -372,21 +372,62 @@ void SetLanguageIDs () } } +void CalculateCPUSpeed() +{ + LARGE_INTEGER freq; + + QueryPerformanceFrequency (&freq); + + if (freq.QuadPart != 0 && CPU.bRDTSC) + { + LARGE_INTEGER count1, count2; + cycle_t ClockCalibration; + DWORD min_diff; + + ClockCalibration.Reset(); + + // Count cycles for at least 55 milliseconds. + // The performance counter may be very low resolution compared to CPU + // speeds today, so the longer we count, the more accurate our estimate. + // On the other hand, we don't want to count too long, because we don't + // want the user to notice us spend time here, since most users will + // probably never use the performance statistics. + min_diff = freq.LowPart * 11 / 200; + + // Minimize the chance of task switching during the testing by going very + // high priority. This is another reason to avoid timing for too long. + SetPriorityClass(GetCurrentProcess(), REALTIME_PRIORITY_CLASS); + SetThreadPriority(GetCurrentThread(), THREAD_PRIORITY_TIME_CRITICAL); + + // Make sure we start timing on a counter boundary. + QueryPerformanceCounter(&count1); + do { QueryPerformanceCounter(&count2); } while (count1.QuadPart == count2.QuadPart); + + // Do the timing loop. + ClockCalibration.Clock(); + do { QueryPerformanceCounter(&count1); } while ((count1.QuadPart - count2.QuadPart) < min_diff); + ClockCalibration.Unclock(); + + SetPriorityClass(GetCurrentProcess(), NORMAL_PRIORITY_CLASS); + SetThreadPriority(GetCurrentThread(), THREAD_PRIORITY_NORMAL); + + PerfToSec = double(count1.QuadPart - count2.QuadPart) / (double(ClockCalibration.GetRawCounter()) * freq.QuadPart); + PerfToMillisec = PerfToSec * 1000.0; + } + + Printf ("CPU Speed: %.0f MHz\n", 0.001 / PerfToMillisec); +} + // // I_Init // void I_Init (void) { - LARGE_INTEGER perf_freq; - CheckCPUID(&CPU); + CalculateCPUSpeed(); DumpCPUInfo(&CPU); - QueryPerformanceFrequency(&perf_freq); - PerfToSec = 1 / double(perf_freq.QuadPart); - PerfToMillisec = 1000 / double(perf_freq.QuadPart); - // Use a timer event if possible NewTicArrived = CreateEvent (NULL, FALSE, FALSE, NULL); if (NewTicArrived) diff --git a/src/x86.h b/src/x86.h index 5454418bf..058cda081 100644 --- a/src/x86.h +++ b/src/x86.h @@ -1,6 +1,6 @@ -#ifndef X86_H -#define X86_H - +#ifndef X86_H +#define X86_H + struct CPUInfo // 92 bytes { char VendorID[16]; @@ -17,11 +17,11 @@ struct CPUInfo // 92 bytes BYTE APICID; DWORD bSSE3:1; - DWORD DontCare1:8; - DWORD bSSSE3:1; - DWORD DontCare1a:9; - DWORD bSSE41:1; - DWORD bSSE42:1; + DWORD DontCare1:8; + DWORD bSSSE3:1; + DWORD DontCare1a:9; + DWORD bSSE41:1; + DWORD bSSE42:1; DWORD DontCare2a:11; DWORD bFPU:1; @@ -77,10 +77,10 @@ struct CPUInfo // 92 bytes extern "C" CPUInfo CPU; - -void CheckCPUID (CPUInfo *cpu); -void DumpCPUInfo (const CPUInfo *cpu); + +void CheckCPUID (CPUInfo *cpu); +void DumpCPUInfo (const CPUInfo *cpu); void DoBlending_SSE2(const PalEntry *from, PalEntry *to, int count, int r, int g, int b, int a); - -#endif - + +#endif +