diff --git a/docs/rh-log.txt b/docs/rh-log.txt index 76f9c88fc..9df211d08 100644 --- a/docs/rh-log.txt +++ b/docs/rh-log.txt @@ -1,4 +1,11 @@ -March 3, 2009 (Changes by Graf Zahl) +March 4, 2009 +- Went back to using RDTSC for timing on Win32. Ironically, + QueryPerformanceCounter() is obviously using the TSC for its timing on my + machine, yet the overhead it has to do to keep the timer sane is apparently + visible on a few maps. I suppose I should at some time check clock_gettime() + and see if it has similar issues on Linux. + +March 3, 2009 (Changes by Graf Zahl) - changed: If a monster with the BOSSDEATH flag is crushed A_BossDeath will be called now. - fixed: D'Sparil's second form was missing the BOSSDEATH flag. diff --git a/src/stats.h b/src/stats.h index 179b6a996..dfbe26174 100644 --- a/src/stats.h +++ b/src/stats.h @@ -103,8 +103,40 @@ private: #else // Windows +#include "x86.h" + extern double PerfToSec, PerfToMillisec; -long long QueryPerfCounter(); + +#ifdef _MSC_VER +// Trying to include intrin.h here results in some bizarre errors, so I'm just +// going to duplicate the function prototype instead. +//#include +extern "C" unsigned __int64 __rdtsc(void); +#pragma intrinsic(__rdtsc) +inline unsigned __int64 rdtsc() +{ +#ifndef _M_X64 + if (CPU.bRDTSC) +#endif + { + return __rdtsc(); + } + return 0; +} +#else +inline volatile unsigned long long rdtsc() +{ +#ifndef __amd64__ + if (CPU.bRDTSC) +#endif + { + register unsigned long long tsc asm("eax"); + asm volatile ("\trdtsc\n" : : : "eax, "edx"); + return tsc; + } + return 0; +} +#endif class cycle_t { @@ -122,16 +154,13 @@ public: void Clock() { - // Not using QueryPerformanceCounter directly, so we don't need - // to pull in the Windows headers for every single file that - // wants to do some profiling. - long long time = QueryPerfCounter(); + long long time = rdtsc(); Counter -= time; } void Unclock() { - long long time = QueryPerfCounter(); + long long time = rdtsc(); Counter += time; } @@ -145,6 +174,11 @@ public: return Counter * PerfToMillisec; } + long long GetRawCounter() + { + return Counter; + } + private: long long Counter; }; diff --git a/src/win32/i_system.cpp b/src/win32/i_system.cpp index 7092d8483..94faa24a7 100644 --- a/src/win32/i_system.cpp +++ b/src/win32/i_system.cpp @@ -372,21 +372,62 @@ void SetLanguageIDs () } } +void CalculateCPUSpeed() +{ + LARGE_INTEGER freq; + + QueryPerformanceFrequency (&freq); + + if (freq.QuadPart != 0 && CPU.bRDTSC) + { + LARGE_INTEGER count1, count2; + cycle_t ClockCalibration; + DWORD min_diff; + + ClockCalibration.Reset(); + + // Count cycles for at least 55 milliseconds. + // The performance counter may be very low resolution compared to CPU + // speeds today, so the longer we count, the more accurate our estimate. + // On the other hand, we don't want to count too long, because we don't + // want the user to notice us spend time here, since most users will + // probably never use the performance statistics. + min_diff = freq.LowPart * 11 / 200; + + // Minimize the chance of task switching during the testing by going very + // high priority. This is another reason to avoid timing for too long. + SetPriorityClass(GetCurrentProcess(), REALTIME_PRIORITY_CLASS); + SetThreadPriority(GetCurrentThread(), THREAD_PRIORITY_TIME_CRITICAL); + + // Make sure we start timing on a counter boundary. + QueryPerformanceCounter(&count1); + do { QueryPerformanceCounter(&count2); } while (count1.QuadPart == count2.QuadPart); + + // Do the timing loop. + ClockCalibration.Clock(); + do { QueryPerformanceCounter(&count1); } while ((count1.QuadPart - count2.QuadPart) < min_diff); + ClockCalibration.Unclock(); + + SetPriorityClass(GetCurrentProcess(), NORMAL_PRIORITY_CLASS); + SetThreadPriority(GetCurrentThread(), THREAD_PRIORITY_NORMAL); + + PerfToSec = double(count1.QuadPart - count2.QuadPart) / (double(ClockCalibration.GetRawCounter()) * freq.QuadPart); + PerfToMillisec = PerfToSec * 1000.0; + } + + Printf ("CPU Speed: %.0f MHz\n", 0.001 / PerfToMillisec); +} + // // I_Init // void I_Init (void) { - LARGE_INTEGER perf_freq; - CheckCPUID(&CPU); + CalculateCPUSpeed(); DumpCPUInfo(&CPU); - QueryPerformanceFrequency(&perf_freq); - PerfToSec = 1 / double(perf_freq.QuadPart); - PerfToMillisec = 1000 / double(perf_freq.QuadPart); - // Use a timer event if possible NewTicArrived = CreateEvent (NULL, FALSE, FALSE, NULL); if (NewTicArrived) diff --git a/src/x86.h b/src/x86.h index 5454418bf..058cda081 100644 --- a/src/x86.h +++ b/src/x86.h @@ -1,6 +1,6 @@ -#ifndef X86_H -#define X86_H - +#ifndef X86_H +#define X86_H + struct CPUInfo // 92 bytes { char VendorID[16]; @@ -17,11 +17,11 @@ struct CPUInfo // 92 bytes BYTE APICID; DWORD bSSE3:1; - DWORD DontCare1:8; - DWORD bSSSE3:1; - DWORD DontCare1a:9; - DWORD bSSE41:1; - DWORD bSSE42:1; + DWORD DontCare1:8; + DWORD bSSSE3:1; + DWORD DontCare1a:9; + DWORD bSSE41:1; + DWORD bSSE42:1; DWORD DontCare2a:11; DWORD bFPU:1; @@ -77,10 +77,10 @@ struct CPUInfo // 92 bytes extern "C" CPUInfo CPU; - -void CheckCPUID (CPUInfo *cpu); -void DumpCPUInfo (const CPUInfo *cpu); + +void CheckCPUID (CPUInfo *cpu); +void DumpCPUInfo (const CPUInfo *cpu); void DoBlending_SSE2(const PalEntry *from, PalEntry *to, int count, int r, int g, int b, int a); - -#endif - + +#endif +