change astar reset optimization to instead rely on memory alignment and
the compiler optimizations for memset/memmove on aligned memory chunks with modern CPUs
This commit is contained in:
parent
71608824e5
commit
8d35af98b2
4 changed files with 174 additions and 95 deletions
|
@ -142,6 +142,14 @@ void SetClipboard(std::string &str);
|
|||
int UTF8GetNext(const std::string &text, int curpos);
|
||||
int UTF8GetPrev(const std::string &text, int curpos);
|
||||
|
||||
/*----------------------------------------------------------------------------
|
||||
-- SIMD support
|
||||
----------------------------------------------------------------------------*/
|
||||
bool supportsSSE2();
|
||||
bool supportsAVX();
|
||||
void *aligned_malloc(size_t alignment, size_t size);
|
||||
void aligned_free(void *block);
|
||||
|
||||
//@}
|
||||
|
||||
#endif /* __UTIL_H__ */
|
||||
|
|
|
@ -51,10 +51,10 @@
|
|||
----------------------------------------------------------------------------*/
|
||||
|
||||
struct Node {
|
||||
int CostFromStart; /// Real costs to reach this point
|
||||
short int CostToGoal; /// Estimated cost to goal
|
||||
char InGoal; /// is this point in the goal
|
||||
char Direction; /// Direction for trace back
|
||||
int32_t CostFromStart; /// Real costs to reach this point
|
||||
int16_t CostToGoal; /// Estimated cost to goal
|
||||
int8_t InGoal; /// is this point in the goal
|
||||
int8_t Direction; /// Direction for trace back
|
||||
};
|
||||
|
||||
struct Open {
|
||||
|
@ -88,9 +88,6 @@ const int XY2Heading[3][3] = { {7, 6, 5}, {0, 0, 4}, {1, 2, 3}};
|
|||
static Node *AStarMatrix;
|
||||
|
||||
/// a list of close nodes, helps to speed up the matrix cleaning
|
||||
static int *CloseSet;
|
||||
static int CloseSetSize;
|
||||
static int Threshold;
|
||||
static int OpenSetMaxSize;
|
||||
static int AStarMatrixSize;
|
||||
#define MAX_CLOSE_SET_RATIO 4
|
||||
|
@ -106,6 +103,7 @@ static bool AStarFixedEnemyUnitsUnpassable = false;
|
|||
|
||||
static int AStarMapWidth;
|
||||
static int AStarMapHeight;
|
||||
static int AStarMapMax;
|
||||
|
||||
static int AStarGoalX;
|
||||
static int AStarGoalY;
|
||||
|
@ -120,8 +118,9 @@ static Open *OpenSet;
|
|||
/// The size of the open node set
|
||||
static int OpenSetSize;
|
||||
|
||||
static int *CostMoveToCache;
|
||||
static const int CacheNotSet = -5;
|
||||
static int32_t *CostMoveToCache;
|
||||
static int CostMoveToCacheSize;
|
||||
static constexpr int CacheNotSet = -1;
|
||||
|
||||
/*----------------------------------------------------------------------------
|
||||
-- Profile
|
||||
|
@ -257,19 +256,23 @@ void InitAStar(int mapWidth, int mapHeight)
|
|||
|
||||
AStarMapWidth = mapWidth;
|
||||
AStarMapHeight = mapHeight;
|
||||
AStarMapMax = AStarMapWidth * AStarMapHeight;
|
||||
|
||||
AStarMatrixSize = sizeof(Node) * AStarMapWidth * AStarMapHeight;
|
||||
AStarMatrix = new Node[AStarMapWidth * AStarMapHeight];
|
||||
// align the matrix, the open set, and the cost to move cache
|
||||
// on 32-byte boundary, in case the memset/memmove operations
|
||||
// of the libc we're using has a 128/256/512bit SIMD vector
|
||||
// instruction branch, since we might be clearing 8M of
|
||||
// memory for a 2048x2048 map
|
||||
AStarMatrixSize = sizeof(Node) * AStarMapMax;
|
||||
AStarMatrix = (Node *)aligned_malloc(32, AStarMatrixSize);
|
||||
memset(AStarMatrix, 0, AStarMatrixSize);
|
||||
|
||||
Threshold = AStarMapWidth * AStarMapHeight / MAX_CLOSE_SET_RATIO;
|
||||
CloseSetSize = Threshold;
|
||||
CloseSet = new int[Threshold];
|
||||
OpenSetMaxSize = AStarMapMax / MAX_OPEN_SET_RATIO;
|
||||
OpenSet = (Open *)aligned_malloc(32, OpenSetMaxSize * sizeof(Open));
|
||||
|
||||
OpenSetMaxSize = AStarMapWidth * AStarMapHeight / MAX_OPEN_SET_RATIO;
|
||||
OpenSet = new Open[OpenSetMaxSize];
|
||||
|
||||
CostMoveToCache = new int[AStarMapWidth * AStarMapHeight];
|
||||
CostMoveToCacheSize = sizeof(int32_t) * AStarMapMax;
|
||||
CostMoveToCache = (int32_t*)aligned_malloc(32, CostMoveToCacheSize);
|
||||
memset(CostMoveToCache, CacheNotSet, CostMoveToCacheSize);
|
||||
|
||||
for (int i = 0; i < 9; ++i) {
|
||||
Heading2O[i] = Heading2Y[i] * AStarMapWidth;
|
||||
|
@ -283,15 +286,12 @@ void InitAStar(int mapWidth, int mapHeight)
|
|||
*/
|
||||
void FreeAStar()
|
||||
{
|
||||
delete[] AStarMatrix;
|
||||
aligned_free(AStarMatrix);
|
||||
AStarMatrix = NULL;
|
||||
delete[] CloseSet;
|
||||
CloseSet = NULL;
|
||||
CloseSetSize = 0;
|
||||
delete[] OpenSet;
|
||||
aligned_free(OpenSet);
|
||||
OpenSet = NULL;
|
||||
OpenSetSize = 0;
|
||||
delete[] CostMoveToCache;
|
||||
aligned_free(CostMoveToCache);
|
||||
CostMoveToCache = NULL;
|
||||
|
||||
ProfilePrint();
|
||||
|
@ -312,62 +312,14 @@ static void CostMoveToCacheCleanUp();
|
|||
static void AStarCleanUp()
|
||||
{
|
||||
ProfileBegin("AStarCleanUp");
|
||||
|
||||
if (CloseSetSize >= Threshold) {
|
||||
AStarPrepare();
|
||||
CostMoveToCacheCleanUp();
|
||||
} else {
|
||||
for (int i = 0; i < CloseSetSize; ++i) {
|
||||
AStarMatrix[CloseSet[i]].CostFromStart = 0;
|
||||
AStarMatrix[CloseSet[i]].InGoal = 0;
|
||||
CostMoveToCache[CloseSet[i]] = CacheNotSet;
|
||||
}
|
||||
}
|
||||
AStarPrepare();
|
||||
CostMoveToCacheCleanUp();
|
||||
ProfileEnd("AStarCleanUp");
|
||||
}
|
||||
|
||||
static void CostMoveToCacheCleanUp()
|
||||
{
|
||||
ProfileBegin("CostMoveToCacheCleanUp");
|
||||
int AStarMapMax = AStarMapWidth * AStarMapHeight;
|
||||
#if 1
|
||||
int *ptr = CostMoveToCache;
|
||||
#ifdef __x86_64__
|
||||
union {
|
||||
intptr_t d;
|
||||
int i[2];
|
||||
} conv;
|
||||
conv.i[0] = CacheNotSet;
|
||||
conv.i[1] = CacheNotSet;
|
||||
|
||||
if (((uintptr_t)ptr) & 4) {
|
||||
*ptr++ = CacheNotSet;
|
||||
--AStarMapMax;
|
||||
}
|
||||
#endif
|
||||
while (AStarMapMax > 3) {
|
||||
#ifdef __x86_64__
|
||||
*((intptr_t *)ptr) = conv.d;
|
||||
*((intptr_t *)(ptr + 2)) = conv.d;
|
||||
ptr += 4;
|
||||
#else
|
||||
*ptr++ = CacheNotSet;
|
||||
*ptr++ = CacheNotSet;
|
||||
*ptr++ = CacheNotSet;
|
||||
*ptr++ = CacheNotSet;
|
||||
#endif
|
||||
AStarMapMax -= 4;
|
||||
};
|
||||
while (AStarMapMax) {
|
||||
*ptr++ = CacheNotSet;
|
||||
--AStarMapMax;
|
||||
}
|
||||
#else
|
||||
for (int i = 0; i < AStarMapMax; ++i) {
|
||||
CostMoveToCache[i] = CacheNotSet;
|
||||
}
|
||||
#endif
|
||||
ProfileEnd("CostMoveToCacheCleanUp");
|
||||
memset(CostMoveToCache, CacheNotSet, CostMoveToCacheSize);
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -496,16 +448,6 @@ static int AStarFindNode(int eo)
|
|||
return -1;
|
||||
}
|
||||
|
||||
/**
|
||||
** Add a node to the closed set
|
||||
*/
|
||||
static void AStarAddToClose(int node)
|
||||
{
|
||||
if (CloseSetSize < Threshold) {
|
||||
CloseSet[CloseSetSize++] = node;
|
||||
}
|
||||
}
|
||||
|
||||
#define GetIndex(x, y) (x) + (y) * AStarMapWidth
|
||||
|
||||
/* build-in costmoveto code */
|
||||
|
@ -605,12 +547,19 @@ static int CostMoveToCallBack_Default(unsigned int index, const CUnit &unit)
|
|||
*/
|
||||
static inline int CostMoveTo(unsigned int index, const CUnit &unit)
|
||||
{
|
||||
int *c = &CostMoveToCache[index];
|
||||
int32_t *c = &CostMoveToCache[index];
|
||||
if (*c != CacheNotSet) {
|
||||
return *c;
|
||||
// for performance reasons, CostMoveToCache uses -1 to
|
||||
// indicate it is unset, but the algorithm is simpler
|
||||
// if the range of costs is [-1, INT_MAX]. so we always
|
||||
// store everything +1
|
||||
return *c - 1;
|
||||
}
|
||||
*c = CostMoveToCallBack_Default(index, unit);
|
||||
return *c;
|
||||
*c = CostMoveToCallBack_Default(index, unit) + 1;
|
||||
#ifdef DEBUG
|
||||
Assert(c >= 0);
|
||||
#endif
|
||||
return *c - 1;
|
||||
}
|
||||
|
||||
class AStarGoalMarker
|
||||
|
@ -626,7 +575,6 @@ public:
|
|||
AStarMatrix[offset].InGoal = 1;
|
||||
*goal_reachable = true;
|
||||
}
|
||||
AStarAddToClose(offset);
|
||||
}
|
||||
private:
|
||||
const CUnit &unit;
|
||||
|
@ -957,7 +905,6 @@ int AStarFindPath(const Vec2i &startPos, const Vec2i &goalPos, int gw, int gh,
|
|||
AStarCleanUp();
|
||||
|
||||
OpenSetSize = 0;
|
||||
CloseSetSize = 0;
|
||||
|
||||
if (!AStarMarkGoal(goalPos, gw, gh, tilesizex, tilesizey, minrange, maxrange, unit)) {
|
||||
// goal is not reachable
|
||||
|
@ -981,7 +928,6 @@ int AStarFindPath(const Vec2i &startPos, const Vec2i &goalPos, int gw, int gh,
|
|||
ProfileEnd("AStarFindPath");
|
||||
return ret;
|
||||
}
|
||||
AStarAddToClose(OpenSet[0].O);
|
||||
if (AStarMatrix[eo].InGoal) {
|
||||
ret = PF_REACHED;
|
||||
ProfileEnd("AStarFindPath");
|
||||
|
@ -1067,8 +1013,6 @@ int AStarFindPath(const Vec2i &startPos, const Vec2i &goalPos, int gw, int gh,
|
|||
ProfileEnd("AStarFindPath");
|
||||
return ret;
|
||||
}
|
||||
// we add the point to the close set
|
||||
AStarAddToClose(eo);
|
||||
} else if (new_cost < AStarMatrix[eo].CostFromStart) {
|
||||
// Already visited node, but we have here a better path
|
||||
// I know, it's redundant (but simpler like this)
|
||||
|
|
|
@ -41,8 +41,10 @@
|
|||
#include <stdlib.h>
|
||||
#include <stdarg.h>
|
||||
|
||||
#include <malloc.h>
|
||||
#ifdef WIN32
|
||||
#include <windows.h>
|
||||
#include <intrin.h>
|
||||
#endif
|
||||
|
||||
#ifdef USE_STACKTRACE
|
||||
|
@ -500,3 +502,128 @@ void PrintOnStdOut(const char *format, ...)
|
|||
va_end(valist);
|
||||
fflush(stdout);
|
||||
}
|
||||
|
||||
/*----------------------------------------------------------------------------
|
||||
Check SSE/AVX support.
|
||||
This can detect the instruction support of
|
||||
SSE, SSE2, SSE3, SSSE3, SSE4.1, SSE4.2, SSE4a, SSE5, and AVX.
|
||||
----------------------------------------------------------------------------*/
|
||||
|
||||
#ifdef __GNUC__
|
||||
|
||||
static void __cpuid(int* cpuinfo, int info)
|
||||
{
|
||||
__asm__ __volatile__(
|
||||
"xchg %%ebx, %%edi;"
|
||||
"cpuid;"
|
||||
"xchg %%ebx, %%edi;"
|
||||
:"=a" (cpuinfo[0]), "=D" (cpuinfo[1]), "=c" (cpuinfo[2]), "=d" (cpuinfo[3])
|
||||
:"0" (info)
|
||||
);
|
||||
}
|
||||
|
||||
static unsigned long long _xgetbv(unsigned int index)
|
||||
{
|
||||
unsigned int eax, edx;
|
||||
__asm__ __volatile__(
|
||||
"xgetbv;"
|
||||
: "=a" (eax), "=d"(edx)
|
||||
: "c" (index)
|
||||
);
|
||||
return ((unsigned long long)edx << 32) | eax;
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
struct SIMDSupport {
|
||||
bool sseSupportted = false;
|
||||
bool sse2Supportted = false;
|
||||
bool sse3Supportted = false;
|
||||
bool ssse3Supportted = false;
|
||||
bool sse4_1Supportted = false;
|
||||
bool sse4_2Supportted = false;
|
||||
bool sse4aSupportted = false;
|
||||
bool sse5Supportted = false;
|
||||
bool avxSupportted = false;
|
||||
};
|
||||
|
||||
static struct SIMDSupport checkSIMDSupport() {
|
||||
struct SIMDSupport s;
|
||||
|
||||
int cpuinfo[4];
|
||||
__cpuid(cpuinfo, 1);
|
||||
|
||||
// Check SSE, SSE2, SSE3, SSSE3, SSE4.1, and SSE4.2 support
|
||||
s.sseSupportted = cpuinfo[3] & (1 << 25) || false;
|
||||
s.sse2Supportted = cpuinfo[3] & (1 << 26) || false;
|
||||
s.sse3Supportted = cpuinfo[2] & (1 << 0) || false;
|
||||
s.ssse3Supportted = cpuinfo[2] & (1 << 9) || false;
|
||||
s.sse4_1Supportted = cpuinfo[2] & (1 << 19) || false;
|
||||
s.sse4_2Supportted = cpuinfo[2] & (1 << 20) || false;
|
||||
|
||||
// ----------------------------------------------------------------------
|
||||
|
||||
// Check AVX support
|
||||
// References
|
||||
// http://software.intel.com/en-us/blogs/2011/04/14/is-avx-enabled/
|
||||
// http://insufficientlycomplicated.wordpress.com/2011/11/07/detecting-intel-advanced-vector-extensions-avx-in-visual-studio/
|
||||
|
||||
s.avxSupportted = cpuinfo[2] & (1 << 28) || false;
|
||||
bool osxsaveSupported = cpuinfo[2] & (1 << 27) || false;
|
||||
if (osxsaveSupported && s.avxSupportted)
|
||||
{
|
||||
// _XCR_XFEATURE_ENABLED_MASK = 0
|
||||
unsigned long long xcrFeatureMask = _xgetbv(0);
|
||||
s.avxSupportted = (xcrFeatureMask & 0x6) == 0x6;
|
||||
}
|
||||
|
||||
// ----------------------------------------------------------------------
|
||||
|
||||
// Check SSE4a and SSE5 support
|
||||
|
||||
// Get the number of valid extended IDs
|
||||
__cpuid(cpuinfo, 0x80000000);
|
||||
int numExtendedIds = cpuinfo[0];
|
||||
if (numExtendedIds >= 0x80000001)
|
||||
{
|
||||
__cpuid(cpuinfo, 0x80000001);
|
||||
s.sse4aSupportted = cpuinfo[2] & (1 << 6) || false;
|
||||
s.sse5Supportted = cpuinfo[2] & (1 << 11) || false;
|
||||
}
|
||||
|
||||
// ----------------------------------------------------------------------
|
||||
|
||||
return s;
|
||||
}
|
||||
|
||||
bool supportsSSE2()
|
||||
{
|
||||
static struct SIMDSupport s = checkSIMDSupport();
|
||||
return s.sse2Supportted;
|
||||
}
|
||||
|
||||
bool supportsAVX()
|
||||
{
|
||||
static struct SIMDSupport s = checkSIMDSupport();
|
||||
return s.avxSupportted;
|
||||
}
|
||||
|
||||
void *aligned_malloc(size_t alignment, size_t size)
|
||||
{
|
||||
#ifdef WIN32
|
||||
return _aligned_malloc(size, alignment);
|
||||
#elif _ISOC11_SOURCE
|
||||
return aligned_alloc(alignment, size);
|
||||
#else
|
||||
return memalign(alignment, size);
|
||||
#endif
|
||||
}
|
||||
|
||||
void aligned_free(void *block)
|
||||
{
|
||||
#ifdef WIN32
|
||||
_aligned_free(block);
|
||||
#else
|
||||
free(block);
|
||||
#endif
|
||||
}
|
|
@ -201,7 +201,7 @@ void SetClipping(int left, int top, int right, int bottom)
|
|||
{
|
||||
Assert(left <= right && top <= bottom && left >= 0 && left < Video.Width
|
||||
&& top >= 0 && top < Video.Height && right >= 0
|
||||
&& right < Video.Width && bottom >= 0 && bottom < Video.Height);
|
||||
&& right <= Video.Width && bottom >= 0 && bottom <= Video.Height);
|
||||
|
||||
ClipX1 = left;
|
||||
ClipY1 = top;
|
||||
|
|
Loading…
Reference in a new issue