mirror of
https://github.com/DarkPlacesEngine/gmqcc.git
synced 2024-12-18 08:22:13 +00:00
Cleanups and documentation
This commit is contained in:
parent
294870c5ba
commit
7e87c61d78
1 changed files with 99 additions and 75 deletions
174
correct.c
174
correct.c
|
@ -22,6 +22,77 @@
|
||||||
*/
|
*/
|
||||||
#include "gmqcc.h"
|
#include "gmqcc.h"
|
||||||
|
|
||||||
|
/*
|
||||||
|
* This is a very clever method for correcting mistakes in QuakeC code
|
||||||
|
* most notably when invalid identifiers are used or inproper assignments;
|
||||||
|
* we can proprly lookup in multiple dictonaries (depening on the rules
|
||||||
|
* of what the task is trying to acomplish) to find the best possible
|
||||||
|
* match.
|
||||||
|
*
|
||||||
|
*
|
||||||
|
* A little about how it works, and probability theory:
|
||||||
|
*
|
||||||
|
* When given an identifier (which we will denote I), we're essentially
|
||||||
|
* just trying to choose the most likely correction for that identifier.
|
||||||
|
* (the actual "correction" can very well be the identifier itself).
|
||||||
|
* There is actually no way to know for sure that certian identifers
|
||||||
|
* such as "lates", need to be corrected to "late" or "latest" or any
|
||||||
|
* other permutations that look lexically the same. This is why we
|
||||||
|
* must advocate the usage of probabilities. This means that instead of
|
||||||
|
* just guessing, instead we're trying to find the correction for C,
|
||||||
|
* out of all possible corrections that maximizes the probability of C
|
||||||
|
* for the original identifer I.
|
||||||
|
*
|
||||||
|
* Bayes' Therom suggests something of the following:
|
||||||
|
* AC P(I|C) P(C) / P(I)
|
||||||
|
* Since P(I) is the same for every possibly I, we can ignore it giving
|
||||||
|
* AC P(I|C) P(C)
|
||||||
|
*
|
||||||
|
* This greatly helps visualize how the parts of the expression are performed
|
||||||
|
* there is essentially three, from right to left we perform the following:
|
||||||
|
*
|
||||||
|
* 1: P(C), the probability that a proposed correction C will stand on its
|
||||||
|
* own. This is called the language model.
|
||||||
|
*
|
||||||
|
* 2: P(I|C), the probability that I would be used, when the programmer
|
||||||
|
* really meant C. This is the error model.
|
||||||
|
*
|
||||||
|
* 3: AC, the control mechanisim, an enumerator if you will, one that
|
||||||
|
* enumerates all feasible values of C, to determine the one that
|
||||||
|
* gives the greatest probability score.
|
||||||
|
*
|
||||||
|
* In reality the requirement for a more complex expression involving
|
||||||
|
* two seperate models is considerably a waste. But one must recognize
|
||||||
|
* that P(C|I) is already conflating two factors. It's just much simpler
|
||||||
|
* to seperate the two models and deal with them explicitaly. To properly
|
||||||
|
* estimate P(C|I) you have to consider both the probability of C and
|
||||||
|
* probability of the transposition from C to I. It's simply much more
|
||||||
|
* cleaner, and direct to seperate the two factors.
|
||||||
|
*
|
||||||
|
* A little information on additional algorithms used:
|
||||||
|
*
|
||||||
|
* Initially when I implemented this corrector, it was very slow.
|
||||||
|
* Need I remind you this is essentially a brute force attack on strings,
|
||||||
|
* and since every transformation requires dynamic memory allocations,
|
||||||
|
* you can easily imagine where most of the runtime conflated. Yes
|
||||||
|
* It went right to malloc. More than THREE MILLION malloc calls are
|
||||||
|
* performed for an identifier about 16 bytes long. This was such a
|
||||||
|
* shock to me. A forward allocator (or as some call it a bump-point
|
||||||
|
* allocator, or just a memory pool) was implemented. To combat this.
|
||||||
|
*
|
||||||
|
* But of course even other factors were making it slow. Initially
|
||||||
|
* this used a hashtable. And hashtables have a good constant lookup
|
||||||
|
* time complexity. But the problem wasn't in the hashtable, it was
|
||||||
|
* in the hashing (despite having one of the fastest hash functions
|
||||||
|
* known). Remember those 3 million mallocs? Well for every malloc
|
||||||
|
* there is also a hash. After 3 million hashes .. you start to get
|
||||||
|
* very slow. To combat this I had suggested burst tries to Blub.
|
||||||
|
* The next day he had implemented them. Sure enough this brought
|
||||||
|
* down the runtime by a factory > 100%
|
||||||
|
*/
|
||||||
|
|
||||||
|
|
||||||
|
#define CORRECT_POOLSIZE (128*1024*1024)
|
||||||
/*
|
/*
|
||||||
* A forward allcator for the corrector. This corrector requires a lot
|
* A forward allcator for the corrector. This corrector requires a lot
|
||||||
* of allocations. This forward allocator combats all those allocations
|
* of allocations. This forward allocator combats all those allocations
|
||||||
|
@ -29,8 +100,6 @@
|
||||||
* allocation isn't wasting a little header space for when NOTRACK isn't
|
* allocation isn't wasting a little header space for when NOTRACK isn't
|
||||||
* defined.
|
* defined.
|
||||||
*/
|
*/
|
||||||
#define CORRECT_POOLSIZE (128*1024*1024)
|
|
||||||
|
|
||||||
static unsigned char **correct_pool_data = NULL;
|
static unsigned char **correct_pool_data = NULL;
|
||||||
static unsigned char *correct_pool_this = NULL;
|
static unsigned char *correct_pool_this = NULL;
|
||||||
static size_t correct_pool_addr = 0;
|
static size_t correct_pool_addr = 0;
|
||||||
|
@ -65,30 +134,35 @@ static GMQCC_INLINE void correct_pool_delete(void) {
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
static GMQCC_INLINE char *correct_outstr(const char *s) {
|
static GMQCC_INLINE char *correct_pool_claim(const char *data) {
|
||||||
char *o = util_strdup(s);
|
char *claim = util_strdup(data);
|
||||||
correct_pool_delete();
|
correct_pool_delete();
|
||||||
return o;
|
return claim;
|
||||||
}
|
}
|
||||||
|
|
||||||
correct_trie_t* correct_trie_new()
|
/*
|
||||||
{
|
* A fast space efficent trie for a disctonary of identifiers. This is
|
||||||
|
* faster than a hashtable for one reason. A hashtable itself may have
|
||||||
|
* fast constant lookup time, but the hash itself must be very fast. We
|
||||||
|
* have one of the fastest hash functions for strings, but if you do a
|
||||||
|
* lost of hashing (which we do, almost 3 million hashes per identifier)
|
||||||
|
* a hashtable becomes slow. Very Very Slow.
|
||||||
|
*/
|
||||||
|
correct_trie_t* correct_trie_new() {
|
||||||
correct_trie_t *t = (correct_trie_t*)mem_a(sizeof(correct_trie_t));
|
correct_trie_t *t = (correct_trie_t*)mem_a(sizeof(correct_trie_t));
|
||||||
t->value = NULL;
|
t->value = NULL;
|
||||||
t->entries = NULL;
|
t->entries = NULL;
|
||||||
return t;
|
return t;
|
||||||
}
|
}
|
||||||
|
|
||||||
void correct_trie_del_sub(correct_trie_t *t)
|
void correct_trie_del_sub(correct_trie_t *t) {
|
||||||
{
|
|
||||||
size_t i;
|
size_t i;
|
||||||
for (i = 0; i < vec_size(t->entries); ++i)
|
for (i = 0; i < vec_size(t->entries); ++i)
|
||||||
correct_trie_del_sub(&t->entries[i]);
|
correct_trie_del_sub(&t->entries[i]);
|
||||||
vec_free(t->entries);
|
vec_free(t->entries);
|
||||||
}
|
}
|
||||||
|
|
||||||
void correct_trie_del(correct_trie_t *t)
|
void correct_trie_del(correct_trie_t *t) {
|
||||||
{
|
|
||||||
size_t i;
|
size_t i;
|
||||||
for (i = 0; i < vec_size(t->entries); ++i)
|
for (i = 0; i < vec_size(t->entries); ++i)
|
||||||
correct_trie_del_sub(&t->entries[i]);
|
correct_trie_del_sub(&t->entries[i]);
|
||||||
|
@ -96,8 +170,7 @@ void correct_trie_del(correct_trie_t *t)
|
||||||
mem_d(t);
|
mem_d(t);
|
||||||
}
|
}
|
||||||
|
|
||||||
void* correct_trie_get(const correct_trie_t *t, const char *key)
|
void* correct_trie_get(const correct_trie_t *t, const char *key) {
|
||||||
{
|
|
||||||
const unsigned char *data = (const unsigned char*)key;
|
const unsigned char *data = (const unsigned char*)key;
|
||||||
while (*data) {
|
while (*data) {
|
||||||
unsigned char ch = *data;
|
unsigned char ch = *data;
|
||||||
|
@ -117,8 +190,7 @@ void* correct_trie_get(const correct_trie_t *t, const char *key)
|
||||||
return t->value;
|
return t->value;
|
||||||
}
|
}
|
||||||
|
|
||||||
void correct_trie_set(correct_trie_t *t, const char *key, void * const value)
|
void correct_trie_set(correct_trie_t *t, const char *key, void * const value) {
|
||||||
{
|
|
||||||
const unsigned char *data = (const unsigned char*)key;
|
const unsigned char *data = (const unsigned char*)key;
|
||||||
while (*data) {
|
while (*data) {
|
||||||
unsigned char ch = *data;
|
unsigned char ch = *data;
|
||||||
|
@ -143,57 +215,11 @@ void correct_trie_set(correct_trie_t *t, const char *key, void * const value)
|
||||||
t->value = value;
|
t->value = value;
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
|
||||||
* This is a very clever method for correcting mistakes in QuakeC code
|
|
||||||
* most notably when invalid identifiers are used or inproper assignments;
|
|
||||||
* we can proprly lookup in multiple dictonaries (depening on the rules
|
|
||||||
* of what the task is trying to acomplish) to find the best possible
|
|
||||||
* match.
|
|
||||||
*
|
|
||||||
*
|
|
||||||
* A little about how it works, and probability theory:
|
|
||||||
*
|
|
||||||
* When given an identifier (which we will denote I), we're essentially
|
|
||||||
* just trying to choose the most likely correction for that identifier.
|
|
||||||
* (the actual "correction" can very well be the identifier itself).
|
|
||||||
* There is actually no way to know for sure that certian identifers
|
|
||||||
* such as "lates", need to be corrected to "late" or "latest" or any
|
|
||||||
* other permutations that look lexically the same. This is why we
|
|
||||||
* must advocate the usage of probabilities. This implies that we're
|
|
||||||
* trying to find the correction for C, out of all possible corrections
|
|
||||||
* that maximizes the probability of C for the original identifer I.
|
|
||||||
*
|
|
||||||
* Bayes' Therom suggests something of the following:
|
|
||||||
* AC P(I|C) P(C) / P(I)
|
|
||||||
* Since P(I) is the same for every possibly I, we can ignore it giving
|
|
||||||
* AC P(I|C) P(C)
|
|
||||||
*
|
|
||||||
* This greatly helps visualize how the parts of the expression are performed
|
|
||||||
* there is essentially three, from right to left we perform the following:
|
|
||||||
*
|
|
||||||
* 1: P(C), the probability that a proposed correction C will stand on its
|
|
||||||
* own. This is called the language model.
|
|
||||||
*
|
|
||||||
* 2: P(I|C), the probability that I would be used, when the programmer
|
|
||||||
* really meant C. This is the error model.
|
|
||||||
*
|
|
||||||
* 3: AC, the control mechanisim, which implies the enumeration of all
|
|
||||||
* feasible values of C, and then determine the one that gives the
|
|
||||||
* greatest probability score. Selecting it as the "correction"
|
|
||||||
*
|
|
||||||
*
|
|
||||||
* The requirement for complex expression involving two models:
|
|
||||||
*
|
|
||||||
* In reality the requirement for a more complex expression involving
|
|
||||||
* two seperate models is considerably a waste. But one must recognize
|
|
||||||
* that P(C|I) is already conflating two factors. It's just much simpler
|
|
||||||
* to seperate the two models and deal with them explicitaly. To properly
|
|
||||||
* estimate P(C|I) you have to consider both the probability of C and
|
|
||||||
* probability of the transposition from C to I. It's simply much more
|
|
||||||
* cleaner, and direct to seperate the two factors.
|
|
||||||
*/
|
|
||||||
|
|
||||||
/* some hashtable management for dictonaries */
|
/*
|
||||||
|
* Implementation of the corrector algorithm commences. A very efficent
|
||||||
|
* brute-force attack (thanks to tries and mempool :-)).
|
||||||
|
*/
|
||||||
static size_t *correct_find(correct_trie_t *table, const char *word) {
|
static size_t *correct_find(correct_trie_t *table, const char *word) {
|
||||||
return (size_t*)correct_trie_get(table, word);
|
return (size_t*)correct_trie_get(table, word);
|
||||||
}
|
}
|
||||||
|
@ -420,7 +446,6 @@ char *correct_str(correct_trie_t* table, const char *ident) {
|
||||||
char **e2;
|
char **e2;
|
||||||
char *e1ident;
|
char *e1ident;
|
||||||
char *e2ident;
|
char *e2ident;
|
||||||
char *found = util_strdup(ident);
|
|
||||||
|
|
||||||
size_t e1rows = 0;
|
size_t e1rows = 0;
|
||||||
size_t e2rows = 0;
|
size_t e2rows = 0;
|
||||||
|
@ -429,21 +454,20 @@ char *correct_str(correct_trie_t* table, const char *ident) {
|
||||||
|
|
||||||
/* needs to be allocated for free later */
|
/* needs to be allocated for free later */
|
||||||
if (correct_find(table, ident))
|
if (correct_find(table, ident))
|
||||||
return correct_outstr(found);
|
return correct_pool_claim(ident);
|
||||||
|
|
||||||
if ((e1rows = correct_size(ident))) {
|
if ((e1rows = correct_size(ident))) {
|
||||||
e1 = correct_edit(ident);
|
e1 = correct_edit(ident);
|
||||||
|
|
||||||
if ((e1ident = correct_maximum(table, e1, e1rows))) {
|
if ((e1ident = correct_maximum(table, e1, e1rows)))
|
||||||
found = util_strdup(e1ident);
|
return correct_pool_claim(e1ident);
|
||||||
return correct_outstr(found);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
e2 = correct_known(table, e1, e1rows, &e2rows);
|
e2 = correct_known(table, e1, e1rows, &e2rows);
|
||||||
if (e2rows && ((e2ident = correct_maximum(table, e2, e2rows)))) {
|
if (e2rows && ((e2ident = correct_maximum(table, e2, e2rows))))
|
||||||
found = util_strdup(e2ident);
|
return correct_pool_claim(e2ident);
|
||||||
}
|
|
||||||
|
|
||||||
return correct_outstr(found);
|
|
||||||
|
correct_pool_delete();
|
||||||
|
return util_strdup(ident);
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in a new issue