| summaryrefslogtreecommitdiff | 
diff options
Diffstat (limited to 'src/core/knowledge.c')
| -rw-r--r-- | src/core/knowledge.c | 447 | 
1 files changed, 447 insertions, 0 deletions
diff --git a/src/core/knowledge.c b/src/core/knowledge.c new file mode 100644 index 0000000..31ccb97 --- /dev/null +++ b/src/core/knowledge.c @@ -0,0 +1,447 @@ +#include <stdlib.h> +#include <string.h> +#include <stdint.h> /* defines SIZE_MAX */ + +#include "../io/error.h" + +#include "knowledge.h" + +/* XXX: are we as close to immutable as we want to be? */ +unsigned int const ZoO_knowledge_punctuation_chars_count = 7; +const ZoO_char const ZoO_knowledge_punctuation_chars[7] = +   { +      '!', +      ',', +      '.', +      ':', +      ';', +      '?', +      '~' +   }; + +/* XXX: are we as close to immutable as we want to be? */ +unsigned int const ZoO_knowledge_forbidden_chars_count = 8; +const ZoO_char const ZoO_knowledge_forbidden_chars[8]= +   { +      '(', +      ')', +      '[', +      ']', +      '{', +      '}', +      '<', +      '>' +   }; + +int ZoO_knowledge_find +( +   const struct ZoO_knowledge k [const restrict static 1], +   const ZoO_char word [const restrict static 1], +   ZoO_index result [const restrict static 1] +) +{ +   int cmp; +   ZoO_index i, current_min, current_max; + +   /* This is a binary search. */ + +   if (k->words_count < 1) +   { +      *result = 0; + +      return -1; +   } + +   current_min = 0; + +   /* overflow-safe: k->words_count >= 1 */ +   current_max = (k->words_count - 1); + +   for (;;) +   { +      /* FIXME: overflow-safe? */ +      i = ((current_min + current_max) / 2); + +      if (i == k->words_count) +      { +         *result = k->words_count; + +         return -1; +      } + +      cmp = +         /* XXX: Assumed to be compatible with ZoO_char */ +         strcmp +         ( +            (char *) word, +            (const char *) k->words[k->sorted_indices[i]].word +         ); + +      if (cmp > 0) +      { +         if ((current_min > current_max)) +         { +            *result = (i + 1); + +            return -1; +         } + +         /* FIXME: overflow-safe? */ +         current_min = (i + 1); +      } +      else if (cmp < 0) +      { +         if ((current_min > current_max) || (i == 0)) +         { +            *result = i; + +            return -1; +         } + +         /* overflow-safe */ +         current_max = (i - 1); +      } +      else +      { +         *result = k->sorted_indices[i]; + +         return 0; +      } +   } +} + +static void word_init (struct ZoO_knowledge_word w [const restrict static 1]) +{ +   w->word_size = 0; +   w->word = (ZoO_char *) NULL; +   w->special = ZoO_WORD_HAS_NO_EFFECT; +   w->occurrences = 1; +   w->forward_links_count  = 0; +   w->backward_links_count = 0; +   w->forward_links_occurrences  = (ZoO_index *) NULL; +   w->backward_links_occurrences = (ZoO_index *) NULL; +   w->forward_links  = (ZoO_index *) NULL; +   w->backward_links = (ZoO_index *) NULL; +} + +static int add_punctuation_nodes +( +   struct ZoO_knowledge k [const static 1] +) +{ +   int error; +   char w[2]; +   ZoO_index i, id; + +   if (ZoO_knowledge_learn(k, "START OF LINE", &id) < 0) +   { +      ZoO_S_FATAL("Could not add 'START OF LINE' to knowledge."); + +      return -2; +   } + +   k->words[id].special = ZoO_WORD_STARTS_SENTENCE; +   k->words[id].occurrences = 0; + +   if (ZoO_knowledge_learn(k, "END OF LINE", &id) < 0) +   { +      ZoO_S_FATAL("Could not add 'END OF LINE' to knowledge."); + +      return -2; +   } + +   k->words[id].special = ZoO_WORD_ENDS_SENTENCE; +   k->words[id].occurrences = 0; + +   w[1] = '\0'; + +   error = 0; + +   for (i = 0; i < ZoO_knowledge_punctuation_chars_count; ++i) +   { +      w[0] = ZoO_knowledge_punctuation_chars[i]; + +      if (ZoO_knowledge_learn(k, w, &id) < 0) +      { +         ZoO_WARNING("Could not add '%s' to knowledge.", w); + +         error = -1; +      } +      else +      { +         k->words[id].special = ZoO_WORD_REMOVES_LEFT_SPACE; +         k->words[id].occurrences = 0; +      } +   } + +   return error; +} + +int ZoO_knowledge_initialize (struct ZoO_knowledge k [const static 1]) +{ +   k->words_count = 0; +   k->words = (struct ZoO_knowledge_word *) NULL; +   k->sorted_indices = (ZoO_index *) NULL; + +   if (add_punctuation_nodes(k) < -1) +   { +      ZoO_knowledge_finalize(k); + +      return -1; +   } + +   return 0; +} + +static void finalize_word +( +   struct ZoO_knowledge_word w [const restrict static 1] +) +{ +   if (w->word != (ZoO_char *) NULL) +   { +      free((void *) w->word); + +      w->word = (ZoO_char *) NULL; +   } + +   if (w->forward_links_occurrences != (ZoO_index *) NULL) +   { +      free((void *) w->forward_links_occurrences); + +      w->forward_links_occurrences = (ZoO_index *) NULL; +   } + +   if (w->backward_links_occurrences != (ZoO_index *) NULL) +   { +      free((void *) w->backward_links_occurrences); + +      w->backward_links_occurrences = (ZoO_index *) NULL; +   } + +   if (w->forward_links != (ZoO_index *) NULL) +   { +      free((void *) w->forward_links); + +      w->forward_links = (ZoO_index *) NULL; +   } + +   if (w->backward_links != (ZoO_index *) NULL) +   { +      free((void *) w->backward_links); + +      w->backward_links = (ZoO_index *) NULL; +   } + +   w->forward_links_count  = 0; +   w->backward_links_count = 0; +} + +void ZoO_knowledge_finalize (struct ZoO_knowledge k [const restrict static 1]) +{ +   ZoO_index i; + +   for (i = 0; i < k->words_count; ++i) +   { +      /* prevents k [restrict] */ +      finalize_word(k->words + i); +   } + +   k->words_count = 0; + +   if (k->words != (struct ZoO_knowledge_word *) NULL) +   { +      free((void *) k->words); + +      k->words = (struct ZoO_knowledge_word *) NULL; +   } + +   if (k->sorted_indices != (ZoO_index *) NULL) +   { +      free((void *) k->sorted_indices); + +      k->sorted_indices = (ZoO_index *) NULL; +   } +} + +int ZoO_knowledge_learn +( +   struct ZoO_knowledge k [const static 1], +   const ZoO_char word [const restrict static 1], +   ZoO_index result [const restrict static 1] +) +{ +   struct ZoO_knowledge_word * new_wordlist; +   ZoO_index * new_sorted_indices; +   ZoO_index temp; + +   /* prevents k [restrict] */ +   if (ZoO_knowledge_find(k, word, result) == 0) +   { +      if (k->words[*result].occurrences == ZoO_INDEX_MAX) +      { +         ZoO_WARNING +         ( +            "Maximum number of occurrences has been reached for word '" +            ZoO_CHAR_STRING_SYMBOL +            "'.", +            word +         ); + +         return -1; +      } + +      /* overflow-safe */ +      k->words[*result].occurrences += 1; + +      return 0; +   } + +   if (k->words_count == ZoO_INDEX_MAX) +   { +      ZoO_S_WARNING("Maximum number of words has been reached."); + +      return -1; +   } + +   new_wordlist = +      (struct ZoO_knowledge_word *) realloc +      ( +         (void *) k->words, +         ( +            ( +               /* overflow-safe: k->words_count < ZoO_INDEX_MAX */ +               (size_t) (k->words_count + 1) +            ) +            * sizeof(struct ZoO_knowledge_word) +         ) +      ); + +   if (new_wordlist == (struct ZoO_knowledge_word *) NULL) +   { +      ZoO_ERROR +      ( +         "Could not learn the word '%s': unable to realloc the word list.", +         word +      ); + +      return -1; +   } + +   k->words = new_wordlist; + +   new_sorted_indices = +      (ZoO_index *) realloc +      ( +         (void *) k->sorted_indices, +         ( +            ( +               /* overflow-safe: k->words_count < ZoO_INDEX_MAX */ +               (size_t) (k->words_count + 1) +            ) +            * sizeof(ZoO_index) +         ) +      ); + +   if (new_sorted_indices == (ZoO_index *) NULL) +   { +      ZoO_ERROR +      ( +         "Could not learn the word '" +         ZoO_CHAR_STRING_SYMBOL +         "': unable to realloc the index list.", +         word +      ); + +      return -1; +   } + +   k->sorted_indices = new_sorted_indices; + +   /* We can only move indices right of *result if they exist. */ +   if (*result != k->words_count) +   { +      /* TODO: check if correct. */ +      memmove +      ( +         /* +          * overflow-safe: +          *  - k->words_count < ZoO_INDEX_MAX +          *  - (k->sorted_indices + *result + 1) =< k->words_count +          */ +         (void *) (k->sorted_indices + *result + 1), +         /* overflow-safe: see above */ +         (const void *) (k->sorted_indices + *result), +         ( +            ( +               /* overflow-safe: *result < k->words_count */ +               (size_t) (k->words_count - *result) +            ) +            * sizeof(ZoO_index) +         ) +      ); +   } + +   temp = *result; + +   k->sorted_indices[*result] = k->words_count; + +   *result = k->words_count; + +   word_init(k->words + *result); + +   /* XXX: strlen assumed to work with ZoO_char. */ +   k->words[*result].word_size = strlen(word); + +   if (k->words[*result].word_size == SIZE_MAX) +   { +      ZoO_S_WARNING +      ( +         "Could not learn word that had a size too big to store in a '\\0' " +         "terminated string. Chances are, this is but a symptom of the real " +         "problem." +      ); + +      return -1; +   } + +   /* We also need '\0' */ +   k->words[*result].word_size += 1; + +   k->words[*result].word = +      (ZoO_char *) calloc +      ( +         k->words[*result].word_size, +         sizeof(ZoO_char) +      ); + +   if (k->words[*result].word == (ZoO_char *) NULL) +   { +      ZoO_S_ERROR +      ( +         "Could not learn word due to being unable to allocate the memory to " +         "store it." +      ); + +      k->words[*result].word_size = 0; + +      return -1; +   } + +   memcpy(k->words[*result].word, word, k->words[*result].word_size); + +   /* Safe: k->words_count < ZoO_INDEX_MAX */ +   k->words_count += 1; + +   ZoO_DEBUG +   ( +      ZoO_DEBUG_LEARNING, +      "Learned word {'%s', id: %u, rank: %u}", +      word, +      *result, +      temp +   ); + +   return 0; +} +  | 


