[PATCH] [patternize] implemented support for custom delimiters for tokenization and changed default delimiter set

Mon Feb 7 14:30:42 CET 2011

Signed-off-by: Balint Kovacs <blint at balabit.hu>
---
 modules/dbparser/patternize.c            |  117 +++++++++++++++++++++---------
 modules/dbparser/patternize.h            |   10 ++-
 modules/dbparser/pdbtool.c               |   17 ++++-
 modules/dbparser/tests/test_patternize.c |    6 +-
 4 files changed, 107 insertions(+), 43 deletions(-)

diff --git a/modules/dbparser/patternize.c b/modules/dbparser/patternize.c
index 1a433c2..8903bd9 100644
--- a/modules/dbparser/patternize.c
+++ b/modules/dbparser/patternize.c
@@ -40,7 +40,7 @@
 #define PTZ_MAXLINELEN 10240
 #define PTZ_MAXWORDS 512      /* maximum number of words in one line */
 #define PTZ_LOGTABLE_ALLOC_BASE 3000
-#define PTZ_WORDLIST_CACHE 3 // FIXME: make this a commandline parameter?
+#define PTZ_WORDLIST_CACHE 3 /* FIXME: make this a commandline parameter? */
 
 static LogTagId cluster_tag_id;
 
@@ -85,7 +85,7 @@ static void _ptz_debug_print_word(gpointer key, gpointer value, gpointer dummy)
 
 static void _ptz_debug_print_cluster(gpointer key, gpointer value, gpointer dummy)
 {
-  fprintf(stderr, "%s: %d\n", (gchar*) key, ((Cluster *) value)->support);
+  fprintf(stderr, "%s: %s\n", (gchar*) key, ((Cluster *) value)->words[0]);
 }
 #endif
 
@@ -103,6 +103,28 @@ ptz_str2hash(gchar *string, guint modulo, guint seed)
   return seed % modulo;
 }
 
+gchar *
+ptz_find_delimiters(gchar *str, gchar *delimdef)
+{
+  gssize token_len;
+  gchar *remainder;
+  GString *delimiters = g_string_sized_new(0);
+  gchar *res;
+
+  token_len = strcspn(str, delimdef);
+  remainder = str;
+  while (strlen(remainder) != 0)
+    {
+       g_string_append_c(delimiters, remainder[token_len]);
+       remainder = &remainder[token_len+1];
+       token_len = strcspn(remainder, delimdef);
+    }
+
+   res = g_strdup(delimiters->str);
+   g_string_free(delimiters, TRUE);
+   return res;
+}
+
 gboolean
 ptz_find_frequent_words_remove_key_predicate(gpointer key, gpointer value, gpointer support)
 {
@@ -110,7 +132,7 @@ ptz_find_frequent_words_remove_key_predicate(gpointer key, gpointer value, gpoin
 }
 
 GHashTable *
-ptz_find_frequent_words(GPtrArray *logs, guint support, gboolean two_pass)
+ptz_find_frequent_words(GPtrArray *logs, guint support, gchar *delimiters, gboolean two_pass)
 {
   int i, j, pass;
   guint *curr_count;
@@ -148,14 +170,14 @@ ptz_find_frequent_words(GPtrArray *logs, guint support, gboolean two_pass)
         {
           msg = (LogMessage *) g_ptr_array_index(logs, i);
           msgstr = (gchar *) log_msg_get_value(msg, LM_V_MESSAGE, &msglen);
-          /* NOTE: we should split on more than a simple space... */
-          words = g_strsplit(msgstr, " ", PTZ_MAXWORDS);
+
+          words = g_strsplit_set(msgstr, delimiters, PTZ_MAXWORDS);
 
           for (j = 0; words[j]; ++j)
             {
               /* NOTE: to calculate the key for the hash, we prefix a word with
-               * its position in the row and a space -- as we split at spaces,
-               * this should not create confusion
+               * its position in the row and a space -- as we always split at
+               * spaces, this should not create confusion
                */
               hash_key = g_strdup_printf("%d %s", j, words[j]);
 
@@ -191,7 +213,7 @@ ptz_find_frequent_words(GPtrArray *logs, guint support, gboolean two_pass)
           g_strfreev(words);
         }
 
-      //g_hash_table_foreach(wordlist, _ptz_debug_print_word, NULL);
+      /* g_hash_table_foreach(wordlist, _ptz_debug_print_word, NULL); */
 
       g_hash_table_foreach_remove(wordlist, ptz_find_frequent_words_remove_key_predicate, GUINT_TO_POINTER(support));
     }
@@ -216,7 +238,7 @@ ptz_find_clusters_remove_cluster_predicate(gpointer key, gpointer value, gpointe
   ret = (val->loglines->len < support);
   if (ret)
     {
-      // remove cluster reference from the relevant logs
+      /* remove cluster reference from the relevant logs */
       for (i = 0; i < val->loglines->len; ++i)
         {
           msg = (LogMessage *) g_ptr_array_index(val->loglines, i);
@@ -246,7 +268,7 @@ cluster_free(Cluster *cluster)
 }
 
 GHashTable *
-ptz_find_clusters_slct(GPtrArray *logs, guint support, guint num_of_samples)
+ptz_find_clusters_slct(GPtrArray *logs, guint support, gchar *delimiters, guint num_of_samples)
 {
   GHashTable *wordlist;
   GHashTable *clusters;
@@ -259,10 +281,11 @@ ptz_find_clusters_slct(GPtrArray *logs, guint support, guint num_of_samples)
   gboolean is_candidate;
   Cluster *cluster;
   GString *cluster_key;
+  gchar * msgdelimiters;
 
   /* get the frequent word list */
-  wordlist = ptz_find_frequent_words(logs, support, TRUE);
-//  g_hash_table_foreach(wordlist, _debug_print, NULL);
+  wordlist = ptz_find_frequent_words(logs, support, delimiters, TRUE);
+  /* g_hash_table_foreach(wordlist, _ptz_debug_print_word, NULL); */
 
   /* find the cluster candidates */
   clusters = g_hash_table_new_full(g_str_hash, g_str_equal, g_free, (GDestroyNotify) cluster_free);
@@ -274,8 +297,8 @@ ptz_find_clusters_slct(GPtrArray *logs, guint support, guint num_of_samples)
 
       g_string_truncate(cluster_key, 0);
 
-      /* NOTE: we should split on more than a simple space... */
-      words = g_strsplit(msgstr, " ", PTZ_MAXWORDS);
+      words = g_strsplit_set(msgstr, delimiters, PTZ_MAXWORDS);
+      msgdelimiters = ptz_find_delimiters(msgstr, delimiters);
 
       is_candidate = FALSE;
       for (j = 0; words[j]; ++j)
@@ -290,12 +313,19 @@ ptz_find_clusters_slct(GPtrArray *logs, guint support, guint num_of_samples)
             }
           else
             {
-              g_string_append_printf(cluster_key, "%d *%c", j, PTZ_SEPARATOR_CHAR);
+              g_string_append_printf(cluster_key, "%d %c%c", j, PTZ_PARSER_MARKER_CHAR, PTZ_SEPARATOR_CHAR);
             }
 
           g_free(hash_key);
         }
 
+      /* append the delimiters of the message to the cluster key to assure unicity
+       * otherwise the same words with different delimiters would still show as the
+       * same cluster
+       */
+      g_string_append_printf(cluster_key, "%s%c", msgdelimiters, PTZ_SEPARATOR_CHAR);
+      g_free(msgdelimiters);
+
       if (is_candidate)
         {
           cluster = (Cluster*) g_hash_table_lookup(clusters, cluster_key->str);
@@ -331,7 +361,7 @@ ptz_find_clusters_slct(GPtrArray *logs, guint support, guint num_of_samples)
 
   g_hash_table_foreach_remove(clusters, ptz_find_clusters_remove_cluster_predicate, GUINT_TO_POINTER(support));
 
-//  g_hash_table_foreach(clusters, _ptz_debug_print_cluster, NULL);
+  /* g_hash_table_foreach(clusters, _ptz_debug_print_cluster, NULL); */
 
   g_hash_table_unref(wordlist);
   g_string_free(cluster_key, TRUE);
@@ -356,7 +386,7 @@ ptz_find_clusters_step(Patternizer *self, GPtrArray *logs, guint support, guint
 {
   msg_progress("Searching clusters", evt_tag_int("input lines", logs->len), NULL);
   if (self->algo == PTZ_ALGO_SLCT)
-    return ptz_find_clusters_slct(logs, support, num_of_samples);
+    return ptz_find_clusters_slct(logs, support, self->delimiters, num_of_samples);
   else
     {
       msg_error("Unknown clustering algorithm", evt_tag_int("algo_id", self->algo));
@@ -450,6 +480,11 @@ ptz_print_patterndb_rule(gpointer key, gpointer value, gpointer user_data)
   guint parser_counts[PTZ_NUM_OF_PARSERS];
   int i;
   Cluster *cluster;
+  GString *pattern = g_string_new("");
+  guint wordcount;
+  gchar *delimiters;
+
+  cluster = (Cluster *) value;
 
   if (named_parsers)
     {
@@ -459,8 +494,8 @@ ptz_print_patterndb_rule(gpointer key, gpointer value, gpointer user_data)
 
   uuid_gen_random(uuid_string, sizeof(uuid_string));
 
-  printf("      <rule id='%s'>\n", uuid_string);
-  printf("        <!-- support: %d -->\n", ((Cluster *) value)->loglines->len);
+  printf("      <rule id='%s' class='system' provider='patternize'>\n", uuid_string);
+  printf("        <!-- support: %d -->\n", cluster->loglines->len);
   printf("        <patterns>\n");
   printf("          <pattern>");
 
@@ -474,30 +509,45 @@ ptz_print_patterndb_rule(gpointer key, gpointer value, gpointer user_data)
   splitstr = g_strdup_printf("%c", PTZ_SEPARATOR_CHAR);
   words = g_strsplit(skey, splitstr, 0);
   g_free(splitstr);
+
+  /* pop the delimiters from the cluster key */
+  wordcount = g_strv_length(words);
+  delimiters = words[wordcount-1];
+  words[wordcount-1] = 0;
+
    for (i = 0; words[i]; ++i)
     {
-      gchar **word_parts;
+      g_string_truncate(pattern, 0);
 
+      gchar **word_parts;
       word_parts = g_strsplit(words[i], " ", 2);
 
-      if (word_parts[1][0] == '*')
+      if (word_parts[1][0] == PTZ_PARSER_MARKER_CHAR)
         {
           /* NOTE: nasty workaround: do not display last ESTRING as syslog-ng won't handle that well... */
           /* FIXME: enter a simple @STRING@ here instead... */
           if (words[i + 1])
             {
-              printf("@ESTRING:");
+              g_string_append(pattern, "@ESTRING:");
               if (named_parsers)
                 {
-                  // TODO: do not hardcode ESTRING here...
-                  printf(".dict.string%d", parser_counts[PTZ_PARSER_ESTRING]++);
+                  /* TODO: do not hardcode ESTRING here... */
+                  g_string_append_printf(pattern, ".dict.string%d", parser_counts[PTZ_PARSER_ESTRING]++);
                 }
-              printf(": @");
+              g_string_append_printf(pattern, ":%c@", delimiters[i]);
+              escapedstr = g_markup_escape_text(pattern->str, -1);
+              printf("%s", escapedstr);
+              g_free(escapedstr);
             }
         }
       else
         {
-          escapedstr = g_markup_escape_text(word_parts[1], -1);
+          g_string_append(pattern, word_parts[1]);
+
+          if (words[i + 1])
+            g_string_append_printf(pattern, "%c", delimiters[i]);
+
+          escapedstr = g_markup_escape_text(pattern->str, -1);
           if (g_strrstr(escapedstr, "@"))
             {
               escapedparts = g_strsplit(escapedstr, "@", -1);
@@ -505,23 +555,19 @@ ptz_print_patterndb_rule(gpointer key, gpointer value, gpointer user_data)
               escapedstr = g_strjoinv("@@", escapedparts);
               g_strfreev(escapedparts);
             }
-
           printf("%s", escapedstr);
           g_free(escapedstr);
-          if (words[i + 1])
-            printf(" ");
         }
-
       g_strfreev(word_parts);
     }
 
   g_free(skey);
   g_strfreev(words);
+  g_string_free(pattern, TRUE);
 
   printf("</pattern>\n");
   printf("        </patterns>\n");
 
-  cluster = (Cluster *) value;
   if (cluster->samples->len > 0)
     {
       printf("        <examples>\n");
@@ -541,12 +587,12 @@ ptz_print_patterndb_rule(gpointer key, gpointer value, gpointer user_data)
 }
 
 void
-ptz_print_patterndb(GHashTable *clusters, gboolean named_parsers)
+ptz_print_patterndb(GHashTable *clusters, gchar *delimiters, gboolean named_parsers)
 {
   char date[12], uuid_string[37];
   time_t currtime;
 
-  // print the header
+  /* print the header */
   time(&currtime);
   strftime(date, 12, "%Y-%m-%d", localtime(&currtime));
   printf("<patterndb version='3' pub_date='%s'>\n", date);
@@ -555,7 +601,7 @@ ptz_print_patterndb(GHashTable *clusters, gboolean named_parsers)
   printf("  <ruleset name='patternize' id='%s'>\n", uuid_string);
   printf("    <rules>\n");
 
-  g_hash_table_foreach(clusters, ptz_print_patterndb_rule, (gpointer) &named_parsers);
+  g_hash_table_foreach(clusters, ptz_print_patterndb_rule, (gpointer *) &named_parsers);
 
   printf("    </rules>\n");
   printf("  </ruleset>\n");
@@ -612,7 +658,7 @@ ptz_load_file(Patternizer *self, gchar *input_file, GError **error)
 }
 
 Patternizer *
-ptz_new(gdouble support_treshold, guint algo, guint iterate, guint num_of_samples)
+ptz_new(gdouble support_treshold, guint algo, guint iterate, guint num_of_samples, gchar *delimiters)
 {
   Patternizer *self = g_new0(Patternizer, 1);
 
@@ -621,6 +667,7 @@ ptz_new(gdouble support_treshold, guint algo, guint iterate, guint num_of_sample
 
   self->support_treshold = support_treshold;
   self->num_of_samples = num_of_samples;
+  self->delimiters = delimiters;
   self->logs = g_ptr_array_sized_new(PTZ_LOGTABLE_ALLOC_BASE);
 
   cluster_tag_id = log_tags_get_by_name(".in_patternize_cluster");
diff --git a/modules/dbparser/patternize.h b/modules/dbparser/patternize.h
index 7fb28ba..2afc2f8 100644
--- a/modules/dbparser/patternize.h
+++ b/modules/dbparser/patternize.h
@@ -32,6 +32,7 @@
 #define PTZ_ITERATE_HIEARARCH 2
 
 #define PTZ_SEPARATOR_CHAR 0x1E
+#define PTZ_PARSER_MARKER_CHAR 0x1A
 
 #define PTZ_NUM_OF_PARSERS 1
 #define PTZ_PARSER_ESTRING 0
@@ -45,6 +46,7 @@ typedef struct _Patternizer
   guint support;
   guint num_of_samples;
   gdouble support_treshold;
+  gchar *delimiters;
 
   // NOTE: for now, we store all logs read in in the memory.
   // This brings in some obvious constraints and should be solved
@@ -61,16 +63,16 @@ typedef struct _Cluster
 } Cluster;
 
 /* only declared for the test program */
-GHashTable *ptz_find_frequent_words(GPtrArray *logs, guint support, gboolean two_pass);
-GHashTable *ptz_find_clusters_slct(GPtrArray *logs, guint support, guint num_of_samples);
+GHashTable *ptz_find_frequent_words(GPtrArray *logs, guint support, gchar *delimiters, gboolean two_pass);
+GHashTable *ptz_find_clusters_slct(GPtrArray *logs, guint support, gchar *delimiters, guint num_of_samples);
 
 
 GHashTable *ptz_find_clusters(Patternizer *self);
-void ptz_print_patterndb(GHashTable *clusters, gboolean named_parsers);
+void ptz_print_patterndb(GHashTable *clusters, gchar *delimiters, gboolean named_parsers);
 
 gboolean ptz_load_file(Patternizer *self, gchar *input_file, GError **error);
 
-Patternizer *ptz_new(gdouble support_treshold, guint algo, guint iterate, guint num_of_samples);
+Patternizer *ptz_new(gdouble support_treshold, guint algo, guint iterate, guint num_of_samples, gchar *delimiters);
 void ptz_free(Patternizer *self);
 
 #endif
diff --git a/modules/dbparser/pdbtool.c b/modules/dbparser/pdbtool.c
index 83643c5..e632f78 100644
--- a/modules/dbparser/pdbtool.c
+++ b/modules/dbparser/pdbtool.c
@@ -816,6 +816,7 @@ static gdouble support_treshold = 4.0;
 static gboolean iterate_outliers = FALSE;
 static gboolean named_parsers = FALSE;
 static gint num_of_samples = 1;
+static gchar *delimiters = " :&~?![]=,;()'\"";
 
 static gint
 pdbtool_patternize(int argc, char *argv[])
@@ -825,11 +826,21 @@ pdbtool_patternize(int argc, char *argv[])
   guint iterate = PTZ_ITERATE_NONE;
   gint i;
   GError *error = NULL;
+  GString *delimcheck = g_string_new(" "); /* delims should always include a space */
 
   if (iterate_outliers)
     iterate = PTZ_ITERATE_OUTLIERS;
 
-  if (!(ptz = ptz_new(support_treshold, PTZ_ALGO_SLCT, iterate, num_of_samples)))
+  /* make sure that every character is unique in the delimiter list */
+  for (i = 0; delimiters[i]; i++)
+    {
+      if (strchr(delimcheck->str, delimiters[i]) == NULL)
+        g_string_append_c(delimcheck, delimiters[i]);
+    }
+  delimiters = g_strdup(delimcheck->str);
+  g_string_free(delimcheck, TRUE);
+
+  if (!(ptz = ptz_new(support_treshold, PTZ_ALGO_SLCT, iterate, num_of_samples, delimiters)))
     {
       return 1;
     }
@@ -846,7 +857,7 @@ pdbtool_patternize(int argc, char *argv[])
     }
 
   clusters = ptz_find_clusters(ptz);
-  ptz_print_patterndb(clusters, named_parsers);
+  ptz_print_patterndb(clusters, delimiters, named_parsers);
   g_hash_table_destroy(clusters);
 
  exit:
@@ -865,6 +876,8 @@ static GOptionEntry patternize_options[] =
     "Recursively iterate on the log lines that do not make it into a cluster in the previous step", NULL},
   { "named-parsers",    'n', 0, G_OPTION_ARG_NONE, &named_parsers,
       "Give the parsers a name in the patterns, eg.: .dict.string1, .dict.string2... (default: no)", NULL},
+  { "delimiters",       'd', 0, G_OPTION_ARG_STRING, &delimiters,
+    "Set of characters based on which the log messages are tokenized, defaults to :&~?![]=,;()'\"", "<delimiters>" },
   { "samples",           0, 0, G_OPTION_ARG_INT, &num_of_samples,
     "Number of example lines to add for the patterns (default: 1)", "<samples>" },
   { NULL, 0, 0, G_OPTION_ARG_NONE, NULL, NULL }
diff --git a/modules/dbparser/tests/test_patternize.c b/modules/dbparser/tests/test_patternize.c
index adf7d1c..7a7724a 100644
--- a/modules/dbparser/tests/test_patternize.c
+++ b/modules/dbparser/tests/test_patternize.c
@@ -69,6 +69,7 @@ testcase_frequent_words(gchar* logs, guint support, gchar *expected)
   gchar **expecteds;
   GHashTable *wordlist;
   loglinesType *logmessages;
+  gchar *delimiters = " :&~?![]=,;()'\"";
 
   logmessages = testcase_get_logmessages(logs);
 
@@ -76,7 +77,7 @@ testcase_frequent_words(gchar* logs, guint support, gchar *expected)
 
   for (twopass = 1; twopass <= 2; ++twopass)
     {
-      wordlist = ptz_find_frequent_words(logmessages->logmessages, support, twopass == 1);
+      wordlist = ptz_find_frequent_words(logmessages->logmessages, support, delimiters, twopass == 1);
 
       for (i = 0; expecteds[i]; ++i)
         {
@@ -280,10 +281,11 @@ testcase_find_clusters_slct(gchar* logs, guint support, gchar *expected)
   clusterfindData *find_data;
   GHashTable *clusters;
   Cluster *test_cluster;
+  gchar *delimiters = " :&~?![]=,;()'\"";
 
   logmessages = testcase_get_logmessages(logs);
 
-  clusters = ptz_find_clusters_slct(logmessages->logmessages, support, 0);
+  clusters = ptz_find_clusters_slct(logmessages->logmessages, support, delimiters, 0);
 
   expecteds = g_strsplit(expected, "|", 0);
   for (i = 0; expecteds[i]; ++i)
-- 
1.7.0.4


--------------030108010004080904070102--