[PATCH] [patternize] implemented support for custom delimiters for tokenization and changed default delimiter set
Balint Kovacs
blint at balabit.hu
Mon Feb 7 14:30:42 CET 2011
Signed-off-by: Balint Kovacs <blint at balabit.hu>
---
modules/dbparser/patternize.c | 117 +++++++++++++++++++++---------
modules/dbparser/patternize.h | 10 ++-
modules/dbparser/pdbtool.c | 17 ++++-
modules/dbparser/tests/test_patternize.c | 6 +-
4 files changed, 107 insertions(+), 43 deletions(-)
diff --git a/modules/dbparser/patternize.c b/modules/dbparser/patternize.c
index 1a433c2..8903bd9 100644
--- a/modules/dbparser/patternize.c
+++ b/modules/dbparser/patternize.c
@@ -40,7 +40,7 @@
#define PTZ_MAXLINELEN 10240
#define PTZ_MAXWORDS 512 /* maximum number of words in one line */
#define PTZ_LOGTABLE_ALLOC_BASE 3000
-#define PTZ_WORDLIST_CACHE 3 // FIXME: make this a commandline parameter?
+#define PTZ_WORDLIST_CACHE 3 /* FIXME: make this a commandline parameter? */
static LogTagId cluster_tag_id;
@@ -85,7 +85,7 @@ static void _ptz_debug_print_word(gpointer key, gpointer value, gpointer dummy)
static void _ptz_debug_print_cluster(gpointer key, gpointer value, gpointer dummy)
{
- fprintf(stderr, "%s: %d\n", (gchar*) key, ((Cluster *) value)->support);
+ fprintf(stderr, "%s: %s\n", (gchar*) key, ((Cluster *) value)->words[0]);
}
#endif
@@ -103,6 +103,28 @@ ptz_str2hash(gchar *string, guint modulo, guint seed)
return seed % modulo;
}
+gchar *
+ptz_find_delimiters(gchar *str, gchar *delimdef)
+{
+ gssize token_len;
+ gchar *remainder;
+ GString *delimiters = g_string_sized_new(0);
+ gchar *res;
+
+ token_len = strcspn(str, delimdef);
+ remainder = str;
+ while (strlen(remainder) != 0)
+ {
+ g_string_append_c(delimiters, remainder[token_len]);
+ remainder = &remainder[token_len+1];
+ token_len = strcspn(remainder, delimdef);
+ }
+
+ res = g_strdup(delimiters->str);
+ g_string_free(delimiters, TRUE);
+ return res;
+}
+
gboolean
ptz_find_frequent_words_remove_key_predicate(gpointer key, gpointer value, gpointer support)
{
@@ -110,7 +132,7 @@ ptz_find_frequent_words_remove_key_predicate(gpointer key, gpointer value, gpoin
}
GHashTable *
-ptz_find_frequent_words(GPtrArray *logs, guint support, gboolean two_pass)
+ptz_find_frequent_words(GPtrArray *logs, guint support, gchar *delimiters, gboolean two_pass)
{
int i, j, pass;
guint *curr_count;
@@ -148,14 +170,14 @@ ptz_find_frequent_words(GPtrArray *logs, guint support, gboolean two_pass)
{
msg = (LogMessage *) g_ptr_array_index(logs, i);
msgstr = (gchar *) log_msg_get_value(msg, LM_V_MESSAGE, &msglen);
- /* NOTE: we should split on more than a simple space... */
- words = g_strsplit(msgstr, " ", PTZ_MAXWORDS);
+
+ words = g_strsplit_set(msgstr, delimiters, PTZ_MAXWORDS);
for (j = 0; words[j]; ++j)
{
/* NOTE: to calculate the key for the hash, we prefix a word with
- * its position in the row and a space -- as we split at spaces,
- * this should not create confusion
+ * its position in the row and a space -- as we always split at
+ * spaces, this should not create confusion
*/
hash_key = g_strdup_printf("%d %s", j, words[j]);
@@ -191,7 +213,7 @@ ptz_find_frequent_words(GPtrArray *logs, guint support, gboolean two_pass)
g_strfreev(words);
}
- //g_hash_table_foreach(wordlist, _ptz_debug_print_word, NULL);
+ /* g_hash_table_foreach(wordlist, _ptz_debug_print_word, NULL); */
g_hash_table_foreach_remove(wordlist, ptz_find_frequent_words_remove_key_predicate, GUINT_TO_POINTER(support));
}
@@ -216,7 +238,7 @@ ptz_find_clusters_remove_cluster_predicate(gpointer key, gpointer value, gpointe
ret = (val->loglines->len < support);
if (ret)
{
- // remove cluster reference from the relevant logs
+ /* remove cluster reference from the relevant logs */
for (i = 0; i < val->loglines->len; ++i)
{
msg = (LogMessage *) g_ptr_array_index(val->loglines, i);
@@ -246,7 +268,7 @@ cluster_free(Cluster *cluster)
}
GHashTable *
-ptz_find_clusters_slct(GPtrArray *logs, guint support, guint num_of_samples)
+ptz_find_clusters_slct(GPtrArray *logs, guint support, gchar *delimiters, guint num_of_samples)
{
GHashTable *wordlist;
GHashTable *clusters;
@@ -259,10 +281,11 @@ ptz_find_clusters_slct(GPtrArray *logs, guint support, guint num_of_samples)
gboolean is_candidate;
Cluster *cluster;
GString *cluster_key;
+ gchar * msgdelimiters;
/* get the frequent word list */
- wordlist = ptz_find_frequent_words(logs, support, TRUE);
-// g_hash_table_foreach(wordlist, _debug_print, NULL);
+ wordlist = ptz_find_frequent_words(logs, support, delimiters, TRUE);
+ /* g_hash_table_foreach(wordlist, _ptz_debug_print_word, NULL); */
/* find the cluster candidates */
clusters = g_hash_table_new_full(g_str_hash, g_str_equal, g_free, (GDestroyNotify) cluster_free);
@@ -274,8 +297,8 @@ ptz_find_clusters_slct(GPtrArray *logs, guint support, guint num_of_samples)
g_string_truncate(cluster_key, 0);
- /* NOTE: we should split on more than a simple space... */
- words = g_strsplit(msgstr, " ", PTZ_MAXWORDS);
+ words = g_strsplit_set(msgstr, delimiters, PTZ_MAXWORDS);
+ msgdelimiters = ptz_find_delimiters(msgstr, delimiters);
is_candidate = FALSE;
for (j = 0; words[j]; ++j)
@@ -290,12 +313,19 @@ ptz_find_clusters_slct(GPtrArray *logs, guint support, guint num_of_samples)
}
else
{
- g_string_append_printf(cluster_key, "%d *%c", j, PTZ_SEPARATOR_CHAR);
+ g_string_append_printf(cluster_key, "%d %c%c", j, PTZ_PARSER_MARKER_CHAR, PTZ_SEPARATOR_CHAR);
}
g_free(hash_key);
}
+ /* append the delimiters of the message to the cluster key to assure unicity
+ * otherwise the same words with different delimiters would still show as the
+ * same cluster
+ */
+ g_string_append_printf(cluster_key, "%s%c", msgdelimiters, PTZ_SEPARATOR_CHAR);
+ g_free(msgdelimiters);
+
if (is_candidate)
{
cluster = (Cluster*) g_hash_table_lookup(clusters, cluster_key->str);
@@ -331,7 +361,7 @@ ptz_find_clusters_slct(GPtrArray *logs, guint support, guint num_of_samples)
g_hash_table_foreach_remove(clusters, ptz_find_clusters_remove_cluster_predicate, GUINT_TO_POINTER(support));
-// g_hash_table_foreach(clusters, _ptz_debug_print_cluster, NULL);
+ /* g_hash_table_foreach(clusters, _ptz_debug_print_cluster, NULL); */
g_hash_table_unref(wordlist);
g_string_free(cluster_key, TRUE);
@@ -356,7 +386,7 @@ ptz_find_clusters_step(Patternizer *self, GPtrArray *logs, guint support, guint
{
msg_progress("Searching clusters", evt_tag_int("input lines", logs->len), NULL);
if (self->algo == PTZ_ALGO_SLCT)
- return ptz_find_clusters_slct(logs, support, num_of_samples);
+ return ptz_find_clusters_slct(logs, support, self->delimiters, num_of_samples);
else
{
msg_error("Unknown clustering algorithm", evt_tag_int("algo_id", self->algo));
@@ -450,6 +480,11 @@ ptz_print_patterndb_rule(gpointer key, gpointer value, gpointer user_data)
guint parser_counts[PTZ_NUM_OF_PARSERS];
int i;
Cluster *cluster;
+ GString *pattern = g_string_new("");
+ guint wordcount;
+ gchar *delimiters;
+
+ cluster = (Cluster *) value;
if (named_parsers)
{
@@ -459,8 +494,8 @@ ptz_print_patterndb_rule(gpointer key, gpointer value, gpointer user_data)
uuid_gen_random(uuid_string, sizeof(uuid_string));
- printf(" <rule id='%s'>\n", uuid_string);
- printf(" <!-- support: %d -->\n", ((Cluster *) value)->loglines->len);
+ printf(" <rule id='%s' class='system' provider='patternize'>\n", uuid_string);
+ printf(" <!-- support: %d -->\n", cluster->loglines->len);
printf(" <patterns>\n");
printf(" <pattern>");
@@ -474,30 +509,45 @@ ptz_print_patterndb_rule(gpointer key, gpointer value, gpointer user_data)
splitstr = g_strdup_printf("%c", PTZ_SEPARATOR_CHAR);
words = g_strsplit(skey, splitstr, 0);
g_free(splitstr);
+
+ /* pop the delimiters from the cluster key */
+ wordcount = g_strv_length(words);
+ delimiters = words[wordcount-1];
+ words[wordcount-1] = 0;
+
for (i = 0; words[i]; ++i)
{
- gchar **word_parts;
+ g_string_truncate(pattern, 0);
+ gchar **word_parts;
word_parts = g_strsplit(words[i], " ", 2);
- if (word_parts[1][0] == '*')
+ if (word_parts[1][0] == PTZ_PARSER_MARKER_CHAR)
{
/* NOTE: nasty workaround: do not display last ESTRING as syslog-ng won't handle that well... */
/* FIXME: enter a simple @STRING@ here instead... */
if (words[i + 1])
{
- printf("@ESTRING:");
+ g_string_append(pattern, "@ESTRING:");
if (named_parsers)
{
- // TODO: do not hardcode ESTRING here...
- printf(".dict.string%d", parser_counts[PTZ_PARSER_ESTRING]++);
+ /* TODO: do not hardcode ESTRING here... */
+ g_string_append_printf(pattern, ".dict.string%d", parser_counts[PTZ_PARSER_ESTRING]++);
}
- printf(": @");
+ g_string_append_printf(pattern, ":%c@", delimiters[i]);
+ escapedstr = g_markup_escape_text(pattern->str, -1);
+ printf("%s", escapedstr);
+ g_free(escapedstr);
}
}
else
{
- escapedstr = g_markup_escape_text(word_parts[1], -1);
+ g_string_append(pattern, word_parts[1]);
+
+ if (words[i + 1])
+ g_string_append_printf(pattern, "%c", delimiters[i]);
+
+ escapedstr = g_markup_escape_text(pattern->str, -1);
if (g_strrstr(escapedstr, "@"))
{
escapedparts = g_strsplit(escapedstr, "@", -1);
@@ -505,23 +555,19 @@ ptz_print_patterndb_rule(gpointer key, gpointer value, gpointer user_data)
escapedstr = g_strjoinv("@@", escapedparts);
g_strfreev(escapedparts);
}
-
printf("%s", escapedstr);
g_free(escapedstr);
- if (words[i + 1])
- printf(" ");
}
-
g_strfreev(word_parts);
}
g_free(skey);
g_strfreev(words);
+ g_string_free(pattern, TRUE);
printf("</pattern>\n");
printf(" </patterns>\n");
- cluster = (Cluster *) value;
if (cluster->samples->len > 0)
{
printf(" <examples>\n");
@@ -541,12 +587,12 @@ ptz_print_patterndb_rule(gpointer key, gpointer value, gpointer user_data)
}
void
-ptz_print_patterndb(GHashTable *clusters, gboolean named_parsers)
+ptz_print_patterndb(GHashTable *clusters, gchar *delimiters, gboolean named_parsers)
{
char date[12], uuid_string[37];
time_t currtime;
- // print the header
+ /* print the header */
time(&currtime);
strftime(date, 12, "%Y-%m-%d", localtime(&currtime));
printf("<patterndb version='3' pub_date='%s'>\n", date);
@@ -555,7 +601,7 @@ ptz_print_patterndb(GHashTable *clusters, gboolean named_parsers)
printf(" <ruleset name='patternize' id='%s'>\n", uuid_string);
printf(" <rules>\n");
- g_hash_table_foreach(clusters, ptz_print_patterndb_rule, (gpointer) &named_parsers);
+ g_hash_table_foreach(clusters, ptz_print_patterndb_rule, (gpointer *) &named_parsers);
printf(" </rules>\n");
printf(" </ruleset>\n");
@@ -612,7 +658,7 @@ ptz_load_file(Patternizer *self, gchar *input_file, GError **error)
}
Patternizer *
-ptz_new(gdouble support_treshold, guint algo, guint iterate, guint num_of_samples)
+ptz_new(gdouble support_treshold, guint algo, guint iterate, guint num_of_samples, gchar *delimiters)
{
Patternizer *self = g_new0(Patternizer, 1);
@@ -621,6 +667,7 @@ ptz_new(gdouble support_treshold, guint algo, guint iterate, guint num_of_sample
self->support_treshold = support_treshold;
self->num_of_samples = num_of_samples;
+ self->delimiters = delimiters;
self->logs = g_ptr_array_sized_new(PTZ_LOGTABLE_ALLOC_BASE);
cluster_tag_id = log_tags_get_by_name(".in_patternize_cluster");
diff --git a/modules/dbparser/patternize.h b/modules/dbparser/patternize.h
index 7fb28ba..2afc2f8 100644
--- a/modules/dbparser/patternize.h
+++ b/modules/dbparser/patternize.h
@@ -32,6 +32,7 @@
#define PTZ_ITERATE_HIEARARCH 2
#define PTZ_SEPARATOR_CHAR 0x1E
+#define PTZ_PARSER_MARKER_CHAR 0x1A
#define PTZ_NUM_OF_PARSERS 1
#define PTZ_PARSER_ESTRING 0
@@ -45,6 +46,7 @@ typedef struct _Patternizer
guint support;
guint num_of_samples;
gdouble support_treshold;
+ gchar *delimiters;
// NOTE: for now, we store all logs read in in the memory.
// This brings in some obvious constraints and should be solved
@@ -61,16 +63,16 @@ typedef struct _Cluster
} Cluster;
/* only declared for the test program */
-GHashTable *ptz_find_frequent_words(GPtrArray *logs, guint support, gboolean two_pass);
-GHashTable *ptz_find_clusters_slct(GPtrArray *logs, guint support, guint num_of_samples);
+GHashTable *ptz_find_frequent_words(GPtrArray *logs, guint support, gchar *delimiters, gboolean two_pass);
+GHashTable *ptz_find_clusters_slct(GPtrArray *logs, guint support, gchar *delimiters, guint num_of_samples);
GHashTable *ptz_find_clusters(Patternizer *self);
-void ptz_print_patterndb(GHashTable *clusters, gboolean named_parsers);
+void ptz_print_patterndb(GHashTable *clusters, gchar *delimiters, gboolean named_parsers);
gboolean ptz_load_file(Patternizer *self, gchar *input_file, GError **error);
-Patternizer *ptz_new(gdouble support_treshold, guint algo, guint iterate, guint num_of_samples);
+Patternizer *ptz_new(gdouble support_treshold, guint algo, guint iterate, guint num_of_samples, gchar *delimiters);
void ptz_free(Patternizer *self);
#endif
diff --git a/modules/dbparser/pdbtool.c b/modules/dbparser/pdbtool.c
index 83643c5..e632f78 100644
--- a/modules/dbparser/pdbtool.c
+++ b/modules/dbparser/pdbtool.c
@@ -816,6 +816,7 @@ static gdouble support_treshold = 4.0;
static gboolean iterate_outliers = FALSE;
static gboolean named_parsers = FALSE;
static gint num_of_samples = 1;
+static gchar *delimiters = " :&~?![]=,;()'\"";
static gint
pdbtool_patternize(int argc, char *argv[])
@@ -825,11 +826,21 @@ pdbtool_patternize(int argc, char *argv[])
guint iterate = PTZ_ITERATE_NONE;
gint i;
GError *error = NULL;
+ GString *delimcheck = g_string_new(" "); /* delims should always include a space */
if (iterate_outliers)
iterate = PTZ_ITERATE_OUTLIERS;
- if (!(ptz = ptz_new(support_treshold, PTZ_ALGO_SLCT, iterate, num_of_samples)))
+ /* make sure that every character is unique in the delimiter list */
+ for (i = 0; delimiters[i]; i++)
+ {
+ if (strchr(delimcheck->str, delimiters[i]) == NULL)
+ g_string_append_c(delimcheck, delimiters[i]);
+ }
+ delimiters = g_strdup(delimcheck->str);
+ g_string_free(delimcheck, TRUE);
+
+ if (!(ptz = ptz_new(support_treshold, PTZ_ALGO_SLCT, iterate, num_of_samples, delimiters)))
{
return 1;
}
@@ -846,7 +857,7 @@ pdbtool_patternize(int argc, char *argv[])
}
clusters = ptz_find_clusters(ptz);
- ptz_print_patterndb(clusters, named_parsers);
+ ptz_print_patterndb(clusters, delimiters, named_parsers);
g_hash_table_destroy(clusters);
exit:
@@ -865,6 +876,8 @@ static GOptionEntry patternize_options[] =
"Recursively iterate on the log lines that do not make it into a cluster in the previous step", NULL},
{ "named-parsers", 'n', 0, G_OPTION_ARG_NONE, &named_parsers,
"Give the parsers a name in the patterns, eg.: .dict.string1, .dict.string2... (default: no)", NULL},
+ { "delimiters", 'd', 0, G_OPTION_ARG_STRING, &delimiters,
+ "Set of characters based on which the log messages are tokenized, defaults to :&~?![]=,;()'\"", "<delimiters>" },
{ "samples", 0, 0, G_OPTION_ARG_INT, &num_of_samples,
"Number of example lines to add for the patterns (default: 1)", "<samples>" },
{ NULL, 0, 0, G_OPTION_ARG_NONE, NULL, NULL }
diff --git a/modules/dbparser/tests/test_patternize.c b/modules/dbparser/tests/test_patternize.c
index adf7d1c..7a7724a 100644
--- a/modules/dbparser/tests/test_patternize.c
+++ b/modules/dbparser/tests/test_patternize.c
@@ -69,6 +69,7 @@ testcase_frequent_words(gchar* logs, guint support, gchar *expected)
gchar **expecteds;
GHashTable *wordlist;
loglinesType *logmessages;
+ gchar *delimiters = " :&~?![]=,;()'\"";
logmessages = testcase_get_logmessages(logs);
@@ -76,7 +77,7 @@ testcase_frequent_words(gchar* logs, guint support, gchar *expected)
for (twopass = 1; twopass <= 2; ++twopass)
{
- wordlist = ptz_find_frequent_words(logmessages->logmessages, support, twopass == 1);
+ wordlist = ptz_find_frequent_words(logmessages->logmessages, support, delimiters, twopass == 1);
for (i = 0; expecteds[i]; ++i)
{
@@ -280,10 +281,11 @@ testcase_find_clusters_slct(gchar* logs, guint support, gchar *expected)
clusterfindData *find_data;
GHashTable *clusters;
Cluster *test_cluster;
+ gchar *delimiters = " :&~?![]=,;()'\"";
logmessages = testcase_get_logmessages(logs);
- clusters = ptz_find_clusters_slct(logmessages->logmessages, support, 0);
+ clusters = ptz_find_clusters_slct(logmessages->logmessages, support, delimiters, 0);
expecteds = g_strsplit(expected, "|", 0);
for (i = 0; expecteds[i]; ++i)
--
1.7.0.4
--------------030108010004080904070102--
More information about the syslog-ng
mailing list