[syslog-ng] syslog-ng suddenly stops logging

Richard Vigeant richard.vigeant at vantrix.com
Thu Jun 19 21:54:22 CEST 2008


Hi,

I have a configuration where several nodes send all log messages to a  
central server. The
applications on remote nodes send their logs locally either via UDP or  
a unix socket. The
syslog-ng running on remote nodes simply pick up all log messages from  
all sources, i.e. TCP, UDP,
/proc/kmsg, /dev/log and internal, and transmit all messages to the  
central server uisng TCP. The
remote node's config file follows.

We've been having intermittent problems where the central server would  
suddenly stop logging messages
from certain nodes. We noticed that very often restarting syslog-ng on  
the central server would fix
the condition and logging would carry on.

Howver I discovered a new rare case where restarting the central  
syslog-ng didn't work. I found out
by doing a tcpdump that the remote syslog-ng was not sending the log  
messages. I have done an strace
on the remote syslog-ng and it shows that nothing happens after a  
message has been "recvfrom()" or
"read()". Then I have restarted syslog-ng and things went back to  
normal. In the 2nd strace we can see
that there is a "write()" after the "read()".

I'm using syslog-ng 2.0.9.

*********************
Remote node's Config file:
*********************
options {
     chain_hostnames(no);
     log_msg_size(8192);
     log_fifo_size(50000);
     log_iw_size(250);
     log_fetch_limit(25);
     flush_lines(10);
     flush_timeout(10);
};

source local {
     pipe("/proc/kmsg");
     unix-stream("/dev/log" max-connections(50));
     internal();
};

source network {
     udp(localport(514));
     tcp(localport(514));
};

destination spotxdeoam {
     tcp("spotxdeoam" destport(514));
};

destination local {
     tcp("localhost" destport(514));
};

# To change the local log path for modules, change the following  
statement.
destination local_Modules {
     file("/opt/spotxde/software/var/SPOTxdeServer/logs/$HOST/$PROGRAM/ 
messages"
          create_dirs(yes)
          template("$FULLDATE, $HOST, $MSG\n")
          template_escape(no));
};

# To change the local log path for node, change the following statement.
destination local_Nodes {
     file("/opt/spotxde/software/var/SPOTxdeServer/logs/$HOST/messages"
          create_dirs(yes)
          template("$FULLDATE, $HOST, $MSG\n")
          template_escape(no));
};

filter Modules {
     program("^OAM")  or
     program("^TRX")  or
     program("^VA1")  or
[root at c15 Rich]# more /opt/spotxde/software/etc/syslog_ng/syslog- 
ng_trx.conf
options {
     chain_hostnames(no);
     log_msg_size(8192);
     log_fifo_size(50000);
     log_iw_size(250);
     log_fetch_limit(25);
     flush_lines(10);
     flush_timeout(10);
};

source local {
     pipe("/proc/kmsg");
     unix-stream("/dev/log" max-connections(50));
     internal();
};

source network {
     udp(localport(514));
     tcp(localport(514));
};

destination spotxdeoam {
     tcp("spotxdeoam" destport(514));
};

destination local {
     tcp("localhost" destport(514));
};

# To change the local log path for modules, change the following  
statement.
destination local_Modules {
     file("/opt/spotxde/software/var/SPOTxdeServer/logs/$HOST/$PROGRAM/ 
messages"
          create_dirs(yes)
          template("$FULLDATE, $HOST, $MSG\n")
          template_escape(no));
};

# To change the local log path for node, change the following statement.
destination local_Nodes {
     file("/opt/spotxde/software/var/SPOTxdeServer/logs/$HOST/messages"
          create_dirs(yes)
          template("$FULLDATE, $HOST, $MSG\n")
          template_escape(no));
};

filter Modules {
     program("^OAM")  or
     program("^TRX")  or
     program("^VA1")  or
     program("^VA2")  or
     program("^VA3")  or
     program("^VA4")  or
     program("^MM1")  or
     program("^MM3")  or
     program("^MM4")  or
     program("^MM7")  or
     program("^WAP")  or
     program("^STI")  or
     program("^HCA")  or
     program("^RPT")  or
     program("^PRF")  or
     program("^SVW")  or
     program("^MNG")  or
     program("^CDR")  or
     program("^EAIF") or
     program("^TEST") or
     program("^MMSC") or
     program("^SSMA") or
     program("^SMTP") or
     program("^ICAP") or
     program("^BIDA") or
     program("^SMPP") or
     program("^LTRX") or
     program("^MCDN") or
     program("^RMSS") or
     program("^EPGS");
};

log {
     source(local);
     source(network);
     filter(Modules);
     destination(spotxdeoam);
# Remove the comment on the next statement to log locally on the TRX  
nodes the Modules related logs
#    destination(local_Modules);
     flags(final);
};

log {
     source(local);
     source(network);
     destination(spotxdeoam);
# Remove the comment on the next statement to log locally on the TRX  
nodes the Node relatedlogs
#    destination(local_Nodes);
     flags(fallback);
};

*********************

*********************
Following is an strace of syslog-ng that doesn't work.
There were 2 attemps in this one: one using UDP and on using Unix  
socket.
*********************

[root at c15 Rich]# strace -p 8259
Process 8259 attached - interrupt to quit
gettimeofday({1213891616, 539195}, NULL) = 0
gettimeofday({1213891616, 539246}, NULL) = 0
poll([{fd=7, events=POLLIN}, {fd=8, events=POLLIN|POLLOUT}, {fd=9,  
events=POLLIN}, {fd=4, events=POLLIN}, {fd=10, events=POLLIN}, {fd=5,  
events=POLLIN}, {fd=6, events=POLLIN}, {fd=3, events=POLLIN}], 8,  
22650) = 0
gettimeofday({1213891639, 190435}, NULL) = 0
time(NULL)                              = 1213891639
time(NULL)                              = 1213891639
gettimeofday({1213891639, 190507}, NULL) = 0
poll([{fd=7, events=POLLIN}, {fd=8, events=POLLIN|POLLOUT}, {fd=9,  
events=POLLIN}, {fd=4, events=POLLIN}, {fd=10, events=POLLIN}, {fd=5,  
events=POLLIN}, {fd=6, events=POLLIN}, {fd=3, events=POLLIN}], 8,  
30000) = 0
gettimeofday({1213891669, 191715}, NULL) = 0
time(NULL)                              = 1213891669
time(NULL)                              = 1213891669
gettimeofday({1213891669, 191778}, NULL) = 0
poll([{fd=7, events=POLLIN}, {fd=8, events=POLLIN|POLLOUT}, {fd=9,  
events=POLLIN}, {fd=4, events=POLLIN}, {fd=10, events=POLLIN}, {fd=5,  
events=POLLIN}, {fd=6, events=POLLIN}, {fd=3, events=POLLIN,  
revents=POLLIN}], 8, 30000) = 1
gettimeofday({1213891676, 326051}, NULL) = 0
recvfrom(3, "<30>Jun 19 12:07:56 RMSS: /syslo"..., 8192, 0,  
{sa_family=AF_INET, sin_port=htons(39166),  
sin_addr=inet_addr("127.0.0.1")}, [16]) = 100
gettimeofday({1213891676, 326294}, NULL) = 0
time(NULL)                              = 1213891676
stat64("/etc/localtime", {st_mode=S_IFREG|0644, st_size=1252, ...}) = 0
time(NULL)                              = 1213891676
gettimeofday({1213891676, 326621}, NULL) = 0
recvfrom(3, 0x8131ef8, 8192, 0, 0xffffd560, 0xffffd55c) = -1 EAGAIN  
(Resource temporarily unavailable)
gettimeofday({1213891676, 326794}, NULL) = 0
poll([{fd=7, events=POLLIN}, {fd=8, events=POLLIN|POLLOUT}, {fd=9,  
events=POLLIN}, {fd=4, events=POLLIN}, {fd=10, events=POLLIN}, {fd=5,  
events=POLLIN}, {fd=6, events=POLLIN}, {fd=3, events=POLLIN}], 8,  
22865) = 0
gettimeofday({1213891699, 193074}, NULL) = 0
time(NULL)                              = 1213891699
time(NULL)                              = 1213891699
gettimeofday({1213891699, 193277}, NULL) = 0
poll([{fd=7, events=POLLIN}, {fd=8, events=POLLIN|POLLOUT}, {fd=9,  
events=POLLIN}, {fd=4, events=POLLIN}, {fd=10, events=POLLIN}, {fd=5,  
events=POLLIN}, {fd=6, events=POLLIN}, {fd=3, events=POLLIN}], 8,  
30000) = 0
gettimeofday({1213891729, 195368}, NULL) = 0
time(NULL)                              = 1213891729
time(NULL)                              = 1213891729
gettimeofday({1213891729, 195569}, NULL) = 0
poll([{fd=7, events=POLLIN}, {fd=8, events=POLLIN|POLLOUT}, {fd=9,  
events=POLLIN}, {fd=4, events=POLLIN}, {fd=10, events=POLLIN}, {fd=5,  
events=POLLIN}, {fd=6, events=POLLIN}, {fd=3, events=POLLIN}], 8,  
30000) = 0
gettimeofday({1213891759, 196643}, NULL) = 0
time(NULL)                              = 1213891759
time(NULL)                              = 1213891759
gettimeofday({1213891759, 196842}, NULL) = 0
poll([{fd=7, events=POLLIN}, {fd=8, events=POLLIN|POLLOUT}, {fd=9,  
events=POLLIN}, {fd=4, events=POLLIN}, {fd=10, events=POLLIN}, {fd=5,  
events=POLLIN}, {fd=6, events=POLLIN}, {fd=3, events=POLLIN}], 8,  
30000) = 0
gettimeofday({1213891789, 197931}, NULL) = 0
time(NULL)                              = 1213891789
time(NULL)                              = 1213891789
gettimeofday({1213891789, 198129}, NULL) = 0
poll([{fd=7, events=POLLIN}, {fd=8, events=POLLIN|POLLOUT}, {fd=9,  
events=POLLIN}, {fd=4, events=POLLIN}, {fd=10, events=POLLIN}, {fd=5,  
events=POLLIN}, {fd=6, events=POLLIN}, {fd=3, events=POLLIN}], 8,  
30000) = 0
gettimeofday({1213891819, 199219}, NULL) = 0
time(NULL)                              = 1213891819
time(NULL)                              = 1213891819
gettimeofday({1213891819, 199421}, NULL) = 0
poll([{fd=7, events=POLLIN}, {fd=8, events=POLLIN|POLLOUT}, {fd=9,  
events=POLLIN}, {fd=4, events=POLLIN}, {fd=10, events=POLLIN}, {fd=5,  
events=POLLIN}, {fd=6, events=POLLIN}, {fd=3, events=POLLIN}], 8,  
30000) = 0
gettimeofday({1213891849, 200510}, NULL) = 0
time(NULL)                              = 1213891849
time(NULL)                              = 1213891849
gettimeofday({1213891849, 200707}, NULL) = 0
poll([{fd=7, events=POLLIN}, {fd=8, events=POLLIN|POLLOUT}, {fd=9,  
events=POLLIN}, {fd=4, events=POLLIN}, {fd=10, events=POLLIN}, {fd=5,  
events=POLLIN}, {fd=6, events=POLLIN, revents=POLLIN}, {fd=3,  
events=POLLIN}], 8, 30000) = 1
gettimeofday({1213891867, 753947}, NULL) = 0
accept(6, {sa_family=AF_FILE,  
path 
= 
"??????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????"}, [2 
]) = 11
setsockopt(11, SOL_SOCKET, SO_KEEPALIVE, [0], 4) = 0
fcntl64(11, F_GETFL)                    = 0x2 (flags O_RDWR)
fcntl64(11, F_SETFL, O_RDWR|O_NONBLOCK) = 0
fcntl64(11, F_GETFD)                    = 0
fcntl64(11, F_SETFD, FD_CLOEXEC)        = 0
brk(0)                                  = 0x8ae1000
brk(0x8b02000)                          = 0x8b02000
gettimeofday({1213891867, 754658}, NULL) = 0
poll([{fd=7, events=POLLIN}, {fd=8, events=POLLIN|POLLOUT}, {fd=9,  
events=POLLIN}, {fd=4, events=POLLIN}, {fd=10, events=POLLIN}, {fd=5,  
events=POLLIN}, {fd=3, events=POLLIN}, {fd=11, events=POLLIN,  
revents=POLLIN|POLLHUP}, {fd=6, events=POLLIN}], 9, 11446) = 1
gettimeofday({1213891867, 754830}, NULL) = 0
read(11, "<30>Jun 19 12:11:07 RMSS: /syslo"..., 8192) = 100
gettimeofday({1213891867, 755001}, NULL) = 0
time(NULL)                              = 1213891867
stat64("/etc/localtime", {st_mode=S_IFREG|0644, st_size=1252, ...}) = 0
gettimeofday({1213891867, 755271}, NULL) = 0
read(11, "", 8192)                      = 0
close(11)                               = 0
gettimeofday({1213891867, 755520}, NULL) = 0
poll([{fd=7, events=POLLIN}, {fd=8, events=POLLIN|POLLOUT}, {fd=9,  
events=POLLIN}, {fd=4, events=POLLIN}, {fd=10, events=POLLIN}, {fd=5,  
events=POLLIN}, {fd=3, events=POLLIN}, {fd=6, events=POLLIN}], 8,  
11445) = 0
gettimeofday({1213891879, 201791}, NULL) = 0
time(NULL)                              = 1213891879
time(NULL)                              = 1213891879
gettimeofday({1213891879, 201987}, NULL) = 0
poll([{fd=7, events=POLLIN}, {fd=8, events=POLLIN|POLLOUT}, {fd=9,  
events=POLLIN}, {fd=4, events=POLLIN}, {fd=10, events=POLLIN}, {fd=5,  
events=POLLIN}, {fd=3, events=POLLIN}, {fd=6, events=POLLIN}], 8,  
30000) = 0
gettimeofday({1213891909, 203098}, NULL) = 0
time(NULL)                              = 1213891909
time(NULL)                              = 1213891909
gettimeofday({1213891909, 203299}, NULL) = 0
poll([{fd=7, events=POLLIN}, {fd=8, events=POLLIN|POLLOUT}, {fd=9,  
events=POLLIN}, {fd=4, events=POLLIN}, {fd=10, events=POLLIN}, {fd=5,  
events=POLLIN}, {fd=3, events=POLLIN}, {fd=6, events=POLLIN}], 8,  
30000) = 0
gettimeofday({1213891939, 205377}, NULL) = 0
time(NULL)                              = 1213891939
time(NULL)                              = 1213891939
gettimeofday({1213891939, 205580}, NULL) = 0
poll(


*******************
And now the strace sequence when it works:
We can see that after the read there is a write to transmit the  
message to the central log server.
*******************

[root at c15 Rich]# strace -p 15310
Process 15310 attached - interrupt to quit
gettimeofday({1213900598, 12321}, NULL) = 0
gettimeofday({1213900598, 12361}, NULL) = 0
poll([{fd=7, events=POLLIN}, {fd=8, events=POLLIN}, {fd=5,  
events=POLLIN}, {fd=9, events=POLLIN}, {fd=4, events=POLLIN}, {fd=6,  
events=POLLIN}, {fd=3, events=POLLIN}], 7, 2069) = 0
gettimeofday({1213900600, 83036}, NULL) = 0
time(NULL)                              = 1213900600
time(NULL)                              = 1213900600
gettimeofday({1213900600, 83101}, NULL) = 0
poll([{fd=7, events=POLLIN}, {fd=8, events=POLLIN}, {fd=5,  
events=POLLIN}, {fd=9, events=POLLIN}, {fd=4, events=POLLIN}, {fd=6,  
events=POLLIN, revents=POLLIN}, {fd=3, events=POLLIN}], 7, 30000) = 1
gettimeofday({1213900610, 338336}, NULL) = 0
accept(6, {sa_family=AF_FILE, path=@}, [2]) = 10
setsockopt(10, SOL_SOCKET, SO_KEEPALIVE, [0], 4) = 0
fcntl64(10, F_GETFL)                    = 0x2 (flags O_RDWR)
fcntl64(10, F_SETFL, O_RDWR|O_NONBLOCK) = 0
fcntl64(10, F_GETFD)                    = 0
fcntl64(10, F_SETFD, FD_CLOEXEC)        = 0
gettimeofday({1213900610, 338937}, NULL) = 0
poll([{fd=7, events=POLLIN}, {fd=8, events=POLLIN}, {fd=5,  
events=POLLIN}, {fd=9, events=POLLIN}, {fd=4, events=POLLIN}, {fd=3,  
events=POLLIN}, {fd=10, events=POLLIN, revents=POLLIN|POLLHUP}, {fd=6,  
events=POLLIN}], 8, 19745) = 1
gettimeofday({1213900610, 339123}, NULL) = 0
read(10, "<30>Jun 19 14:36:50 RMSS: /syslo"..., 8192) = 100
gettimeofday({1213900610, 339320}, NULL) = 0
time(NULL)                              = 1213900610
stat64("/etc/localtime", {st_mode=S_IFREG|0644, st_size=1252, ...}) = 0
gettimeofday({1213900610, 339627}, NULL) = 0
read(10, "", 8192)                      = 0
close(10)                               = 0
gettimeofday({1213900610, 339888}, NULL) = 0
poll([{fd=7, events=POLLIN}, {fd=8, events=POLLIN}, {fd=5,  
events=POLLIN}, {fd=9, events=POLLIN}, {fd=4, events=POLLIN}, {fd=3,  
events=POLLIN}, {fd=6, events=POLLIN}], 7, 10) = 0
gettimeofday({1213900610, 351199}, NULL) = 0
stat64("/etc/localtime", {st_mode=S_IFREG|0644, st_size=1252, ...}) = 0
write(8, "<30>Jun 19 14:36:50 c15 RMSS: /s"..., 103) = 103
gettimeofday({1213900610, 351494}, NULL) = 0
poll( <unfinished ...>
Process 15310 detached

Richard Vigeant
www.vantrix.com



-------------- next part --------------
An HTML attachment was scrubbed...
URL: http://lists.balabit.hu/pipermail/syslog-ng/attachments/20080619/fe4234dd/attachment-0001.htm 


More information about the syslog-ng mailing list