[syslog-ng] syslog-ng suddenly stops logging
Richard Vigeant
richard.vigeant at vantrix.com
Thu Jun 19 21:54:22 CEST 2008
Hi,
I have a configuration where several nodes send all log messages to a
central server. The
applications on remote nodes send their logs locally either via UDP or
a unix socket. The
syslog-ng running on remote nodes simply pick up all log messages from
all sources, i.e. TCP, UDP,
/proc/kmsg, /dev/log and internal, and transmit all messages to the
central server uisng TCP. The
remote node's config file follows.
We've been having intermittent problems where the central server would
suddenly stop logging messages
from certain nodes. We noticed that very often restarting syslog-ng on
the central server would fix
the condition and logging would carry on.
Howver I discovered a new rare case where restarting the central
syslog-ng didn't work. I found out
by doing a tcpdump that the remote syslog-ng was not sending the log
messages. I have done an strace
on the remote syslog-ng and it shows that nothing happens after a
message has been "recvfrom()" or
"read()". Then I have restarted syslog-ng and things went back to
normal. In the 2nd strace we can see
that there is a "write()" after the "read()".
I'm using syslog-ng 2.0.9.
*********************
Remote node's Config file:
*********************
options {
chain_hostnames(no);
log_msg_size(8192);
log_fifo_size(50000);
log_iw_size(250);
log_fetch_limit(25);
flush_lines(10);
flush_timeout(10);
};
source local {
pipe("/proc/kmsg");
unix-stream("/dev/log" max-connections(50));
internal();
};
source network {
udp(localport(514));
tcp(localport(514));
};
destination spotxdeoam {
tcp("spotxdeoam" destport(514));
};
destination local {
tcp("localhost" destport(514));
};
# To change the local log path for modules, change the following
statement.
destination local_Modules {
file("/opt/spotxde/software/var/SPOTxdeServer/logs/$HOST/$PROGRAM/
messages"
create_dirs(yes)
template("$FULLDATE, $HOST, $MSG\n")
template_escape(no));
};
# To change the local log path for node, change the following statement.
destination local_Nodes {
file("/opt/spotxde/software/var/SPOTxdeServer/logs/$HOST/messages"
create_dirs(yes)
template("$FULLDATE, $HOST, $MSG\n")
template_escape(no));
};
filter Modules {
program("^OAM") or
program("^TRX") or
program("^VA1") or
[root at c15 Rich]# more /opt/spotxde/software/etc/syslog_ng/syslog-
ng_trx.conf
options {
chain_hostnames(no);
log_msg_size(8192);
log_fifo_size(50000);
log_iw_size(250);
log_fetch_limit(25);
flush_lines(10);
flush_timeout(10);
};
source local {
pipe("/proc/kmsg");
unix-stream("/dev/log" max-connections(50));
internal();
};
source network {
udp(localport(514));
tcp(localport(514));
};
destination spotxdeoam {
tcp("spotxdeoam" destport(514));
};
destination local {
tcp("localhost" destport(514));
};
# To change the local log path for modules, change the following
statement.
destination local_Modules {
file("/opt/spotxde/software/var/SPOTxdeServer/logs/$HOST/$PROGRAM/
messages"
create_dirs(yes)
template("$FULLDATE, $HOST, $MSG\n")
template_escape(no));
};
# To change the local log path for node, change the following statement.
destination local_Nodes {
file("/opt/spotxde/software/var/SPOTxdeServer/logs/$HOST/messages"
create_dirs(yes)
template("$FULLDATE, $HOST, $MSG\n")
template_escape(no));
};
filter Modules {
program("^OAM") or
program("^TRX") or
program("^VA1") or
program("^VA2") or
program("^VA3") or
program("^VA4") or
program("^MM1") or
program("^MM3") or
program("^MM4") or
program("^MM7") or
program("^WAP") or
program("^STI") or
program("^HCA") or
program("^RPT") or
program("^PRF") or
program("^SVW") or
program("^MNG") or
program("^CDR") or
program("^EAIF") or
program("^TEST") or
program("^MMSC") or
program("^SSMA") or
program("^SMTP") or
program("^ICAP") or
program("^BIDA") or
program("^SMPP") or
program("^LTRX") or
program("^MCDN") or
program("^RMSS") or
program("^EPGS");
};
log {
source(local);
source(network);
filter(Modules);
destination(spotxdeoam);
# Remove the comment on the next statement to log locally on the TRX
nodes the Modules related logs
# destination(local_Modules);
flags(final);
};
log {
source(local);
source(network);
destination(spotxdeoam);
# Remove the comment on the next statement to log locally on the TRX
nodes the Node relatedlogs
# destination(local_Nodes);
flags(fallback);
};
*********************
*********************
Following is an strace of syslog-ng that doesn't work.
There were 2 attemps in this one: one using UDP and on using Unix
socket.
*********************
[root at c15 Rich]# strace -p 8259
Process 8259 attached - interrupt to quit
gettimeofday({1213891616, 539195}, NULL) = 0
gettimeofday({1213891616, 539246}, NULL) = 0
poll([{fd=7, events=POLLIN}, {fd=8, events=POLLIN|POLLOUT}, {fd=9,
events=POLLIN}, {fd=4, events=POLLIN}, {fd=10, events=POLLIN}, {fd=5,
events=POLLIN}, {fd=6, events=POLLIN}, {fd=3, events=POLLIN}], 8,
22650) = 0
gettimeofday({1213891639, 190435}, NULL) = 0
time(NULL) = 1213891639
time(NULL) = 1213891639
gettimeofday({1213891639, 190507}, NULL) = 0
poll([{fd=7, events=POLLIN}, {fd=8, events=POLLIN|POLLOUT}, {fd=9,
events=POLLIN}, {fd=4, events=POLLIN}, {fd=10, events=POLLIN}, {fd=5,
events=POLLIN}, {fd=6, events=POLLIN}, {fd=3, events=POLLIN}], 8,
30000) = 0
gettimeofday({1213891669, 191715}, NULL) = 0
time(NULL) = 1213891669
time(NULL) = 1213891669
gettimeofday({1213891669, 191778}, NULL) = 0
poll([{fd=7, events=POLLIN}, {fd=8, events=POLLIN|POLLOUT}, {fd=9,
events=POLLIN}, {fd=4, events=POLLIN}, {fd=10, events=POLLIN}, {fd=5,
events=POLLIN}, {fd=6, events=POLLIN}, {fd=3, events=POLLIN,
revents=POLLIN}], 8, 30000) = 1
gettimeofday({1213891676, 326051}, NULL) = 0
recvfrom(3, "<30>Jun 19 12:07:56 RMSS: /syslo"..., 8192, 0,
{sa_family=AF_INET, sin_port=htons(39166),
sin_addr=inet_addr("127.0.0.1")}, [16]) = 100
gettimeofday({1213891676, 326294}, NULL) = 0
time(NULL) = 1213891676
stat64("/etc/localtime", {st_mode=S_IFREG|0644, st_size=1252, ...}) = 0
time(NULL) = 1213891676
gettimeofday({1213891676, 326621}, NULL) = 0
recvfrom(3, 0x8131ef8, 8192, 0, 0xffffd560, 0xffffd55c) = -1 EAGAIN
(Resource temporarily unavailable)
gettimeofday({1213891676, 326794}, NULL) = 0
poll([{fd=7, events=POLLIN}, {fd=8, events=POLLIN|POLLOUT}, {fd=9,
events=POLLIN}, {fd=4, events=POLLIN}, {fd=10, events=POLLIN}, {fd=5,
events=POLLIN}, {fd=6, events=POLLIN}, {fd=3, events=POLLIN}], 8,
22865) = 0
gettimeofday({1213891699, 193074}, NULL) = 0
time(NULL) = 1213891699
time(NULL) = 1213891699
gettimeofday({1213891699, 193277}, NULL) = 0
poll([{fd=7, events=POLLIN}, {fd=8, events=POLLIN|POLLOUT}, {fd=9,
events=POLLIN}, {fd=4, events=POLLIN}, {fd=10, events=POLLIN}, {fd=5,
events=POLLIN}, {fd=6, events=POLLIN}, {fd=3, events=POLLIN}], 8,
30000) = 0
gettimeofday({1213891729, 195368}, NULL) = 0
time(NULL) = 1213891729
time(NULL) = 1213891729
gettimeofday({1213891729, 195569}, NULL) = 0
poll([{fd=7, events=POLLIN}, {fd=8, events=POLLIN|POLLOUT}, {fd=9,
events=POLLIN}, {fd=4, events=POLLIN}, {fd=10, events=POLLIN}, {fd=5,
events=POLLIN}, {fd=6, events=POLLIN}, {fd=3, events=POLLIN}], 8,
30000) = 0
gettimeofday({1213891759, 196643}, NULL) = 0
time(NULL) = 1213891759
time(NULL) = 1213891759
gettimeofday({1213891759, 196842}, NULL) = 0
poll([{fd=7, events=POLLIN}, {fd=8, events=POLLIN|POLLOUT}, {fd=9,
events=POLLIN}, {fd=4, events=POLLIN}, {fd=10, events=POLLIN}, {fd=5,
events=POLLIN}, {fd=6, events=POLLIN}, {fd=3, events=POLLIN}], 8,
30000) = 0
gettimeofday({1213891789, 197931}, NULL) = 0
time(NULL) = 1213891789
time(NULL) = 1213891789
gettimeofday({1213891789, 198129}, NULL) = 0
poll([{fd=7, events=POLLIN}, {fd=8, events=POLLIN|POLLOUT}, {fd=9,
events=POLLIN}, {fd=4, events=POLLIN}, {fd=10, events=POLLIN}, {fd=5,
events=POLLIN}, {fd=6, events=POLLIN}, {fd=3, events=POLLIN}], 8,
30000) = 0
gettimeofday({1213891819, 199219}, NULL) = 0
time(NULL) = 1213891819
time(NULL) = 1213891819
gettimeofday({1213891819, 199421}, NULL) = 0
poll([{fd=7, events=POLLIN}, {fd=8, events=POLLIN|POLLOUT}, {fd=9,
events=POLLIN}, {fd=4, events=POLLIN}, {fd=10, events=POLLIN}, {fd=5,
events=POLLIN}, {fd=6, events=POLLIN}, {fd=3, events=POLLIN}], 8,
30000) = 0
gettimeofday({1213891849, 200510}, NULL) = 0
time(NULL) = 1213891849
time(NULL) = 1213891849
gettimeofday({1213891849, 200707}, NULL) = 0
poll([{fd=7, events=POLLIN}, {fd=8, events=POLLIN|POLLOUT}, {fd=9,
events=POLLIN}, {fd=4, events=POLLIN}, {fd=10, events=POLLIN}, {fd=5,
events=POLLIN}, {fd=6, events=POLLIN, revents=POLLIN}, {fd=3,
events=POLLIN}], 8, 30000) = 1
gettimeofday({1213891867, 753947}, NULL) = 0
accept(6, {sa_family=AF_FILE,
path
=
"??????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????"}, [2
]) = 11
setsockopt(11, SOL_SOCKET, SO_KEEPALIVE, [0], 4) = 0
fcntl64(11, F_GETFL) = 0x2 (flags O_RDWR)
fcntl64(11, F_SETFL, O_RDWR|O_NONBLOCK) = 0
fcntl64(11, F_GETFD) = 0
fcntl64(11, F_SETFD, FD_CLOEXEC) = 0
brk(0) = 0x8ae1000
brk(0x8b02000) = 0x8b02000
gettimeofday({1213891867, 754658}, NULL) = 0
poll([{fd=7, events=POLLIN}, {fd=8, events=POLLIN|POLLOUT}, {fd=9,
events=POLLIN}, {fd=4, events=POLLIN}, {fd=10, events=POLLIN}, {fd=5,
events=POLLIN}, {fd=3, events=POLLIN}, {fd=11, events=POLLIN,
revents=POLLIN|POLLHUP}, {fd=6, events=POLLIN}], 9, 11446) = 1
gettimeofday({1213891867, 754830}, NULL) = 0
read(11, "<30>Jun 19 12:11:07 RMSS: /syslo"..., 8192) = 100
gettimeofday({1213891867, 755001}, NULL) = 0
time(NULL) = 1213891867
stat64("/etc/localtime", {st_mode=S_IFREG|0644, st_size=1252, ...}) = 0
gettimeofday({1213891867, 755271}, NULL) = 0
read(11, "", 8192) = 0
close(11) = 0
gettimeofday({1213891867, 755520}, NULL) = 0
poll([{fd=7, events=POLLIN}, {fd=8, events=POLLIN|POLLOUT}, {fd=9,
events=POLLIN}, {fd=4, events=POLLIN}, {fd=10, events=POLLIN}, {fd=5,
events=POLLIN}, {fd=3, events=POLLIN}, {fd=6, events=POLLIN}], 8,
11445) = 0
gettimeofday({1213891879, 201791}, NULL) = 0
time(NULL) = 1213891879
time(NULL) = 1213891879
gettimeofday({1213891879, 201987}, NULL) = 0
poll([{fd=7, events=POLLIN}, {fd=8, events=POLLIN|POLLOUT}, {fd=9,
events=POLLIN}, {fd=4, events=POLLIN}, {fd=10, events=POLLIN}, {fd=5,
events=POLLIN}, {fd=3, events=POLLIN}, {fd=6, events=POLLIN}], 8,
30000) = 0
gettimeofday({1213891909, 203098}, NULL) = 0
time(NULL) = 1213891909
time(NULL) = 1213891909
gettimeofday({1213891909, 203299}, NULL) = 0
poll([{fd=7, events=POLLIN}, {fd=8, events=POLLIN|POLLOUT}, {fd=9,
events=POLLIN}, {fd=4, events=POLLIN}, {fd=10, events=POLLIN}, {fd=5,
events=POLLIN}, {fd=3, events=POLLIN}, {fd=6, events=POLLIN}], 8,
30000) = 0
gettimeofday({1213891939, 205377}, NULL) = 0
time(NULL) = 1213891939
time(NULL) = 1213891939
gettimeofday({1213891939, 205580}, NULL) = 0
poll(
*******************
And now the strace sequence when it works:
We can see that after the read there is a write to transmit the
message to the central log server.
*******************
[root at c15 Rich]# strace -p 15310
Process 15310 attached - interrupt to quit
gettimeofday({1213900598, 12321}, NULL) = 0
gettimeofday({1213900598, 12361}, NULL) = 0
poll([{fd=7, events=POLLIN}, {fd=8, events=POLLIN}, {fd=5,
events=POLLIN}, {fd=9, events=POLLIN}, {fd=4, events=POLLIN}, {fd=6,
events=POLLIN}, {fd=3, events=POLLIN}], 7, 2069) = 0
gettimeofday({1213900600, 83036}, NULL) = 0
time(NULL) = 1213900600
time(NULL) = 1213900600
gettimeofday({1213900600, 83101}, NULL) = 0
poll([{fd=7, events=POLLIN}, {fd=8, events=POLLIN}, {fd=5,
events=POLLIN}, {fd=9, events=POLLIN}, {fd=4, events=POLLIN}, {fd=6,
events=POLLIN, revents=POLLIN}, {fd=3, events=POLLIN}], 7, 30000) = 1
gettimeofday({1213900610, 338336}, NULL) = 0
accept(6, {sa_family=AF_FILE, path=@}, [2]) = 10
setsockopt(10, SOL_SOCKET, SO_KEEPALIVE, [0], 4) = 0
fcntl64(10, F_GETFL) = 0x2 (flags O_RDWR)
fcntl64(10, F_SETFL, O_RDWR|O_NONBLOCK) = 0
fcntl64(10, F_GETFD) = 0
fcntl64(10, F_SETFD, FD_CLOEXEC) = 0
gettimeofday({1213900610, 338937}, NULL) = 0
poll([{fd=7, events=POLLIN}, {fd=8, events=POLLIN}, {fd=5,
events=POLLIN}, {fd=9, events=POLLIN}, {fd=4, events=POLLIN}, {fd=3,
events=POLLIN}, {fd=10, events=POLLIN, revents=POLLIN|POLLHUP}, {fd=6,
events=POLLIN}], 8, 19745) = 1
gettimeofday({1213900610, 339123}, NULL) = 0
read(10, "<30>Jun 19 14:36:50 RMSS: /syslo"..., 8192) = 100
gettimeofday({1213900610, 339320}, NULL) = 0
time(NULL) = 1213900610
stat64("/etc/localtime", {st_mode=S_IFREG|0644, st_size=1252, ...}) = 0
gettimeofday({1213900610, 339627}, NULL) = 0
read(10, "", 8192) = 0
close(10) = 0
gettimeofday({1213900610, 339888}, NULL) = 0
poll([{fd=7, events=POLLIN}, {fd=8, events=POLLIN}, {fd=5,
events=POLLIN}, {fd=9, events=POLLIN}, {fd=4, events=POLLIN}, {fd=3,
events=POLLIN}, {fd=6, events=POLLIN}], 7, 10) = 0
gettimeofday({1213900610, 351199}, NULL) = 0
stat64("/etc/localtime", {st_mode=S_IFREG|0644, st_size=1252, ...}) = 0
write(8, "<30>Jun 19 14:36:50 c15 RMSS: /s"..., 103) = 103
gettimeofday({1213900610, 351494}, NULL) = 0
poll( <unfinished ...>
Process 15310 detached
Richard Vigeant
www.vantrix.com
-------------- next part --------------
An HTML attachment was scrubbed...
URL: http://lists.balabit.hu/pipermail/syslog-ng/attachments/20080619/fe4234dd/attachment-0001.htm
More information about the syslog-ng
mailing list