Hi, I have a configuration where several nodes send all log messages to a central server. The applications on remote nodes send their logs locally either via UDP or a unix socket. The syslog-ng running on remote nodes simply pick up all log messages from all sources, i.e. TCP, UDP, /proc/kmsg, /dev/log and internal, and transmit all messages to the central server uisng TCP. The remote node's config file follows. We've been having intermittent problems where the central server would suddenly stop logging messages from certain nodes. We noticed that very often restarting syslog-ng on the central server would fix the condition and logging would carry on. Howver I discovered a new rare case where restarting the central syslog-ng didn't work. I found out by doing a tcpdump that the remote syslog-ng was not sending the log messages. I have done an strace on the remote syslog-ng and it shows that nothing happens after a message has been "recvfrom()" or "read()". Then I have restarted syslog-ng and things went back to normal. In the 2nd strace we can see that there is a "write()" after the "read()". I'm using syslog-ng 2.0.9. ********************* Remote node's Config file: ********************* options { chain_hostnames(no); log_msg_size(8192); log_fifo_size(50000); log_iw_size(250); log_fetch_limit(25); flush_lines(10); flush_timeout(10); }; source local { pipe("/proc/kmsg"); unix-stream("/dev/log" max-connections(50)); internal(); }; source network { udp(localport(514)); tcp(localport(514)); }; destination spotxdeoam { tcp("spotxdeoam" destport(514)); }; destination local { tcp("localhost" destport(514)); }; # To change the local log path for modules, change the following statement. destination local_Modules { file("/opt/spotxde/software/var/SPOTxdeServer/logs/$HOST/$PROGRAM/ messages" create_dirs(yes) template("$FULLDATE, $HOST, $MSG\n") template_escape(no)); }; # To change the local log path for node, change the following statement. destination local_Nodes { file("/opt/spotxde/software/var/SPOTxdeServer/logs/$HOST/messages" create_dirs(yes) template("$FULLDATE, $HOST, $MSG\n") template_escape(no)); }; filter Modules { program("^OAM") or program("^TRX") or program("^VA1") or [root@c15 Rich]# more /opt/spotxde/software/etc/syslog_ng/syslog- ng_trx.conf options { chain_hostnames(no); log_msg_size(8192); log_fifo_size(50000); log_iw_size(250); log_fetch_limit(25); flush_lines(10); flush_timeout(10); }; source local { pipe("/proc/kmsg"); unix-stream("/dev/log" max-connections(50)); internal(); }; source network { udp(localport(514)); tcp(localport(514)); }; destination spotxdeoam { tcp("spotxdeoam" destport(514)); }; destination local { tcp("localhost" destport(514)); }; # To change the local log path for modules, change the following statement. destination local_Modules { file("/opt/spotxde/software/var/SPOTxdeServer/logs/$HOST/$PROGRAM/ messages" create_dirs(yes) template("$FULLDATE, $HOST, $MSG\n") template_escape(no)); }; # To change the local log path for node, change the following statement. destination local_Nodes { file("/opt/spotxde/software/var/SPOTxdeServer/logs/$HOST/messages" create_dirs(yes) template("$FULLDATE, $HOST, $MSG\n") template_escape(no)); }; filter Modules { program("^OAM") or program("^TRX") or program("^VA1") or program("^VA2") or program("^VA3") or program("^VA4") or program("^MM1") or program("^MM3") or program("^MM4") or program("^MM7") or program("^WAP") or program("^STI") or program("^HCA") or program("^RPT") or program("^PRF") or program("^SVW") or program("^MNG") or program("^CDR") or program("^EAIF") or program("^TEST") or program("^MMSC") or program("^SSMA") or program("^SMTP") or program("^ICAP") or program("^BIDA") or program("^SMPP") or program("^LTRX") or program("^MCDN") or program("^RMSS") or program("^EPGS"); }; log { source(local); source(network); filter(Modules); destination(spotxdeoam); # Remove the comment on the next statement to log locally on the TRX nodes the Modules related logs # destination(local_Modules); flags(final); }; log { source(local); source(network); destination(spotxdeoam); # Remove the comment on the next statement to log locally on the TRX nodes the Node relatedlogs # destination(local_Nodes); flags(fallback); }; ********************* ********************* Following is an strace of syslog-ng that doesn't work. There were 2 attemps in this one: one using UDP and on using Unix socket. ********************* [root@c15 Rich]# strace -p 8259 Process 8259 attached - interrupt to quit gettimeofday({1213891616, 539195}, NULL) = 0 gettimeofday({1213891616, 539246}, NULL) = 0 poll([{fd=7, events=POLLIN}, {fd=8, events=POLLIN|POLLOUT}, {fd=9, events=POLLIN}, {fd=4, events=POLLIN}, {fd=10, events=POLLIN}, {fd=5, events=POLLIN}, {fd=6, events=POLLIN}, {fd=3, events=POLLIN}], 8, 22650) = 0 gettimeofday({1213891639, 190435}, NULL) = 0 time(NULL) = 1213891639 time(NULL) = 1213891639 gettimeofday({1213891639, 190507}, NULL) = 0 poll([{fd=7, events=POLLIN}, {fd=8, events=POLLIN|POLLOUT}, {fd=9, events=POLLIN}, {fd=4, events=POLLIN}, {fd=10, events=POLLIN}, {fd=5, events=POLLIN}, {fd=6, events=POLLIN}, {fd=3, events=POLLIN}], 8, 30000) = 0 gettimeofday({1213891669, 191715}, NULL) = 0 time(NULL) = 1213891669 time(NULL) = 1213891669 gettimeofday({1213891669, 191778}, NULL) = 0 poll([{fd=7, events=POLLIN}, {fd=8, events=POLLIN|POLLOUT}, {fd=9, events=POLLIN}, {fd=4, events=POLLIN}, {fd=10, events=POLLIN}, {fd=5, events=POLLIN}, {fd=6, events=POLLIN}, {fd=3, events=POLLIN, revents=POLLIN}], 8, 30000) = 1 gettimeofday({1213891676, 326051}, NULL) = 0 recvfrom(3, "<30>Jun 19 12:07:56 RMSS: /syslo"..., 8192, 0, {sa_family=AF_INET, sin_port=htons(39166), sin_addr=inet_addr("127.0.0.1")}, [16]) = 100 gettimeofday({1213891676, 326294}, NULL) = 0 time(NULL) = 1213891676 stat64("/etc/localtime", {st_mode=S_IFREG|0644, st_size=1252, ...}) = 0 time(NULL) = 1213891676 gettimeofday({1213891676, 326621}, NULL) = 0 recvfrom(3, 0x8131ef8, 8192, 0, 0xffffd560, 0xffffd55c) = -1 EAGAIN (Resource temporarily unavailable) gettimeofday({1213891676, 326794}, NULL) = 0 poll([{fd=7, events=POLLIN}, {fd=8, events=POLLIN|POLLOUT}, {fd=9, events=POLLIN}, {fd=4, events=POLLIN}, {fd=10, events=POLLIN}, {fd=5, events=POLLIN}, {fd=6, events=POLLIN}, {fd=3, events=POLLIN}], 8, 22865) = 0 gettimeofday({1213891699, 193074}, NULL) = 0 time(NULL) = 1213891699 time(NULL) = 1213891699 gettimeofday({1213891699, 193277}, NULL) = 0 poll([{fd=7, events=POLLIN}, {fd=8, events=POLLIN|POLLOUT}, {fd=9, events=POLLIN}, {fd=4, events=POLLIN}, {fd=10, events=POLLIN}, {fd=5, events=POLLIN}, {fd=6, events=POLLIN}, {fd=3, events=POLLIN}], 8, 30000) = 0 gettimeofday({1213891729, 195368}, NULL) = 0 time(NULL) = 1213891729 time(NULL) = 1213891729 gettimeofday({1213891729, 195569}, NULL) = 0 poll([{fd=7, events=POLLIN}, {fd=8, events=POLLIN|POLLOUT}, {fd=9, events=POLLIN}, {fd=4, events=POLLIN}, {fd=10, events=POLLIN}, {fd=5, events=POLLIN}, {fd=6, events=POLLIN}, {fd=3, events=POLLIN}], 8, 30000) = 0 gettimeofday({1213891759, 196643}, NULL) = 0 time(NULL) = 1213891759 time(NULL) = 1213891759 gettimeofday({1213891759, 196842}, NULL) = 0 poll([{fd=7, events=POLLIN}, {fd=8, events=POLLIN|POLLOUT}, {fd=9, events=POLLIN}, {fd=4, events=POLLIN}, {fd=10, events=POLLIN}, {fd=5, events=POLLIN}, {fd=6, events=POLLIN}, {fd=3, events=POLLIN}], 8, 30000) = 0 gettimeofday({1213891789, 197931}, NULL) = 0 time(NULL) = 1213891789 time(NULL) = 1213891789 gettimeofday({1213891789, 198129}, NULL) = 0 poll([{fd=7, events=POLLIN}, {fd=8, events=POLLIN|POLLOUT}, {fd=9, events=POLLIN}, {fd=4, events=POLLIN}, {fd=10, events=POLLIN}, {fd=5, events=POLLIN}, {fd=6, events=POLLIN}, {fd=3, events=POLLIN}], 8, 30000) = 0 gettimeofday({1213891819, 199219}, NULL) = 0 time(NULL) = 1213891819 time(NULL) = 1213891819 gettimeofday({1213891819, 199421}, NULL) = 0 poll([{fd=7, events=POLLIN}, {fd=8, events=POLLIN|POLLOUT}, {fd=9, events=POLLIN}, {fd=4, events=POLLIN}, {fd=10, events=POLLIN}, {fd=5, events=POLLIN}, {fd=6, events=POLLIN}, {fd=3, events=POLLIN}], 8, 30000) = 0 gettimeofday({1213891849, 200510}, NULL) = 0 time(NULL) = 1213891849 time(NULL) = 1213891849 gettimeofday({1213891849, 200707}, NULL) = 0 poll([{fd=7, events=POLLIN}, {fd=8, events=POLLIN|POLLOUT}, {fd=9, events=POLLIN}, {fd=4, events=POLLIN}, {fd=10, events=POLLIN}, {fd=5, events=POLLIN}, {fd=6, events=POLLIN, revents=POLLIN}, {fd=3, events=POLLIN}], 8, 30000) = 1 gettimeofday({1213891867, 753947}, NULL) = 0 accept(6, {sa_family=AF_FILE, path = "??????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????"}, [2 ]) = 11 setsockopt(11, SOL_SOCKET, SO_KEEPALIVE, [0], 4) = 0 fcntl64(11, F_GETFL) = 0x2 (flags O_RDWR) fcntl64(11, F_SETFL, O_RDWR|O_NONBLOCK) = 0 fcntl64(11, F_GETFD) = 0 fcntl64(11, F_SETFD, FD_CLOEXEC) = 0 brk(0) = 0x8ae1000 brk(0x8b02000) = 0x8b02000 gettimeofday({1213891867, 754658}, NULL) = 0 poll([{fd=7, events=POLLIN}, {fd=8, events=POLLIN|POLLOUT}, {fd=9, events=POLLIN}, {fd=4, events=POLLIN}, {fd=10, events=POLLIN}, {fd=5, events=POLLIN}, {fd=3, events=POLLIN}, {fd=11, events=POLLIN, revents=POLLIN|POLLHUP}, {fd=6, events=POLLIN}], 9, 11446) = 1 gettimeofday({1213891867, 754830}, NULL) = 0 read(11, "<30>Jun 19 12:11:07 RMSS: /syslo"..., 8192) = 100 gettimeofday({1213891867, 755001}, NULL) = 0 time(NULL) = 1213891867 stat64("/etc/localtime", {st_mode=S_IFREG|0644, st_size=1252, ...}) = 0 gettimeofday({1213891867, 755271}, NULL) = 0 read(11, "", 8192) = 0 close(11) = 0 gettimeofday({1213891867, 755520}, NULL) = 0 poll([{fd=7, events=POLLIN}, {fd=8, events=POLLIN|POLLOUT}, {fd=9, events=POLLIN}, {fd=4, events=POLLIN}, {fd=10, events=POLLIN}, {fd=5, events=POLLIN}, {fd=3, events=POLLIN}, {fd=6, events=POLLIN}], 8, 11445) = 0 gettimeofday({1213891879, 201791}, NULL) = 0 time(NULL) = 1213891879 time(NULL) = 1213891879 gettimeofday({1213891879, 201987}, NULL) = 0 poll([{fd=7, events=POLLIN}, {fd=8, events=POLLIN|POLLOUT}, {fd=9, events=POLLIN}, {fd=4, events=POLLIN}, {fd=10, events=POLLIN}, {fd=5, events=POLLIN}, {fd=3, events=POLLIN}, {fd=6, events=POLLIN}], 8, 30000) = 0 gettimeofday({1213891909, 203098}, NULL) = 0 time(NULL) = 1213891909 time(NULL) = 1213891909 gettimeofday({1213891909, 203299}, NULL) = 0 poll([{fd=7, events=POLLIN}, {fd=8, events=POLLIN|POLLOUT}, {fd=9, events=POLLIN}, {fd=4, events=POLLIN}, {fd=10, events=POLLIN}, {fd=5, events=POLLIN}, {fd=3, events=POLLIN}, {fd=6, events=POLLIN}], 8, 30000) = 0 gettimeofday({1213891939, 205377}, NULL) = 0 time(NULL) = 1213891939 time(NULL) = 1213891939 gettimeofday({1213891939, 205580}, NULL) = 0 poll( ******************* And now the strace sequence when it works: We can see that after the read there is a write to transmit the message to the central log server. ******************* [root@c15 Rich]# strace -p 15310 Process 15310 attached - interrupt to quit gettimeofday({1213900598, 12321}, NULL) = 0 gettimeofday({1213900598, 12361}, NULL) = 0 poll([{fd=7, events=POLLIN}, {fd=8, events=POLLIN}, {fd=5, events=POLLIN}, {fd=9, events=POLLIN}, {fd=4, events=POLLIN}, {fd=6, events=POLLIN}, {fd=3, events=POLLIN}], 7, 2069) = 0 gettimeofday({1213900600, 83036}, NULL) = 0 time(NULL) = 1213900600 time(NULL) = 1213900600 gettimeofday({1213900600, 83101}, NULL) = 0 poll([{fd=7, events=POLLIN}, {fd=8, events=POLLIN}, {fd=5, events=POLLIN}, {fd=9, events=POLLIN}, {fd=4, events=POLLIN}, {fd=6, events=POLLIN, revents=POLLIN}, {fd=3, events=POLLIN}], 7, 30000) = 1 gettimeofday({1213900610, 338336}, NULL) = 0 accept(6, {sa_family=AF_FILE, path=@}, [2]) = 10 setsockopt(10, SOL_SOCKET, SO_KEEPALIVE, [0], 4) = 0 fcntl64(10, F_GETFL) = 0x2 (flags O_RDWR) fcntl64(10, F_SETFL, O_RDWR|O_NONBLOCK) = 0 fcntl64(10, F_GETFD) = 0 fcntl64(10, F_SETFD, FD_CLOEXEC) = 0 gettimeofday({1213900610, 338937}, NULL) = 0 poll([{fd=7, events=POLLIN}, {fd=8, events=POLLIN}, {fd=5, events=POLLIN}, {fd=9, events=POLLIN}, {fd=4, events=POLLIN}, {fd=3, events=POLLIN}, {fd=10, events=POLLIN, revents=POLLIN|POLLHUP}, {fd=6, events=POLLIN}], 8, 19745) = 1 gettimeofday({1213900610, 339123}, NULL) = 0 read(10, "<30>Jun 19 14:36:50 RMSS: /syslo"..., 8192) = 100 gettimeofday({1213900610, 339320}, NULL) = 0 time(NULL) = 1213900610 stat64("/etc/localtime", {st_mode=S_IFREG|0644, st_size=1252, ...}) = 0 gettimeofday({1213900610, 339627}, NULL) = 0 read(10, "", 8192) = 0 close(10) = 0 gettimeofday({1213900610, 339888}, NULL) = 0 poll([{fd=7, events=POLLIN}, {fd=8, events=POLLIN}, {fd=5, events=POLLIN}, {fd=9, events=POLLIN}, {fd=4, events=POLLIN}, {fd=3, events=POLLIN}, {fd=6, events=POLLIN}], 7, 10) = 0 gettimeofday({1213900610, 351199}, NULL) = 0 stat64("/etc/localtime", {st_mode=S_IFREG|0644, st_size=1252, ...}) = 0 write(8, "<30>Jun 19 14:36:50 c15 RMSS: /s"..., 103) = 103 gettimeofday({1213900610, 351494}, NULL) = 0 poll( <unfinished ...> Process 15310 detached Richard Vigeant www.vantrix.com