tproxy race condition? [RESEND]

15 Dec 2004

      Hi Krisztian and Balazs,

First: a big thank-you for your work on the tproxy code!

I'm trying to use tproxy to implement a fully transparent layer 7 proxy as
follows: TCP connections arrive and are REDIRECTed to a single local port.
A userspace process listen()s on that port, and makes ongoing (transparent)
connections on new TCP sockets by calling bind(), tproxy setsockopt()s and
connect().  In general it works well, but I'm having a few issues which I
think are possibly SMP-related.  I believe I've reduced some of these to a
simple test case, sources for which are attached.  I'm using Linux kernel
2.4.27 and all four patches in cttproxy-2.4.27-2.0.0 patch.  To run the
test case, you need two machines; I think the 'client' must be SMP.

The 'server', 10.0.3.2, listens on a single TCP port and has a simple loop
which accept()s and close()s TCP connections that it receives.

The SMP 'client', 10.0.3.3, has two processes each connecting to the
server.  The clients loop through a port range 32768-49152.  They bind() on
10.0.3.3, receiving some port from the kernel.  They then assign a
transparent port in the loop port range on unregistered IP 10.0.3.253, and
connect() to the server.  (The server has a route set up so that it knows
to return traffic on 10.0.3.253 to the client box).

The problem: once in a while, one of the client processes takes 3s to
connect() to the server.  Then, the resulting TCP connection is NOT
TRANSPARENT (i.e. 10.0.3.3 is used, not 10.0.3.253).  This can be seen by
running "tcpdump host 10.0.3.3" on either box.  However, none of the client
process system calls fail at any point.

In the case that CONFIG_IP_NF_NAT_NRES is set, at the same time this
happens, the _other process_ has a -EINVAL failure in
ip_tproxy_setsockopt_flags(), with corresponding "failed to register NAT
reservation" error in dmesg.  When CONFIG_IP_NF_NAT_NRES is unset, this
failure doesn't happen.  But either way, on the _original process_, the
non-transparent TCP connection happens.

What I believe is happening is as follows: There is evidence in dmesg that
the first SYN packet of the connect() passes through the LOCAL_OUT iptables
hooks (I see "ip_tproxy_fn(): new connection, hook=3" and "ip_tproxy_fn():
new connection, hook=4", but for some reason the packet never actually
makes it onto the wire.  I can't see where it goes missing.  But anyway,
connect() waits 3s and resends the SYN.  This time, as the second packet
goes through the iptables, for some reason it's not translated.  It makes
it onto the wire and the rest of the connection proceeds untranslated.

I haven't been able to progress much further debugging this, and wondered
if you had any ideas?  My principal concern is that the userspace processes
don't receive an error and have no proper way of telling that the
connection is going untransparent.  Am I making a stupid mistake somewhere?

One other curious thing here: MUST_BE_READ_LOCKED(&ip_tproxy_lock) in
ip_tproxy_relatedct_add() fails.  Could this be related in any way?

Finally, what is the purpose of the new CONFIG_IP_NF_NAT_NRES option?

Thank-you for reading this, and for any advice you have!

Jim Minter <jim@minter.demon.co.uk>

== 8< == CLIENT CODE == 8< ==
#include <arpa/inet.h>
#include <linux/netfilter_ipv4/ip_tproxy2.h> // the v2.0 header
#include <sys/socket.h>
#include <sys/time.h>
#include <time.h>
#include <unistd.h>
#include <cerrno>
#include <cstdarg>
#include <cstdio>
#include <cstdlib>

void
error(const char *c) {
  perror(c);
  exit(1);
}

void
log(const char *fmt, ...) {
  char buf[10];
  struct timeval timeval;
  gettimeofday(&timeval, NULL);
  strftime(buf, sizeof(buf), "%T", localtime(&timeval.tv_sec));
  printf("%s.%06u ", buf, timeval.tv_usec);

  va_list args;
  va_start(args, fmt);
  vprintf(fmt, args);
  va_end(args); 
}

int
main(int argc, char **argv) {
  int lo = 32768;
  int hi = 49152;

  setbuf(stderr, NULL);
  setlinebuf(stdout);

  while(1)
    for(int port = lo; port < hi; port++) {
      fprintf(stderr, ".");
      int s = socket(PF_INET, SOCK_STREAM, 0);
      if(s == -1)
        error("socket");

      {
        // seems to be necessary...?
        int param = 1; 
        if(setsockopt(s, SOL_SOCKET, SO_REUSEADDR, ¶m, sizeof(param)))
          error("setsockopt SO_REUSEADDR");
      }

      {
        // bind to our local IP, and output the port the kernel gave us
        struct sockaddr_in sockaddr_in;
        sockaddr_in.sin_family = AF_INET;
        sockaddr_in.sin_port = 0;
        sockaddr_in.sin_addr.s_addr = inet_addr("10.0.3.3");

        if(bind(s, (struct sockaddr *)&sockaddr_in, sizeof(sockaddr_in)))
          error("bind");

        socklen_t sl = sizeof(sockaddr_in);
        if(getsockname(s, (struct sockaddr *)&sockaddr_in, &sl))
          error("getsockname");

        log("%u\n", ntohs(sockaddr_in.sin_port));
      }

      {
        // now get ourselves a looped port on unregistered IP 10.0.3.253.
        struct in_tproxy in_tproxy;
        in_tproxy.op = TPROXY_ASSIGN;
        in_tproxy.v.addr.faddr.s_addr = inet_addr("10.0.3.253");
        in_tproxy.v.addr.fport = htons(port);

        if(setsockopt(s, SOL_IP, IP_TPROXY, &in_tproxy, sizeof(in_tproxy)))
          error("setsockopt TPROXY_ASSIGN");

        log("a\n");

        in_tproxy.op = TPROXY_CONNECT;
        in_tproxy.v.addr.faddr.s_addr = inet_addr("10.0.3.2");
        in_tproxy.v.addr.fport = htons(7000);
        if(setsockopt(s, SOL_IP, IP_TPROXY, &in_tproxy, sizeof(in_tproxy)))
          error("setsockopt TPROXY_CONNECT");

        log("b\n");

        in_tproxy.op = TPROXY_FLAGS;
        in_tproxy.v.flags = ITP_CONNECT | ITP_ONCE;
        if(setsockopt(s, SOL_IP, IP_TPROXY, &in_tproxy, sizeof(in_tproxy))) {
          perror("setsockopt TPROXY_FLAGS");
          close(s);
          continue;
        }

        log("c\n");
      }

      {
        // now connect
        struct sockaddr_in sockaddr_in;
        sockaddr_in.sin_family = AF_INET;
        sockaddr_in.sin_port = htons(7000);
        sockaddr_in.sin_addr.s_addr = inet_addr("10.0.3.2");

        if(connect(s, (struct sockaddr *)&sockaddr_in, sizeof(sockaddr_in)))
          error("connect");

        log("d\n");
      }

      {
        // wait for other side to close
        char buf;
        if(read(s, &buf, 1) != 0)
          error("read");
      }

      close(s);

      log("e\n");
    }

  return 0;
}
== 8< == CLIENT CODE ENDS == 8< ==
== 8< == SERVER CODE == 8< ==
#include <netinet/in.h>
#include <sys/socket.h>
#include <unistd.h>
#include <cerrno>
#include <cstdio>
#include <cstdlib>

void
error(const char *c) {
  perror(c);
  exit(1);
}

int
main() {
  int s = socket(PF_INET, SOCK_STREAM, 0);
  if(s == -1)
    error("socket");

  {
    int param = 1; 
    if(setsockopt(s, SOL_SOCKET, SO_REUSEADDR, ¶m, sizeof(param)))
      error("setsockopt SO_REUSEADDR");
  }

  {
    struct sockaddr_in sockaddr_in;
    sockaddr_in.sin_family = AF_INET;
    sockaddr_in.sin_port = htons(7000);
    sockaddr_in.sin_addr.s_addr = INADDR_ANY;

    if(bind(s, (struct sockaddr *)&sockaddr_in, sizeof(sockaddr_in)))
      error("bind");
  }

  if(listen(s, SOMAXCONN))
    error("listen");

  while(1) {
    int fd = accept(s, NULL, 0);
    if(fd == -1)
      error("accept");

    close(fd);
  }
}
== 8< == SERVER CODE ENDS == 8< ==

jim＠minter.demon.co.uk

KOVACS Krisztian

jim＠minter.demon.co.uk

KOVACS Krisztian

jim＠minter.demon.co.uk

jim＠minter.demon.co.uk

KOVACS Krisztian

jim＠minter.demon.co.uk

KOVACS Krisztian

jim＠minter.demon.co.uk

Lennert Buytenhek

tags

participants (3)