/*
* libslack - http://libslack.org/
*
* Copyright (C) 1999-2004 raf <raf@raf.org>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
* or visit http://www.gnu.org/copyleft/gpl.html
*
* 20040102 raf <raf@raf.org>
*/

/*

=head1 NAME

I<libslack(net)> - network module

=head1 SYNOPSIS

    #include <slack/std.h>
    #include <slack/net.h>

    typedef struct sockaddr sockaddr_t;
    typedef unsigned short sockport_t;
    typedef struct sockopt_t sockopt_t;

    typedef union sockaddr_any_t sockaddr_any_t;
    typedef struct sockaddr_un sockaddr_un_t;
    typedef struct sockaddr_in sockaddr_in_t;
    typedef struct sockaddr_in6 sockaddr_in6_t;

    typedef struct net_interface_t net_interface_t;
    typedef struct rudp_t rudp_t;

    struct sockopt_t
    {
        int level;
        int optname;
        const void *optval;
        int optlen;
    };

    union sockaddr_any_t
    {
        sockaddr_t any;
        sockaddr_un_t un;
        sockaddr_in_t in;
        sockaddr_in6_t in6;
    };

    struct net_interface_t
    {
        char name[IFNAMSIZ];
        unsigned int index;
        short flags;
        int mtu;
        sockaddr_any_t *addr;
        sockaddr_any_t *brdaddr;
        sockaddr_any_t *dstaddr;
        sockaddr_any_t *hwaddr;
    };

    int net_server(const char *interface, const char *service, sockport_t port, int rcvbufsz, int sndbufsz, sockaddr_t *addr, size_t *addrsize);
    int net_client(const char *host, const char *service, sockport_t port, long timeout, int rcvbufsz, int sndbufsz, sockaddr_t *addr, size_t *addrsize);
    int net_udp_server(const char *interface, const char *service, sockport_t port, int rcvbufsz, int sndbufsz, sockaddr_t *addr, size_t *addrsize);
    int net_udp_client(const char *host, const char *service, sockport_t port, int rcvbufsz, int sndbufsz, sockaddr_t *addr, size_t *addrsize);
    int net_create_server(const char *interface, const char *service, sockport_t port, int type, int protocol, sockopt_t *sockopts, sockaddr_t *addr, size_t *addrsize);
    int net_create_client(const char *host, const char *service, sockport_t port, sockport_t localport, int type, int protocol, long timeout, sockopt_t *sockopts, sockaddr_t *addr, size_t *addrsize);
    int net_multicast_sender(const char *group, const char *service, sockport_t port, sockopt_t *sockopts, sockaddr_t *addr, size_t *addrsize, const char *ifname, unsigned int ifindex, int ttl, unsigned int noloopback);
    int net_multicast_receiver(const char *group, const char *service, sockport_t port, sockopt_t *sockopts, sockaddr_t *addr, size_t *addrsize, const char *ifname, unsigned int ifindex);
    int net_multicast_join(int sockfd, const sockaddr_t *addr, size_t addrsize, const char *ifname, unsigned int ifindex);
    int net_multicast_leave(int sockfd, const sockaddr_t *addr, size_t addrsize, const char *ifname, unsigned int ifindex);
    int net_multicast_set_interface(int sockfd, const char *ifname, unsigned int ifindex);
    int net_multicast_get_interface(int sockfd);
    int net_multicast_set_loopback(int sockfd, unsigned int loopback);
    int net_multicast_get_loopback(int sockfd);
    int net_multicast_set_ttl(int sockfd, int ttl);
    int net_multicast_get_ttl(int sockfd);
    int net_tos_lowdelay(int sockfd);
    int net_tos_throughput(int sockfd);
    int net_tos_reliability(int sockfd);
    int net_tos_lowcost(int sockfd);
    int net_tos_normal(int sockfd);
    struct hostent *net_gethostbyname(const char *name, struct hostent *hostbuf, void **buf, size_t *size, int *herrno);
    struct servent *net_getservbyname(const char *name, const char *proto, struct servent *servbuf, void **buf, size_t *size);
    int net_options(int sockfd, sockopt_t *sockopts);
    List *net_interfaces(void);
    List *net_interfaces_with_locker(Locker *locker);
    List *net_interfaces_by_family(int family);
    List *net_interfaces_by_family_with_locker(int family, Locker *locker);
    rudp_t *rudp_create(void);
    void rudp_release(rudp_t *rudp);
    void *rudp_destroy(rudp_t **rudp);
    ssize_t net_rudp_transact(int sockfd, rudp_t *rudp, const void *obuf, size_t osize, void *ibuf, size_t isize);
    ssize_t net_rudp_transactwith(int sockfd, rudp_t *rudp, const void *obuf, size_t osize, int oflags, void *ibuf, size_t isize, int iflags, sockaddr_any_t *addr, size_t addrsize);
    ssize_t net_pack(int sockfd, long timeout, int flags, const char *format, ...);
    ssize_t net_vpack(int sockfd, long timeout, int flags, const char *format, va_list args);
    ssize_t net_packto(int sockfd, long timeout, int flags, const sockaddr_t *to, size_t tosize, const char *format, ...);
    ssize_t net_vpackto(int sockfd, long timeout, int flags, const sockaddr_t *to, size_t tosize, const char *format, va_list args);
    ssize_t net_unpack(int sockfd, long timeout, int flags, const char *format, ...);
    ssize_t net_vunpack(int sockfd, long timeout, int flags, const char *format, va_list args);
    ssize_t net_unpackfrom(int sockfd, long timeout, int flags, sockaddr_t *from, size_t *fromsize, const char *format, ...);
    ssize_t net_vunpackfrom(int sockfd, long timeout, int flags, sockaddr_t *from, size_t *fromsize, const char *format, va_list args);
    ssize_t pack(void *buf, size_t size, const char *format, ...);
    ssize_t vpack(void *buf, size_t size, const char *format, va_list args);
    ssize_t unpack(void *buf, size_t size, const char *format, ...);
    ssize_t vunpack(void *buf, size_t size, const char *format, va_list args);
    ssize_t net_read(int sockfd, long timeout, char *buf, size_t count);
    ssize_t net_write(int sockfd, long timeout, const char *buf, size_t count);
    ssize_t net_expect(int sockfd, long timeout, const char *format, ...);
    ssize_t net_vexpect(int sockfd, long timeout, const char *format, va_list args);
    ssize_t net_send(int sockfd, long timeout, const char *format, ...);
    ssize_t net_vsend(int sockfd, long timeout, const char *format, va_list args);
    ssize_t sendfd(int sockfd, const void *buf, size_t nbytes, int flags, int fd);
    ssize_t recvfd(int sockfd, void *buf, size_t nbytes, int flags, int *fd);
    int mail(const char *server, const char *sender, const char *recipients, const char *subject, const char *message);

=head1 DESCRIPTION

This module provides functions that create client and server sockets (IPv4,
IPv6 and UNIX domain sockets, stream or datagram), that expect and send text
dialogues, and that pack and unpack packets according to templates. IPv4 and
IPv6 multicasting is supported. Reliability over UDP is provided. There are
also a function to send mail and functions to send and receive open file
descriptors via UNIX domain sockets from one process to another.

=over 4

=cut

*/

#define _BSD_SOURCE /* for gethostbyname_r() under Linux */

#include "config.h"
#include "std.h"

#include <sys/time.h>
#include <sys/types.h>
#include <sys/socket.h>
#include <sys/uio.h>
#include <netdb.h>
#include <net/if.h>
#define BSD_COMP /* for SIOCGIF... under Solaris */
#include <sys/ioctl.h>
#include <netinet/in_systm.h>
#include <netinet/in.h> /* needed by <netinet/ip.h> under OpenBSD */
#include <netinet/ip.h>

#include "net.h"
#include "err.h"
#include "str.h"
#include "fio.h"
#include "mem.h"

#ifndef HAVE_SNPRINTF
#include "snprintf.h"
#endif

#ifndef HAVE_VSSCANF
#include "vsscanf.h"
#endif

#ifdef SOCKS
#include "socks.h"
#endif

#ifndef MSG_SIZE
#define MSG_SIZE 8192
#endif

#ifndef EPROTO /* Mac OS X doesn't have EPROTO */
#define EPROTO EPROTOTYPE
#endif

#ifndef AF_LOCAL /* Solaris 2.6 doesn't have AF_LOCAL */
#define AF_LOCAL AF_UNIX
#endif

#ifndef TEST

#ifndef HAVE_IFREQ_IFR_IFINDEX
#define ifr_ifindex ifr_index
#endif
#ifndef HAVE_IFREQ_IFR_MTU
#define ifr_mtu ifr_ifindex
#endif

struct rudp_t
{
	double rtt;        /* most recent round trip time in seconds */
	double srtt;       /* smoothed round trip time estimator in seconds */
	double rttvar;     /* smoothed mean deviation in seconds */
	double rto;        /* current retransmission timeout in seconds */
	int nrexmt;        /* number of times retransmitted */
	uint32_t base;     /* number of seconds since epoch at start */
	uint32_t sequence; /* sequence number */
};

#ifndef RUDP_RXTMIN
#define RUDP_RXTMIN 2 /* minimum retransmission timeout in seconds */
#endif

#ifndef RUDP_RXTMAX
#define RUDP_RXTMAX 60 /* maximum retransmission timeout in seconds */
#endif

#ifndef RUDP_MAXNREXMT
#define RUDP_MAXNREXMT 3 /* maximum number of times to retransmit */
#endif

/*

=item C<int net_server(const char *interface, const char *service, sockport_t port, int rcvbufsz, int sndbufsz, sockaddr_t *addr, size_t *addrsize)>

Creates a TCP server socket ready to I<accept(2)> connections on
C<interface> (as determined by I<gethostbyname(3)>).

If C<interface> is C<null>, connections will be accepted on all local
network interfaces. Otherwise, connections will only be accepted on the
specified interface (as determined by I<gethostbyname(3)>).

If C<service> is non-C<null> and is either numeric or is a service name (as
determined by I<getservbyname(3)>), the specified port is used. Otherwise,
C<port> (which must be in host byte order) is used.

If C<interface> is equal to C<"/unix"> and C<service> is an absolute file
system path, the server socket created will be a I<UNIX domain stream
socket>. Otherwise, a TCP server socket is created. If the C<RES_OPTIONS>
environment variable exists and contains the string C<"inet6"> or the
C</etc/resolv.conf> file contains the C<inet6> option, the TCP socket will
be an IPv6 socket. Otherwise, it will be an IPv4 socket.

If C<rcvbufsz> is non-zero, the socket's receive buffer size is set to this
size. Note that you may not get the size you request. If this is important,
use I<getsockopt(2)> to obtain the actual receive buffer size.

If C<sndbufsz> is non-zero, the socket's send buffer size is set to this
size. Note that you may not get the size you ask for. If this is important,
use I<getsockopt(2)> to obtain the actual send buffer size.

If C<addr> and C<addrsize> are not C<null>, the address bound to is stored
in the buffer pointed to by C<addr>. C<*addrsize> specifies the size of the
buffer pointed to by C<addr>. If there is insufficient space, the bound
address is not stored in C<addr>. If C<addrsize> is not C<null>, the length
of the address is stored there.

On success, returns the new socket descriptor. On error, returns C<-1> with
C<errno> set appropriately.

=cut

*/

static sockopt_t *build_sockopts(sockopt_t *sockopts, int *rcvbufsz, int *sndbufsz)
{
	size_t so = 0;

	if (*rcvbufsz)
	{
		sockopts[so].level = SOL_SOCKET;
		sockopts[so].optname = SO_RCVBUF;
		sockopts[so].optval = rcvbufsz;
		sockopts[so].optlen = sizeof(int);
		so++;
	}

	if (*sndbufsz)
	{
		sockopts[so].level = SOL_SOCKET;
		sockopts[so].optname = SO_SNDBUF;
		sockopts[so].optval = sndbufsz;
		sockopts[so].optlen = sizeof(int);
		so++;
	}

	sockopts[so].optval = NULL;

	return sockopts;
}

int net_server(const char *interface, const char *service, sockport_t port, int rcvbufsz, int sndbufsz, sockaddr_t *addr, size_t *addrsize)
{
	sockopt_t sockopts[3];

	build_sockopts(sockopts, &rcvbufsz, &sndbufsz);

	return net_create_server(interface, service, port, SOCK_STREAM, 0, sockopts, addr, addrsize);
}

/*

=item C<int net_client(const char *host, const char *service, sockport_t port, long timeout, int rcvbufsz, int sndbufsz, sockaddr_t *addr, size_t *addrsize)>

Creates a TCP client socket and connects to the server listening at C<host>
(as determined by I<gethostbyname(3)>) on the port number specified by
C<service>. C<service> must either be numeric or a service name as
determined by I<getservbyname(3)>. Otherwise, the port number to connect to
is given by C<port> (which must be in host byte order). If C<host> is
C<null>, the client socket connects to the loopback address.

If C<host> is equal to C<"/unix"> and C<service> is an absolute file system
path, the client socket created will be a I<UNIX domain stream socket>.
Otherwise, a TCP client socket is created. If the C<RES_OPTIONS> environment
variable exists and contains the string C<"inet6"> or the
C</etc/resolv.conf> file contains the C<inet6> option, the TCP socket will
be an IPv6 socket. Otherwise, it will be an IPv4 socket.

If C<timeout> is non-zero, it specifies the number of seconds after which to
timeout the attempt to connect to the specified server. This can be useful
if the client may attempt to connect to a service that is blocked by a
firewall that drops its packets or if the host you are connecting to does
not protect itself from SYN floods. The native TCP timeouts are very long
(usually minutes) when faced with an unresponsive network and you may not
want your programs or their users to wait that long.

If C<rcvbufsz> is non-zero, the socket's receive buffer size is set to this
size. Note that you may not get the size you request. If this is important,
use I<getsockopt(2)> to obtain the actual receive buffer size.

If C<sndbufsz> is non-zero, the socket's send buffer size is set to this
size. Note that you may not get the size you ask for. If this is important,
use I<getsockopt(2)> to obtain the actual send buffer size.

If C<addr> and C<addrsize> are not C<null>, the address of the peer is
stored in the buffer pointed to by C<addr>. C<*addrsize> specifies the size
of the buffer pointed to by C<addr>. If there is insufficient space, the
peer's address is not stored in C<addr>. If C<addrsize> is not C<null>, the
size of the address is stored there.

On success, returns the new socket descriptor. On error, returns C<-1> with
C<errno> set appropriately.

=cut

*/

int net_client(const char *host, const char *service, sockport_t port, long timeout, int rcvbufsz, int sndbufsz, sockaddr_t *addr, size_t *addrsize)
{
	sockopt_t sockopts[3];

	build_sockopts(sockopts, &rcvbufsz, &sndbufsz);

	return net_create_client(host, service, port, 0, SOCK_STREAM, 0, timeout, sockopts, addr, addrsize);
}

/*

=item C<int net_udp_server(const char *interface, const char *service, sockport_t port, int rcvbufsz, int sndbufsz, sockaddr_t *addr, size_t *addrsize)>

Equivalent to I<net_server(3)> except that a UDP server is socket is
created. If C<interface> is equal to C<"/unix"> and C<service> is an
absolute file system path, the server socket created will be a I<UNIX domain
datagram socket>. On success, returns the new socket's file descriptor. On
error, returns C<-1> with C<errno> set appropriately.

=cut

*/

int net_udp_server(const char *interface, const char *service, sockport_t port, int rcvbufsz, int sndbufsz, sockaddr_t *addr, size_t *addrsize)
{
	sockopt_t sockopts[3];

	build_sockopts(sockopts, &rcvbufsz, &sndbufsz);

	return net_create_server(interface, service, port, SOCK_DGRAM, 0, sockopts, addr, addrsize);
}

/*

=item C<int net_udp_client(const char *host, const char *service, sockport_t port, int rcvbufsz, int sndbufsz, sockaddr_t *addr, size_t *addrsize)>

Equivalent to I<net_client(3)> except that a UDP client socket is created.
If C<interface> is equal to C<"/unix"> and C<service> is an absolute file
system path, the server socket created will be a I<UNIX domain datagram
socket>. On success, returns the new socket's file descriptor. On error,
returns C<-1> with C<errno> set appropriately.

=cut

*/

int net_udp_client(const char *host, const char *service, sockport_t port, int rcvbufsz, int sndbufsz, sockaddr_t *addr, size_t *addrsize)
{
	sockopt_t sockopts[3];

	build_sockopts(sockopts, &rcvbufsz, &sndbufsz);

	return net_create_client(host, service, port, 0, SOCK_DGRAM, 0, 0, sockopts, addr, addrsize);
}

/*

=item C<int net_create_server(const char *interface, const char *service, sockport_t port, int type, int protocol, sockopt_t *sockopts, sockaddr_t *addr, size_t *addrsize)>

Equivalent to I<net_server(3)> and I<net_udp_server(3)> only more general.
The type of socket is specified by C<type> (e.g. C<SOCK_STREAM> or
C<SOCK_DGRAM>) and C<protocol> (usually zero). If C<sockopts> is not
C<null>, the socket options specified are set before calling I<bind(2)>. On
success, returns the new socket's file descriptor. On error, returns C<-1>
with C<errno> set appropriately.

=cut

*/

static sockaddr_t *net_unaddr(sockaddr_un_t *un, size_t family, const char *path)
{
	memset(un, 0, sizeof(sockaddr_un_t));
	un->sun_family = family;
	strcpy(un->sun_path, path);
	return (sockaddr_t *)un;
}

static sockaddr_t *net_inaddr(sockaddr_in_t *in, size_t family, const void *addr, size_t addrsize, sockport_t port)
{
	memset(in, 0, sizeof(sockaddr_in_t));
	in->sin_family = family;
	memcpy(&in->sin_addr, addr, addrsize);
	in->sin_port = port;
	return (sockaddr_t *)in;
}

#ifdef AF_INET6
static sockaddr_t *net_in6addr(sockaddr_in6_t *in6, size_t family, const void *addr, size_t addrsize, sockport_t port)
{
	memset(in6, 0, sizeof(sockaddr_in6_t));
	in6->sin6_family = family;
	memcpy(&in6->sin6_addr, addr, addrsize);
	in6->sin6_port = port;
	return (sockaddr_t *)in6;
}
#endif

#ifdef AF_INET6
static int inet6_required(void)
{
	char *res_options;
	FILE *resolv_conf;

	if ((res_options = getenv("RES_OPTIONS")) && strstr(res_options, "inet6"))
		return 1;

	if ((resolv_conf = fopen("/etc/resolv.conf", "r")))
	{
		char line[BUFSIZ];

		while (fgets(line, BUFSIZ, resolv_conf))
		{
			if (!strncmp(line, "options", 7) && strstr(line + 8, "inet6"))
			{
				fclose(resolv_conf);
				return 1;
			}
		}

		fclose(resolv_conf);
	}

	return 0;
}
#endif

static const char *getprotonamebysocktype(int socktype)
{
	switch (socktype)
	{
		case SOCK_STREAM: return "tcp";
		case SOCK_DGRAM: return "udp";
		default: return NULL;
	}
}

static sockport_t getservportbynameandtype(const char *name, int type)
{
	struct servent servbuf[1];
	struct servent *serv;
	void *buf = NULL;
	size_t size = 0;
	sockport_t port = 0;
	const char *proto;

	proto = getprotonamebysocktype(type);

	if ((serv = net_getservbyname(name, proto, servbuf, &buf, &size)))
		port = serv->s_port;

	free(buf);

	return port;
}

static int service_number(const char *service)
{
	char *endptr = NULL;
	unsigned long val = strtoul(service, &endptr, 10);

	if ((val = strtoul(service, &endptr, 10)) > USHRT_MAX)
		return set_errno(ERANGE);

	if (endptr == service || *endptr != '\0')
		return set_errno(EDOM);

	return (int)val;
}

static sockport_t service_port(const char *service, int type, int port)
{
	if (service)
	{
		int ret;

		if ((ret = service_number(service)) != -1)
			return htons((sockport_t)ret);

		if ((ret = getservportbynameandtype(service, type)) != 0)
			return ret;
	}

	return htons(port);
}

static int is_multicast(sockaddr_t *address)
{
	sockaddr_any_t *addr = (sockaddr_any_t *)address;

	switch (addr->any.sa_family)
	{
		case AF_INET:
			return IN_MULTICAST(ntohl(*(long *)&addr->in.sin_addr));

#ifdef AF_INET6
		case AF_INET6:
			return IN6_IS_ADDR_MULTICAST(&addr->in6.sin6_addr);
#endif
	}

	return 0;
}

int net_create_server(const char *interface, const char *service, sockport_t port, int type, int protocol, sockopt_t *sockopts, sockaddr_t *addr, size_t *addrsize)
{
	int sockfd;
	sockaddr_any_t localany;
	sockaddr_t *localaddr;
	size_t localsize;
	struct hostent *hostent;
	int reuse_addr = 1;

	/* Check for UNIX domain socket specification */

	if (interface && !strcmp(interface, "/unix"))
	{
		if (!service || *service != '/' || !service[1] || strlen(service) >= sizeof localany.un.sun_path)
			return set_errno(EINVAL);

		localaddr = net_unaddr(&localany.un, AF_LOCAL, service);
		localsize = sizeof localany.un;
		unlink(localany.un.sun_path);
	}
	else /* IPv4 or IPv6 */
	{
		/* Set port to service's port number if possible */

		port = service_port(service, type, port);

		/* Set localaddr and localsize to the specified interface, or any */

		if (interface)
		{
			struct hostent hostbuf[1];
			void *buf = NULL;
			size_t size = 0;
			int herrno;

			if (!(hostent = net_gethostbyname(interface, hostbuf, &buf, &size, &herrno)))
			{
				free(buf);
				return set_errno(ENOENT);
			}

			if (hostent->h_addrtype == AF_INET)
			{
				localaddr = net_inaddr(&localany.in, hostent->h_addrtype, hostent->h_addr_list[0], hostent->h_length, port);
				localsize = sizeof localany.in;
			}
#ifdef AF_INET6
			else if (hostent->h_addrtype == AF_INET6)
			{
				localaddr = net_in6addr(&localany.in6, hostent->h_addrtype, hostent->h_addr_list[0], hostent->h_length, port);
				localsize = sizeof localany.in6;
			}
#endif
			else
			{
				free(buf);
				return set_errno(ENOSYS);
			}

			free(buf);
		}
		else /* wildcard */
		{
#ifdef AF_INET6
			if (inet6_required())
			{
				localaddr = net_in6addr(&localany.in6, AF_INET6, &in6addr_any, sizeof in6addr_any, port);
				localsize = sizeof localany.in6;
			}
			else
#endif
			{
				unsigned long inaddr_any = htonl(INADDR_ANY);
				localaddr = net_inaddr(&localany.in, AF_INET, &inaddr_any, sizeof inaddr_any, port);
				localsize = sizeof localany.in;
			}
		}
	}

	/* Create the socket */

	if ((sockfd = socket(localaddr->sa_family, type, protocol)) == -1)
		return -1;

	/* Set reuseaddr for tcp servers and udp multicast receivers */

	if ((type == SOCK_STREAM && localaddr->sa_family != AF_LOCAL) || (type == SOCK_DGRAM && is_multicast(localaddr)))
		if (setsockopt(sockfd, SOL_SOCKET, SO_REUSEADDR, (const void *)&reuse_addr, sizeof reuse_addr) == -1)
			return close(sockfd), -1;

	/* Set any user supplied socket options */

	if (sockopts && net_options(sockfd, sockopts) == -1)
		return close(sockfd), -1;

	/* bind to localaddr */

	if (bind(sockfd, localaddr, localsize) == -1)
		return close(sockfd), -1;

	/* If connection oriented, listen */

	if (type == SOCK_STREAM && listen(sockfd, 1024) == -1)
		return close(sockfd), -1;

	/* Return sockfd, localaddr and localsize */

	if (addr && addrsize && *addrsize >= localsize)
		memcpy(addr, localaddr, localsize);

	if (addrsize)
		*addrsize = localsize;

	return sockfd;
}

/*

=item C<int net_create_client(const char *host, const char *service, sockport_t port, sockport_t localport, int type, int protocol, long timeout, sockopt_t *sockopts, sockaddr_t *addr, size_t *addrsize)>

Equivalent to I<net_client(3)> and I<net_udp_client(3)> only more general.
The type of socket is specified by C<type> (e.g. C<SOCK_STREAM> or
C<SOCK_DGRAM>) and C<protocol> (usually zero). If C<localport> is not zero,
it is the port (in host byte order) that the local endpoint binds to. If
C<sockopts> is not C<null>, the socket options specified are set before
calling I<bind(2)>. On success, returns the new socket's file descriptor. On
error, returns C<-1> with C<errno> set appropriately.

=cut

*/

static int net_client_connect(sockaddr_t *remoteaddr, size_t remotesize, sockport_t localport, int type, int protocol, int timeout, sockopt_t *sockopts)
{
	int sockfd;
	int rc;

	/* Create the socket */

	if ((sockfd = socket(remoteaddr->sa_family, type, protocol)) == -1)
		return -1;

	/* Set any user specified socket options */

	if (sockopts && net_options(sockfd, sockopts) == -1)
		return close(sockfd), -1;

	/* If connectionless (or requested), bind (not always needed) */

	if (type == SOCK_DGRAM && (localport || remoteaddr->sa_family == AF_LOCAL))
	{
		sockaddr_any_t localany;
		sockaddr_t *localaddr;
		size_t localsize;

		if (remoteaddr->sa_family == AF_LOCAL)
		{
#if HAVE_UNIX_DOMAIN_WILDCARD
			localaddr = net_unaddr(&localany.un, AF_LOCAL, "");
#else

			/*
			** There is a race condition here. Between the time the path is
			** constructed and bind() creates the inode, another process
			** might create a file with the same path. However, since bind()
			** fails if the path already exists, there's no security risk.
			** Please correct me if I'm wrong. There are bugs, though.
			** bind() will fail when another process creates a file with the
			** same path and the number of possible pathnames is limited by
			** tmpnam(). Fortunately, it's a very large limit.
			** Unfortunately, there's no way around this on some systems
			** (e.g. Solaris). Another annoyance is that the path to which
			** we bind the socket must be unlinked by the application. To
			** get the name, the application must use getsockname() and then
			** unlink() the path when finished with the socket.
			**
			** Linux doesn't have this problem since it lets us bind to ""
			** (the AF_LOCAL equivalent of INADDR_ANY).
			**
			** The easy, elegant, portable solution is to never use UNIX
			** domain datagram sockets. Always use stream sockets instead.
			*/

			char path[L_tmpnam];
			if (!tmpnam(path))
				return close(sockfd), -1;

			localaddr = net_unaddr(&localany.un, AF_LOCAL, path);
#endif
			localsize = sizeof localany.un;
		}
		else
		{
#ifdef AF_INET6
			if (inet6_required())
			{
				localaddr = net_in6addr(&localany.in6, AF_INET6, &in6addr_any, sizeof in6addr_any, htons(localport));
				localsize = sizeof localany.in6;
			}
			else
#endif
			{
				unsigned long inaddr_any = htonl(INADDR_ANY);
				localaddr = net_inaddr(&localany.in, AF_INET, &inaddr_any, sizeof inaddr_any, htons(localport));
				localsize = sizeof localany.in;
			}
		}

		if (bind(sockfd, localaddr, localsize) == -1)
			return close(sockfd), -1;
	}

	/* Connect to remoteaddr (possibly with a timeout) */

	if (timeout && nonblock_on(sockfd) == -1)
		return close(sockfd), -1;

	if ((rc = connect(sockfd, remoteaddr, remotesize)) == -1 && errno != EINPROGRESS)
	{
		int saved_errno = errno;
		close(sockfd);
		return set_errno(saved_errno);
	}

	if (rc == -1)
	{
		int access, err = 0;
		size_t size = sizeof err;

		if ((access = rw_timeout(sockfd, timeout, 0)) == -1)
			return close(sockfd), -1;

		if (!(access & R_OK) && !(access & W_OK))
			return close(sockfd), -1;

		if (getsockopt(sockfd, SOL_SOCKET, SO_ERROR, (void *)&err, (void *)&size) == -1)
			return close(sockfd), -1;

		if (err)
			return close(sockfd), set_errno(err);
	}

	if (timeout && nonblock_off(sockfd) == -1)
		return close(sockfd), -1;

	return sockfd;
}

int net_create_client(const char *host, const char *service, sockport_t port, sockport_t localport, int type, int protocol, long timeout, sockopt_t *sockopts, sockaddr_t *addr, size_t *addrsize)
{
	int sockfd;
	sockaddr_any_t remoteany;
	sockaddr_t *remoteaddr;
	size_t remotesize;
	struct hostent *hostent = NULL;
	struct hostent hostbuf[1];
	void *buf = NULL;
	size_t size = 0;
	int herrno;
	size_t h = 0;

	/* Check for UNIX domain socket specification */

	if (host && !strcmp(host, "/unix"))
	{
		if (!service || *service != '/' || !service[1] || strlen(service) >= sizeof remoteany.un.sun_path)
			return set_errno(EINVAL);

		remoteaddr = net_unaddr(&remoteany.un, AF_LOCAL, service);
		remotesize = sizeof remoteany.un;
	}
	else /* IPv4 or IPv6 */
	{
		/* Set port to service's port number if possible */

		port = service_port(service, type, port);

		/* Set remoteaddr and remotesize to the specified host address, or loopback */

		if (host)
		{
			if (!(hostent = net_gethostbyname(host, hostbuf, &buf, &size, &herrno)))
			{
				free(buf);
				return set_errno(ENOENT);
			}

			if (hostent->h_addrtype == AF_INET)
			{
				remoteaddr = net_inaddr(&remoteany.in, hostent->h_addrtype, hostent->h_addr_list[0], hostent->h_length, port);
				remotesize = sizeof remoteany.in;
			}
#ifdef AF_INET6
			else if (hostent->h_addrtype == AF_INET6)
			{
				remoteaddr = net_in6addr(&remoteany.in6, hostent->h_addrtype, hostent->h_addr_list[0], hostent->h_length, port);
				remotesize = sizeof remoteany.in6;
			}
#endif
			else
			{
				free(buf);
				return set_errno(ENOSYS);
			}
		}
		else /* loopback */
		{
#ifdef AF_INET6
			if (inet6_required())
			{
				remoteaddr = net_in6addr(&remoteany.in6, AF_INET6, &in6addr_loopback, sizeof in6addr_loopback, port);
				remotesize = sizeof remoteany.in6;
			}
			else
#endif
			{
				unsigned long inaddr_loopback = htonl(INADDR_LOOPBACK);
				remoteaddr = net_inaddr(&remoteany.in, AF_INET, &inaddr_loopback, sizeof inaddr_loopback, port);
				remotesize = sizeof remoteany.in;
			}
		}
	}

	/* Try to connect to all available addresses */

	for (;;)
	{
		if ((sockfd = net_client_connect(remoteaddr, remotesize, localport, type, protocol, timeout, sockopts)) != -1)
			break;

		/* Try the next address in h_addr_list, if any */

		if (!hostent || !hostent->h_addr_list[++h])
			break;

		if (hostent->h_addrtype == AF_INET)
		{
			remoteaddr = net_inaddr(&remoteany.in, hostent->h_addrtype, hostent->h_addr_list[h], hostent->h_length, port);
			remotesize = sizeof remoteany.in;
		}
#ifdef AF_INET6
		else if (hostent->h_addrtype == AF_INET6)
		{
			remoteaddr = net_in6addr(&remoteany.in6, hostent->h_addrtype, hostent->h_addr_list[h], hostent->h_length, port);
			remotesize = sizeof remoteany.in6;
		}
#endif
	}

	free(buf);

	/* None succeeded */

	if (sockfd == -1)
		return -1;

	/* Return sockfd, remoteaddr and remotesize */

	if (addr && addrsize && *addrsize >= remotesize)
		memcpy(addr, remoteaddr, remotesize);

	if (addrsize)
		*addrsize = remotesize;

	return sockfd;
}

/*

=item C<int net_multicast_sender(const char *group, const char *service, sockport_t port, sockopt_t *sockopts, sockaddr_t *addr, size_t *addrsize, const char *ifname, unsigned int ifindex, int ttl, unsigned int noloopback)>

Creates a UDP multicast sender socket. C<group> specifies the multicast
group that packets will be sent to.

If the C<RES_OPTIONS> environment variable exists and contains the string
C<"inet6"> or the C</etc/resolv.conf> file contains the C<inet6> option, the
multicast sender will be an IPv6 socket. Otherwise, it will be an IPv4
socket.

C<service> must specify a service name or a numeric port number to use.
Otherwise, C<port> (which must be in host byte order) specifies the port
number to use.

C<sockopts> may contain extra socket options to set.

If C<addr> and C<addrsize> are not C<null>, the multicast group's address is
stored in the buffer pointed to by C<addr>. C<*addrsize> specifies the size
of the buffer pointed to by C<addr>. If there is insufficient space, the
address is not stored in C<addr>. If C<addrsize> is not C<null>, the size of
the address is stored there.

If I<ifname> is not C<null>, it specifies the name of the interface on which
to send the multicast packets. Otherwise, if C<ifindex> is not zero, it
specifies the index of the interface on which to send multicast packets.
Otherwise, the kernel will choose the interface on which to send multicast
packets based on the routing table (which is the default behaviour).

If C<ttl> is greater than C<1>, it specifies the multicast packets' TTL. By
default the TTL is C<1>. See the Multicast-HOWTO for details on the scoping
semantics of the TTL field in multicast packets.

If C<noloopback> is not zero, multicast loopback is disabled. This would
prevent any process on the sending host from receiving the multicast packets
sent via this socket. Multicast loopback is enabled by default.

The socket is connected to the specified multicast group address so that
I<send(2)> must be used to send packets, rather than I<sendto(2)>. This
reduces the time spent sending packets by one third because an unconnected
UDP socket is temporarily connected to the destination address by the kernel
every time I<sendto(2)> is called.

On success, returns the new socket descriptor. On error, returns C<-1> with
C<errno> set appropriately.

=cut

*/

int net_multicast_sender(const char *group, const char *service, sockport_t port, sockopt_t *sockopts, sockaddr_t *addr, size_t *addrsize, const char *ifname, unsigned int ifindex, int ttl, unsigned int noloopback)
{
	int sockfd;
	unsigned int loopback = 0;

	if ((sockfd = net_create_client(group, service, port, 0, SOCK_DGRAM, 0, 0, sockopts, addr, addrsize)) == -1)
		return -1;

	if ((ifname || ifindex) && net_multicast_set_interface(sockfd, ifname, ifindex) == -1)
		return close(sockfd), -1;

	if (ttl > 1 && net_multicast_set_ttl(sockfd, ttl) == -1)
		return close(sockfd), -1;

	if (noloopback && net_multicast_set_loopback(sockfd, loopback) == -1)
		return close(sockfd), -1;

	return sockfd;
}

/*

=item C<int net_multicast_receiver(const char *group, const char *service, sockport_t port, sockopt_t *sockopts, sockaddr_t *addr, size_t *addrsize, const char *ifname, unsigned int ifindex)>

Creates a UDP multicast receiver socket. C<group> specifies the multicast
group that the socket will join.

If the C<RES_OPTIONS> environment variable exists and contains the string
C<"inet6"> or the C</etc/resolv.conf> file contains the C<inet6> option, the
multicast receiver socket will be an IPv6 socket. Otherwise, it will be an
IPv4 socket.

C<service> must specify a service name or a numeric port number to use.
Otherwise, C<port> (which must be in host byte order) specifies the port
number to use.

C<sockopts> may contain extra socket options to set.

If C<addr> and C<addrsize> are not C<null>, the multicast group's address is
stored in the buffer pointed to by C<addr>. C<*addrsize> specifies the size
of the buffer pointed to by C<addr>. If there is insufficient space, the
address is not stored in C<addr>. If C<addrsize> is not C<null>, the size of
the addres is stored there.

If I<ifname> is not C<null>, it specifies the name of the interface on which
to receive multicast packets. Otherwise, if C<ifindex> is not zero, it
specifies the index of the interface on which to receive multicast packets.
Otherwise, the kernel will choose the interface on which to receive
multicast packets based on the routing table (which is the default
behaviour). The new socket may join the same group on more interfaces by
subsequent calls to I<net_multicast_join(3)>.

On success, returns the new socket descriptor. On error, returns C<-1> with
C<errno> set appropriately.

=cut

*/

int net_multicast_receiver(const char *group, const char *service, sockport_t port, sockopt_t *sockopts, sockaddr_t *addr, size_t *addrsize, const char *ifname, unsigned int ifindex)
{
	sockaddr_any_t any[1];
	size_t anysize = sizeof(any);
	int sockfd;

	if (!addr)
		addr = (sockaddr_t *)any;

	if (!addrsize)
		addrsize = &anysize;

	if ((sockfd = net_create_server(group, service, port, SOCK_DGRAM, 0, sockopts, addr, addrsize)) == -1)
		return -1;

	if (net_multicast_join(sockfd, addr, *addrsize, ifname, ifindex) == -1)
		return close(sockfd), -1;

	return sockfd;
}

/*

=item C<int net_multicast_join(int sockfd, const sockaddr_t *addr, size_t addrsize, const char *ifname, unsigned int ifindex)>

Adds C<sockfd>'s membership to the multicast group specified by C<addr>
whose size is C<addrsize>. If I<ifname> is not C<null>, it specifies the
name of the interface on which to receive multicast packets. Otherwise, if
C<ifindex> is not zero, it specifies the index of the interface on which to
receive multicast packets. Otherwise, the kernel will choose the interface
on which to receive multicast packets based on the routing table (which is
the default behaviour). A multicast socket may join the same group on
multiple interfaces by subsequent calls to I<net_multicast_join(3)>. Note
that there is a system imposed limit on the number of times a socket may
join a multicast group (this limit can be about 20). On success, returns
C<0>. On error, returns C<-1> with C<errno> set appropriately.

=cut

*/

#ifndef HAVE_IF_INDEXTONAME
static char *if_indextoname(unsigned int ifindex, char *ifname)
{
	List *ifaces;

	if (!(ifaces = net_interfaces()))
		return NULL;

	while (list_has_next(ifaces))
	{
		net_interface_t *iface = list_next(ifaces);

		if (iface->index == ifindex)
		{
			strlcpy(ifname, iface->name, IFNAMSIZ);
			list_release(ifaces);

			return ifname;
		}
	}

	list_release(ifaces);

	return NULL;
}
#endif

#ifndef HAVE_IF_NAMETOINDEX
static unsigned int if_nametoindex(const char *ifname)
{
	List *ifaces;

	if (!(ifaces = net_interfaces()))
		return 0;

	while (list_has_next(ifaces))
	{
		net_interface_t *iface = list_next(ifaces);

		if (!strcmp(ifname, iface->name))
		{
			unsigned int ifindex = iface->index;

			list_release(ifaces);

			return ifindex;
		}
	}

	list_release(ifaces);

	return 0;
}
#endif

#ifndef IPV6_JOIN_GROUP
#define IPV6_JOIN_GROUP IPV6_ADD_MEMBERSHIP
#endif

#ifndef IPV6_LEAVE_GROUP
#define IPV6_LEAVE_GROUP IPV6_DROP_MEMBERSHIP
#endif

int net_multicast_join(int sockfd, const sockaddr_t *addr, size_t addrsize, const char *ifname, unsigned int ifindex)
{
	sockaddr_any_t *any = (sockaddr_any_t *)addr;

	switch (any->any.sa_family)
	{
		case AF_INET:
		{
			struct ip_mreq mreq[1];
			struct ifreq ifreq[1];

			memcpy(&mreq->imr_multiaddr, &any->in.sin_addr, sizeof mreq->imr_multiaddr);

			if (!ifindex && !ifname)
			{
				mreq->imr_interface.s_addr = htonl(INADDR_ANY);
			}
			else
			{
				if (ifname)
				{
					strlcpy(ifreq->ifr_name, ifname, IFNAMSIZ);
				}
				else if (ifindex)
				{
					if (!if_indextoname(ifindex, ifreq->ifr_name))
						return set_errno(ENXIO);
				}

				if (ioctl(sockfd, SIOCGIFADDR, ifreq) == -1)
					return -1;

				memcpy(&mreq->imr_interface, &((sockaddr_in_t *)&ifreq->ifr_addr)->sin_addr, sizeof mreq->imr_multiaddr);
			}

			return setsockopt(sockfd, IPPROTO_IP, IP_ADD_MEMBERSHIP, mreq, sizeof mreq);
		}

#ifdef AF_INET6
		case AF_INET6:
		{
			struct ipv6_mreq mreq[1];

			memcpy(&mreq->ipv6mr_multiaddr, &any->in6.sin6_addr, sizeof mreq->ipv6mr_multiaddr);

			if (ifname)
			{
				if ((mreq->ipv6mr_interface = if_nametoindex(ifname)) == 0)
					return set_errno(ENXIO);
			}
			else
			{
				mreq->ipv6mr_interface = ifindex;
			}

			return setsockopt(sockfd, IPPROTO_IPV6, IPV6_JOIN_GROUP, mreq, sizeof mreq);
		}
#endif

		default:
			return set_errno(EPROTONOSUPPORT);
	}
}

/*

=item C<int net_multicast_leave(int sockfd, const sockaddr_t *addr, size_t addrsize, const char *ifname, unsigned int ifindex)>

Drops C<sockfd>'s membership from the multicast group specified by C<addr>
whose size is C<addrsize>. If I<ifname> is not C<null>, it specifies the
name of the interface on which to drop group membership. Otherwise, if
C<ifindex> is not zero, it specifies the index of the interface on which to
drop group membership. Otherwise, the interface that joined most recently
will be dropped from the multicast group. On success, returns C<0>. On
error, returns C<-1> with C<errno> set appropriately.

=cut

*/

int net_multicast_leave(int sockfd, const sockaddr_t *addr, size_t addrsize, const char *ifname, unsigned int ifindex)
{
	sockaddr_any_t *any = (sockaddr_any_t *)addr;

	switch (any->any.sa_family)
	{
		case AF_INET:
		{
			struct ip_mreq mreq[1];
			struct ifreq ifreq[1];

			memcpy(&mreq->imr_multiaddr, &any->in.sin_addr, sizeof mreq->imr_multiaddr);

			if (!ifindex && !ifname)
			{
				mreq->imr_interface.s_addr = htonl(INADDR_ANY);
			}
			else
			{
				if (ifname)
				{
					strlcpy(ifreq->ifr_name, ifname, IFNAMSIZ);
				}
				else if (ifindex)
				{
					if (!if_indextoname(ifindex, ifreq->ifr_name))
						return set_errno(ENXIO);
				}

				if (ioctl(sockfd, SIOCGIFADDR, ifreq) == -1)
					return -1;

				memcpy(&mreq->imr_interface, &((sockaddr_in_t *)&ifreq->ifr_addr)->sin_addr, sizeof mreq->imr_interface);
			}

			return setsockopt(sockfd, IPPROTO_IP, IP_DROP_MEMBERSHIP, mreq, sizeof mreq);
		}

#ifdef AF_INET6
		case AF_INET6:
		{
			struct ipv6_mreq mreq[1];

			memcpy(&mreq->ipv6mr_multiaddr, &any->in6.sin6_addr, sizeof mreq->ipv6mr_multiaddr);

			if (ifname)
			{
				if ((mreq->ipv6mr_interface = if_nametoindex(ifname)) == 0)
					return set_errno(ENXIO);
			}
			else
			{
				mreq->ipv6mr_interface = ifindex;
			}

			return setsockopt(sockfd, IPPROTO_IPV6, IPV6_LEAVE_GROUP, mreq, sizeof mreq);
		}
#endif

		default:
			return set_errno(EPROTONOSUPPORT);
	}
}

/*

=item C<int net_multicast_set_interface(int sockfd, const char *ifname, unsigned int ifindex)>

Specifies the interface on which C<sockfd> will send multicast packets. If
I<ifname> is not C<null>, it specifies the name of the interface on which to
send the multicast packets. Otherwise, if C<ifindex> is not zero, it
specifies the index of the interface on which to send multicast packets.
Otherwise, the kernel will choose the interface on which to send multicast
packets based on the routing table (which is the default behaviour). On
success, returns C<0>. On error, returns C<-1> with C<errno> set
appropriately.

=cut

*/

int net_multicast_set_interface(int sockfd, const char *ifname, unsigned int ifindex)
{
	sockaddr_any_t any;
	size_t size = sizeof any;

	if (getsockname(sockfd, (void *)&any, (void *)&size) == -1)
		return -1;

	switch (any.any.sa_family)
	{
		case AF_INET:
		{
			struct ifreq ifreq[1];
			struct in_addr inaddr;

			if (!ifindex && !ifname)
			{
				inaddr.s_addr = htonl(INADDR_ANY);
			}
			else
			{
				if (ifname)
				{
					strlcpy(ifreq->ifr_name, ifname, IFNAMSIZ);
				}
				else if (ifindex)
				{
					if (!if_indextoname(ifindex, ifreq->ifr_name))
						return set_errno(ENXIO);
				}

				if (ioctl(sockfd, SIOCGIFADDR, ifreq) == -1)
					return -1;

				memcpy(&inaddr, &((sockaddr_in_t *)&ifreq->ifr_addr)->sin_addr, sizeof inaddr);
			}

			return setsockopt(sockfd, IPPROTO_IP, IP_MULTICAST_IF, &inaddr, sizeof inaddr);
		}

#ifdef AF_INET6
		case AF_INET6:
		{
			unsigned int index;

			if (ifname && (index = if_nametoindex(ifname)) == 0)
				return set_errno(ENXIO);

			index = ifindex;

			return setsockopt(sockfd, IPPROTO_IPV6, IPV6_MULTICAST_IF, &index, sizeof index);
		}
#endif

		default:
			return set_errno(EPROTONOSUPPORT);
	}
}

/*

=item C<int net_multicast_get_interface(int sockfd)>

Returns the index of the interface that C<sockfd> sends multicast packets
on. On error, returns C<-1> with C<errno> set appropriately.

=cut

*/

int net_multicast_get_interface(int sockfd)
{
	sockaddr_any_t any;
	size_t size = sizeof any;

	if (getsockname(sockfd, (void *)&any, (void *)&size) == -1)
		return -1;

	switch (any.any.sa_family)
	{
		case AF_INET:
		{
			struct in_addr inaddr;
			size_t size = sizeof inaddr;
			unsigned int index = 0;
			List *ifaces;

			if (getsockopt(sockfd, IPPROTO_IP, IP_MULTICAST_IF, &inaddr, (void *)&size) == -1)
				return -1;

			if (!(ifaces = net_interfaces_by_family(AF_INET)))
				return -1;

			while (list_has_next(ifaces))
			{
				net_interface_t *iface = list_next(ifaces);

				if (iface->flags & IFF_UP)
				{
					if (iface->addr->any.sa_family == AF_INET && !memcmp(&inaddr, &iface->addr->in.sin_addr, sizeof inaddr))
					{
						index = iface->index;
						list_break(ifaces);
						break;
					}
				}
			}

			list_release(ifaces);

			return (int)index;
		}

#ifdef AF_INET6
		case AF_INET6:
		{
			unsigned int index;
			size_t size = sizeof index;

			if (getsockopt(sockfd, IPPROTO_IPV6, IPV6_MULTICAST_IF, &index, (void *)&size) == -1)
				return -1;

			return index;
		}
#endif

		default:
			return set_errno(EPROTONOSUPPORT);
	}
}

/*

=item C<int net_multicast_set_loopback(int sockfd, unsigned int loopback)>

If C<loopback> is zero, multicast loopback is disabled for packets sent on
C<sockfd>. This prevents any process on the sending host from receiving the
multicast packets sent via this socket. If C<loopback> is zero, multicast
loopback is enabled for packets sent on C<sockfd> (this is the default
behaviour). On success, returns C<0>. On error, returns C<-1> with C<errno>
set appropriately.

=cut

*/

int net_multicast_set_loopback(int sockfd, unsigned int loopback)
{
	sockaddr_any_t any;
	size_t size = sizeof any;

	if (getsockname(sockfd, (void *)&any, (void *)&size) == -1)
		return -1;

	switch (any.any.sa_family)
	{
		case AF_INET:
		{
			unsigned char flag = (unsigned char)loopback;

			return setsockopt(sockfd, IPPROTO_IP, IP_MULTICAST_LOOP, &flag, sizeof flag);
		}

#ifdef AF_INET6
		case AF_INET6:
		{
			unsigned int flag = loopback;

			return setsockopt(sockfd, IPPROTO_IPV6, IPV6_MULTICAST_LOOP, &flag, sizeof flag);
		}
#endif

		default:
			return set_errno(EPROTONOSUPPORT);
	}
}

/*

=item C<int net_multicast_get_loopback(int sockfd)>

Returns whether or not multicast packets sent on C<sockfd> can be received
by any process on the sending host. A non-zero return value means yes. A
zero return value means no. On error, returns C<-1> with C<errno> set
appropriately.

=cut

*/

int net_multicast_get_loopback(int sockfd)
{
	sockaddr_any_t any;
	size_t size = sizeof any;

	if (getsockname(sockfd, (void *)&any, (void *)&size) == -1)
		return -1;

	switch (any.any.sa_family)
	{
		case AF_INET:
		{
			unsigned char flag;
			size_t size = sizeof flag;

			if (getsockopt(sockfd, IPPROTO_IP, IP_MULTICAST_LOOP, &flag, (void *)&size) == -1)
				return -1;

			return (int)flag;
		}

#ifdef AF_INET6
		case AF_INET6:
		{
			unsigned int flag;
			size_t size = sizeof flag;

			if (getsockopt(sockfd, IPPROTO_IPV6, IPV6_MULTICAST_LOOP, &flag, (void *)&size) == -1)
				return -1;

			return (int)flag;
		}
#endif

		default:
			return set_errno(EPROTONOSUPPORT);
	}
}

/*

=item C<int net_multicast_set_ttl(int sockfd, int ttl)>

Sets the TTL for multicast packets sent on C<sockfd> to C<ttl>. The default
TTL for multicast packets is C<1>. See the Multicast-HOWTO for details on
the scoping semantics of the TTL field in multicast packets. On success,
returns C<0>. On error, returns C<-1> with C<errno> set appropriately.

=cut

*/

int net_multicast_set_ttl(int sockfd, int ttl)
{
	sockaddr_any_t any;
	size_t size = sizeof any;

	if (getsockname(sockfd, (void *)&any, (void *)&size) == -1)
		return -1;

	switch (any.any.sa_family)
	{
		case AF_INET:
		{
			unsigned char hops = (unsigned char)ttl;

			return setsockopt(sockfd, IPPROTO_IP, IP_MULTICAST_TTL, &hops, sizeof hops);
		}

#ifdef AF_INET6
		case AF_INET6:
		{
			int hops = ttl;

			return setsockopt(sockfd, IPPROTO_IPV6, IPV6_MULTICAST_HOPS, &hops, sizeof hops);
		}
#endif

		default:
			return set_errno(EPROTONOSUPPORT);
	}
}

/*

=item C<int net_multicast_get_ttl(int sockfd)>

Returns the TTL of multicast packets sent on C<sockfd>. On error, returns
C<-1> with C<errno> set appropriately.

=cut

*/

int net_multicast_get_ttl(int sockfd)
{
	sockaddr_any_t any;
	size_t size = sizeof any;

	if (getsockname(sockfd, (void *)&any, (void *)&size) == -1)
		return -1;

	switch (any.any.sa_family)
	{
		case AF_INET:
		{
			unsigned char hops;
			size_t size = sizeof hops;

			if (getsockopt(sockfd, IPPROTO_IP, IP_MULTICAST_TTL, &hops, (void *)&size) == -1)
				return -1;

			return (int)hops;
		}

#ifdef AF_INET6
		case AF_INET6:
		{
			int hops;
			size_t size = sizeof hops;

			if (getsockopt(sockfd, IPPROTO_IPV6, IPV6_MULTICAST_HOPS, &hops, (void *)&size) == -1)
				return -1;

			return hops;
		}
#endif

		default:
			return set_errno(EPROTONOSUPPORT);
	}
}

/*

=item C<int net_tos_lowdelay(int sockfd)>

Sets the TOS bits of packets sent on C<sockfd> to request minimum delay.
This is for interactive applications. This results in many small packets.
Use this sparingly. On success, returns C<0>. On error, returns C<-1> with
C<errno> set appropriately.

=cut

*/

int net_tos_lowdelay(int sockfd)
{
	int tos = IPTOS_LOWDELAY;

	return setsockopt(sockfd, IPPROTO_IP, IP_TOS, &tos, sizeof tos);
}

/*

=item C<int net_tos_throughput(int sockfd)>

Sets the TOS bits of packets sent on C<sockfd> to request maximum
throughput. This is for bulk data transfers. Don't forget to also specify
buffer sizes that are large enough to maximise throughput. On success,
returns C<0>. On error, returns C<-1> with C<errno> set appropriately.

=cut

*/

int net_tos_throughput(int sockfd)
{
	int tos = IPTOS_THROUGHPUT;

	return setsockopt(sockfd, IPPROTO_IP, IP_TOS, &tos, sizeof tos);
}

/*

=item C<int net_tos_reliability(int sockfd)>

Sets the TOS bits of packets sent on C<sockfd> to request maximum
reliability. This should only be used for datagram-based internet
management. On success, returns C<0>. On error, returns C<-1> with C<errno>
set appropriately.

=cut

*/

int net_tos_reliability(int sockfd)
{
	int tos = IPTOS_RELIABILITY;

	return setsockopt(sockfd, IPPROTO_IP, IP_TOS, &tos, sizeof tos);
}

/*

=item C<int net_tos_lowcost(int sockfd)>

Sets the TOS bits of packets sent on C<sockfd> to request minimum monetary
cost. Probably a good default. On success, returns C<0>. On error, returns
C<-1> with C<errno> set appropriately.

=cut

*/

#ifndef IPTOS_LOWCOST
#define IPTOS_LOWCOST 0x02
#endif

int net_tos_lowcost(int sockfd)
{
	int tos = IPTOS_LOWCOST;

	return setsockopt(sockfd, IPPROTO_IP, IP_TOS, &tos, sizeof tos);
}

/*

=item C<int net_tos_normal(int sockfd)>

Clears the TOS bits of packets sent on C<sockfd> (the default). On success,
returns C<0>. On error, returns C<-1> with C<errno> set appropriately.

=cut

*/

int net_tos_normal(int sockfd)
{
	int tos = 0x00;

	return setsockopt(sockfd, IPPROTO_IP, IP_TOS, &tos, sizeof tos);
}

/*

=item C<struct hostent *net_gethostbyname(const char *name, struct hostent *hostbuf, void **buf, size_t *size, int *herrno)>

A portable, reentrant I<gethostbyname(3)> that handles it's own memory
allocation requirements. Looks up I<name>. On success, returns C<hostbuf>
with any extra data in C<*buf>. C<*size> is the length of C<*buf> on entry
and is updated to reflect the length on exit if a larger buffer was required
to perform the lookup. On error, returns C<null> with C<*herrno> set
appropriately if there was a lookup failure or with C<errno> set
appropriately if there was a memory allocation failure. It is the caller's
responsibility to deallocate C<*buf> using I<free(3)> when the lookup failed
or when the results of the name lookup are no longer required.

Note: If your system has any version of I<gethostbyname_r(3)>, it will be
used. Otherwise, I<gethostbyname(3)> will be used. Even this might be
threadsafe if your system uses thread specific data to make it so.

    struct hostent hostbuf[1], *hostent;
    void *buf = NULL;
    size_t size = 0;
    int herrno;

    if ((hostent = net_gethostbyname("hostname", hostbuf, &buf, &size, &herrno)))
    {
        // use hostent ...
    }

    free(buf);

=cut

*/

#define xor(a, b) (!(a) ^ !(b))

struct hostent *net_gethostbyname(const char *name, struct hostent *hostbuf, void **buf, size_t *size, int *herrno)
{
	if (!name || !hostbuf || !buf || !size || xor(*buf, *size) || !herrno)
		return set_errnull(EINVAL);

#if HAVE_FUNC_GETHOSTBYNAME_R_6

	{
		struct hostent *ret;
		int err;

		if (*size == 0 && !(*buf = malloc(*size = 1024)))
			return NULL;

		while ((err = gethostbyname_r(name, hostbuf, *buf, *size, &ret, herrno)) && errno == ERANGE)
			if (!mem_resize((char **)buf, *size <<= 1))
				return NULL;

		return (err) ? NULL : ret;
	}

#elif HAVE_FUNC_GETHOSTBYNAME_R_5

	{
		struct hostent *ret;

		if (*size == 0 && !(*buf = malloc(*size = 1024)))
			return NULL;

		while (!(ret = gethostbyname_r(name, hostbuf, *buf, *size, herrno)) && errno == ERANGE)
			if (!mem_resize((char **)buf, *size <<= 1))
				return NULL;

		return ret;
	}

#elif HAVE_FUNC_GETHOSTBYNAME_R_3

	{
		if (*size == 0)
		{
			if (!(*buf = calloc(1, *size = sizeof(struct hostent_data))))
				return NULL;
		}
		else if (*size < sizeof(struct hostent_data))
		{
			size_t oldsize = *size;

			if (!mem_resize((char **)buf, *size = sizeof(struct hostent_data)))
				return NULL;

			memset((char *)*buf + oldsize, 0, *size - oldsize);
		}

		if (gethostbyname_r(name, hostbuf, (struct hostent_data *)*buf) == -1)
		{
			*herrno = h_errno;
			return NULL;
		}

		return hostbuf;
	}

#else

	/*
	** Some systems use thread specific data. Even if this isn't one of
	** them, we have to return something, even if it isn't threadsafe.
	** If we're here and it's not threadsafe, this system probably doesn't
	** support threads anyway.
	*/

	{
		struct hostent *ret;

		if (!(ret = gethostbyname(name)))
			*herrno = h_errno;

		return ret;
	}

#endif
}

/*

=item C<struct servent *net_getservbyname(const char *name, const char *proto, struct servent *servbuf, void **buf, size_t *size)>

A portable, reentrant I<getservbyname(3)> that handles it's own memory
allocation requirements. Looks up the service C<name> and C<proto>. On
success, returns C<servbuf> with any extra data in C<*buf>. C<*size> is the
length of C<*buf> on entry and is updated to reflect the length on exit if a
larger buffer was required to perform the lookup. On error, returns C<null>
with C<errno> set appropriately. It is the caller's responsibility to
deallocate C<*buf> using I<free(3)> when the lookup failed or when the
results of the name lookup are no longer required.

Note: If your system has any version of I<getservbyname_r(3)>, it will be
used. Otherwise, I<getservbyname(3)> will be used. Even this might be
threadsafe if your system uses thread specific data to make it so.

    struct servent servbuf[1], *servent;
    void *buf = NULL;
    size_t size = 0;

    if ((servent = net_getservbyname("service", "proto", servbuf, &buf, &size)))
    {
        // use servent ...
    }

    free(buf);

=cut

*/

struct servent *net_getservbyname(const char *name, const char *proto, struct servent *servbuf, void **buf, size_t *size)
{
	if (!name || !servbuf || !buf || !size || xor(*buf, *size))
		return set_errnull(EINVAL);

#if HAVE_FUNC_GETSERVBYNAME_R_6

	{
		struct servent *ret;
		int err;

		if (*size == 0 && !(*buf = malloc(*size = 128)))
			return NULL;

		while ((err = getservbyname_r(name, proto, servbuf, *buf, *size, &ret)) && errno == ERANGE)
			if (!mem_resize((char **)buf, *size <<= 1))
				return NULL;

		return (err) ? NULL : ret;
	}

#elif HAVE_FUNC_GETSERVBYNAME_R_5

	{
		struct servent *ret;

		if (*size == 0 && !(*buf = malloc(*size = 128)))
			return NULL;

		while (!(ret = getservbyname_r(name, proto, servbuf, *buf, *size)) && errno == ERANGE)
			if (!mem_resize((char **)buf, *size <<= 1))
				return NULL;

		return ret;
	}

#elif HAVE_FUNC_GETSERVBYNAME_R_4

	{
		if (*size == 0)
		{
			if (!(*buf = calloc(1, *size = sizeof(struct servent_data))))
				return NULL;
		}
		else if (*size < sizeof(struct servent_data))
		{
			size_t oldsize = *size;

			if (!mem_resize((char **)buf, *size = sizeof(struct servent_data)))
				return NULL;

			memset((char *)*buf + oldsize, 0, *size - oldsize);
		}

		if (getservbyname_r(name, proto, servbuf, (struct servent_data *)*buf) == -1)
			return NULL;

		return servbuf;
	}

#else

	/*
	** Some systems use thread specific data. Even if this isn't one of
	** them, we have to return something, even if it isn't threadsafe.
	** If we're here and it's not threadsafe, this system probably doesn't
	** support threads anyway. Of course, that's no consolation if some
	** function further up the stack is in the middle of a getservent()
	** loop.
	*/

	return getservbyname(name, proto);

#endif
}

/*

=item C<int net_options(int sockfd, sockopt_t *sockopts)>

Sets an arbitrary number of socket options for the socket C<sockfd>. The
options to set are specified by C<sockopts> which is an array of
C<sockopt_t> structures. Each I<sockopt_t> structure contains the C<level>,
C<optname>, C<optval> and C<optlen> parameters to be passed to
I<setsockopt(2)>. The array must end with a structure whose C<optval>
element is C<null>. On success, returns C<0>. On error, returns C<-1> with
C<errno> set appropriately. If I<setsockopt(2)> returns an error,
I<net_options(3)> will continue to set any further options but will
ultimately return an error itself.

=cut

*/

int net_options(int sockfd, sockopt_t *sockopts)
{
	sockopt_t *so;
	int err = 0;

	if (sockfd == -1)
		return set_errno(EBADF);

	if (!sockopts)
		return set_errno(EINVAL);

	for (so = sockopts; so->optval; ++so)
		if (setsockopt(sockfd, so->level, so->optname, so->optval, so->optlen) == -1)
			err = -1;

	return err;
}

static void iface_release(net_interface_t *iface)
{
	if (iface)
	{
		free(iface->addr);
		free(iface->brdaddr);
		free(iface->dstaddr);
		free(iface->hwaddr);
		free(iface);
	}
}

/*

=item C<List *net_interfaces(void)>

Returns the list of network interfaces. For each interface, calls
I<ioctl(2)> to obtain the interface's flags, hardware address, network
address, broadcast address if applicable, destination address if applicable,
MTU and index. On success, returns a list of I<net_interface_t> objects. It
is the caller's responsibility to deallocate the list with
I<list_release(3)>. On error, returns C<null> with C<errno> set
appropriately. Note that on Solaris, neither the hardware address nor the
index can be returned. This function guesses the index in this case which
seems to work. If the C<RES_OPTIONS> environment variable contains the
string C<"inet6">, then only IPv6 interfaces are returned. Otherwise, only
IPv4 interfaces are returned.

=cut

*/

List *net_interfaces(void)
{
	return net_interfaces_with_locker(NULL);
}

/*

=item C<List *net_interfaces_with_locker(Locker *locker)>

Equivalent to I<net_interfaces(3)> except that multiple threads accessing
the returned list will be synchronised by C<locker>.

=cut

*/

List *net_interfaces_with_locker(Locker *locker)
{
	int family;

#ifdef AF_INET6
	if (inet6_required())
		family = AF_INET6;
	else
#endif
		family = AF_INET;

	return net_interfaces_by_family_with_locker(family, locker);
}

/*

=item C<List *net_interfaces_by_family(int family)>

Equivalent to I<net_interfaces(3)> except that C<family> specifies the
required address family.

=cut

*/

List *net_interfaces_by_family(int family)
{
	return net_interfaces_by_family_with_locker(family, NULL);
}

/*

=item C<List *net_interfaces_by_family_with_locker(int family, Locker *locker)>

Equivalent to I<net_interfaces_with_locker(3)> except that C<family>
specifies the required address family.

=cut

*/

List *net_interfaces_by_family_with_locker(int family, Locker *locker)
{
	List *ret;
	int sockfd;
	size_t size, lastsize = 0;
	char *buf = NULL, *ptr;
	struct ifconf ifc[1];
	int index = 0;

	if (!(ret = list_create_with_locker(locker, (list_release_t *)iface_release)))
		return NULL;

	if ((sockfd = socket(family, SOCK_DGRAM, 0)) == -1)
	{
		list_release(ret);
		return NULL;
	}

	/* Obtain the list of network interfaces */

	for (size = 100 * sizeof(struct ifreq); ; size += 10 * sizeof(struct ifreq))
	{
		if (!mem_resize(&buf, size))
		{
			list_release(ret);
			mem_release(buf);
			close(sockfd);
			return NULL;
		}

		ifc->ifc_len = size;
		ifc->ifc_buf = buf;

		if (ioctl(sockfd, SIOCGIFCONF, ifc) == -1)
		{
			if (errno != EINVAL || lastsize != 0)
			{
				list_release(ret);
				mem_release(buf);
				close(sockfd);
				return NULL;
			}
		}
		else
		{
			if (ifc->ifc_len == lastsize)
				break;

			lastsize = ifc->ifc_len;
		}
	}

	/* Obtain details of each network interface */

	for (ptr = buf; ptr < buf + lastsize; )
	{
		struct ifreq *ifr = (struct ifreq *)ptr;
		struct ifreq ifrcopy[1];
		net_interface_t *iface;

		if (!(iface = calloc(1, sizeof(net_interface_t))))
		{
			list_release(ret);
			mem_release(buf);
			close(sockfd);
			return NULL;
		}

		if (!list_append(ret, iface))
		{
			list_release(ret);
			mem_release(buf);
			mem_release(iface);
			close(sockfd);
			return NULL;
		}

#ifdef HAVE_SOCKADDR_SA_LEN
		size = ifr->ifr_addr.sa_len;
#else
		switch (ifr->ifr_addr.sa_family)
		{
#ifdef AF_INET6
			case AF_INET6:
				size = sizeof(sockaddr_in6_t);
				break;
#endif
			case AF_INET:
			default:
				size = sizeof(sockaddr_t);
				break;
		}
#endif

		ptr += sizeof ifr->ifr_name + size;
		*ifrcopy = *ifr;

		/* Get the interface's flags */

		if (ioctl(sockfd, SIOCGIFFLAGS, ifrcopy) == -1)
		{
			list_release(ret);
			mem_release(buf);
			close(sockfd);
			return NULL;
		}

		iface->flags = ifrcopy->ifr_flags;
		strlcpy(iface->name, ifr->ifr_name, IFNAMSIZ);

		/* Get the interface's address */

		if (iface->flags & IFF_UP)
		{
			if (!(iface->addr = calloc(1, size)))
			{
				list_release(ret);
				mem_release(buf);
				close(sockfd);
				return NULL;
			}

			memcpy(iface->addr, &ifr->ifr_addr, size);

			/* Get the interface's hardware address */

#ifdef SIOCGIFHWADDR
			if (ioctl(sockfd, SIOCGIFHWADDR, ifrcopy) == 0)
			{
				if (!(iface->hwaddr = mem_new(sockaddr_t)))
				{
					list_release(ret);
					mem_release(buf);
					close(sockfd);
					return NULL;
				}

				memcpy(iface->hwaddr, &ifrcopy->ifr_hwaddr, sizeof(sockaddr_t));
			}
#endif

			/* Get the interface's broadcast address */

#ifdef SIOCGIFBRDADDR
			if (iface->flags & IFF_BROADCAST)
			{
				if (ioctl(sockfd, SIOCGIFBRDADDR, ifrcopy) == -1)
				{
					list_release(ret);
					mem_release(buf);
					close(sockfd);
					return NULL;
				}

				if (!(iface->brdaddr = calloc(1, size)))
				{
					list_release(ret);
					mem_release(buf);
					close(sockfd);
					return NULL;
				}

				memcpy(iface->brdaddr, &ifrcopy->ifr_broadaddr, size);
			}
#endif

			/* Get the interface's destination address (for Point-To-Point) */

#ifdef SIOCGIFDSTADDR
			if (iface->flags & IFF_POINTOPOINT)
			{
				if (ioctl(sockfd, SIOCGIFDSTADDR, ifrcopy) == -1)
				{
					list_release(ret);
					mem_release(buf);
					close(sockfd);
					return NULL;
				}

				if (!(iface->dstaddr = calloc(1, size)))
				{
					list_release(ret);
					mem_release(buf);
					close(sockfd);
					return NULL;
				}

				memcpy(iface->dstaddr, &ifrcopy->ifr_dstaddr, size);
			}
#endif
		}

		/* Get the interface's Maximum Transmission Unit */

#ifdef SIOCGIFMTU
		if (ioctl(sockfd, SIOCGIFMTU, ifrcopy) != -1)
			iface->mtu = ifrcopy->ifr_mtu;
		else
#endif
			iface->mtu = -1;

		/* Get the interface's index */

#ifdef SIOCGIFINDEX
		if (ioctl(sockfd, SIOCGIFINDEX, ifrcopy) != -1)
			iface->index = ifrcopy->ifr_ifindex;
		else
#endif
			iface->index = ++index; /* Must fake it under Solaris */
	}

	mem_release(buf);
	close(sockfd);

	return ret;
}

/*

=item C<rudp_t *rudp_create(void)>

Allocates and initialises a retransmission timeout estimator for providing
reliability over UDP. It is the caller's responsibility to deallocate the
estimator using I<rudp_release(3)> or I<rudp_destroy(3)>. Note that each
retransmission timer may only be used for a single destination address. If a
UDP socket communicates with multiple peers, a separate estimator must be
used for each peer. On success, returns the RTO estimator. On error, returns
C<null> with C<errno> set appropriately. See the EXAMPLES section.

=cut

*/

#define	RUDP_RTO_CALC(rudp) ((rudp)->srtt + (4.0 * (rudp)->rttvar))

static double rudp_minmax(double rto)
{
	if (rto < RUDP_RXTMIN)
		return RUDP_RXTMIN;

	if (rto > RUDP_RXTMAX)
		return RUDP_RXTMAX;

	return rto;
}

static int rudp_init(rudp_t *rudp)
{
	/*
	** Initialises the RTO estimator, C<rudp>. Must be called when reliable
	** UDP transactions time out. On success, returns C<0>. On error,
	** returns C<-1> with C<errno> set appropriately. See the EXAMPLES
	** section.
	*/

	if (!rudp)
		return set_errno(EINVAL);

	rudp->rtt = 0.0;
	rudp->srtt = 0.0;
	rudp->rttvar = 0.75;
	rudp->rto = rudp_minmax(RUDP_RTO_CALC(rudp));
	rudp->sequence = 0;

	return 0;
}

rudp_t *rudp_create(void)
{
	rudp_t *rudp;
	struct timeval now[1];

	if (gettimeofday(now, NULL) == -1)
		return NULL;

	if (!(rudp = mem_new(rudp_t)))
		return NULL;

	rudp->base = now->tv_sec;
	rudp_init(rudp);

	return rudp;
}

/*

=item C<void rudp_release(rudp_t *rudp)>

Releases (deallocates) the RTO estimator, C<rudp>. See the EXAMPLES section.

=cut

*/

void rudp_release(rudp_t *rudp)
{
	free(rudp);
}

/*

=item C<void *rudp_destroy(rudp_t **rudp)>

Destroys (deallocates and sets to C<null>) the RTO estimator, C<*rudp>.
Returns C<null>.

=cut

*/

void *rudp_destroy(rudp_t **rudp)
{
	if (rudp && *rudp)
	{
		rudp_release(*rudp);
		*rudp = NULL;
	}

	return NULL;
}

/*

C<uint32_t rudp_timestamp(rudp_t *rudp)>

Returns the number of milliseconds since C<rudp> was created in a 32 bit
integer. This number needs to be stored in reliable UDP packet headers so
that the round trip time can be calculated. On error, returns
C<(uint32_t)-1> with C<errno> set appropriately. See the EXAMPLES section.

*/

static uint32_t rudp_timestamp(rudp_t *rudp)
{
	struct timeval now[1];

	if (!rudp)
		return (uint32_t)set_errno(EINVAL);

	if (gettimeofday(now, NULL) == -1)
		return (uint32_t)-1;

	return (uint32_t)((now->tv_sec - rudp->base) * 1000) + (now->tv_usec / 1000);
}

/*

C<uint32_t rudp_newpack(rudp_t *rudp)>

Prepares the RTO estimator, C<rudp>, for a new packet that is about to be
sent and returns a 32 bit sequence number for this new packet. This number
needs to be stored in reliable UDP packet headers so that the round trip
time can be calculated. On error, returns C<(uint32_t)-1> with C<errno> set
appropriately. See the EXAMPLES section.

*/

static uint32_t rudp_newpack(rudp_t *rudp)
{
	if (!rudp)
		return (uint32_t)set_errno(EINVAL);

	rudp->nrexmt = 0;
	++rudp->sequence;

	return rudp->sequence;
}

/*

C<double rudp_start(rudp_t *rudp)>

Returns C<rudp>'s current retransmission timeout in seconds. On error,
returns C<-1.0> with C<errno> set appropriately. See the EXAMPLES section.

*/

static double rudp_start(rudp_t *rudp)
{
	if (!rudp)
		return (double)set_errno(EINVAL);

	return rudp->rto;
}

/*

C<int rudp_stop(rudp_t *rudp, uint32_t rtt)>

Updates the RTO estimator C<rudp>. C<rtt> is the round trip time in
milliseconds. Call this after successfully receiving a response to a
reliable UDP packet. On success, returns C<0>. On error, returns C<-1> with
C<errno> set appropriately. See the EXAMPLES section.

*/

static int rudp_stop(rudp_t *rudp, uint32_t rtt)
{
	double delta;

	if (!rudp)
		return set_errno(EINVAL);

	rudp->rtt = rtt / 1000.0;
	delta = rudp->rtt - rudp->srtt;
	rudp->srtt += delta / 8;
	rudp->rttvar += (fabs(delta) - rudp->rttvar) / 4;
	rudp->rto = rudp_minmax(RUDP_RTO_CALC(rudp));

	return 0;
}

/*

C<int rudp_timeout(rudp_t *rudp)>

Informs C<rudp> that its retransmission timer has expired. This causes
C<rudp>'s RTO to double until the retransmission limit (3) is reached at
which point it returns C<-1> with C<errno> set to C<ETIMEDOUT>. On success,
returns C<0>. On error, returns C<-1> with C<errno> set appropriately. See
the EXAMPLES section.

*/

static int rudp_timeout(rudp_t *rudp)
{
	if (!rudp)
		return set_errno(EINVAL);

	rudp->rto *= 2;

	if (++rudp->nrexmt > RUDP_MAXNREXMT)
		return set_errno(ETIMEDOUT);

	return 0;
}

/*

=item C<ssize_t net_rudp_transact(int sockfd, rudp_t *rudp, const void *obuf, size_t osize, void *ibuf, size_t isize)>

Provides reliable (not infallible) UDP transactions over C<sockfd>, a socket
created with I<net_udp_client(3)> or I<net_create_client>. Sends C<osize>
bytes, starting at C<obuf>, to the address to which C<sockfd> is connected.
C<rudp> is the retransmission timeout estimator as created by
I<rudp_create(3)>. The message is prepended by an 8 byte header that
contains a timestamp and a sequence number. This is required to enable
calculation of the RTT. The peer must expect this header and include it
verbatim in its response. Note that the same retransmission timeout
estimator (C<rudp>) should be used for all transactions. Waits for a
response. If the retransmission timer expires before a response is received,
the retransmission timer is updated and the packet is retransmitted. This
continues until either a response is received or the packet has been
retransmitted three times with no response. If there is a response, at most
I<isize> bytes are received in C<ibuf>. On success, returns the number of
bytes received. On error, returns C<-1> with C<errno> set appropriately.

=cut

*/

ssize_t net_rudp_transact(int sockfd, rudp_t *rudp, const void *obuf, size_t osize, void *ibuf, size_t isize)
{
	return net_rudp_transactwith(sockfd, rudp, obuf, osize, 0, ibuf, isize, 0, NULL, 0);
}

/*

=item C<ssize_t net_rudp_transactwith(int sockfd, rudp_t *rudp, const void *obuf, size_t osize, int oflags, void *ibuf, size_t isize, int iflags, sockaddr_any_t *addr, size_t addrsize)>

Equivalent to I<net_rudp_transact(3)> except that C<sockfd> is a socket
created with I<net_udp_server(3)> or I<net_create_server(3)>. C<addr> is the
address of the peer. C<addrsize> is the size of C<addr>. I<sendmsg(2)> and
I<recvmsg(2)> are used instead of using I<writev(2)> and I<readv(2)>.
C<oflags> is passed to I<sendmsg(2)> as the C<flags> argument. C<iflags> is
passed to I<recvmsg(2)> as the C<flags> argument. Note that each
retransmission timer may only be used for a single destination address. If a
UDP socket communicates with multiple peers, a separate estimator must be
used for each peer. On success, returns the number of bytes received. On
error, returns C<-1> with C<errno> set appropriately. The EXAMPLES section
below contains the code for this function.

=cut

*/

ssize_t net_rudp_transactwith(int sockfd, rudp_t *rudp, const void *obuf, size_t osize, int oflags, void *ibuf, size_t isize, int iflags, sockaddr_any_t *addr, size_t addrsize)
{
	struct { uint32_t sequence, timestamp; } ohdr[1], ihdr[1];
	struct msghdr omsg[1], imsg[1];
	struct iovec ovec[2], ivec[2];
	int sequence;
	uint32_t timestamp;
	double timeout;
	long timeout_sec;
	long timeout_usec;
	ssize_t bytes;

	if (sockfd < 0 || !rudp || !obuf || !osize || !ibuf || !isize)
		return set_errno(EINVAL);

	if (addr)
	{
		memset(omsg, 0, sizeof omsg);
		omsg->msg_name = (void *)addr;
		omsg->msg_namelen = addrsize;
		omsg->msg_iov = ovec;
		omsg->msg_iovlen = 2;

		memset(imsg, 0, sizeof imsg);
		imsg->msg_iov = ivec;
		imsg->msg_iovlen = 2;
	}

	ovec[0].iov_base = (void *)ohdr;
	ovec[0].iov_len = sizeof ohdr;
	ovec[1].iov_base = (void *)obuf;
	ovec[1].iov_len = osize;

	ivec[0].iov_base = (void *)ihdr;
	ivec[0].iov_len = sizeof ihdr;
	ivec[1].iov_base = ibuf;
	ivec[1].iov_len = isize;

	if ((sequence = rudp_newpack(rudp)) == -1)
		return -1;

	ohdr->sequence = sequence;

sendagain:

	if ((timestamp = rudp_timestamp(rudp)) == -1)
		return -1;

	ohdr->timestamp = timestamp;

	if (addr)
	{
		if (sendmsg(sockfd, omsg, oflags) == -1)
			return -1;
	}
	else
	{
		if (writev(sockfd, ovec, 2) == -1)
			return -1;
	}

	if ((timeout = rudp_start(rudp)) == -1)
		return -1;

	timeout_sec = (long)timeout;
	timeout_usec = (long)((timeout - timeout_sec) * 1000000);

recvagain:

	if (read_timeout(sockfd, timeout_sec, timeout_usec) == -1)
	{
		if (errno == ETIMEDOUT && rudp_timeout(rudp) != -1)
			goto sendagain;

		rudp_init(rudp);

		return -1;
	}

	if (addr)
	{
		if ((bytes = recvmsg(sockfd, imsg, iflags)) == -1)
			return -1;
	}
	else
	{
		if ((bytes = readv(sockfd, ivec, 2)) == -1)
			return -1;
	}

	if (bytes < sizeof ihdr || ihdr->sequence != ohdr->sequence)
		goto recvagain;

	if (rudp_stop(rudp, rudp_timestamp(rudp) - ihdr->timestamp) == -1)
		return -1;

	return bytes - sizeof ihdr;
}

/*

=item C<ssize_t net_pack(int sockfd, long timeout, int flags, const char *format, ...)>

Creates a packet containing data packed by I<pack(3)> as specified by
C<format> and sends it on the connected socket, C<sockfd>, with I<send(2)>.
If C<timeout> is non-zero, it is the number of seconds to wait for the send
buffer to have enough space for the new data before timing out (This only
applies to TCP sockets since UDP has no send buffer). C<flags> is passed to
I<send(2)>. This is intended for use with UDP. It can work reliably with TCP
but only when the application protocol involves each peer packing and
unpacking alternatively, each waiting for the other's response before making
their next response. On success, returns the number of bytes packed and
sent. On error, returns C<-1> with C<errno> set appropriately.

Note, the I<net_pack(3)> functions can sometimes be inappropriate as they
inherently involve copying existing data into a new buffer before writing
it. It is much faster to not copy the data at all. When possible (i.e. when
the data is already in network byte order), use I<writev(2)> instead to
write multiple non-contiguous buffers in a single system call.

=cut

*/

ssize_t net_pack(int sockfd, long timeout, int flags, const char *format, ...)
{
	va_list args;
	int rc;

	va_start(args, format);
	rc = net_vpack(sockfd, timeout, flags, format, args);
	va_end(args);

	return rc;
}

/*

=item C<ssize_t net_vpack(int sockfd, long timeout, int flags, const char *format, va_list args)>

Equivalent to I<net_pack(3)> with the variable argument list specified
directly as for I<vprintf(3)>.

=cut

*/

ssize_t net_vpack(int sockfd, long timeout, int flags, const char *format, va_list args)
{
	char buf[MSG_SIZE];
	int rc;

	if ((rc = vpack(buf, MSG_SIZE, format, args)) == -1)
		return -1;

	if (timeout && write_timeout(sockfd, timeout, 0) == -1)
		return -1;

	return send(sockfd, buf, rc, flags);
}

/*

=item C<ssize_t net_packto(int sockfd, long timeout, int flags, const sockaddr_t *to, size_t tosize, const char *format, ...)>

Creates a packet containing data packed by I<pack(3)> as specified by
C<format> and sends it on the unconnected socket, C<sockfd>, to the address
specified by C<to> with length C<tosize> with I<sendto(2)>. C<flags> is
passed to I<sendto(2)>. If C<timeout> is non-zero, it is the number of
seconds to wait for the send buffer to have enough space for the new data
before timing out. This only applies to TCP sockets since UDP has no send
buffer. On success, returns the number of bytes packed and sent. On error,
returns C<-1> with C<errno> set appropriately.

=cut

*/

ssize_t net_packto(int sockfd, long timeout, int flags, const sockaddr_t *to, size_t tosize, const char *format, ...)
{
	va_list args;
	int rc;

	va_start(args, format);
	rc = net_vpackto(sockfd, timeout, flags, to, tosize, format, args);
	va_end(args);

	return rc;
}

/*

=item C<ssize_t net_vpackto(int sockfd, long timeout, int flags, const sockaddr_t *to, size_t tosize, const char *format, va_list args)>

Equivalent to I<net_packto(3)> with the variable argument list specified
directly as for I<vprintf(3)>.

=cut

*/

ssize_t net_vpackto(int sockfd, long timeout, int flags, const sockaddr_t *to, size_t tosize, const char *format, va_list args)
{
	char buf[MSG_SIZE];
	int rc;

	if ((rc = vpack(buf, MSG_SIZE, format, args)) == -1)
		return -1;

	if (timeout && write_timeout(sockfd, timeout, 0) == -1)
		return -1;

	return sendto(sockfd, buf, rc, flags, to, tosize);
}

/*

=item C<ssize_t net_unpack(int sockfd, long timeout, int flags, const char *format, ...)>

Receives a packet of data on the connected socket, C<sockfd>, with
I<recv(2)>, and unpacks it with I<unpack(3)> as specified by C<format>.
C<flags> is passed to I<recv(2)>. C<timeout> is the number of seconds to
wait before timing out. On success, returns the number of bytes received and
unpacked. On error, returns C<-1> with C<errno> set appropriately.

Note, the I<net_unpack(3)> functions can sometimes be inappropriate as they
inherently involve reading data into a single buffer and then copying it
into multiple target buffers. It is much faster to not copy the data at all.
When possible (i.e. when the data is already in network byte order), use
I<readv(2)> instead to read into multiple non-contiguous buffers in a single
system call.

=cut

*/

ssize_t net_unpack(int sockfd, long timeout, int flags, const char *format, ...)
{
	va_list args;
	int rc;

	va_start(args, format);
	rc = net_vunpack(sockfd, timeout, flags, format, args);
	va_end(args);

	return rc;
}

/*

=item C<ssize_t net_vunpack(int sockfd, long timeout, int flags, const char *format, va_list args)>

Equivalent to I<net_unpack(3)> with the variable argument list specified
directly as for I<vprintf(3)>.

=cut

*/

ssize_t net_vunpack(int sockfd, long timeout, int flags, const char *format, va_list args)
{
	char buf[MSG_SIZE];
	int rc;

	if (read_timeout(sockfd, timeout, 0) == -1)
		return -1;

	if ((rc = recv(sockfd, buf, MSG_SIZE, flags)) == -1)
		return -1;

	return vunpack(buf, rc, format, args);
}

/*

=item C<ssize_t net_unpackfrom(int sockfd, long timeout, int flags, sockaddr_t *from, size_t *fromsize, const char *format, ...)>

Receives a packet of data on the unconnected socket, C<sockfd>, with
I<recvfrom(2)>, and unpacks it with I<unpack(3)> as specified by C<format>.
If C<from> is non-C<null>, the source address of the message is stored
there. C<fromsize> is a value-result parameter, initialized to the size of
the C<from> buffer, and modified on return to indicate the actual size of
the address stored there. C<flags> is passed to I<recvfrom(2)>. C<timeout>
is the number of seconds to wait before timing out. On success, returns the
number of bytes received and unpacked. On error, returns C<-1> with C<errno>
set appropriately.

=cut

*/

ssize_t net_unpackfrom(int sockfd, long timeout, int flags, sockaddr_t *from, size_t *fromsize, const char *format, ...)
{
	va_list args;
	int rc;

	va_start(args, format);
	rc = net_vunpackfrom(sockfd, timeout, flags, from, fromsize, format, args);
	va_end(args);

	return rc;
}

/*

=item C<ssize_t net_vunpackfrom(int sockfd, long timeout, int flags, sockaddr_t *from, size_t *fromsize, const char *format, va_list args)>

Equivalent to I<net_unpackfrom(3)> with the variable argument list specified
directly as for I<vprintf(3)>.

=cut

*/

ssize_t net_vunpackfrom(int sockfd, long timeout, int flags, sockaddr_t *from, size_t *fromsize, const char *format, va_list args)
{
	char buf[MSG_SIZE];
	int rc;

	if (read_timeout(sockfd, timeout, 0) == -1)
		return -1;

	if ((rc = recvfrom(sockfd, buf, MSG_SIZE, flags, from, (void *)fromsize)) == -1)
		return -1;

	return vunpack(buf, rc, format, args);
}

/*

=item C<ssize_t pack(void *buf, size_t size, const char *format, ...)>

Packs data into C<buf> as described by C<format>. The arguments after
C<format> contain the data to be packed. C<size> is the size of C<buf>.
Returns the number of bytes packed on success, or -1 on error with C<errno>
set appropriately.

Note, this is based on the I<pack(3)> function in I<perl(1)> (in fact, the
following documentation is from I<perlfunc(1)>) except that the C<*> count
specifier has different semantics, the C<?> count specifier is new, there's
no non C<nul>-terminated strings or machine dependant formats or uuencoding
or BER integer compression, everything is in network byte order, and floats
are represented as strings so I<pack(3)> is suitable for serialising data to
be written to disk or sent across a network to other hosts. OK, C<v> and
C<w> specifically aren't in network order but sometimes that's needed too.

C<format> can contain the following type specifiers:

    a   A string with arbitrary binary data
    z   A nul terminated string, will be nul padded
    b   A bit string (rounded out to nearest byte boundary)
    h   A hexadecimal string (rounded out to nearest byte boundary)
    c   A char (8 bits)
    s   A short (16 bits)
    i   An int (32 bits)
    l   A long (64 bits - only on some systems)
    f   A single-precision float (length byte + text + nul)
    d   A double-precision float (length byte + text + nul)
    v   A short in "VAX" (little-endian) order (16 bits)
    w   An int in "VAX" (little-endian) order (32 bits)
    p   A pointer (32 bits)
    x   A nul byte
    X   Back up a byte
    @   Null fill to absolute position

The following rules apply:

Each letter may optionally be followed by a number giving a repeat count or
length, or by C<"*"> or C<"?">. A C<"*"> will obtain the repeat count or
length from the next argument (like I<printf(3)>). The count argument must
appear before the first corresponding data argument. When unpacking C<"a">,
C<"z">, C<"b"> or C<"h">, a C<"?"> will obtain the repeat count or length
from the I<size_t> object pointed to by the next argument and the size of
the target buffer in the argument after that. These two arguments must
appear before the first corresponding target buffer argument. This enables
unpacking packets that contain length fields without risking target buffer
overflow.

With all types except C<"a">, C<"z">, C<"b"> and C<"h"> the I<pack(3)>
function will gobble up that many arguments.

The C<"a"> and C<"z"> types gobble just one value, but pack it as a string
of length count (specified by the corresponding number), truncating or
padding with C<nul>s as necessary. It is the caller's responsibility to
ensure that the data arguments point to sufficient memory. When unpacking,
C<"z"> strips everything after the first C<nul>, and C<"a"> returns data
verbatim.

Likewise, the C<"b"> field packs a string that many bits long.

The C<"h"> field packs a string that many nybbles long.

The C<"p"> type packs a pointer. You are responsible for ensuring the memory
pointed to is not a temporary value (which can potentially get deallocated
before you get around to using the packed result). A C<null> pointer is
unpacked if the corresponding packed value for C<"p"> is C<null>. Of course,
C<"p"> is useless if the packed data is to be sent over a network to another
process.

The integer formats C<"c">, C<"s">, C<"i"> and C<"l"> are all on network
byte order and so can safely be packed for sending over a network to another
process. However, C<"l"> relies on a non-ISO C 89 language feature (namely,
the I<long long int> type which is in ISO C 99) and so should not be used in
portable code, even if it is supported on the local system. There is no
guarantee that a long long packed on one system will be unpackable on
another. At least not until C99 is more widespread.

Real numbers (floats and doubles) are packed in text format. Due to the
multiplicity of floating point formats around, this is done to safely
transport real numbers across a network to another process.

It is the caller's responsibility to ensure that there are sufficient
arguments provided to satisfy the requirements of C<format>.

=cut

*/

ssize_t pack(void *buf, size_t size, const char *format, ...)
{
	va_list args;
	int rc;

	va_start(args, format);
	rc = vpack(buf, size, format, args);
	va_end(args);

	return rc;
}

/*

=item C<ssize_t vpack(void *buf, size_t size, const char *format, va_list args)>

Equivalent to I<pack(3)> with the variable argument list specified directly
as for I<vprintf(3)>.

=cut

*/

#define GET_COUNT() \
	count = 1; \
	if (*format == '*') \
		++format, count = va_arg(args, size_t); \
	else if (isdigit((int)(unsigned int)*format)) \
		for (count = 0; isdigit((int)(unsigned int)*format); ++format) \
			count *= 10, count += *format - '0'; \
	if ((ssize_t)count < 1) \
		return set_errno(EINVAL);

#define CHECK_SPACE(required) \
	if (p + (required) > pkt + size) \
		return set_errno(ENOSPC);

ssize_t vpack(void *buf, size_t size, const char *format, va_list args)
{
	size_t count;
	unsigned char *pkt = buf;
	unsigned char *p = pkt;
	char tmp[128];

	if (!pkt || !format)
		return set_errno(EINVAL);

	while (*format)
	{
		switch (*format++)
		{
			case 'a': /* A string with arbitrary binary data */
			{
				void *data;
				GET_COUNT()
				CHECK_SPACE(count)
				if (!(data = va_arg(args, void *)))
					return set_errno(EINVAL);
				memcpy(p, data, count);
				p += count;
				break;
			}

			case 'z': /* A nul terminated string, will be nul padded */
			{
				char *data;
				size_t len;
				GET_COUNT()
				CHECK_SPACE(count)
				if (!(data = va_arg(args, char *)))
					return set_errno(EINVAL);
				len = strlen(data);
				if (len > count)
					len = count;
				memcpy(p, data, len);
				p += len;
				count -= len;
				if (count)
					memset(p, 0, count);
				p += count;
				break;
			}

			case 'b': /* A bit string (rounded out to nearest byte boundary) */
			{
				char *data;
				unsigned char byte;
				int shift;
				GET_COUNT()
				CHECK_SPACE((count + 7) >> 3)
				if (!(data = va_arg(args, char *)))
					return set_errno(EINVAL);
				byte = 0x00;
				shift = 7;
				while (count--)
				{
					switch (*data++)
					{
						case '0':
							break;
						case '1':
							byte |= 1 << shift;
							break;
						default:
							return set_errno(EINVAL);
					}
					if (--shift == -1)
					{
						*p++ = byte;
						byte = 0x00;
						shift = 7;
					}
				}
				if (shift != 7)
					*p++ = byte;
				break;
			}

			case 'h': /* A hex string (rounded out to nearest byte boundary) */
			{
				char *data;
				unsigned char byte;
				int shift;
				GET_COUNT()
				CHECK_SPACE((count + 1) >> 1)
				if (!(data = va_arg(args, char *)))
					return set_errno(EINVAL);
				byte = 0x00;
				shift = 4;
				while (count--)
				{
					unsigned char nybble = *data++;
					switch (nybble)
					{
						case '0': case '1': case '2': case '3': case '4':
						case '5': case '6': case '7': case '8': case '9':
							byte |= (nybble - '0') << shift;
							break;
						case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
							byte |= (nybble - 'a' + 10) << shift;
							break;
						case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
							byte |= (nybble - 'A' + 10) << shift;
							break;
						default:
							return set_errno(EINVAL);
					}
					if ((shift -= 4) == -4)
					{
						*p++ = byte;
						byte = 0x00;
						shift = 4;
					}
				}
				if (shift != 4)
					*p++ = byte;
				break;
			}

			case 'c': /* A char (8 bits) */
			{
				GET_COUNT()
				CHECK_SPACE(count)
				while (count--)
					*p++ = (unsigned char)va_arg(args, int);
				break;
			}

			case 's': /* A short (16 bits) */
			{
				GET_COUNT()
				CHECK_SPACE(count << 1)
				while (count--)
				{
					unsigned short data = (unsigned short)va_arg(args, int);
					*p++ = (data >> 8) & 0xff;
					*p++ = data & 0xff;
				}

				break;
			}

			case 'i': /* An int (32 bits) */
			case 'p': /* A pointer (32 bits) */
			{
				GET_COUNT()
				CHECK_SPACE(count << 2)
				while (count--)
				{
					unsigned long data = (unsigned long)va_arg(args, int);
					*p++ = (data >> 24) & 0xff;
					*p++ = (data >> 16) & 0xff;
					*p++ = (data >> 8) & 0xff;
					*p++ = data & 0xff;
				}

				break;
			}

#ifdef HAVE_LONG_LONG
			case 'l': /* A long (64 bits - only on some systems) */
			{
				GET_COUNT()
				CHECK_SPACE(count << 3)
				while (count--)
				{
					unsigned long long data = (unsigned long long)va_arg(args, long long);
					*p++ = (data >> 56) & 0xff;
					*p++ = (data >> 48) & 0xff;
					*p++ = (data >> 40) & 0xff;
					*p++ = (data >> 32) & 0xff;
					*p++ = (data >> 24) & 0xff;
					*p++ = (data >> 16) & 0xff;
					*p++ = (data >> 8) & 0xff;
					*p++ = data & 0xff;
				}

				break;
			}
#else
			case 'l': /* A long (64 bits - only on some systems) */
			{
				return set_errno(ENOSYS);
			}
#endif

			case 'f': /* A single-precision float (length byte + text + nul) */
			case 'd': /* A double-precision float (length byte + text + nul) */
			{
				GET_COUNT()
				while (count--)
				{
					double data = va_arg(args, double);
					int rc = snprintf(tmp, 128, "%g", data);
					size_t len;
					if (rc == -1 || rc >= 128)
						return set_errno(ENOSPC);
					len = strlen(tmp) + 1;
					CHECK_SPACE(len + 1)
					*p++ = len & 0xff;
					memcpy(p, tmp, len);
					p += len;
				}

				break;
			}

			case 'v': /* A short in "VAX" (little-endian) order (16 bits) */
			{
				GET_COUNT()
				CHECK_SPACE(count << 1)
				while (count--)
				{
					unsigned short data = (unsigned short)va_arg(args, int);
					*p++ = data & 0xff;
					*p++ = (data >> 8) & 0xff;
				}

				break;
			}

			case 'w': /* An int in "VAX" (little-endian) order (32 bits) */
			{
				GET_COUNT()
				CHECK_SPACE(count << 2)
				while (count--)
				{
					unsigned int data = (unsigned int)va_arg(args, int);
					*p++ = data & 0xff;
					*p++ = (data >> 8) & 0xff;
					*p++ = (data >> 16) & 0xff;
					*p++ = (data >> 24) & 0xff;
				}

				break;
			}

			case 'x': /* A nul byte */
			{
				GET_COUNT()
				CHECK_SPACE(count)
				memset(p, 0, count);
				p += count;
				break;
			}

			case 'X': /* Back up a byte */
			{
				GET_COUNT()
				if (p - count < pkt)
					return set_errno(EINVAL);
				p -= count;
				break;
			}

			case '@': /* Null fill to absolute position */
			{
				GET_COUNT()
				if (count > size)
					return set_errno(ENOSPC);
				if (pkt + count < p)
					return set_errno(EINVAL);
				memset(p, 0, count - (p - pkt));
				p += count - (p - pkt);
				break;
			}

			default:
			{
				return set_errno(EINVAL);
			}
		}
	}

	return p - pkt;
}

/*

=item C<ssize_t unpack(void *buf, size_t size, const char *format, ...)>

Unpacks the data in C<buf> which was packed by I<pack(3)>. C<size> is the
size of C<buf>. C<format> must be equivalent to the C<format> argument to
the call to I<pack(3)> that packed the data. The remaining arguments must be
pointers to variables that will hold the unpacked data or C<null>. If any
are C<null> the corresponding data will be skipped (i.e. not unpacked).
Unpacked C<"z">, C<"b"> and C<"h"> strings are always C<nul> terminated. It
is the caller's responsibility to ensure that the pointers into which these
strings are unpacked contain enough memory (count + 1 bytes). It is the
caller's responsibility to ensure that the non-C<null> pointers into which
C<"a"> strings are unpacked also contain enough memory (count bytes). It is
the caller's responsibility to ensure that there are sufficient arguments
supplied to satisfy the requirements of C<format>, even if they are just
C<null> pointers. Returns the number of bytes unpacked on success, or -1 on
error.

=cut

*/

ssize_t unpack(void *buf, size_t size, const char *format, ...)
{
	va_list args;
	int rc;

	va_start(args, format);
	rc = vunpack(buf, size, format, args);
	va_end(args);

	return rc;
}

/*

=item C<ssize_t vunpack(void *buf, size_t size, const char *format, va_list args)>

Equivalent to I<unpack(3)> with the variable argument list specified
directly as for I<vprintf(3)>.

=cut

*/

#define GET_COUNT_LIMIT() \
	limit = count = 1; \
	if (*format == '*') \
		++format, limit = count = va_arg(args, size_t); \
	else if (*format == '?') \
	{ \
		size_t *countp = va_arg(args, size_t *); \
		if (!countp) \
			return set_errno(EINVAL); \
		count = *countp; \
		limit = va_arg(args, size_t); \
		++format; \
	} \
	else if (isdigit((int)(unsigned int)*format)) \
	{ \
		for (count = 0; isdigit((int)(unsigned int)*format); ++format) \
			count *= 10, count += *format - '0'; \
		limit = count; \
	} \
	if ((ssize_t)count < 1 || (ssize_t)limit < 1) \
		return set_errno(EINVAL); \
	if (count > limit) \
		return set_errno(ENOSPC);

#define CHECK_SKIP(count, action) \
	if (!data) \
	{ \
		p += (count); \
		action; \
	}

ssize_t vunpack(void *buf, size_t size, const char *format, va_list args)
{
	unsigned char *pkt = buf;
	unsigned char *p = pkt;
	size_t count, limit;

	if (!pkt || !format)
		return set_errno(EINVAL);

	while (*format)
	{
		switch (*format++)
		{
			case 'a': /* A string with arbitrary binary data */
			{
				void *data;
				GET_COUNT_LIMIT()
				CHECK_SPACE(count)
				data = va_arg(args, void *);
				CHECK_SKIP(count, break)
				memcpy(data, p, count);
				p += count;
				break;
			}

			case 'z': /* A nul terminated string, will be nul padded */
			{
				char *data;
				size_t len;
				GET_COUNT_LIMIT()
				CHECK_SPACE(count)
				data = va_arg(args, char *);
				CHECK_SKIP(count, break)
				for (len = 0; p + len < pkt + size && p[len]; ++len)
					;
				if (len > count)
					len = count;
				memcpy(data, p, len);
				p += len;
				count -= len;
				memset(data + len, 0, count ? count : 1);
				p += count;
				break;
			}

			case 'b': /* A bit string (rounded out to nearest byte boundary) */
			{
				char bin[] = "01";
				char *data;
				int shift;
				GET_COUNT_LIMIT()
				CHECK_SPACE((count + 7) >> 3)
				data = va_arg(args, char *);
				CHECK_SKIP((count + 7) >> 3, break)
				shift = 7;
				while (count--)
				{
					*data++ = bin[(*p & (0x01 << shift)) >> shift];
					if (--shift == -1)
						++p, shift = 7;
				}
				if (shift != 7)
					++p;
				*data = '\0';
				break;
			}

			case 'h': /* A hex string (rounded out to nearest byte boundary) */
			{
				char hex[] = "0123456789abcdef";
				char *data;
				int shift;
				GET_COUNT_LIMIT()
				CHECK_SPACE((count + 1) >> 1)
				data = va_arg(args, char *);
				CHECK_SKIP((count + 1) >> 1, break)
				shift = 4;
				while (count--)
				{
					*data++ = hex[(*p & (0x0f << shift)) >> shift];
					if ((shift -= 4) == -4)
						++p, shift = 4;
				}
				if (shift != 4)
					++p;
				*data = '\0';
				break;
			}

			case 'c': /* A char (8 bits) */
			{
				GET_COUNT()
				CHECK_SPACE(count)
				while (count--)
				{
					signed char *data = va_arg(args, signed char *);
					CHECK_SKIP(1, continue)
					*data = (signed char)*p++;
				}
				break;
			}

			case 's': /* A short (16 bits) */
			{
				GET_COUNT()
				CHECK_SPACE(count << 1)
				while (count--)
				{
					signed short *data = va_arg(args, signed short *);
					CHECK_SKIP(2, continue)
					*data = (signed short)*p++ << 8;
					*data |= *p++;
				}
				break;
			}

			case 'i': /* An int (32 bits) */
			case 'p': /* A pointer (32 bits) */
			{
				GET_COUNT()
				CHECK_SPACE(count << 2)
				while (count--)
				{
					signed long *data = va_arg(args, signed long *);
					CHECK_SKIP(4, continue)
					*data = (signed long)*p++ << 24;
					*data |= (signed long)*p++ << 16;
					*data |= (signed long)*p++ << 8;
					*data |= (signed long)*p++;
				}

				break;
			}

			case 'v': /* A short in "VAX" (little-endian) order (16 bits) */
			{
				GET_COUNT()
				CHECK_SPACE(count << 1)
				while (count--)
				{
					signed short *data = va_arg(args, signed short *);
					CHECK_SKIP(2, continue)
					*data = *p++;
					*data |= (unsigned short)*p++ << 8;
				}
				break;
			}

			case 'w': /* An int in "VAX" (little-endian) order (32 bits) */
			{
				GET_COUNT()
				CHECK_SPACE(count << 2)
				while (count--)
				{
					signed int *data = va_arg(args, signed int *);
					CHECK_SKIP(4, continue)
					*data = (signed long)*p++;
					*data |= (signed long)*p++ << 8;
					*data |= (signed long)*p++ << 16;
					*data |= (signed long)*p++ << 24;
				}

				break;
			}

#ifdef HAVE_LONG_LONG
			case 'l': /* A long (64 bits - only on some systems) */
			{
				GET_COUNT()
				CHECK_SPACE(count << 3)
				while (count--)
				{
					signed long long *data = va_arg(args, signed long long *);
					CHECK_SKIP(8, continue)
					*data = (signed long long)*p++ << 56;
					*data |= (signed long long)*p++ << 48;
					*data |= (signed long long)*p++ << 40;
					*data |= (signed long long)*p++ << 32;
					*data |= (signed long long)*p++ << 24;
					*data |= (signed long long)*p++ << 16;
					*data |= (signed long long)*p++ << 8;
					*data |= (signed long long)*p++;
				}

				break;
			}
#else
			case 'l':
			{
				return set_errno(ENOSYS);
			}
#endif

			case 'f': /* A single-precision float (length byte + text + nul) */
			{
				GET_COUNT()
				while (count--)
				{
					float *data = va_arg(args, float *);
					size_t len;
					CHECK_SPACE(1);
					len = (size_t)*p++;
					CHECK_SPACE(len)
					CHECK_SKIP(len, continue)
					sscanf((const char *)p, "%g", data);
					p += len;
				}

				break;
			}

			case 'd': /* A double-precision float (length byte + text + nul) */
			{
				GET_COUNT()
				while (count--)
				{
					double *data = va_arg(args, double *);
					size_t len;
					CHECK_SPACE(1);
					len = (size_t)*p++;
					CHECK_SPACE(len);
					CHECK_SKIP(len, continue)
					sscanf((const char *)p, "%lg", data);
					p += len;
				}

				break;
			}

			case 'x': /* A nul byte */
			{
				GET_COUNT()
				CHECK_SPACE(count)
				p += count;
				break;
			}

			case 'X': /* Back up a byte */
			{
				GET_COUNT()
				if (p - count < pkt)
					return set_errno(EINVAL);
				p -= count;
				break;
			}

			case '@': /* Null fill to absolute position */
			{
				GET_COUNT()
				if (count > size)
					return set_errno(ENOSPC);
				if (pkt + count < p)
					return set_errno(EINVAL);
				p += count - (p - pkt);
				break;
			}

			default:
			{
				return set_errno(EINVAL);
			}
		}
	}

	return p - pkt;
}

/*

=item C<ssize_t net_read(int sockfd, long timeout, char *buf, size_t count)>

Repeatedly calls I<read(2)> on the connection oriented socket, C<sockfd>,
until C<count> bytes have been read into C<buf> or until EOF is encountered
or until it times out (after C<timeout> seconds). On success, returns the
number of bytes read. On error, returns C<-1> with C<errno> set
appropriately.

=cut

*/

ssize_t net_read(int sockfd, long timeout, char *buf, size_t count)
{
	char *b;
	ssize_t bytes;

	for (b = buf; count; count -= bytes, b += bytes)
	{
		if (read_timeout(sockfd, timeout, 0) == -1)
			return -1;

		if ((bytes = read(sockfd, b, count)) == -1)
			return -1;

		if (bytes == 0)
			break;
	}

	return b - buf;
}

/*

=item C<ssize_t net_write(int sockfd, long timeout, const char *buf, size_t count)>

Repeatedly calls I<write(2)> on the connection oriented socket, C<sockfd>,
until C<count> bytes from C<buf> have been written or until it times out
(after C<timeout> seconds). On success, returns the number of bytes written.
On error, returns C<-1>.

=cut

*/

ssize_t net_write(int sockfd, long timeout, const char *buf, size_t count)
{
	const char *b;
	ssize_t bytes;

	for (b = buf; count; count -= bytes, b += bytes)
	{
		if (write_timeout(sockfd, timeout, 0) == -1)
			return -1;

		if ((bytes = write(sockfd, b, count)) <= 0)
			return bytes;
	}

	return b - buf;
}

/*

=item C<ssize_t net_expect(int sockfd, long timeout, const char *format, ...)>

Expects and confirms a formatted text message from a remote connection on
the socket, C<sockfd>. C<timeout> is the number of seconds to wait before
timing out. If C<timeout> is 0, times out immediately. On success, returns
the number of conversions performed (see I<scanf(3)>). When the connection
closes, returns C<0>. On error, returns C<-1> with C<errno> set
appropriately.

B<Note:> This is generally unreliable. When TCP segments get lost in
transit, the resent bytes can form part of a larger segment so the
"boundaries" that you may expect in your input can fail to appear. This can
lead to lost data (read but not expected). This can only really be used
safely when the application protocol involves each peer reading and writing
alternatively, each waiting for the other's response before making their
next response. In short, I<net_expect(3)> should only be used in concert
with I<net_send(3)>.

=cut

*/

ssize_t net_expect(int sockfd, long timeout, const char *format, ...)
{
	va_list args;
	ssize_t rc;

	va_start(args, format);
	rc = net_vexpect(sockfd, timeout, format, args);
	va_end(args);

	return rc;
}

/*

=item C<ssize_t net_vexpect(int sockfd, long timeout, const char *format, va_list args)>

Equivalent to I<net_expect(3)> with the variable argument list specified
directly as for I<vprintf(3)>.

=cut

*/

ssize_t net_vexpect(int sockfd, long timeout, const char *format, va_list args)
{
	char buf[MSG_SIZE + 1];
	ssize_t bytes;

	if (read_timeout(sockfd, timeout, 0) == -1)
		return -1;

	if ((bytes = read(sockfd, buf, MSG_SIZE)) <= 0)
		return bytes;

	buf[bytes] = '\0';

	return vsscanf(buf, format, args);
}

/*

=item C<ssize_t net_send(int sockfd, long timeout, const char *format, ...)>

Sends a formatted string (see I<printf(3)>) to a remote connection on the
socket, C<sockfd>. C<timeout> is the number of seconds to wait before timing
out. On success, returns the number of bytes written. On error, returns
C<-1> with C<errno> set appropriately.

=cut

*/

ssize_t net_send(int sockfd, long timeout, const char *format, ...)
{
	va_list args;
	ssize_t rc;

	va_start(args, format);
	rc = net_vsend(sockfd, timeout, format, args);
	va_end(args);

	return rc;
}

/*

=item C<ssize_t net_vsend(int sockfd, long timeout, const char *format, va_list args)>

Equivalent to I<net_send(3)> with the variable argument list specified
directly as for I<vprintf(3)>.

=cut

*/

ssize_t net_vsend(int sockfd, long timeout, const char *format, va_list args)
{
	char buf[MSG_SIZE + 1];
	ssize_t bytes;

	bytes = vsnprintf(buf, MSG_SIZE + 1, format, args);
	if (bytes == -1 || bytes > MSG_SIZE)
		return set_errno(ENOSPC);

	return net_write(sockfd, timeout, buf, bytes);
}

/*

=item C<ssize_t sendfd(int sockfd, const void *buf, size_t nbytes, int flags, int fd)>

Sends the open file descriptor, C<fd>, to another process (related or
unrelated) on the other end of the UNIX domain socket, C<sockfd>. Equivalent
to I<send(2)> in all other respects. UNIX domain sockets can be created
using I<net_client(3)> or I<net_server(3)> with a first argument of
C<"/unix"> or using I<socketpair(2)> or I<pipe(2)> (under SVR4). It is safe
to I<close(2)> (and even I<unlink(2)>) the file descriptor after sending it.
The kernel won't really close it (or delete it) until the receiving process
closes the descriptor. If the sender doesn't close C<fd>, both processes
share the same file table entry in the kernel. This means sharing file
position if the descriptor refers to a regular file. If the receiver doesn't
receive the file descriptor with I<recvfd(3)> when it is sent, the
descriptor will be closed. A file descriptor must always be passed along
with some normal data. Linux doesn't support calling I<recv(2)> with a
C<null> buffer or zero length. On success, returns C<0>. On error, returns
C<-1> with C<errno> set appropriately.

=cut

*/

ssize_t sendfd(int sockfd, const void *buf, size_t nbytes, int flags, int fd)
{
	struct msghdr mesg[1];
	struct iovec iov[1];

#ifdef HAVE_MSGHDR_MSG_CONTROL

/* Solaris8 doesn't have these */

#ifndef CMSG_ALIGN
#define CMSG_ALIGN(len) (((len) + sizeof(size_t) - 1) & (size_t)~(sizeof(size_t) - 1))
#endif

#ifndef CMSG_SPACE
#define CMSG_SPACE(len) (CMSG_ALIGN(sizeof(struct cmsghdr)) + CMSG_ALIGN(len))
#endif

#ifndef CMSG_LEN
#define CMSG_LEN(len) (CMSG_ALIGN(sizeof(struct cmsghdr)) + (len))
#endif

	union
	{
		struct cmsghdr align;
		char control[CMSG_SPACE(sizeof(int))];
	}
	control;

	struct cmsghdr *cmsg;

	mesg->msg_control = control.control;
	mesg->msg_controllen = sizeof control.control;

	cmsg = CMSG_FIRSTHDR(mesg);
	cmsg->cmsg_len = CMSG_LEN(sizeof(int));
	cmsg->cmsg_level = SOL_SOCKET;
	cmsg->cmsg_type = SCM_RIGHTS;

	*((int *)CMSG_DATA(cmsg)) = fd;

#else

	mesg->msg_accrights = (caddr_t)&fd;
	mesg->msg_accrightslen = sizeof(int);

#endif

	mesg->msg_name = NULL;
	mesg->msg_namelen = 0;

	mesg->msg_iov = iov;
	mesg->msg_iovlen = 1;

	iov->iov_base = (void *)buf;
	iov->iov_len = nbytes;

	return sendmsg(sockfd, mesg, flags);
}

/*

=item C<ssize_t recvfd(int sockfd, void *buf, size_t nbytes, int flags, int *fd)>

Receives an open file descriptor (which is stored in C<*fd>) from another
process (related or unrelated) on the other end of the UNIX domain socket,
C<sockfd>. Equivalent to I<recv(2)> in all other respects. UNIX domain
sockets can be created using I<net_client(3)> or I<net_server(3)> with a
first argument of C<"/unix"> or using I<socketpair(2)> or I<pipe(2)> (under
SVR4). If the sender doesn't close the file descriptor, both processes share
the same file table entry in the kernel. This means sharing file position if
the descriptor refers to a regular file. If the sender sends the same file
descriptor multiple times, all received file descriptors also share the same
file table entry in the kernel. If the receiver doesn't receive the file
descriptor with I<recvfd(3)> when it is sent with I<sendfd(3)>, the
descriptor will be closed. A file descriptor must always be passed along
with some normal data. Linux doesn't support calling I<recv(2)> with a
C<null> buffer or zero length. Don't set C<MSG_PEEK> in C<flags> (the
results are unpredictable). On success, returns C<0>. On error, returns
C<-1> with C<errno> set appropriately. If the file descriptor was not
passed, C<*fd> is set to C<-1>.

=cut

*/

ssize_t recvfd(int sockfd, void *buf, size_t nbytes, int flags, int *fd)
{
	struct msghdr mesg[1];
	struct iovec iov[1];
	ssize_t rc;

#ifdef HAVE_MSGHDR_MSG_CONTROL

	union
	{
		struct cmsghdr align;
		char control[CMSG_SPACE(sizeof(int))];
	}
	control;

	struct cmsghdr *cmsg;

	mesg->msg_control = control.control;
	mesg->msg_controllen = sizeof control.control;

#else

	int newfd;

	mesg->msg_accrights = (caddr_t)&newfd;
	mesg->msg_accrightslen = sizeof(int);

#endif

	if (!fd)
		return set_errno(EINVAL);

	mesg->msg_name = NULL;
	mesg->msg_namelen = 0;

	mesg->msg_iov = iov;
	mesg->msg_iovlen = 1;

	iov->iov_base = buf;
	iov->iov_len = nbytes;

	if ((rc = recvmsg(sockfd, mesg, flags)) <= 0)
		return -1;

#ifdef HAVE_MSGHDR_MSG_CONTROL

	*fd = -1;

	if ((cmsg = CMSG_FIRSTHDR(mesg)) && cmsg->cmsg_len == CMSG_LEN(sizeof(int)))
	{
		if (cmsg->cmsg_level == SOL_SOCKET && cmsg->cmsg_type == SCM_RIGHTS)
			*fd = *((int *)CMSG_DATA(cmsg));
	}

#else

	*fd = (mesg->msg_accrightslen == sizeof(int)) ? newfd : -1;

#endif

	return rc;
}

#if 0
/*

C<ssize_t recvcred(int sockfd, void *buf, size_t nbytes, int flags, struct fcred *cred)>

Receives the user credentials of the process on the other end of the UNIX
domain socket, C<sockfd> and stores them in C<*cred>. Equivalent to
I<recv(2)> in all other respects. Requires that the C<LOCAL_CREDS> socket
option (with level C<0>) has been set for C<sockfd> in advance. On datagram
sockets, user credentials accompany every datagram. On stream sockets, user
credentials are sent only once, the first time data is sent. On success,
returns C<0>. On error, returns C<-1> with C<errno> set appropriately. If
the user credentials were not provided by the kernel, C<cred> is filled with
zero bytes (so C<fc_ngroups == 0>).

*/

#ifndef MAXLOGNAME
#define MAXLOGNAME 16
#endif

struct fcred
{
	uid_t fc_ruid;             /* real user id */
	gid_t fc_rgid;             /* real group id */
	char fc_login[MAXLOGNAME]; /* setlogin() name */
	uid_t fc_uid;              /* effective user id */
	short fc_ngroups;          /* number of groups */
	gid_t fc_groups[NGROUPS];  /* supplementary group ids */
};

#define fc_gid fc_groups[0]    /* effective group id */

static ssize_t recvcred(int sockfd, void *buf, size_t nbytes, int flags, struct fcred *cred)
{
	struct msghdr mesg[1];
	struct iovec iov[1];
	ssize_t rc;

	union
	{
		struct cmsghdr align;
		char control[CMSG_SPACE(sizeof(struct fcred))];
	}
	control;

	struct cmsghdr *cmsg;

	mesg->msg_control = control.control;
	mesg->msg_controllen = sizeof control.control;

	mesg->msg_name = NULL;
	mesg->msg_namelen = 0;

	mesg->msg_iov = iov;
	mesg->msg_iovlen = 1;

	iov->iov_base = buf;
	iov->iov_len = nbytes;

	if ((rc = recvmsg(sockfd, mesg, flags)) == -1)
		return rc;

	if (cred)
	{
		if (mesg->msg_controllen > sizeof(struct cmsghdr) || !(cmsg = CMSG_FIRSTHDR(mesg)) || cmsg->cmsg_len != CMSG_LEN(sizeof(struct fcred)) || cmsg->cmsg_level != SOL_SOCKET || cmsg->cmsg_type != SCM_CREDS)
			memset(cred, 0, sizeof(struct fcred));
		else
			memcpy(cred, CMSG_DATA(cmsg), sizeof(struct fcred));
	}

	return rc;
}
#endif

/*

=item C<int mail(const char *server, const char *sender, const char *recipients, const char *subject, const char *message)>

Sends a mail message consisting of C<subject> and C<message> from C<sender>
to the addresses in C<recipients>. C<recipients> contains mail addresses
separated by sequences of comma and/or space characters. C<message> must not
contain any lines containing only a C<'.'> character. On success, returns
C<0>. On error, returns C<-1> with C<errno> set appropriately.

=cut

*/

static int rcpt(int smtp, const char *recipients)
{
	List *list = split(recipients, ", ");

	if (!list || list_length(list) <= 0)
		return -1;

	while (list_has_next(list) == 1)
	{
		char *recipient = cstr((String *)list_next(list));
		int rc, code;

		if (net_send(smtp, 10, "RCPT TO: <%s>\r\n", recipient) == -1 ||
			(rc = net_expect(smtp, 10, "%d", &code)) == -1)
		{
			list_release(list);
			return -1;
		}

		if (rc != 1 || code != 250)
		{
			list_release(list);
			return set_errno(EPROTO);
		}
	}

	list_release(list);
	return 0;
}

int mail(const char *server, const char *sender, const char *recipients, const char *subject, const char *message)
{
	int smtp;
	int code;
	int rc;
	char c;

	if (!sender || !recipients)
		return set_errno(EINVAL);

	if ((smtp = net_client(server, "smtp", 25, 5, 0, 0, NULL, NULL)) == -1)
		return -1;

#define fail { close(smtp); return -1; }
#define try(action) if ((action) == -1) fail
#define try_send(args) try(net_send args)
#define try_expect(args, cnv, resp) try(rc = net_expect args) \
	if (rc != (cnv) || code != (resp)) { close(smtp); return set_errno(EPROTO); }

	net_tos_lowdelay(smtp);

	try_expect((smtp, 10, "%d%c", &code, &c), 2, 220)

	while (c == '-')
		try_expect((smtp, 10, "%d%c", &code, &c), 2, 220)

	try_send((smtp, 10, "HELO localhost\r\n"))
	try_expect((smtp, 10, "%d", &code), 1, 250)
	try_send((smtp, 10, "MAIL FROM: <%s>\r\n", sender))
	try_expect((smtp, 10, "%d", &code), 1, 250)
	try(rcpt(smtp, recipients))
	try_send((smtp, 10, "DATA\r\n"))
	try_expect((smtp, 10, "%d", &code), 1, 354)

	net_tos_throughput(smtp);

	try_send((smtp, 10, "From: %s\r\n", sender))
	try_send((smtp, 10, "To: %s\r\n", recipients))
	try_send((smtp, 10, "Subject: %s\r\n\r\n", (subject) ? subject : ""))
	try_send((smtp, 10, "%s\r\n.\r\n", (message) ? message : ""))
	try_expect((smtp, 10, "%d", &code), 1, 250)

	net_tos_lowdelay(smtp);

	try_send((smtp, 10, "QUIT\r\n"))
	try_expect((smtp, 10, "%d", &code), 1, 221)
	close(smtp);

	return 0;
}

/*

=back

=head1 SOCKET OPTION NOTES

Here is some vital information about socket options that never made it into
the I<setsockopt(2)> manpage (where it would be most useful). It's from
I<"UNIX Network Programming: Networking APIs: Sockets and XTI (Volume 1)">
by W. Richard Stevens.

=over 4

=item C<SO_LINGER>

Never set this option. There are two (bad) reasons why people set this
option. The first reason is to avoid having to wait until after a socket has
left the C<TIME_WAIT> state before restarting a server that has terminated.
The C<TIME_WAIT> state is your friend. Do not try to avoid it. If you avoid
it, you break TCP. If you break TCP, you will be punished. Set
C<SO_REUSEADDR> instead. This is what it's for.

The other reason is to know when the peer has received all sent data. This
probably doesn't work the way you want. It can only tell you when the peer
TCP has acknowledged the data. It cannot tell you when the peer application
has read the data. To do this, use I<shutdown(2)> with a second argument of
C<SHUT_WR> and then call I<read(2)> until it returns 0. This tells you that
the peer application has read all sent data, knows that it has read all sent
data (because it received your FIN) and has closed it's half of the
connection with either I<close(2)> or I<shutdown(2)> with a second argument
of C<SHUT_WR> (because you have received the peer's FIN). Then you can
I<close(2)> the socket safe in the knowledge that no data has been lost.

If you set C<SO_LINGER> with a zero timeout, the peer will think your
application has crashed or aborted the connection (because it receives an
RST). The only time to use C<SO_LINGER> is when this is the behaviour you
want.

=item C<SO_REUSEADDR>

Use this option for every TCP server socket. The net server functions set
this option for every TCP server socket. This means that if your server
dies, the new process that replaces it will be able to bind to the server's
port immediately. This option is also needed when multiple copies of a
multicast application need to run on the same host and C<SO_REUSEPORT> isn't
defined. This option must be set before I<bind(2)>.

=item C<TCP_NODELAY>

Avoid setting this option whenever possible (i.e. most of the time). It
disables the Nagle algorithm. The Nagle algorithm is your friend. It stops
you polluting the network with annoying little packets. If you must set it,
please ensure that the traffic is restricted to your own network and leave
the Internet alone.

Setting this option is often the wrong solution to a bad network programming
practice. If an application protocol involves immediate responses to each
message and exceptionally long delays are experienced, it's probably due to
the message being sent with multiple small I<write(2)>s (e.g. application
header first, then data) instead of a single I<write(2)>.

If a message is sent in small I<write(2)>s, the first I<write(2)> will
result in a small segment being sent. If the data in that segment does not
contain enough information for the peer to respond immediately, the peer TCP
will not ACK the segment until the ACK timer expires (50ms - 200ms). This is
the delayed ACK algorithm. The sending TCP will not send the second small
segment (containing the remainder of the message) until the first small
segment has been acknowledged by the peer TCP. This is the Nagle Algorithm.

The solution to this problem is not to disable the Nagle algorithm, but
rather to modify the application so that the message is sent in a single
call to I<writev(2)>. Avoid copying separate buffers into a single buffer
and then calling I<write(2)> as it is less efficient.

This option should only be set when the peer application does not respond to
each message and there can be no delay in sending the messages (e.g. real
time monitoring systems) or when, even though the peer does respond to each
message, the application can't hang around waiting for the response to the
previous message before sending the next message (e.g. highly interactive
applications like The X Window System).

=item C<SO_SNDBUF>

This option specifies how much unacknowledged data you are willing to have
out in the network before you stop sending data and wait for some
acknowledgement. For bulk transfers, the send and receive buffer sizes need
to be set to the capacity of the pipe (i.e. the bandwidth-delay product)
otherwise throughput will be limited by the buffer sizes rather than by the
network. The bandwidth-delay product is the bandwidth of the network
multiplied by the round trip time. Here are some examples. Note that these
values are for raw bandwidth, not data bandwidth. Actual values will be
smaller due to packet header overhead.

 Network                   | Bandwidth(bps) | RTT(ms) | Buffer(bytes)
 --------------------------+----------------+---------+--------------
 Ethernet LAN (10Mb/s)     |    10,000,000  |    3    |      3,750
 Ethernet LAN (100Mb/s)    |   100,000,000  |    3    |     37,500
 T1, transcontinental      |     1,544,000  |   60    |     11,580
 T1, satellite             |     1,544,000  |  500    |     96,500
 T3, transcontinental      |    45,000,000  |   60    |    337,500
 Gigabit, transcontinental | 1,000,000,000  |   60    |  7,500,000
 Gigabit Satellite Network |   155,520,000  |  500    |  9,720,000
  (SONET OC-3)             |                |         |
 Gigabit Satellite Network |   622,080,000  |  500    | 38,888,000
  (SONET OC-12)            |                |         |

Of course, it's generally impossible to know in advance what the bandwidth
or RTT will be and they can both change during the life of the connection.
Ideally, the kernel would automatically adjust buffer sizes as needed, but
don't hold your breath. Unless you know exactly what kind of network your
application will be running on, it's best to set buffer sizes to values
obtained from the user via a configuration file or user interface. Bear in
mind that most kernels don't support buffer sizes larger than a few hundred
kilobytes anyway.

Also note that TCP over satellite connections can behave very badly.
Everything is fine provided that there's no congestion. However, if a single
packet is lost, throughput will halve due to congestion avoidance, every
segment sent since the lost packet will have to be retransmitted (that's
38MB!) and it takes five minutes to reach maximum throughput again due to
the long RTT. Selective ACKs are needed to TCP to fix this. Fortunately,
Linux (and probably other) systems support selective ACKs.

This option can also be used to avoid the dreaded interaction between the
Nagle Algorithm and Delayed ACK algorithm during bulk data transfer. This
interaction cannot occur during bulk transfer if the send buffer size is at
least 3 times the Maximum Segment Size (MSS). Having a send buffer this
large means that the sender is always capable of sending 2 full segments. If
the receiver's receive buffer size isn't large enough to accept both
segments, it will ACK each segment without delay (to indicate that it is
running out of buffer space). If the receiver's receive buffer size is large
enough to accept both segments, it will ACK every second segment without
delay (so as not to disrupt your TCP's RTT calculations). The buffer size
should actually be an even multiple of the MSS (i.e. at least 4 times the
MSS). Here are some examples.

    Link     | MTU(bytes) | MSS(bytes) | 4*MSS(bytes)
    ---------+------------+------------+--------------
    Ethernet |    1,500   |    1,460   |     5,840
    ATM      |    9,188   |    9,148   |    36,592
    HIPPI    |   65,535   |   65,495   |   261,980

Some TCP implementations automatically round the send and receive buffer
sizes up to an even multiple of the MSS after establishing the connection.
So if you set these options, do so before establishing the connection (i.e.
before I<listen(2)> or I<connect(2)>). The net server and client functions
set these options at the right time if requested.

This option, when set for UDP sockets, limits the maximum datagram size that
can be sent.

=item C<SO_RCVBUF>

Much of what was said about the send buffer size applies to the size of the
peer's receive buffer. If your application is willing to accept large
amounts of data, it needs to advertise the fact by having a large receive
buffer. If the long fat pipe TCP options are required (Window Scale), they
must be negotiated during connection setup (in the SYN packets) so this
option must be set before I<listen(2)> or I<connect(2)>. The net server and
client functions set this option at the right time if requested.

This option, when set for UDP sockets, specifies how many received datagrams
to queue before discarding datagrams.

=item C<SO_KEEPALIVE>

This option causes TCP to send a probe after two hours of inactivity to
check that the connection is still alive. Many people think that two hours
is too long to wait so they implement application level heartbeats instead
(e.g. BGP routing daemons send keepalive packets every 30 seconds). Many
people think that this functionality belongs in the application anyway. The
POSIX.1g standard requires the C<TCP_KEEPALIVE> option which lets you
specify how many seconds to wait before sending the probe but this option
isn't widely implemented yet. Until it is, the C<SO_KEEPALIVE> option is not
very useful.

=back

=head1 PROTOCOL DESIGN NOTES

Here are some things to consider when designing packet headers and
distributed algorithms gleaned from I<"Interconnections: Bridges, Routers,
Switches and Internetworking Protocols"> by Radia Perlman.

=over 4

=item Simplicity versus Flexibility versus Optimality

Simple protocols are more likely to be successfully implemented and
deployed. Various factors complicate a protocol:

=over 4

=item *

Design by committee (multiple ways to do the same thing).

=item *

Backwards Compatibility.

=item *

Flexibility.

=item *

Optimality.

=item *

Underspecification (leaving decisions to the implementer).

=item *

Exotic features.

=back

=item Knowing the problem you're trying to solve

Solve at least one actual problem. Do nothing that is of no use.

=item Overhead and Scaling

Calculate the overhead of algorithms and protocols. Does it scale? How far?
Does it matter?

=item Operation above capacity

If there are assumptions about the size of the problem, either make them
impossibly huge or cope when the limit is exceeded.

=item Compact IDs versus Object Identifiers

Identifiers take two forms: (1) centrally administered numbers (e.g. port
numbers) which are short, fixed size, fast and easy to locate but hard to
obtain, and (2) hierarchical identifiers (e.g. MIB names) with decentralised
administration. These are large, variable size, slow and hard to locate (no
central authority) but easy to obtain.

=item Optimising for the most common or important case

If some information in a packet is rarely needed, make it an option. It is
better for a few packets to be larger and slower than for all other packets
to bear unused overhead.

=item Forward Compatibility

=over 4

=item Large Enough Fields

It is better to overestimate than to underestimate. It makes protocols live
longer.

=item Independence of Layers

Don't assume addresses are IPv4 addresses.

=item Reserved Fields

Spare bits must be transmitted as zero and ignored upon receipt. That way,
they can later be used by future versions to encode features that can safely
be ignored by earlier versions.

=item Version Number Field

Version numbers can be a simple number, or split into major and minor
version components. Minor version increments indicate backwards compatible
changes. Major version increments indicate incompatible changes. If a node
receives a packet with a version it doesn't know about, it should drop it or
respond with the version it does understand. The other node can switch to
the older protocol when it or receives this packet. However, nodes should
occasionally forget that the other node speaks an older version of the
protocol to prevent two nodes from incorrectly thinking that the other can
only speak an old version of the protocol.

Avoid having version numbers wrap around by making it huge or by
incrementing versions very rarely. If the version can wrap, make the highest
possible version number indicate that the actual version follows in a larger
field.

=item Options

Another way to provide for future protocol evolutions is to allow options to
be appended. Options should be encoded as <type, length, value> and the
length must be interpreted in the same way for all options. This allows
unknown options to be skipped. Some options should cause the packet to be
dropped. The type field can be used to specify whether the node should skip
the option or drop the packet: e.g. skip options with odd numbered types and
drop packets when options with even numbered types are encountered.

=back

=item Migration

When migrating from one protocol to another incompatible protocol, it's
easiest to keep them separate (e.g. dual IPv4/IPv6 stacks) because migration
can't be done atomically and it can be difficult to translate between two
protocols.

=item Parameters

=over 4

=item *

Have parameters when there are settings that the user may want to control.

=item *

Don't have parameters just because you can't decide on the setting. Who else
will?

=item *

Choose or calculate parameters when possible to reduce human involvement.

=item *

Make it possible to change parameters one at a time throughout a network
without things breaking.

=item *

Nodes can report their parameters to their neighbours so they adjust their
own parameters accordingly or detect misconfiguration.

=back

=item Making Multiprotocol Operation Possible

Have a field that indicates the protocol type. This can allow multiplexing
of mini-protocols within the application if the need ever arises.

=item Robustness

There are three kinds of robustness. I<Simple robustness> is when a node can
cope when other nodes go down. I<Self stabilising robustness> is when, even
though a node may not cope with another node malfunctioning, it will return
to correct behaviour when the malfunctioning node is fixed. I<Byzantine
robustness> is when a node behaves properly even when malicious or
malfunctioning nodes are operating. In this day and age, Byzantine
robustness is a necessity.

=over 4

=item *

Exercise every single line of code, then torture every single line of code.

=item *

Sometimes its better to crash than to malfunction.

=item *

Sometimes you can partition a network to contain a problem.

=item *

Test connectivity, don't assume it.

=item *

Simple checksums can be tricked. Use MD5 or public key signatures when
practical. Use encryption and authentication when possible (e.g. Secure
Sockets Layer/Secure Shell tunnels).

=item *

Process packets quickly to avoid denial of service attacks.

=back

=item Determinism versus Stability

Elections can be deterministic (the same node wins every time it is up) or
stable (once a node is elected, it stays elected until it goes down). If
every node is configured with a priority, and the election winner increases
its priority by N after winning an election, then you can achieve
deterministic elections by configuring nodes with priorities that differ by
more than N and you can achieve stable elections by configuring nodes with
the same priority.

=item Performance for Correctness

Understand the performance requirements that define a "correct"
implementation. For example, processing packets at wire speed is necessary
to avoid denials of service.

=back

=head1 ERRORS

These are the errors generated by the functions that return -1 on error.
Additional errors may be generated and returned from the underlying system
calls. See their manual pages.

=over 4

=item ENOENT

I<gethostbyname(3)> failed to identify the C<host> or C<interface> argument
passed to one of the socket functions.

=item ENOSYS

I<gethostbyname(3)> returned an address from an unsupported address family.

The C<"l"> format was used with I<pack(3)> or I<unpack(3)> when the system
doesn't support it or it wasn't compiled into I<libslack>.

=item EINVAL

A string argument is C<null>.

A pack format count is not a positive integer.

An unpack count or limit argument is not a positive integer.

An argument containing C<"a">, C<"z">, C<"b"> or C<"h"> data to be packed is
C<null>.

An argument containing C<"b"> data to be packed contains characters outside
the range [01].

An argument containing C<"h"> data to be packed contains characters outside
the range [0-9a-fA-F].

An C<"X"> pack instruction is trying to go back past the start of the
packet.

The count argument to an C<"@"> pack instruction refers to a location before
that where the instruction was encountered (i.e. it's trying to pack
leftwards).

The C<format> argument to I<pack(3)> or I<unpack(3)> contains an illegal
character.

An unpack C<?> indirect count argument is C<null>.

=item ENOSPC

A message was too large to be sent with I<net_send(3)>.

A packet was too small to store all of the data to be packed or unpacked.

An unpack C<?> indirect count argument points to a number greater than the
subsequent limit argument (not enough space in the target buffer).

=item ETIMEDOUT

I<net_expect(3)> or I<net_send(3)> timed out.

=item EPROTO (or EPROTOTYPE on Mac OS X)

I<mail(3)> encountered an error in the dialogue with the SMTP server. This
most likely cause of this is a missing or inadequate domain name for the
sender address on systems where I<sendmail(8)> requires a real domain name.

=back

=head1 MT-Level

MT-Safe

=head1 EXAMPLES

A TCP server:

    #include <slack/std.h>
    #include <slack/net.h>

    void provide_service(int fd) { write(fd, "ok\n", 3); }

    int main()
    {
        int servfd, clntfd;

        if ((servfd = net_server(NULL, "service", 30000, 0, 0, NULL, NULL)) == -1)
            return 1;

        while ((clntfd = accept(servfd, NULL, NULL)) != -1)
        {
            pid_t pid;

            switch (pid = fork())
            {
                case -1: return 1;
                case  0: provide_service(clntfd); _exit(EXIT_SUCCESS);
                default: close(clntfd); break;
            }
        }

        return EXIT_FAILURE; // unreached
    }

A TCP client:

    #include <slack/std.h>
    #include <slack/net.h>

    void request_service(int fd) {} // Do aomething here
    void process_response(int fd) {} // Do something here

    int main()
    {
        int sockfd;

        if ((sockfd = net_client("localhost", "service", 30000, 5, 0, 0, NULL, NULL)) == -1)
            return EXIT_FAILURE;

        request_service(sockfd);
        process_response(sockfd);
        close(sockfd);
        return EXIT_SUCCESS;
    }

A UDP server:

    #include <slack/std.h>
    #include <slack/net.h>

    void provide_service(char *pkt) {} // Do something here

    int main()
    {
        char pkt[8];
        sockaddr_any_t addr;
        size_t addrsize;
        int servfd;

        if ((servfd = net_udp_server(NULL, "service", 30000, 0, 0, NULL, NULL)) == -1)
            return EXIT_FAILURE;

        for (;;)
        {
    	    addrsize = sizeof addr;

            if (recvfrom(servfd, pkt, 8, 0, &addr.any, &addrsize) == -1)
                return EXIT_FAILURE;

            provide_service(pkt);

            if (sendto(servfd, pkt, 8, 0, &addr.any, addrsize) == -1)
                return EXIT_FAILURE;
        }

        return EXIT_SUCCESS; // unreached
    }

A UDP client:

    #include <slack/std.h>
    #include <slack/net.h>

    void build_request(char *pkt) {} // Do something here
    void process_response(char *pkt) {} // Do something here

    int main()
    {
        char pkt[8];
        int sockfd = net_udp_client("localhost", "service", 30000, 0, 0, NULL, NULL);
        if (sockfd == -1)
            return EXIT_FAILURE;

        build_request(pkt);

        if (send(sockfd, pkt, 8, 0) == -1)
            return EXIT_FAILURE;

        if (recv(sockfd, pkt, 8, 0) == -1)
            return EXIT_FAILURE;

        process_response(pkt);

        close(sockfd);

        return EXIT_SUCCESS;
    }

A reliable UDP client:

    #include <slack/std.h>
    #include <slack/net.h>

    void build_request(char *pkt) {} // Do something here
    void process_response(char *pkt) {} // Do something here

    int main()
    {
        char opkt[8], ipkt[8];
        int sockfd;
        rudp_t *rudp;

        if ((sockfd = net_udp_client("localhost", "echo", 7, 0, 0, NULL, NULL)) == -1)
            return EXIT_FAILURE;

        if (!(rudp = rudp_create()))
            return EXIT_FAILURE;

        build_request(opkt);

        if (net_rudp_transact(sockfd, rudp, opkt, 8, ipkt, 8) == -1)
            return EXIT_FAILURE;

        process_response(ipkt);

        rudp_re